feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整
- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total - judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分 - 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断) - 访谈打分仍为情绪强化版 15 细项、总分 100 - 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑 - 新增 judge schema 与 memoir prompt 组装的单元测试
This commit is contained in:
@@ -26,6 +26,37 @@ _CONV_MAX = 8192
|
||||
_CONV_JUDGE_JSON_MAX = 2048
|
||||
_MEMOIR_MAX = 12000
|
||||
_COMPARE_STREAM_MAX = 6144
|
||||
_MEMOIR_EVIDENCE_MAX = 12000
|
||||
|
||||
|
||||
def _build_memoir_judge_prompt(
|
||||
*,
|
||||
memoir_markdown: str,
|
||||
source_transcript: str = "",
|
||||
reference_memoir_markdown: str = "",
|
||||
evidence_notes: str = "",
|
||||
) -> str:
|
||||
"""Assemble an evidence-aware memoir judging prompt."""
|
||||
source = (source_transcript or "").strip()
|
||||
reference = (reference_memoir_markdown or "").strip()
|
||||
notes = (evidence_notes or "").strip()
|
||||
sections = [MEMOIR_JUDGE_INSTRUCTIONS, ""]
|
||||
if notes:
|
||||
sections.extend(["【评审说明】", notes[:1200], ""])
|
||||
if source:
|
||||
sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
|
||||
else:
|
||||
sections.extend(
|
||||
[
|
||||
"【原始访谈/证据】",
|
||||
"无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
|
||||
"",
|
||||
]
|
||||
)
|
||||
if reference:
|
||||
sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
|
||||
sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
|
||||
return "\n".join(sections)
|
||||
|
||||
|
||||
class EvalJudgeService:
|
||||
@@ -124,7 +155,7 @@ class EvalJudgeService:
|
||||
{r_json}
|
||||
|
||||
请依次撰写:
|
||||
1) 两段对话在整体体验上的主要差异(共情、追问、重复感、自然度等);
|
||||
1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等);
|
||||
2) B 相对 A 的优点与不足;
|
||||
3) 若 B 在关键维度明显弱于 A,给出可操作的改进方向(系统提示、访谈策略、模型或温度等)。
|
||||
|
||||
@@ -154,14 +185,22 @@ class EvalJudgeService:
|
||||
logger.warning("conversation compare stream failed: {}", e)
|
||||
yield f"\n\n[流式输出中断:{e}]"
|
||||
|
||||
async def judge_memoir(self, *, memoir_markdown: str) -> MemoirJudgeOutput | None:
|
||||
async def judge_memoir(
|
||||
self,
|
||||
*,
|
||||
memoir_markdown: str,
|
||||
source_transcript: str = "",
|
||||
reference_memoir_markdown: str = "",
|
||||
evidence_notes: str = "",
|
||||
) -> MemoirJudgeOutput | None:
|
||||
if not self._llm:
|
||||
return None
|
||||
prompt = f"""{MEMOIR_JUDGE_INSTRUCTIONS}
|
||||
|
||||
【回忆录正文】
|
||||
{memoir_markdown[:_MEMOIR_MAX]}
|
||||
"""
|
||||
prompt = _build_memoir_judge_prompt(
|
||||
memoir_markdown=memoir_markdown,
|
||||
source_transcript=source_transcript,
|
||||
reference_memoir_markdown=reference_memoir_markdown,
|
||||
evidence_notes=evidence_notes,
|
||||
)
|
||||
try:
|
||||
return await allm_json_call(
|
||||
self._llm,
|
||||
|
||||
Reference in New Issue
Block a user