feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断)
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试
This commit is contained in:
Kevin
2026-04-07 10:34:59 +08:00
parent ea97427767
commit 5972b0e721
9 changed files with 616 additions and 235 deletions

View File

@@ -26,6 +26,37 @@ _CONV_MAX = 8192
_CONV_JUDGE_JSON_MAX = 2048
_MEMOIR_MAX = 12000
_COMPARE_STREAM_MAX = 6144
_MEMOIR_EVIDENCE_MAX = 12000
def _build_memoir_judge_prompt(
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> str:
"""Assemble an evidence-aware memoir judging prompt."""
source = (source_transcript or "").strip()
reference = (reference_memoir_markdown or "").strip()
notes = (evidence_notes or "").strip()
sections = [MEMOIR_JUDGE_INSTRUCTIONS, ""]
if notes:
sections.extend(["【评审说明】", notes[:1200], ""])
if source:
sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
else:
sections.extend(
[
"【原始访谈/证据】",
"无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
"",
]
)
if reference:
sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
return "\n".join(sections)
class EvalJudgeService:
@@ -124,7 +155,7 @@ class EvalJudgeService:
{r_json}
请依次撰写:
1) 两段对话在整体体验上的主要差异(共情、追问、重复感、自然度等);
1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等);
2) B 相对 A 的优点与不足;
3) 若 B 在关键维度明显弱于 A给出可操作的改进方向系统提示、访谈策略、模型或温度等
@@ -154,14 +185,22 @@ class EvalJudgeService:
logger.warning("conversation compare stream failed: {}", e)
yield f"\n\n[流式输出中断:{e}]"
async def judge_memoir(self, *, memoir_markdown: str) -> MemoirJudgeOutput | None:
async def judge_memoir(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> MemoirJudgeOutput | None:
if not self._llm:
return None
prompt = f"""{MEMOIR_JUDGE_INSTRUCTIONS}
【回忆录正文】
{memoir_markdown[:_MEMOIR_MAX]}
"""
prompt = _build_memoir_judge_prompt(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
try:
return await allm_json_call(
self._llm,