feat: 回忆录证据血缘与内部评测可追溯,顺带对齐本地评测台与 CI

数据库与模型:新增多版迁移(章节证据快照、对话血缘、记忆事实/时间线 lineage 等),把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路:会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照;新增章节证据快照与评测侧 EvalTraceService 等模块,方便组评审用的证据包。
内部评测:自动化 run 与手工 memoir 评审共用可追溯证据;rubric/ judge 相关脚本与文档有配套调整。
app-eval-web:Memoir/实验详情里能展开看证据摘要与 evidence_trace(含对话轮次 id);Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致,避免改端口后页面连错服务。
工程杂项:GitHub Actions / 仓库说明有更新;各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾;新增/扩充了?
This commit is contained in:
Kevin
2026-04-08 15:37:09 +08:00
parent 6772e1269c
commit 309a051038
109 changed files with 4125 additions and 858 deletions

View File

@@ -91,28 +91,45 @@ def _build_memoir_judge_prompt(
*,
memoir_markdown: str,
source_transcript: str = "",
structured_evidence: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> str:
"""Assemble an evidence-aware memoir judging prompt."""
source = (source_transcript or "").strip()
struct = (structured_evidence or "").strip()
reference = (reference_memoir_markdown or "").strip()
notes = (evidence_notes or "").strip()
sections = [
MEMOIR_JUDGE_INSTRUCTIONS,
"",
"【证据与输入顺序】以下区块按优先级给出:评审说明(若有)→ 原始访谈证据 → 参考基线(若有)→ 待评成稿。**真实性相关细项必须以原始访谈证据为准。**",
"【证据与输入顺序】以下区块按优先级给出:"
"评审说明(若有)→ 原始访谈/对话证据segment 绑定)→ 结构化记忆证据chunk/fact/timeline/summary"
"→ 参考基线(若有)→ 待评成稿。**真实性、覆盖率、可追溯性以「artifact 绑定证据闭包」为准**"
"若存在 `lineage_tier=fallback` 或证据不足,须保守打分并写 `insufficient_evidence`。",
"",
]
if notes:
sections.extend(["【评审说明】", notes[:1200], ""])
if source:
sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
sections.extend(["【原始访谈/对话证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
else:
sections.extend(
[
"【原始访谈/证据】",
"无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
"【原始访谈/对话证据】",
"无可用局部对话证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
"",
]
)
if struct:
sections.extend(
["【结构化记忆证据】", struct[:_MEMOIR_EVIDENCE_MAX], ""]
)
else:
sections.extend(
[
"【结构化记忆证据】",
"(本 artifact 未绑定或未解析到 chunk/fact/timeline/summary 证据。)",
"",
]
)
@@ -268,12 +285,14 @@ class EvalJudgeService:
*,
memoir_markdown: str,
source_transcript: str = "",
structured_evidence: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> MemoirJudgeOutput | None:
result = await self.judge_memoir_result(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
structured_evidence=structured_evidence,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
@@ -284,6 +303,7 @@ class EvalJudgeService:
*,
memoir_markdown: str,
source_transcript: str = "",
structured_evidence: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> JudgeCallResult[MemoirJudgeOutput]:
@@ -292,6 +312,7 @@ class EvalJudgeService:
prompt = _build_memoir_judge_prompt(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
structured_evidence=structured_evidence,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)