feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断)
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试
This commit is contained in:
Kevin
2026-04-07 10:34:59 +08:00
parent ea97427767
commit 5972b0e721
9 changed files with 616 additions and 235 deletions

View File

@@ -26,6 +26,8 @@ logger = get_logger(__name__)
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40
_MAX_EVIDENCE_CONVERSATIONS = 8
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
@@ -56,6 +58,50 @@ def _assistant_text_for_eval_display(raw: str) -> str:
return (raw or "").replace("[SPLIT]", "\n")
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
return s
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
def _dialogue_transcript_from_pairs(pairs: list[tuple[str, str]]) -> str:
parts: list[str] = []
for role, content in pairs:
body = (content or "").strip()
if not body:
continue
label = "用户" if role == "human" else "AI"
out = _assistant_text_for_eval_display(body) if role != "human" else body
parts.append(f"{label}: {out}")
return "\n\n".join(parts)
async def _conversation_transcript_for_eval(
db: AsyncSession, conversation_id: str
) -> str:
from app.features.conversation import repo as conversation_repo
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
return _dialogue_transcript_from_pairs(
[(str(row.role or "").lower(), str(row.content or "")) for row in rows]
)
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
from app.features.conversation import repo as conversation_repo
conversations = await conversation_repo.get_user_conversations(user_id, db)
if not conversations:
return ""
parts: list[str] = []
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
transcript = await _conversation_transcript_for_eval(db, str(conv.id))
if transcript:
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
return _trim_evidence_text("\n\n".join(parts))
async def execute_eval_run(
db: AsyncSession,
*,
@@ -150,7 +196,7 @@ async def execute_eval_run(
rationale = tj.rationale if tj else None
await eval_repo.add_turn(
db,
run_id=run.id,
run_id=str(run.id),
turn_index=idx,
user_utterance=u,
assistant_reply=replies[idx],
@@ -166,11 +212,36 @@ async def execute_eval_run(
conv_total = conv_out.total_score if conv_out else None
memoir_md = simple_memoir_from_transcript(utterances, replies)
mem_out = await judge.judge_memoir(memoir_markdown=memoir_md)
source_transcript = _trim_evidence_text(full_transcript)
reference_memoir = (case.reference_memoir_markdown or "").strip()
mem_out = await judge.judge_memoir(
memoir_markdown=memoir_md,
source_transcript=source_transcript,
reference_memoir_markdown=reference_memoir,
evidence_notes="严格按文档核对真实性、覆盖率、可追溯性;以原始访谈为主,参考基线仅作辅助。",
)
chapter_entries: list[dict[str, Any]] = []
story_entries: list[dict[str, Any]] = []
uid = (case.source_user_id or "").strip()
source_conversation_id = (case.source_conversation_id or "").strip()
evidence_transcript = source_transcript
if source_conversation_id:
try:
conversation_evidence = await _conversation_transcript_for_eval(
db, source_conversation_id
)
if conversation_evidence:
evidence_transcript = _trim_evidence_text(conversation_evidence)
except Exception as e:
logger.warning("eval source conversation evidence skipped: {}", e)
elif uid:
try:
user_evidence = await _user_transcript_evidence(db, uid)
if user_evidence:
evidence_transcript = user_evidence
except Exception as e:
logger.warning("eval user transcript evidence skipped: {}", e)
if uid:
from app.features.memoir.repo import get_chapters_for_memoir_list
from app.features.story.repo import get_stories_for_user
@@ -184,7 +255,14 @@ async def execute_eval_run(
if not body:
continue
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
cj = await judge.judge_memoir(memoir_markdown=md)
cj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=evidence_transcript,
reference_memoir_markdown=reference_memoir,
evidence_notes=(
"这是用户现有章节的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
),
)
chapter_entries.append(
{
"id": ch.id,
@@ -203,7 +281,14 @@ async def execute_eval_run(
if not body:
continue
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
sj = await judge.judge_memoir(memoir_markdown=md)
sj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=evidence_transcript,
reference_memoir_markdown=reference_memoir,
evidence_notes=(
"这是用户现有故事的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
),
)
story_entries.append(
{
"id": st.id,
@@ -228,8 +313,12 @@ async def execute_eval_run(
mem_parts.append(float(j["total_score"]))
mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None
exp = await eval_repo.get_experiment(db, run.experiment_id)
weights = exp.composite_weights_json if exp else None
exp = await eval_repo.get_experiment(db, str(run.experiment_id))
weights = (
exp.composite_weights_json
if exp and isinstance(exp.composite_weights_json, dict)
else None
)
comp = _composite(conv_total, mem_total, weights)
bundle: dict[str, Any] = {
@@ -257,13 +346,13 @@ async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> Non
exp = await eval_repo.get_experiment(db, experiment_id)
if not exp:
return
cases = await eval_repo.list_cases(db, exp.regression_set_id)
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
incomplete = [r for r in runs if r.status not in ("completed", "failed")]
incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")]
if incomplete:
return
failed = [r for r in runs if r.status == "failed"]
failed = [r for r in runs if str(r.status) == "failed"]
if failed:
await eval_repo.update_experiment(
db,
@@ -301,10 +390,10 @@ async def execute_experiment_full(experiment_id: str) -> None:
await eval_repo.update_experiment(db, exp, status="running")
await db.commit()
cases = await eval_repo.list_cases(db, exp.regression_set_id)
base_v = await eval_repo.get_version(db, exp.baseline_version_id)
cand_v = await eval_repo.get_version(db, exp.candidate_version_id)
if not base_v or not cand_v:
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
if base_v is None or cand_v is None:
await eval_repo.update_experiment(
db,
exp,
@@ -317,12 +406,12 @@ async def execute_experiment_full(experiment_id: str) -> None:
for case in cases:
for side, ver in ("baseline", base_v), ("candidate", cand_v):
run = await eval_repo.get_run(db, experiment_id, case.id, side)
run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
if not run:
run = await eval_repo.create_run(
db,
experiment_id=experiment_id,
case_id=case.id,
case_id=str(case.id),
side=side,
)
await db.commit()