feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断)
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试
This commit is contained in:
Kevin
2026-04-07 10:34:59 +08:00
parent ea97427767
commit 5972b0e721
9 changed files with 616 additions and 235 deletions

View File

@@ -10,6 +10,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.core.dependencies import get_eval_judge_langchain_llm
from app.core.logging import get_logger
from app.features.conversation import repo as conversation_repo
from app.features.evaluation.errors import (
EvaluationBadRequestError,
EvaluationNotFoundError,
@@ -27,6 +28,8 @@ logger = get_logger(__name__)
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40
_MAX_EVIDENCE_CONVERSATIONS = 8
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
@@ -48,6 +51,41 @@ def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
return "\n\n".join(parts)
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
return s
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
async def _conversation_transcript_for_manual(
db: AsyncSession, conversation_id: str
) -> str:
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
parts: list[str] = []
for row in rows:
role = (row.role or "").lower()
body = (row.content or "").strip()
if not body:
continue
label = "用户" if role == "human" else "AI"
out = _assistant_text_for_eval_display(body) if role != "human" else body
parts.append(f"{label}: {out}")
return "\n\n".join(parts)
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
conversations = await conversation_repo.get_user_conversations(user_id, db)
if not conversations:
return ""
parts: list[str] = []
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
transcript = await _conversation_transcript_for_manual(db, str(conv.id))
if transcript:
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
return _trim_evidence_text("\n\n".join(parts))
def _normalize_title_key(title: str) -> str:
t = (title or "").strip().lower()
t = re.sub(r"^#+\s*", "", t)
@@ -271,6 +309,7 @@ class EvalJudgeManualService:
judge_llm = get_eval_judge_langchain_llm()
judge = EvalJudgeService(judge_llm)
baselines = list(baseline_sections or [])
evidence_transcript = await _user_transcript_evidence(self._db, uid)
chapter_results: list[dict[str, Any]] = []
try:
@@ -281,7 +320,7 @@ class EvalJudgeManualService:
body = (ch.canonical_markdown or "").strip()
if not body:
continue
bl = _baseline_for_chapter_title(baselines, ch.title or "", i)
bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
baseline_excerpt = ""
if bl and (bl.body or "").strip():
baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000)
@@ -289,7 +328,14 @@ class EvalJudgeManualService:
if baseline_excerpt:
md += f"## 导出基线(节选)\n\n{baseline_excerpt}\n\n"
md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}"
cj = await judge.judge_memoir(memoir_markdown=md)
cj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=evidence_transcript,
reference_memoir_markdown=baseline_excerpt,
evidence_notes=(
"严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
),
)
chapter_results.append(
{
"id": ch.id,
@@ -310,7 +356,13 @@ class EvalJudgeManualService:
if not body:
continue
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
sj = await judge.judge_memoir(memoir_markdown=md)
sj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=evidence_transcript,
evidence_notes=(
"严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
),
)
story_results.append(
{
"id": st.id,