feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整
- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total - judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分 - 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断) - 访谈打分仍为情绪强化版 15 细项、总分 100 - 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑 - 新增 judge schema 与 memoir prompt 组装的单元测试
This commit is contained in:
@@ -26,6 +26,8 @@ logger = get_logger(__name__)
|
||||
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
|
||||
_MAX_EVAL_CHAPTERS = 30
|
||||
_MAX_EVAL_STORIES = 40
|
||||
_MAX_EVIDENCE_CONVERSATIONS = 8
|
||||
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
|
||||
|
||||
|
||||
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
|
||||
@@ -56,6 +58,50 @@ def _assistant_text_for_eval_display(raw: str) -> str:
|
||||
return (raw or "").replace("[SPLIT]", "\n")
|
||||
|
||||
|
||||
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
|
||||
s = (text or "").strip()
|
||||
if len(s) <= max_chars:
|
||||
return s
|
||||
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
|
||||
|
||||
|
||||
def _dialogue_transcript_from_pairs(pairs: list[tuple[str, str]]) -> str:
|
||||
parts: list[str] = []
|
||||
for role, content in pairs:
|
||||
body = (content or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
label = "用户" if role == "human" else "AI"
|
||||
out = _assistant_text_for_eval_display(body) if role != "human" else body
|
||||
parts.append(f"{label}: {out}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
async def _conversation_transcript_for_eval(
|
||||
db: AsyncSession, conversation_id: str
|
||||
) -> str:
|
||||
from app.features.conversation import repo as conversation_repo
|
||||
|
||||
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
|
||||
return _dialogue_transcript_from_pairs(
|
||||
[(str(row.role or "").lower(), str(row.content or "")) for row in rows]
|
||||
)
|
||||
|
||||
|
||||
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
|
||||
from app.features.conversation import repo as conversation_repo
|
||||
|
||||
conversations = await conversation_repo.get_user_conversations(user_id, db)
|
||||
if not conversations:
|
||||
return ""
|
||||
parts: list[str] = []
|
||||
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
|
||||
transcript = await _conversation_transcript_for_eval(db, str(conv.id))
|
||||
if transcript:
|
||||
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
|
||||
return _trim_evidence_text("\n\n".join(parts))
|
||||
|
||||
|
||||
async def execute_eval_run(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
@@ -150,7 +196,7 @@ async def execute_eval_run(
|
||||
rationale = tj.rationale if tj else None
|
||||
await eval_repo.add_turn(
|
||||
db,
|
||||
run_id=run.id,
|
||||
run_id=str(run.id),
|
||||
turn_index=idx,
|
||||
user_utterance=u,
|
||||
assistant_reply=replies[idx],
|
||||
@@ -166,11 +212,36 @@ async def execute_eval_run(
|
||||
conv_total = conv_out.total_score if conv_out else None
|
||||
|
||||
memoir_md = simple_memoir_from_transcript(utterances, replies)
|
||||
mem_out = await judge.judge_memoir(memoir_markdown=memoir_md)
|
||||
source_transcript = _trim_evidence_text(full_transcript)
|
||||
reference_memoir = (case.reference_memoir_markdown or "").strip()
|
||||
mem_out = await judge.judge_memoir(
|
||||
memoir_markdown=memoir_md,
|
||||
source_transcript=source_transcript,
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes="严格按文档核对真实性、覆盖率、可追溯性;以原始访谈为主,参考基线仅作辅助。",
|
||||
)
|
||||
|
||||
chapter_entries: list[dict[str, Any]] = []
|
||||
story_entries: list[dict[str, Any]] = []
|
||||
uid = (case.source_user_id or "").strip()
|
||||
source_conversation_id = (case.source_conversation_id or "").strip()
|
||||
evidence_transcript = source_transcript
|
||||
if source_conversation_id:
|
||||
try:
|
||||
conversation_evidence = await _conversation_transcript_for_eval(
|
||||
db, source_conversation_id
|
||||
)
|
||||
if conversation_evidence:
|
||||
evidence_transcript = _trim_evidence_text(conversation_evidence)
|
||||
except Exception as e:
|
||||
logger.warning("eval source conversation evidence skipped: {}", e)
|
||||
elif uid:
|
||||
try:
|
||||
user_evidence = await _user_transcript_evidence(db, uid)
|
||||
if user_evidence:
|
||||
evidence_transcript = user_evidence
|
||||
except Exception as e:
|
||||
logger.warning("eval user transcript evidence skipped: {}", e)
|
||||
if uid:
|
||||
from app.features.memoir.repo import get_chapters_for_memoir_list
|
||||
from app.features.story.repo import get_stories_for_user
|
||||
@@ -184,7 +255,14 @@ async def execute_eval_run(
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
|
||||
cj = await judge.judge_memoir(memoir_markdown=md)
|
||||
cj = await judge.judge_memoir(
|
||||
memoir_markdown=md,
|
||||
source_transcript=evidence_transcript,
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes=(
|
||||
"这是用户现有章节的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
|
||||
),
|
||||
)
|
||||
chapter_entries.append(
|
||||
{
|
||||
"id": ch.id,
|
||||
@@ -203,7 +281,14 @@ async def execute_eval_run(
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
|
||||
sj = await judge.judge_memoir(memoir_markdown=md)
|
||||
sj = await judge.judge_memoir(
|
||||
memoir_markdown=md,
|
||||
source_transcript=evidence_transcript,
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes=(
|
||||
"这是用户现有故事的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
|
||||
),
|
||||
)
|
||||
story_entries.append(
|
||||
{
|
||||
"id": st.id,
|
||||
@@ -228,8 +313,12 @@ async def execute_eval_run(
|
||||
mem_parts.append(float(j["total_score"]))
|
||||
mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None
|
||||
|
||||
exp = await eval_repo.get_experiment(db, run.experiment_id)
|
||||
weights = exp.composite_weights_json if exp else None
|
||||
exp = await eval_repo.get_experiment(db, str(run.experiment_id))
|
||||
weights = (
|
||||
exp.composite_weights_json
|
||||
if exp and isinstance(exp.composite_weights_json, dict)
|
||||
else None
|
||||
)
|
||||
comp = _composite(conv_total, mem_total, weights)
|
||||
|
||||
bundle: dict[str, Any] = {
|
||||
@@ -257,13 +346,13 @@ async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> Non
|
||||
exp = await eval_repo.get_experiment(db, experiment_id)
|
||||
if not exp:
|
||||
return
|
||||
cases = await eval_repo.list_cases(db, exp.regression_set_id)
|
||||
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
|
||||
|
||||
incomplete = [r for r in runs if r.status not in ("completed", "failed")]
|
||||
incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")]
|
||||
if incomplete:
|
||||
return
|
||||
|
||||
failed = [r for r in runs if r.status == "failed"]
|
||||
failed = [r for r in runs if str(r.status) == "failed"]
|
||||
if failed:
|
||||
await eval_repo.update_experiment(
|
||||
db,
|
||||
@@ -301,10 +390,10 @@ async def execute_experiment_full(experiment_id: str) -> None:
|
||||
await eval_repo.update_experiment(db, exp, status="running")
|
||||
await db.commit()
|
||||
|
||||
cases = await eval_repo.list_cases(db, exp.regression_set_id)
|
||||
base_v = await eval_repo.get_version(db, exp.baseline_version_id)
|
||||
cand_v = await eval_repo.get_version(db, exp.candidate_version_id)
|
||||
if not base_v or not cand_v:
|
||||
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
|
||||
base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
|
||||
cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
|
||||
if base_v is None or cand_v is None:
|
||||
await eval_repo.update_experiment(
|
||||
db,
|
||||
exp,
|
||||
@@ -317,12 +406,12 @@ async def execute_experiment_full(experiment_id: str) -> None:
|
||||
|
||||
for case in cases:
|
||||
for side, ver in ("baseline", base_v), ("candidate", cand_v):
|
||||
run = await eval_repo.get_run(db, experiment_id, case.id, side)
|
||||
run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
|
||||
if not run:
|
||||
run = await eval_repo.create_run(
|
||||
db,
|
||||
experiment_id=experiment_id,
|
||||
case_id=case.id,
|
||||
case_id=str(case.id),
|
||||
side=side,
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
Reference in New Issue
Block a user