feat: 回忆录证据血缘与内部评测可追溯,顺带对齐本地评测台与 CI
数据库与模型:新增多版迁移(章节证据快照、对话血缘、记忆事实/时间线 lineage 等),把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。 业务链路:会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照;新增章节证据快照与评测侧 EvalTraceService 等模块,方便组评审用的证据包。 内部评测:自动化 run 与手工 memoir 评审共用可追溯证据;rubric/ judge 相关脚本与文档有配套调整。 app-eval-web:Memoir/实验详情里能展开看证据摘要与 evidence_trace(含对话轮次 id);Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致,避免改端口后页面连错服务。 工程杂项:GitHub Actions / 仓库说明有更新;各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾;新增/扩充了?
This commit is contained in:
@@ -16,6 +16,7 @@ from app.features.evaluation.candidate_runner import (
|
||||
EvalCandidateRunner,
|
||||
simple_memoir_from_transcript,
|
||||
)
|
||||
from app.features.evaluation.eval_trace_service import EvalTraceService
|
||||
from app.features.evaluation.gate_report_service import gate_result_to_details
|
||||
from app.features.evaluation.gating_service import compute_gate
|
||||
from app.features.evaluation.judge_service import EvalJudgeService
|
||||
@@ -30,7 +31,6 @@ logger = get_logger(__name__)
|
||||
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
|
||||
_MAX_EVAL_CHAPTERS = 30
|
||||
_MAX_EVAL_STORIES = 40
|
||||
_MAX_EVIDENCE_CONVERSATIONS = 8
|
||||
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
|
||||
|
||||
|
||||
@@ -74,43 +74,6 @@ def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHA
|
||||
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
|
||||
|
||||
|
||||
def _dialogue_transcript_from_pairs(pairs: list[tuple[str, str]]) -> str:
|
||||
parts: list[str] = []
|
||||
for role, content in pairs:
|
||||
body = (content or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
label = "用户" if role == "human" else "AI"
|
||||
out = assistant_text_for_eval_display(body) if role != "human" else body
|
||||
parts.append(f"{label}: {out}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
async def _conversation_transcript_for_eval(
|
||||
db: AsyncSession, conversation_id: str
|
||||
) -> str:
|
||||
from app.features.conversation import repo as conversation_repo
|
||||
|
||||
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
|
||||
return _dialogue_transcript_from_pairs(
|
||||
[(str(row.role or "").lower(), str(row.content or "")) for row in rows]
|
||||
)
|
||||
|
||||
|
||||
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
|
||||
from app.features.conversation import repo as conversation_repo
|
||||
|
||||
conversations = await conversation_repo.get_user_conversations(user_id, db)
|
||||
if not conversations:
|
||||
return ""
|
||||
parts: list[str] = []
|
||||
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
|
||||
transcript = await _conversation_transcript_for_eval(db, str(conv.id))
|
||||
if transcript:
|
||||
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
|
||||
return _trim_evidence_text("\n\n".join(parts))
|
||||
|
||||
|
||||
async def execute_eval_run(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
@@ -118,6 +81,14 @@ async def execute_eval_run(
|
||||
case: EvalCase,
|
||||
version: EvalVersion,
|
||||
) -> None:
|
||||
fresh_run = await db.get(EvalRun, run.id)
|
||||
if not fresh_run:
|
||||
return
|
||||
if (fresh_run.status or "").lower() == "completed":
|
||||
logger.info("eval run skip already completed run_id={}", fresh_run.id)
|
||||
return
|
||||
run = fresh_run
|
||||
|
||||
if not settings.eval_execution_enabled:
|
||||
await eval_repo.update_run(
|
||||
db,
|
||||
@@ -227,34 +198,39 @@ async def execute_eval_run(
|
||||
memoir_md = simple_memoir_from_transcript(utterances, replies)
|
||||
source_transcript = _trim_evidence_text(full_transcript)
|
||||
reference_memoir = (case.reference_memoir_markdown or "").strip()
|
||||
synthetic_notes = (
|
||||
"本项为 replay 合成的短 memoir:证据闭包仅为重放对话 transcript(无 library artifact lineage)。"
|
||||
f" turns={len(utterances)}"
|
||||
)
|
||||
mem_out = await judge.judge_memoir(
|
||||
memoir_markdown=memoir_md,
|
||||
source_transcript=source_transcript,
|
||||
structured_evidence=(
|
||||
"(结构化记忆证据:自动化 replay 路径未绑定用户 memory chunk/fact/timeline/summary。)"
|
||||
),
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes="严格按文档核对真实性、覆盖率、可追溯性;以原始访谈为主,参考基线仅作辅助。",
|
||||
evidence_notes=synthetic_notes,
|
||||
)
|
||||
|
||||
chapter_entries: list[dict[str, Any]] = []
|
||||
story_entries: list[dict[str, Any]] = []
|
||||
uid = (case.source_user_id or "").strip()
|
||||
source_conversation_id = (case.source_conversation_id or "").strip()
|
||||
evidence_transcript = source_transcript
|
||||
if source_conversation_id:
|
||||
try:
|
||||
conversation_evidence = await _conversation_transcript_for_eval(
|
||||
db, source_conversation_id
|
||||
)
|
||||
if conversation_evidence:
|
||||
evidence_transcript = _trim_evidence_text(conversation_evidence)
|
||||
except Exception as e:
|
||||
logger.warning("eval source conversation evidence skipped: {}", e)
|
||||
elif uid:
|
||||
try:
|
||||
user_evidence = await _user_transcript_evidence(db, uid)
|
||||
if user_evidence:
|
||||
evidence_transcript = user_evidence
|
||||
except Exception as e:
|
||||
logger.warning("eval user transcript evidence skipped: {}", e)
|
||||
trace_svc = EvalTraceService(db)
|
||||
|
||||
def _library_evidence_notes(
|
||||
lineage_tier: str,
|
||||
evidence_summary: str,
|
||||
truncated: bool,
|
||||
dropped: list[str],
|
||||
) -> str:
|
||||
drops = ",".join(dropped[:12]) if dropped else ""
|
||||
return (
|
||||
"library artifact 评审:以证据闭包为准;若 lineage 为 fallback 或不足须保守打分。"
|
||||
f" lineage_tier={lineage_tier};summary={evidence_summary};"
|
||||
f" prompt_truncated={truncated};dropped_sections={drops or 'none'}。"
|
||||
" 单章节/单故事节选;跨篇上下文不足写入 insufficient_evidence。"
|
||||
)
|
||||
|
||||
if uid:
|
||||
from app.features.memoir.repo import get_chapters_for_memoir_list
|
||||
from app.features.story.repo import get_stories_for_user
|
||||
@@ -268,13 +244,19 @@ async def execute_eval_run(
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
|
||||
cb = await trace_svc.build_chapter_bundle(uid, ch)
|
||||
formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
|
||||
fm = formatted.format_meta
|
||||
cj = await judge.judge_memoir(
|
||||
memoir_markdown=md,
|
||||
source_transcript=evidence_transcript,
|
||||
source_transcript=formatted.source_transcript,
|
||||
structured_evidence=formatted.structured_evidence,
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes=(
|
||||
"这是用户现有章节的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
|
||||
" 评审范围:单章节节选;跨全书连贯性仅在与证据一致时评估,否则保守打分并在 insufficient_evidence 说明。"
|
||||
evidence_notes=_library_evidence_notes(
|
||||
cb2.lineage_tier,
|
||||
formatted.evidence_summary,
|
||||
fm.truncated,
|
||||
fm.dropped_sections,
|
||||
),
|
||||
)
|
||||
chapter_entries.append(
|
||||
@@ -282,6 +264,10 @@ async def execute_eval_run(
|
||||
"id": ch.id,
|
||||
"title": ch.title,
|
||||
"order_index": ch.order_index,
|
||||
"lineage_tier": cb2.lineage_tier,
|
||||
"evidence_summary": formatted.evidence_summary,
|
||||
"evidence_trace": cb2.model_dump(),
|
||||
"format_meta": fm.model_dump(),
|
||||
"judge": cj.model_dump() if cj else None,
|
||||
}
|
||||
)
|
||||
@@ -295,13 +281,19 @@ async def execute_eval_run(
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
|
||||
sb = await trace_svc.build_story_bundle(uid, str(st.id))
|
||||
formatted, sb2 = await trace_svc.format_story_bundle(sb)
|
||||
fm = formatted.format_meta
|
||||
sj = await judge.judge_memoir(
|
||||
memoir_markdown=md,
|
||||
source_transcript=evidence_transcript,
|
||||
source_transcript=formatted.source_transcript,
|
||||
structured_evidence=formatted.structured_evidence,
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes=(
|
||||
"这是用户现有故事的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
|
||||
" 评审范围:单故事节选;跨篇章关联若证据不足须保守并在 insufficient_evidence 说明。"
|
||||
evidence_notes=_library_evidence_notes(
|
||||
sb2.lineage_tier,
|
||||
formatted.evidence_summary,
|
||||
fm.truncated,
|
||||
fm.dropped_sections,
|
||||
),
|
||||
)
|
||||
story_entries.append(
|
||||
@@ -309,6 +301,10 @@ async def execute_eval_run(
|
||||
"id": st.id,
|
||||
"title": st.title,
|
||||
"stage": st.stage,
|
||||
"lineage_tier": sb2.lineage_tier,
|
||||
"evidence_summary": formatted.evidence_summary,
|
||||
"evidence_trace": sb2.model_dump(),
|
||||
"format_meta": fm.model_dump(),
|
||||
"judge": sj.model_dump() if sj else None,
|
||||
}
|
||||
)
|
||||
@@ -352,6 +348,7 @@ async def execute_eval_run(
|
||||
bundle: dict[str, Any] = {
|
||||
"conversation_judge": conv_out.model_dump() if conv_out else None,
|
||||
"memoir_judge": mem_out.model_dump() if mem_out else None,
|
||||
"synthetic_memoir_judge": mem_out.model_dump() if mem_out else None,
|
||||
"chapters": chapter_entries,
|
||||
"stories": story_entries,
|
||||
"judge_meta": {
|
||||
@@ -359,10 +356,14 @@ async def execute_eval_run(
|
||||
"memoir_synthetic_ok": mem_out is not None,
|
||||
"memoir_synth_scores_n": len(synth_scores),
|
||||
"memoir_library_scores_n": len(library_scores),
|
||||
"synthetic_memoir_lineage_tier": "replay_transcript_only",
|
||||
"synthetic_memoir_evidence_summary": (
|
||||
f"replay_turns={len(utterances)};structured_memory=unbound"
|
||||
),
|
||||
"memoir_aggregate_rule": (
|
||||
"synth_plus_library_weighted_mean"
|
||||
"synthetic_memoir_judge_plus_library_memoir_judge_weighted_mean"
|
||||
if synth_scores and library_scores
|
||||
else ("synthetic_only" if synth_scores else "library_only")
|
||||
else ("synthetic_memoir_only" if synth_scores else "library_memoir_only")
|
||||
),
|
||||
},
|
||||
}
|
||||
@@ -422,38 +423,52 @@ async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> Non
|
||||
|
||||
|
||||
async def execute_experiment_full(experiment_id: str) -> None:
|
||||
async with AsyncSessionLocal() as db:
|
||||
exp = await eval_repo.get_experiment(db, experiment_id)
|
||||
if not exp:
|
||||
return
|
||||
await eval_repo.update_experiment(db, exp, status="running")
|
||||
await db.commit()
|
||||
from app.core.redis_lock import acquire_redis_lock, release_redis_lock
|
||||
|
||||
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
|
||||
base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
|
||||
cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
|
||||
if base_v is None or cand_v is None:
|
||||
await eval_repo.update_experiment(
|
||||
db,
|
||||
exp,
|
||||
status="failed",
|
||||
error_message="version 不存在",
|
||||
completed_at=datetime.now(timezone.utc),
|
||||
)
|
||||
lock_key = f"lock:eval_experiment:{experiment_id}"
|
||||
lock_handle = acquire_redis_lock(lock_key, ttl_seconds=7200)
|
||||
if lock_handle is None:
|
||||
logger.warning(
|
||||
"eval experiment already running or lock busy experiment_id={}",
|
||||
experiment_id,
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
exp = await eval_repo.get_experiment(db, experiment_id)
|
||||
if not exp:
|
||||
return
|
||||
await eval_repo.update_experiment(db, exp, status="running")
|
||||
await db.commit()
|
||||
return
|
||||
|
||||
for case in cases:
|
||||
for side, ver in ("baseline", base_v), ("candidate", cand_v):
|
||||
run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
|
||||
if not run:
|
||||
run = await eval_repo.create_run(
|
||||
db,
|
||||
experiment_id=experiment_id,
|
||||
case_id=str(case.id),
|
||||
side=side,
|
||||
)
|
||||
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
|
||||
base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
|
||||
cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
|
||||
if base_v is None or cand_v is None:
|
||||
await eval_repo.update_experiment(
|
||||
db,
|
||||
exp,
|
||||
status="failed",
|
||||
error_message="version 不存在",
|
||||
completed_at=datetime.now(timezone.utc),
|
||||
)
|
||||
await db.commit()
|
||||
await execute_eval_run(db, run=run, case=case, version=ver)
|
||||
return
|
||||
|
||||
await _finalize_experiment_gate(db, experiment_id)
|
||||
for case in cases:
|
||||
for side, ver in ("baseline", base_v), ("candidate", cand_v):
|
||||
run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
|
||||
if not run:
|
||||
run = await eval_repo.create_run(
|
||||
db,
|
||||
experiment_id=experiment_id,
|
||||
case_id=str(case.id),
|
||||
side=side,
|
||||
)
|
||||
await db.commit()
|
||||
await execute_eval_run(db, run=run, case=case, version=ver)
|
||||
|
||||
await _finalize_experiment_gate(db, experiment_id)
|
||||
finally:
|
||||
release_redis_lock(lock_handle)
|
||||
|
||||
Reference in New Issue
Block a user