数据库与模型:新增多版迁移(章节证据快照、对话血缘、记忆事实/时间线 lineage 等),把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。 业务链路:会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照;新增章节证据快照与评测侧 EvalTraceService 等模块,方便组评审用的证据包。 内部评测:自动化 run 与手工 memoir 评审共用可追溯证据;rubric/ judge 相关脚本与文档有配套调整。 app-eval-web:Memoir/实验详情里能展开看证据摘要与 evidence_trace(含对话轮次 id);Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致,避免改端口后页面连错服务。 工程杂项:GitHub Actions / 仓库说明有更新;各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾;新增/扩充了?
475 lines
17 KiB
Python
475 lines
17 KiB
Python
"""执行单次评测 run 与整实验(供 Celery / 内联调试)。"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from datetime import datetime, timezone
|
||
from typing import Any
|
||
|
||
from sqlalchemy.ext.asyncio import AsyncSession
|
||
|
||
from app.core.config import settings
|
||
from app.core.db import AsyncSessionLocal
|
||
from app.core.dependencies import get_eval_judge_langchain_llm, get_llm_provider
|
||
from app.core.logging import get_logger
|
||
from app.features.evaluation import repo as eval_repo
|
||
from app.features.evaluation.candidate_runner import (
|
||
EvalCandidateRunner,
|
||
simple_memoir_from_transcript,
|
||
)
|
||
from app.features.evaluation.eval_trace_service import EvalTraceService
|
||
from app.features.evaluation.gate_report_service import gate_result_to_details
|
||
from app.features.evaluation.gating_service import compute_gate
|
||
from app.features.evaluation.judge_service import EvalJudgeService
|
||
from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion
|
||
from app.features.evaluation.transcript_for_judge import (
|
||
assistant_text_for_eval_display,
|
||
format_eval_turn_block,
|
||
)
|
||
|
||
logger = get_logger(__name__)
|
||
|
||
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
|
||
_MAX_EVAL_CHAPTERS = 30
|
||
_MAX_EVAL_STORIES = 40
|
||
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
|
||
|
||
|
||
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
|
||
s = (text or "").strip()
|
||
if len(s) <= max_chars:
|
||
return s
|
||
return f"{s[:max_chars]}\n\n…(已截断供评审)"
|
||
|
||
|
||
def _composite(
|
||
conv: float | None, mem: float | None, weights: dict[str, Any] | None
|
||
) -> float | None:
|
||
"""合成总分;缺失的一侧不计为 0,避免把评审失败误标为极差。
|
||
|
||
仅一侧有分:返回该侧原始分(不乘权重),表示当前 run 仅完成了部分评审维度。
|
||
"""
|
||
w = weights or {}
|
||
wc = float(w.get("conversation", 0.5))
|
||
wm = float(w.get("memoir", 0.5))
|
||
has_c = conv is not None
|
||
has_m = mem is not None
|
||
if not has_c and not has_m:
|
||
return None
|
||
if has_c and has_m:
|
||
return float(wc) * float(conv) + float(wm) * float(mem)
|
||
if has_c:
|
||
return float(conv)
|
||
return float(mem)
|
||
|
||
|
||
def _utterances_for_case(case: EvalCase) -> list[str]:
|
||
raw = case.user_utterances or []
|
||
return [str(u).strip() for u in raw if str(u).strip()]
|
||
|
||
|
||
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
|
||
s = (text or "").strip()
|
||
if len(s) <= max_chars:
|
||
return s
|
||
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
|
||
|
||
|
||
async def execute_eval_run(
|
||
db: AsyncSession,
|
||
*,
|
||
run: EvalRun,
|
||
case: EvalCase,
|
||
version: EvalVersion,
|
||
) -> None:
|
||
fresh_run = await db.get(EvalRun, run.id)
|
||
if not fresh_run:
|
||
return
|
||
if (fresh_run.status or "").lower() == "completed":
|
||
logger.info("eval run skip already completed run_id={}", fresh_run.id)
|
||
return
|
||
run = fresh_run
|
||
|
||
if not settings.eval_execution_enabled:
|
||
await eval_repo.update_run(
|
||
db,
|
||
run,
|
||
status="failed",
|
||
error_message="EVAL_EXECUTION_ENABLED=false",
|
||
completed_at=datetime.now(timezone.utc),
|
||
)
|
||
return
|
||
|
||
utterances = _utterances_for_case(case)
|
||
if not utterances:
|
||
await eval_repo.update_run(
|
||
db,
|
||
run,
|
||
status="failed",
|
||
error_message="empty user_utterances",
|
||
completed_at=datetime.now(timezone.utc),
|
||
)
|
||
return
|
||
|
||
await eval_repo.update_run(
|
||
db,
|
||
run,
|
||
status="running",
|
||
started_at=datetime.now(timezone.utc),
|
||
error_message=None,
|
||
)
|
||
await db.commit()
|
||
|
||
provider_llm = getattr(get_llm_provider(), "langchain_llm", None)
|
||
if provider_llm is None:
|
||
await eval_repo.update_run(
|
||
db,
|
||
run,
|
||
status="failed",
|
||
error_message="生产 LLM 未配置",
|
||
completed_at=datetime.now(timezone.utc),
|
||
)
|
||
await db.commit()
|
||
return
|
||
|
||
judge_llm = get_eval_judge_langchain_llm()
|
||
judge = EvalJudgeService(judge_llm)
|
||
runner = EvalCandidateRunner(provider_llm)
|
||
cfg = version.config_json if isinstance(version.config_json, dict) else None
|
||
|
||
try:
|
||
replies, latencies = await runner.replay_utterances(
|
||
utterances,
|
||
version_config=cfg,
|
||
temperature=settings.eval_candidate_temperature,
|
||
)
|
||
except Exception as e:
|
||
logger.exception("eval replay failed: {}", e)
|
||
await eval_repo.update_run(
|
||
db,
|
||
run,
|
||
status="failed",
|
||
error_message=str(e)[:2000],
|
||
completed_at=datetime.now(timezone.utc),
|
||
)
|
||
await db.commit()
|
||
return
|
||
|
||
transcript_parts: list[str] = []
|
||
for i, u in enumerate(utterances):
|
||
if i >= len(replies):
|
||
break
|
||
transcript_parts.append(
|
||
format_eval_turn_block(i, u, assistant_text_for_eval_display(replies[i]))
|
||
)
|
||
prior_blocks: list[str] = []
|
||
for idx, u in enumerate(utterances):
|
||
if idx >= len(replies):
|
||
break
|
||
reply = assistant_text_for_eval_display(replies[idx])
|
||
lat = latencies[idx] if idx < len(latencies) else None
|
||
prior = "\n\n".join(prior_blocks)
|
||
if len(prior) > 8000:
|
||
prior = prior[-8000:]
|
||
tj = await judge.judge_turn(
|
||
prior_transcript=prior,
|
||
user_utterance=u,
|
||
assistant_reply=reply,
|
||
turn_index_0=idx,
|
||
)
|
||
scores = tj.model_dump() if tj else None
|
||
rationale = tj.rationale if tj else None
|
||
await eval_repo.add_turn(
|
||
db,
|
||
run_id=str(run.id),
|
||
turn_index=idx,
|
||
user_utterance=u,
|
||
assistant_reply=replies[idx],
|
||
duration_ms=lat,
|
||
judge_scores_json=scores,
|
||
judge_rationale=rationale,
|
||
)
|
||
await db.commit()
|
||
prior_blocks.append(format_eval_turn_block(idx, u, reply))
|
||
|
||
full_transcript = "\n\n".join(transcript_parts)
|
||
conv_out = await judge.judge_conversation(full_transcript=full_transcript)
|
||
conv_total = conv_out.total_score if conv_out else None
|
||
|
||
memoir_md = simple_memoir_from_transcript(utterances, replies)
|
||
source_transcript = _trim_evidence_text(full_transcript)
|
||
reference_memoir = (case.reference_memoir_markdown or "").strip()
|
||
synthetic_notes = (
|
||
"本项为 replay 合成的短 memoir:证据闭包仅为重放对话 transcript(无 library artifact lineage)。"
|
||
f" turns={len(utterances)}"
|
||
)
|
||
mem_out = await judge.judge_memoir(
|
||
memoir_markdown=memoir_md,
|
||
source_transcript=source_transcript,
|
||
structured_evidence=(
|
||
"(结构化记忆证据:自动化 replay 路径未绑定用户 memory chunk/fact/timeline/summary。)"
|
||
),
|
||
reference_memoir_markdown=reference_memoir,
|
||
evidence_notes=synthetic_notes,
|
||
)
|
||
|
||
chapter_entries: list[dict[str, Any]] = []
|
||
story_entries: list[dict[str, Any]] = []
|
||
uid = (case.source_user_id or "").strip()
|
||
trace_svc = EvalTraceService(db)
|
||
|
||
def _library_evidence_notes(
|
||
lineage_tier: str,
|
||
evidence_summary: str,
|
||
truncated: bool,
|
||
dropped: list[str],
|
||
) -> str:
|
||
drops = ",".join(dropped[:12]) if dropped else ""
|
||
return (
|
||
"library artifact 评审:以证据闭包为准;若 lineage 为 fallback 或不足须保守打分。"
|
||
f" lineage_tier={lineage_tier};summary={evidence_summary};"
|
||
f" prompt_truncated={truncated};dropped_sections={drops or 'none'}。"
|
||
" 单章节/单故事节选;跨篇上下文不足写入 insufficient_evidence。"
|
||
)
|
||
|
||
if uid:
|
||
from app.features.memoir.repo import get_chapters_for_memoir_list
|
||
from app.features.story.repo import get_stories_for_user
|
||
|
||
try:
|
||
chapters = await get_chapters_for_memoir_list(
|
||
uid, db, active_only=True, is_new_only=None
|
||
)
|
||
for ch in chapters[:_MAX_EVAL_CHAPTERS]:
|
||
body = (ch.canonical_markdown or "").strip()
|
||
if not body:
|
||
continue
|
||
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
|
||
cb = await trace_svc.build_chapter_bundle(uid, ch)
|
||
formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
|
||
fm = formatted.format_meta
|
||
cj = await judge.judge_memoir(
|
||
memoir_markdown=md,
|
||
source_transcript=formatted.source_transcript,
|
||
structured_evidence=formatted.structured_evidence,
|
||
reference_memoir_markdown=reference_memoir,
|
||
evidence_notes=_library_evidence_notes(
|
||
cb2.lineage_tier,
|
||
formatted.evidence_summary,
|
||
fm.truncated,
|
||
fm.dropped_sections,
|
||
),
|
||
)
|
||
chapter_entries.append(
|
||
{
|
||
"id": ch.id,
|
||
"title": ch.title,
|
||
"order_index": ch.order_index,
|
||
"lineage_tier": cb2.lineage_tier,
|
||
"evidence_summary": formatted.evidence_summary,
|
||
"evidence_trace": cb2.model_dump(),
|
||
"format_meta": fm.model_dump(),
|
||
"judge": cj.model_dump() if cj else None,
|
||
}
|
||
)
|
||
except Exception as e:
|
||
logger.warning("eval chapter judges skipped: {}", e)
|
||
|
||
try:
|
||
stories = await get_stories_for_user(db, uid, status="active")
|
||
for st in stories[:_MAX_EVAL_STORIES]:
|
||
body = (st.canonical_markdown or "").strip()
|
||
if not body:
|
||
continue
|
||
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
|
||
sb = await trace_svc.build_story_bundle(uid, str(st.id))
|
||
formatted, sb2 = await trace_svc.format_story_bundle(sb)
|
||
fm = formatted.format_meta
|
||
sj = await judge.judge_memoir(
|
||
memoir_markdown=md,
|
||
source_transcript=formatted.source_transcript,
|
||
structured_evidence=formatted.structured_evidence,
|
||
reference_memoir_markdown=reference_memoir,
|
||
evidence_notes=_library_evidence_notes(
|
||
sb2.lineage_tier,
|
||
formatted.evidence_summary,
|
||
fm.truncated,
|
||
fm.dropped_sections,
|
||
),
|
||
)
|
||
story_entries.append(
|
||
{
|
||
"id": st.id,
|
||
"title": st.title,
|
||
"stage": st.stage,
|
||
"lineage_tier": sb2.lineage_tier,
|
||
"evidence_summary": formatted.evidence_summary,
|
||
"evidence_trace": sb2.model_dump(),
|
||
"format_meta": fm.model_dump(),
|
||
"judge": sj.model_dump() if sj else None,
|
||
}
|
||
)
|
||
except Exception as e:
|
||
logger.warning("eval story judges skipped: {}", e)
|
||
|
||
synth_scores: list[float] = []
|
||
if mem_out is not None:
|
||
synth_scores.append(float(mem_out.total_score))
|
||
|
||
library_scores: list[float] = []
|
||
for row in chapter_entries:
|
||
j = row.get("judge")
|
||
if isinstance(j, dict) and j.get("total_score") is not None:
|
||
library_scores.append(float(j["total_score"]))
|
||
for row in story_entries:
|
||
j = row.get("judge")
|
||
if isinstance(j, dict) and j.get("total_score") is not None:
|
||
library_scores.append(float(j["total_score"]))
|
||
|
||
def _mean(xs: list[float]) -> float:
|
||
return sum(xs) / len(xs) if xs else 0.0
|
||
|
||
if synth_scores and library_scores:
|
||
mem_total = 0.5 * _mean(synth_scores) + 0.5 * _mean(library_scores)
|
||
elif synth_scores:
|
||
mem_total = _mean(synth_scores)
|
||
elif library_scores:
|
||
mem_total = _mean(library_scores)
|
||
else:
|
||
mem_total = None
|
||
|
||
exp = await eval_repo.get_experiment(db, str(run.experiment_id))
|
||
weights = (
|
||
exp.composite_weights_json
|
||
if exp and isinstance(exp.composite_weights_json, dict)
|
||
else None
|
||
)
|
||
comp = _composite(conv_total, mem_total, weights)
|
||
|
||
bundle: dict[str, Any] = {
|
||
"conversation_judge": conv_out.model_dump() if conv_out else None,
|
||
"memoir_judge": mem_out.model_dump() if mem_out else None,
|
||
"synthetic_memoir_judge": mem_out.model_dump() if mem_out else None,
|
||
"chapters": chapter_entries,
|
||
"stories": story_entries,
|
||
"judge_meta": {
|
||
"conversation_judge_ok": conv_out is not None,
|
||
"memoir_synthetic_ok": mem_out is not None,
|
||
"memoir_synth_scores_n": len(synth_scores),
|
||
"memoir_library_scores_n": len(library_scores),
|
||
"synthetic_memoir_lineage_tier": "replay_transcript_only",
|
||
"synthetic_memoir_evidence_summary": (
|
||
f"replay_turns={len(utterances)};structured_memory=unbound"
|
||
),
|
||
"memoir_aggregate_rule": (
|
||
"synthetic_memoir_judge_plus_library_memoir_judge_weighted_mean"
|
||
if synth_scores and library_scores
|
||
else ("synthetic_memoir_only" if synth_scores else "library_memoir_only")
|
||
),
|
||
},
|
||
}
|
||
await eval_repo.update_run(
|
||
db,
|
||
run,
|
||
status="completed",
|
||
memoir_markdown=memoir_md,
|
||
conversation_score_total=conv_total,
|
||
memoir_score_total=mem_total,
|
||
composite_score=comp,
|
||
judge_bundle_json=bundle,
|
||
completed_at=datetime.now(timezone.utc),
|
||
)
|
||
await db.commit()
|
||
|
||
|
||
async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> None:
|
||
runs = await eval_repo.list_runs_for_experiment(db, experiment_id)
|
||
exp = await eval_repo.get_experiment(db, experiment_id)
|
||
if not exp:
|
||
return
|
||
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
|
||
|
||
incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")]
|
||
if incomplete:
|
||
return
|
||
|
||
failed = [r for r in runs if str(r.status) == "failed"]
|
||
if failed:
|
||
await eval_repo.update_experiment(
|
||
db,
|
||
exp,
|
||
status="failed",
|
||
error_message="部分 run 失败",
|
||
completed_at=datetime.now(timezone.utc),
|
||
)
|
||
await db.commit()
|
||
return
|
||
|
||
gr = compute_gate(cases=cases, runs=runs)
|
||
await eval_repo.upsert_gate_verdict(
|
||
db,
|
||
experiment_id=experiment_id,
|
||
passed=gr.passed,
|
||
mean_composite_delta=gr.mean_delta,
|
||
protected_regressions_json=gr.protected_regressions,
|
||
details_json=gate_result_to_details(gr),
|
||
)
|
||
await eval_repo.update_experiment(
|
||
db,
|
||
exp,
|
||
status="completed",
|
||
completed_at=datetime.now(timezone.utc),
|
||
)
|
||
await db.commit()
|
||
|
||
|
||
async def execute_experiment_full(experiment_id: str) -> None:
|
||
from app.core.redis_lock import acquire_redis_lock, release_redis_lock
|
||
|
||
lock_key = f"lock:eval_experiment:{experiment_id}"
|
||
lock_handle = acquire_redis_lock(lock_key, ttl_seconds=7200)
|
||
if lock_handle is None:
|
||
logger.warning(
|
||
"eval experiment already running or lock busy experiment_id={}",
|
||
experiment_id,
|
||
)
|
||
return
|
||
|
||
try:
|
||
async with AsyncSessionLocal() as db:
|
||
exp = await eval_repo.get_experiment(db, experiment_id)
|
||
if not exp:
|
||
return
|
||
await eval_repo.update_experiment(db, exp, status="running")
|
||
await db.commit()
|
||
|
||
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
|
||
base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
|
||
cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
|
||
if base_v is None or cand_v is None:
|
||
await eval_repo.update_experiment(
|
||
db,
|
||
exp,
|
||
status="failed",
|
||
error_message="version 不存在",
|
||
completed_at=datetime.now(timezone.utc),
|
||
)
|
||
await db.commit()
|
||
return
|
||
|
||
for case in cases:
|
||
for side, ver in ("baseline", base_v), ("candidate", cand_v):
|
||
run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
|
||
if not run:
|
||
run = await eval_repo.create_run(
|
||
db,
|
||
experiment_id=experiment_id,
|
||
case_id=str(case.id),
|
||
side=side,
|
||
)
|
||
await db.commit()
|
||
await execute_eval_run(db, run=run, case=case, version=ver)
|
||
|
||
await _finalize_experiment_gate(db, experiment_id)
|
||
finally:
|
||
release_redis_lock(lock_handle)
|