Files
life-echo/api/app/features/evaluation/execution_service.py
Kevin 309a051038 feat: 回忆录证据血缘与内部评测可追溯,顺带对齐本地评测台与 CI
数据库与模型:新增多版迁移(章节证据快照、对话血缘、记忆事实/时间线 lineage 等),把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路:会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照;新增章节证据快照与评测侧 EvalTraceService 等模块,方便组评审用的证据包。
内部评测:自动化 run 与手工 memoir 评审共用可追溯证据;rubric/ judge 相关脚本与文档有配套调整。
app-eval-web:Memoir/实验详情里能展开看证据摘要与 evidence_trace(含对话轮次 id);Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致,避免改端口后页面连错服务。
工程杂项:GitHub Actions / 仓库说明有更新;各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾;新增/扩充了?
2026-04-08 15:37:09 +08:00

475 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""执行单次评测 run 与整实验(供 Celery / 内联调试)。"""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.config import settings
from app.core.db import AsyncSessionLocal
from app.core.dependencies import get_eval_judge_langchain_llm, get_llm_provider
from app.core.logging import get_logger
from app.features.evaluation import repo as eval_repo
from app.features.evaluation.candidate_runner import (
EvalCandidateRunner,
simple_memoir_from_transcript,
)
from app.features.evaluation.eval_trace_service import EvalTraceService
from app.features.evaluation.gate_report_service import gate_result_to_details
from app.features.evaluation.gating_service import compute_gate
from app.features.evaluation.judge_service import EvalJudgeService
from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion
from app.features.evaluation.transcript_for_judge import (
assistant_text_for_eval_display,
format_eval_turn_block,
)
logger = get_logger(__name__)
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
return s
return f"{s[:max_chars]}\n\n…(已截断供评审)"
def _composite(
conv: float | None, mem: float | None, weights: dict[str, Any] | None
) -> float | None:
"""合成总分;缺失的一侧不计为 0避免把评审失败误标为极差。
仅一侧有分:返回该侧原始分(不乘权重),表示当前 run 仅完成了部分评审维度。
"""
w = weights or {}
wc = float(w.get("conversation", 0.5))
wm = float(w.get("memoir", 0.5))
has_c = conv is not None
has_m = mem is not None
if not has_c and not has_m:
return None
if has_c and has_m:
return float(wc) * float(conv) + float(wm) * float(mem)
if has_c:
return float(conv)
return float(mem)
def _utterances_for_case(case: EvalCase) -> list[str]:
raw = case.user_utterances or []
return [str(u).strip() for u in raw if str(u).strip()]
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
return s
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
async def execute_eval_run(
db: AsyncSession,
*,
run: EvalRun,
case: EvalCase,
version: EvalVersion,
) -> None:
fresh_run = await db.get(EvalRun, run.id)
if not fresh_run:
return
if (fresh_run.status or "").lower() == "completed":
logger.info("eval run skip already completed run_id={}", fresh_run.id)
return
run = fresh_run
if not settings.eval_execution_enabled:
await eval_repo.update_run(
db,
run,
status="failed",
error_message="EVAL_EXECUTION_ENABLED=false",
completed_at=datetime.now(timezone.utc),
)
return
utterances = _utterances_for_case(case)
if not utterances:
await eval_repo.update_run(
db,
run,
status="failed",
error_message="empty user_utterances",
completed_at=datetime.now(timezone.utc),
)
return
await eval_repo.update_run(
db,
run,
status="running",
started_at=datetime.now(timezone.utc),
error_message=None,
)
await db.commit()
provider_llm = getattr(get_llm_provider(), "langchain_llm", None)
if provider_llm is None:
await eval_repo.update_run(
db,
run,
status="failed",
error_message="生产 LLM 未配置",
completed_at=datetime.now(timezone.utc),
)
await db.commit()
return
judge_llm = get_eval_judge_langchain_llm()
judge = EvalJudgeService(judge_llm)
runner = EvalCandidateRunner(provider_llm)
cfg = version.config_json if isinstance(version.config_json, dict) else None
try:
replies, latencies = await runner.replay_utterances(
utterances,
version_config=cfg,
temperature=settings.eval_candidate_temperature,
)
except Exception as e:
logger.exception("eval replay failed: {}", e)
await eval_repo.update_run(
db,
run,
status="failed",
error_message=str(e)[:2000],
completed_at=datetime.now(timezone.utc),
)
await db.commit()
return
transcript_parts: list[str] = []
for i, u in enumerate(utterances):
if i >= len(replies):
break
transcript_parts.append(
format_eval_turn_block(i, u, assistant_text_for_eval_display(replies[i]))
)
prior_blocks: list[str] = []
for idx, u in enumerate(utterances):
if idx >= len(replies):
break
reply = assistant_text_for_eval_display(replies[idx])
lat = latencies[idx] if idx < len(latencies) else None
prior = "\n\n".join(prior_blocks)
if len(prior) > 8000:
prior = prior[-8000:]
tj = await judge.judge_turn(
prior_transcript=prior,
user_utterance=u,
assistant_reply=reply,
turn_index_0=idx,
)
scores = tj.model_dump() if tj else None
rationale = tj.rationale if tj else None
await eval_repo.add_turn(
db,
run_id=str(run.id),
turn_index=idx,
user_utterance=u,
assistant_reply=replies[idx],
duration_ms=lat,
judge_scores_json=scores,
judge_rationale=rationale,
)
await db.commit()
prior_blocks.append(format_eval_turn_block(idx, u, reply))
full_transcript = "\n\n".join(transcript_parts)
conv_out = await judge.judge_conversation(full_transcript=full_transcript)
conv_total = conv_out.total_score if conv_out else None
memoir_md = simple_memoir_from_transcript(utterances, replies)
source_transcript = _trim_evidence_text(full_transcript)
reference_memoir = (case.reference_memoir_markdown or "").strip()
synthetic_notes = (
"本项为 replay 合成的短 memoir证据闭包仅为重放对话 transcript无 library artifact lineage"
f" turns={len(utterances)}"
)
mem_out = await judge.judge_memoir(
memoir_markdown=memoir_md,
source_transcript=source_transcript,
structured_evidence=(
"(结构化记忆证据:自动化 replay 路径未绑定用户 memory chunk/fact/timeline/summary。"
),
reference_memoir_markdown=reference_memoir,
evidence_notes=synthetic_notes,
)
chapter_entries: list[dict[str, Any]] = []
story_entries: list[dict[str, Any]] = []
uid = (case.source_user_id or "").strip()
trace_svc = EvalTraceService(db)
def _library_evidence_notes(
lineage_tier: str,
evidence_summary: str,
truncated: bool,
dropped: list[str],
) -> str:
drops = ",".join(dropped[:12]) if dropped else ""
return (
"library artifact 评审:以证据闭包为准;若 lineage 为 fallback 或不足须保守打分。"
f" lineage_tier={lineage_tier}summary={evidence_summary}"
f" prompt_truncated={truncated}dropped_sections={drops or 'none'}"
" 单章节/单故事节选;跨篇上下文不足写入 insufficient_evidence。"
)
if uid:
from app.features.memoir.repo import get_chapters_for_memoir_list
from app.features.story.repo import get_stories_for_user
try:
chapters = await get_chapters_for_memoir_list(
uid, db, active_only=True, is_new_only=None
)
for ch in chapters[:_MAX_EVAL_CHAPTERS]:
body = (ch.canonical_markdown or "").strip()
if not body:
continue
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
cb = await trace_svc.build_chapter_bundle(uid, ch)
formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
fm = formatted.format_meta
cj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=formatted.source_transcript,
structured_evidence=formatted.structured_evidence,
reference_memoir_markdown=reference_memoir,
evidence_notes=_library_evidence_notes(
cb2.lineage_tier,
formatted.evidence_summary,
fm.truncated,
fm.dropped_sections,
),
)
chapter_entries.append(
{
"id": ch.id,
"title": ch.title,
"order_index": ch.order_index,
"lineage_tier": cb2.lineage_tier,
"evidence_summary": formatted.evidence_summary,
"evidence_trace": cb2.model_dump(),
"format_meta": fm.model_dump(),
"judge": cj.model_dump() if cj else None,
}
)
except Exception as e:
logger.warning("eval chapter judges skipped: {}", e)
try:
stories = await get_stories_for_user(db, uid, status="active")
for st in stories[:_MAX_EVAL_STORIES]:
body = (st.canonical_markdown or "").strip()
if not body:
continue
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
sb = await trace_svc.build_story_bundle(uid, str(st.id))
formatted, sb2 = await trace_svc.format_story_bundle(sb)
fm = formatted.format_meta
sj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=formatted.source_transcript,
structured_evidence=formatted.structured_evidence,
reference_memoir_markdown=reference_memoir,
evidence_notes=_library_evidence_notes(
sb2.lineage_tier,
formatted.evidence_summary,
fm.truncated,
fm.dropped_sections,
),
)
story_entries.append(
{
"id": st.id,
"title": st.title,
"stage": st.stage,
"lineage_tier": sb2.lineage_tier,
"evidence_summary": formatted.evidence_summary,
"evidence_trace": sb2.model_dump(),
"format_meta": fm.model_dump(),
"judge": sj.model_dump() if sj else None,
}
)
except Exception as e:
logger.warning("eval story judges skipped: {}", e)
synth_scores: list[float] = []
if mem_out is not None:
synth_scores.append(float(mem_out.total_score))
library_scores: list[float] = []
for row in chapter_entries:
j = row.get("judge")
if isinstance(j, dict) and j.get("total_score") is not None:
library_scores.append(float(j["total_score"]))
for row in story_entries:
j = row.get("judge")
if isinstance(j, dict) and j.get("total_score") is not None:
library_scores.append(float(j["total_score"]))
def _mean(xs: list[float]) -> float:
return sum(xs) / len(xs) if xs else 0.0
if synth_scores and library_scores:
mem_total = 0.5 * _mean(synth_scores) + 0.5 * _mean(library_scores)
elif synth_scores:
mem_total = _mean(synth_scores)
elif library_scores:
mem_total = _mean(library_scores)
else:
mem_total = None
exp = await eval_repo.get_experiment(db, str(run.experiment_id))
weights = (
exp.composite_weights_json
if exp and isinstance(exp.composite_weights_json, dict)
else None
)
comp = _composite(conv_total, mem_total, weights)
bundle: dict[str, Any] = {
"conversation_judge": conv_out.model_dump() if conv_out else None,
"memoir_judge": mem_out.model_dump() if mem_out else None,
"synthetic_memoir_judge": mem_out.model_dump() if mem_out else None,
"chapters": chapter_entries,
"stories": story_entries,
"judge_meta": {
"conversation_judge_ok": conv_out is not None,
"memoir_synthetic_ok": mem_out is not None,
"memoir_synth_scores_n": len(synth_scores),
"memoir_library_scores_n": len(library_scores),
"synthetic_memoir_lineage_tier": "replay_transcript_only",
"synthetic_memoir_evidence_summary": (
f"replay_turns={len(utterances)};structured_memory=unbound"
),
"memoir_aggregate_rule": (
"synthetic_memoir_judge_plus_library_memoir_judge_weighted_mean"
if synth_scores and library_scores
else ("synthetic_memoir_only" if synth_scores else "library_memoir_only")
),
},
}
await eval_repo.update_run(
db,
run,
status="completed",
memoir_markdown=memoir_md,
conversation_score_total=conv_total,
memoir_score_total=mem_total,
composite_score=comp,
judge_bundle_json=bundle,
completed_at=datetime.now(timezone.utc),
)
await db.commit()
async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> None:
runs = await eval_repo.list_runs_for_experiment(db, experiment_id)
exp = await eval_repo.get_experiment(db, experiment_id)
if not exp:
return
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")]
if incomplete:
return
failed = [r for r in runs if str(r.status) == "failed"]
if failed:
await eval_repo.update_experiment(
db,
exp,
status="failed",
error_message="部分 run 失败",
completed_at=datetime.now(timezone.utc),
)
await db.commit()
return
gr = compute_gate(cases=cases, runs=runs)
await eval_repo.upsert_gate_verdict(
db,
experiment_id=experiment_id,
passed=gr.passed,
mean_composite_delta=gr.mean_delta,
protected_regressions_json=gr.protected_regressions,
details_json=gate_result_to_details(gr),
)
await eval_repo.update_experiment(
db,
exp,
status="completed",
completed_at=datetime.now(timezone.utc),
)
await db.commit()
async def execute_experiment_full(experiment_id: str) -> None:
from app.core.redis_lock import acquire_redis_lock, release_redis_lock
lock_key = f"lock:eval_experiment:{experiment_id}"
lock_handle = acquire_redis_lock(lock_key, ttl_seconds=7200)
if lock_handle is None:
logger.warning(
"eval experiment already running or lock busy experiment_id={}",
experiment_id,
)
return
try:
async with AsyncSessionLocal() as db:
exp = await eval_repo.get_experiment(db, experiment_id)
if not exp:
return
await eval_repo.update_experiment(db, exp, status="running")
await db.commit()
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
if base_v is None or cand_v is None:
await eval_repo.update_experiment(
db,
exp,
status="failed",
error_message="version 不存在",
completed_at=datetime.now(timezone.utc),
)
await db.commit()
return
for case in cases:
for side, ver in ("baseline", base_v), ("candidate", cand_v):
run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
if not run:
run = await eval_repo.create_run(
db,
experiment_id=experiment_id,
case_id=str(case.id),
side=side,
)
await db.commit()
await execute_eval_run(db, run=run, case=case, version=ver)
await _finalize_experiment_gate(db, experiment_id)
finally:
release_redis_lock(lock_handle)