2026-04-03 14:44:46 +08:00
|
|
|
"""执行单次评测 run 与整实验(供 Celery / 内联调试)。"""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
|
|
|
|
|
|
from app.core.config import settings
|
|
|
|
|
from app.core.db import AsyncSessionLocal
|
|
|
|
|
from app.core.dependencies import get_eval_judge_langchain_llm, get_llm_provider
|
|
|
|
|
from app.core.logging import get_logger
|
|
|
|
|
from app.features.evaluation import repo as eval_repo
|
|
|
|
|
from app.features.evaluation.candidate_runner import (
|
|
|
|
|
EvalCandidateRunner,
|
|
|
|
|
simple_memoir_from_transcript,
|
|
|
|
|
)
|
|
|
|
|
from app.features.evaluation.gate_report_service import gate_result_to_details
|
|
|
|
|
from app.features.evaluation.gating_service import compute_gate
|
|
|
|
|
from app.features.evaluation.judge_service import EvalJudgeService
|
|
|
|
|
from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion
|
|
|
|
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
2026-04-06 13:45:04 +08:00
|
|
|
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
|
|
|
|
|
_MAX_EVAL_CHAPTERS = 30
|
|
|
|
|
_MAX_EVAL_STORIES = 40
|
2026-04-07 10:34:59 +08:00
|
|
|
_MAX_EVIDENCE_CONVERSATIONS = 8
|
|
|
|
|
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
|
2026-04-06 13:45:04 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
|
|
|
|
|
s = (text or "").strip()
|
|
|
|
|
if len(s) <= max_chars:
|
|
|
|
|
return s
|
|
|
|
|
return f"{s[:max_chars]}\n\n…(已截断供评审)"
|
|
|
|
|
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
|
|
|
def _composite(
|
|
|
|
|
conv: float | None, mem: float | None, weights: dict[str, Any] | None
|
|
|
|
|
) -> float:
|
|
|
|
|
w = weights or {}
|
|
|
|
|
wc = float(w.get("conversation", 0.5))
|
|
|
|
|
wm = float(w.get("memoir", 0.5))
|
|
|
|
|
c = float(conv or 0)
|
|
|
|
|
m = float(mem or 0)
|
|
|
|
|
return wc * c + wm * m
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _utterances_for_case(case: EvalCase) -> list[str]:
|
|
|
|
|
raw = case.user_utterances or []
|
|
|
|
|
return [str(u).strip() for u in raw if str(u).strip()]
|
|
|
|
|
|
|
|
|
|
|
2026-04-06 22:22:50 +08:00
|
|
|
def _assistant_text_for_eval_display(raw: str) -> str:
|
|
|
|
|
"""评审与 transcript 展示:避免字面量 [SPLIT] 干扰 judge 阅读。"""
|
|
|
|
|
return (raw or "").replace("[SPLIT]", "\n")
|
|
|
|
|
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
|
|
|
|
|
s = (text or "").strip()
|
|
|
|
|
if len(s) <= max_chars:
|
|
|
|
|
return s
|
|
|
|
|
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _dialogue_transcript_from_pairs(pairs: list[tuple[str, str]]) -> str:
|
|
|
|
|
parts: list[str] = []
|
|
|
|
|
for role, content in pairs:
|
|
|
|
|
body = (content or "").strip()
|
|
|
|
|
if not body:
|
|
|
|
|
continue
|
|
|
|
|
label = "用户" if role == "human" else "AI"
|
|
|
|
|
out = _assistant_text_for_eval_display(body) if role != "human" else body
|
|
|
|
|
parts.append(f"{label}: {out}")
|
|
|
|
|
return "\n\n".join(parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def _conversation_transcript_for_eval(
|
|
|
|
|
db: AsyncSession, conversation_id: str
|
|
|
|
|
) -> str:
|
|
|
|
|
from app.features.conversation import repo as conversation_repo
|
|
|
|
|
|
|
|
|
|
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
|
|
|
|
|
return _dialogue_transcript_from_pairs(
|
|
|
|
|
[(str(row.role or "").lower(), str(row.content or "")) for row in rows]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
|
|
|
|
|
from app.features.conversation import repo as conversation_repo
|
|
|
|
|
|
|
|
|
|
conversations = await conversation_repo.get_user_conversations(user_id, db)
|
|
|
|
|
if not conversations:
|
|
|
|
|
return ""
|
|
|
|
|
parts: list[str] = []
|
|
|
|
|
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
|
|
|
|
|
transcript = await _conversation_transcript_for_eval(db, str(conv.id))
|
|
|
|
|
if transcript:
|
|
|
|
|
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
|
|
|
|
|
return _trim_evidence_text("\n\n".join(parts))
|
|
|
|
|
|
|
|
|
|
|
2026-04-03 14:44:46 +08:00
|
|
|
async def execute_eval_run(
|
|
|
|
|
db: AsyncSession,
|
|
|
|
|
*,
|
|
|
|
|
run: EvalRun,
|
|
|
|
|
case: EvalCase,
|
|
|
|
|
version: EvalVersion,
|
|
|
|
|
) -> None:
|
|
|
|
|
if not settings.eval_execution_enabled:
|
|
|
|
|
await eval_repo.update_run(
|
|
|
|
|
db,
|
|
|
|
|
run,
|
|
|
|
|
status="failed",
|
|
|
|
|
error_message="EVAL_EXECUTION_ENABLED=false",
|
|
|
|
|
completed_at=datetime.now(timezone.utc),
|
|
|
|
|
)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
utterances = _utterances_for_case(case)
|
|
|
|
|
if not utterances:
|
|
|
|
|
await eval_repo.update_run(
|
|
|
|
|
db,
|
|
|
|
|
run,
|
|
|
|
|
status="failed",
|
|
|
|
|
error_message="empty user_utterances",
|
|
|
|
|
completed_at=datetime.now(timezone.utc),
|
|
|
|
|
)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
await eval_repo.update_run(
|
|
|
|
|
db,
|
|
|
|
|
run,
|
|
|
|
|
status="running",
|
|
|
|
|
started_at=datetime.now(timezone.utc),
|
|
|
|
|
error_message=None,
|
|
|
|
|
)
|
|
|
|
|
await db.commit()
|
|
|
|
|
|
|
|
|
|
provider_llm = getattr(get_llm_provider(), "langchain_llm", None)
|
|
|
|
|
if provider_llm is None:
|
|
|
|
|
await eval_repo.update_run(
|
|
|
|
|
db,
|
|
|
|
|
run,
|
|
|
|
|
status="failed",
|
|
|
|
|
error_message="生产 LLM 未配置",
|
|
|
|
|
completed_at=datetime.now(timezone.utc),
|
|
|
|
|
)
|
|
|
|
|
await db.commit()
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
judge_llm = get_eval_judge_langchain_llm()
|
|
|
|
|
judge = EvalJudgeService(judge_llm)
|
|
|
|
|
runner = EvalCandidateRunner(provider_llm)
|
|
|
|
|
cfg = version.config_json if isinstance(version.config_json, dict) else None
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
replies, latencies = await runner.replay_utterances(
|
|
|
|
|
utterances,
|
|
|
|
|
version_config=cfg,
|
|
|
|
|
temperature=settings.eval_candidate_temperature,
|
|
|
|
|
)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.exception("eval replay failed: {}", e)
|
|
|
|
|
await eval_repo.update_run(
|
|
|
|
|
db,
|
|
|
|
|
run,
|
|
|
|
|
status="failed",
|
|
|
|
|
error_message=str(e)[:2000],
|
|
|
|
|
completed_at=datetime.now(timezone.utc),
|
|
|
|
|
)
|
|
|
|
|
await db.commit()
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
transcript_parts: list[str] = []
|
|
|
|
|
for i, u in enumerate(utterances):
|
|
|
|
|
if i >= len(replies):
|
|
|
|
|
break
|
2026-04-06 22:22:50 +08:00
|
|
|
transcript_parts.append(
|
|
|
|
|
f"用户: {u}\nAI: {_assistant_text_for_eval_display(replies[i])}"
|
|
|
|
|
)
|
2026-04-03 14:44:46 +08:00
|
|
|
prior = ""
|
|
|
|
|
for idx, u in enumerate(utterances):
|
|
|
|
|
if idx >= len(replies):
|
|
|
|
|
break
|
2026-04-06 22:22:50 +08:00
|
|
|
reply = _assistant_text_for_eval_display(replies[idx])
|
2026-04-03 14:44:46 +08:00
|
|
|
lat = latencies[idx] if idx < len(latencies) else None
|
|
|
|
|
tj = await judge.judge_turn(
|
|
|
|
|
prior_transcript=prior,
|
|
|
|
|
user_utterance=u,
|
|
|
|
|
assistant_reply=reply,
|
|
|
|
|
)
|
|
|
|
|
scores = tj.model_dump() if tj else None
|
|
|
|
|
rationale = tj.rationale if tj else None
|
|
|
|
|
await eval_repo.add_turn(
|
|
|
|
|
db,
|
2026-04-07 10:34:59 +08:00
|
|
|
run_id=str(run.id),
|
2026-04-03 14:44:46 +08:00
|
|
|
turn_index=idx,
|
|
|
|
|
user_utterance=u,
|
2026-04-06 22:22:50 +08:00
|
|
|
assistant_reply=replies[idx],
|
2026-04-03 14:44:46 +08:00
|
|
|
duration_ms=lat,
|
|
|
|
|
judge_scores_json=scores,
|
|
|
|
|
judge_rationale=rationale,
|
|
|
|
|
)
|
|
|
|
|
await db.commit()
|
|
|
|
|
prior = (prior + f"\n用户: {u}\nAI: {reply}")[-8000:]
|
|
|
|
|
|
|
|
|
|
full_transcript = "\n\n".join(transcript_parts)
|
|
|
|
|
conv_out = await judge.judge_conversation(full_transcript=full_transcript)
|
|
|
|
|
conv_total = conv_out.total_score if conv_out else None
|
|
|
|
|
|
|
|
|
|
memoir_md = simple_memoir_from_transcript(utterances, replies)
|
2026-04-07 10:34:59 +08:00
|
|
|
source_transcript = _trim_evidence_text(full_transcript)
|
|
|
|
|
reference_memoir = (case.reference_memoir_markdown or "").strip()
|
|
|
|
|
mem_out = await judge.judge_memoir(
|
|
|
|
|
memoir_markdown=memoir_md,
|
|
|
|
|
source_transcript=source_transcript,
|
|
|
|
|
reference_memoir_markdown=reference_memoir,
|
|
|
|
|
evidence_notes="严格按文档核对真实性、覆盖率、可追溯性;以原始访谈为主,参考基线仅作辅助。",
|
|
|
|
|
)
|
2026-04-06 13:45:04 +08:00
|
|
|
|
|
|
|
|
chapter_entries: list[dict[str, Any]] = []
|
|
|
|
|
story_entries: list[dict[str, Any]] = []
|
|
|
|
|
uid = (case.source_user_id or "").strip()
|
2026-04-07 10:34:59 +08:00
|
|
|
source_conversation_id = (case.source_conversation_id or "").strip()
|
|
|
|
|
evidence_transcript = source_transcript
|
|
|
|
|
if source_conversation_id:
|
|
|
|
|
try:
|
|
|
|
|
conversation_evidence = await _conversation_transcript_for_eval(
|
|
|
|
|
db, source_conversation_id
|
|
|
|
|
)
|
|
|
|
|
if conversation_evidence:
|
|
|
|
|
evidence_transcript = _trim_evidence_text(conversation_evidence)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.warning("eval source conversation evidence skipped: {}", e)
|
|
|
|
|
elif uid:
|
|
|
|
|
try:
|
|
|
|
|
user_evidence = await _user_transcript_evidence(db, uid)
|
|
|
|
|
if user_evidence:
|
|
|
|
|
evidence_transcript = user_evidence
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.warning("eval user transcript evidence skipped: {}", e)
|
2026-04-06 13:45:04 +08:00
|
|
|
if uid:
|
|
|
|
|
from app.features.memoir.repo import get_chapters_for_memoir_list
|
|
|
|
|
from app.features.story.repo import get_stories_for_user
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
chapters = await get_chapters_for_memoir_list(
|
|
|
|
|
uid, db, active_only=True, is_new_only=None
|
|
|
|
|
)
|
|
|
|
|
for ch in chapters[:_MAX_EVAL_CHAPTERS]:
|
|
|
|
|
body = (ch.canonical_markdown or "").strip()
|
|
|
|
|
if not body:
|
|
|
|
|
continue
|
|
|
|
|
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
|
2026-04-07 10:34:59 +08:00
|
|
|
cj = await judge.judge_memoir(
|
|
|
|
|
memoir_markdown=md,
|
|
|
|
|
source_transcript=evidence_transcript,
|
|
|
|
|
reference_memoir_markdown=reference_memoir,
|
|
|
|
|
evidence_notes=(
|
|
|
|
|
"这是用户现有章节的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
|
|
|
|
|
),
|
|
|
|
|
)
|
2026-04-06 13:45:04 +08:00
|
|
|
chapter_entries.append(
|
|
|
|
|
{
|
|
|
|
|
"id": ch.id,
|
|
|
|
|
"title": ch.title,
|
|
|
|
|
"order_index": ch.order_index,
|
|
|
|
|
"judge": cj.model_dump() if cj else None,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.warning("eval chapter judges skipped: {}", e)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
stories = await get_stories_for_user(db, uid, status="active")
|
|
|
|
|
for st in stories[:_MAX_EVAL_STORIES]:
|
|
|
|
|
body = (st.canonical_markdown or "").strip()
|
|
|
|
|
if not body:
|
|
|
|
|
continue
|
|
|
|
|
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
|
2026-04-07 10:34:59 +08:00
|
|
|
sj = await judge.judge_memoir(
|
|
|
|
|
memoir_markdown=md,
|
|
|
|
|
source_transcript=evidence_transcript,
|
|
|
|
|
reference_memoir_markdown=reference_memoir,
|
|
|
|
|
evidence_notes=(
|
|
|
|
|
"这是用户现有故事的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
|
|
|
|
|
),
|
|
|
|
|
)
|
2026-04-06 13:45:04 +08:00
|
|
|
story_entries.append(
|
|
|
|
|
{
|
|
|
|
|
"id": st.id,
|
|
|
|
|
"title": st.title,
|
|
|
|
|
"stage": st.stage,
|
|
|
|
|
"judge": sj.model_dump() if sj else None,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.warning("eval story judges skipped: {}", e)
|
|
|
|
|
|
|
|
|
|
mem_parts: list[float] = []
|
|
|
|
|
if mem_out is not None:
|
|
|
|
|
mem_parts.append(float(mem_out.total_score))
|
|
|
|
|
for row in chapter_entries:
|
|
|
|
|
j = row.get("judge")
|
|
|
|
|
if isinstance(j, dict) and j.get("total_score") is not None:
|
|
|
|
|
mem_parts.append(float(j["total_score"]))
|
|
|
|
|
for row in story_entries:
|
|
|
|
|
j = row.get("judge")
|
|
|
|
|
if isinstance(j, dict) and j.get("total_score") is not None:
|
|
|
|
|
mem_parts.append(float(j["total_score"]))
|
|
|
|
|
mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None
|
2026-04-03 14:44:46 +08:00
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
exp = await eval_repo.get_experiment(db, str(run.experiment_id))
|
|
|
|
|
weights = (
|
|
|
|
|
exp.composite_weights_json
|
|
|
|
|
if exp and isinstance(exp.composite_weights_json, dict)
|
|
|
|
|
else None
|
|
|
|
|
)
|
2026-04-03 14:44:46 +08:00
|
|
|
comp = _composite(conv_total, mem_total, weights)
|
|
|
|
|
|
|
|
|
|
bundle: dict[str, Any] = {
|
|
|
|
|
"conversation_judge": conv_out.model_dump() if conv_out else None,
|
|
|
|
|
"memoir_judge": mem_out.model_dump() if mem_out else None,
|
2026-04-06 13:45:04 +08:00
|
|
|
"chapters": chapter_entries,
|
|
|
|
|
"stories": story_entries,
|
2026-04-03 14:44:46 +08:00
|
|
|
}
|
|
|
|
|
await eval_repo.update_run(
|
|
|
|
|
db,
|
|
|
|
|
run,
|
|
|
|
|
status="completed",
|
|
|
|
|
memoir_markdown=memoir_md,
|
|
|
|
|
conversation_score_total=conv_total,
|
|
|
|
|
memoir_score_total=mem_total,
|
|
|
|
|
composite_score=comp,
|
|
|
|
|
judge_bundle_json=bundle,
|
|
|
|
|
completed_at=datetime.now(timezone.utc),
|
|
|
|
|
)
|
|
|
|
|
await db.commit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> None:
|
|
|
|
|
runs = await eval_repo.list_runs_for_experiment(db, experiment_id)
|
|
|
|
|
exp = await eval_repo.get_experiment(db, experiment_id)
|
|
|
|
|
if not exp:
|
|
|
|
|
return
|
2026-04-07 10:34:59 +08:00
|
|
|
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
|
2026-04-03 14:44:46 +08:00
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")]
|
2026-04-03 14:44:46 +08:00
|
|
|
if incomplete:
|
|
|
|
|
return
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
failed = [r for r in runs if str(r.status) == "failed"]
|
2026-04-03 14:44:46 +08:00
|
|
|
if failed:
|
|
|
|
|
await eval_repo.update_experiment(
|
|
|
|
|
db,
|
|
|
|
|
exp,
|
|
|
|
|
status="failed",
|
|
|
|
|
error_message="部分 run 失败",
|
|
|
|
|
completed_at=datetime.now(timezone.utc),
|
|
|
|
|
)
|
|
|
|
|
await db.commit()
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
gr = compute_gate(cases=cases, runs=runs)
|
|
|
|
|
await eval_repo.upsert_gate_verdict(
|
|
|
|
|
db,
|
|
|
|
|
experiment_id=experiment_id,
|
|
|
|
|
passed=gr.passed,
|
|
|
|
|
mean_composite_delta=gr.mean_delta,
|
|
|
|
|
protected_regressions_json=gr.protected_regressions,
|
|
|
|
|
details_json=gate_result_to_details(gr),
|
|
|
|
|
)
|
|
|
|
|
await eval_repo.update_experiment(
|
|
|
|
|
db,
|
|
|
|
|
exp,
|
|
|
|
|
status="completed",
|
|
|
|
|
completed_at=datetime.now(timezone.utc),
|
|
|
|
|
)
|
|
|
|
|
await db.commit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def execute_experiment_full(experiment_id: str) -> None:
|
|
|
|
|
async with AsyncSessionLocal() as db:
|
|
|
|
|
exp = await eval_repo.get_experiment(db, experiment_id)
|
|
|
|
|
if not exp:
|
|
|
|
|
return
|
|
|
|
|
await eval_repo.update_experiment(db, exp, status="running")
|
|
|
|
|
await db.commit()
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
|
|
|
|
|
base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
|
|
|
|
|
cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
|
|
|
|
|
if base_v is None or cand_v is None:
|
2026-04-03 14:44:46 +08:00
|
|
|
await eval_repo.update_experiment(
|
|
|
|
|
db,
|
|
|
|
|
exp,
|
|
|
|
|
status="failed",
|
|
|
|
|
error_message="version 不存在",
|
|
|
|
|
completed_at=datetime.now(timezone.utc),
|
|
|
|
|
)
|
|
|
|
|
await db.commit()
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
for case in cases:
|
|
|
|
|
for side, ver in ("baseline", base_v), ("candidate", cand_v):
|
2026-04-07 10:34:59 +08:00
|
|
|
run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
|
2026-04-03 14:44:46 +08:00
|
|
|
if not run:
|
|
|
|
|
run = await eval_repo.create_run(
|
|
|
|
|
db,
|
|
|
|
|
experiment_id=experiment_id,
|
2026-04-07 10:34:59 +08:00
|
|
|
case_id=str(case.id),
|
2026-04-03 14:44:46 +08:00
|
|
|
side=side,
|
|
|
|
|
)
|
|
|
|
|
await db.commit()
|
|
|
|
|
await execute_eval_run(db, run=run, case=case, version=ver)
|
|
|
|
|
|
|
|
|
|
await _finalize_experiment_gate(db, experiment_id)
|