"""执行单次评测 run 与整实验(供 Celery / 内联调试)。""" from __future__ import annotations from datetime import datetime, timezone from typing import Any from sqlalchemy.ext.asyncio import AsyncSession from app.core.config import settings from app.core.db import AsyncSessionLocal from app.core.dependencies import get_eval_judge_langchain_llm, get_llm_provider from app.core.logging import get_logger from app.features.evaluation import repo as eval_repo from app.features.evaluation.candidate_runner import ( EvalCandidateRunner, simple_memoir_from_transcript, ) from app.features.evaluation.eval_trace_service import EvalTraceService from app.features.evaluation.gate_report_service import gate_result_to_details from app.features.evaluation.gating_service import compute_gate from app.features.evaluation.judge_service import EvalJudgeService from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion from app.features.evaluation.transcript_for_judge import ( assistant_text_for_eval_display, format_eval_turn_block, ) logger = get_logger(__name__) _MAX_JUDGE_MARKDOWN_CHARS = 20_000 _MAX_EVAL_CHAPTERS = 30 _MAX_EVAL_STORIES = 40 _MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000 def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str: s = (text or "").strip() if len(s) <= max_chars: return s return f"{s[:max_chars]}\n\n…(已截断供评审)" def _composite( conv: float | None, mem: float | None, weights: dict[str, Any] | None ) -> float | None: """合成总分;缺失的一侧不计为 0,避免把评审失败误标为极差。 仅一侧有分:返回该侧原始分(不乘权重),表示当前 run 仅完成了部分评审维度。 """ w = weights or {} wc = float(w.get("conversation", 0.5)) wm = float(w.get("memoir", 0.5)) has_c = conv is not None has_m = mem is not None if not has_c and not has_m: return None if has_c and has_m: return float(wc) * float(conv) + float(wm) * float(mem) if has_c: return float(conv) return float(mem) def _utterances_for_case(case: EvalCase) -> list[str]: raw = case.user_utterances or [] return [str(u).strip() for u in raw if str(u).strip()] def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str: s = (text or "").strip() if len(s) <= max_chars: return s return f"{s[:max_chars]}\n\n…(访谈证据已截断)" async def execute_eval_run( db: AsyncSession, *, run: EvalRun, case: EvalCase, version: EvalVersion, ) -> None: fresh_run = await db.get(EvalRun, run.id) if not fresh_run: return if (fresh_run.status or "").lower() == "completed": logger.info("eval run skip already completed run_id={}", fresh_run.id) return run = fresh_run if not settings.eval_execution_enabled: await eval_repo.update_run( db, run, status="failed", error_message="EVAL_EXECUTION_ENABLED=false", completed_at=datetime.now(timezone.utc), ) return utterances = _utterances_for_case(case) if not utterances: await eval_repo.update_run( db, run, status="failed", error_message="empty user_utterances", completed_at=datetime.now(timezone.utc), ) return await eval_repo.update_run( db, run, status="running", started_at=datetime.now(timezone.utc), error_message=None, ) await db.commit() provider_llm = getattr(get_llm_provider(), "langchain_llm", None) if provider_llm is None: await eval_repo.update_run( db, run, status="failed", error_message="生产 LLM 未配置", completed_at=datetime.now(timezone.utc), ) await db.commit() return judge_llm = get_eval_judge_langchain_llm() judge = EvalJudgeService(judge_llm) runner = EvalCandidateRunner(provider_llm) cfg = version.config_json if isinstance(version.config_json, dict) else None try: replies, latencies = await runner.replay_utterances( utterances, version_config=cfg, temperature=settings.eval_candidate_temperature, ) except Exception as e: logger.exception("eval replay failed: {}", e) await eval_repo.update_run( db, run, status="failed", error_message=str(e)[:2000], completed_at=datetime.now(timezone.utc), ) await db.commit() return transcript_parts: list[str] = [] for i, u in enumerate(utterances): if i >= len(replies): break transcript_parts.append( format_eval_turn_block(i, u, assistant_text_for_eval_display(replies[i])) ) prior_blocks: list[str] = [] for idx, u in enumerate(utterances): if idx >= len(replies): break reply = assistant_text_for_eval_display(replies[idx]) lat = latencies[idx] if idx < len(latencies) else None prior = "\n\n".join(prior_blocks) if len(prior) > 8000: prior = prior[-8000:] tj = await judge.judge_turn( prior_transcript=prior, user_utterance=u, assistant_reply=reply, turn_index_0=idx, ) scores = tj.model_dump() if tj else None rationale = tj.rationale if tj else None await eval_repo.add_turn( db, run_id=str(run.id), turn_index=idx, user_utterance=u, assistant_reply=replies[idx], duration_ms=lat, judge_scores_json=scores, judge_rationale=rationale, ) await db.commit() prior_blocks.append(format_eval_turn_block(idx, u, reply)) full_transcript = "\n\n".join(transcript_parts) conv_out = await judge.judge_conversation(full_transcript=full_transcript) conv_total = conv_out.total_score if conv_out else None memoir_md = simple_memoir_from_transcript(utterances, replies) source_transcript = _trim_evidence_text(full_transcript) reference_memoir = (case.reference_memoir_markdown or "").strip() synthetic_notes = ( "本项为 replay 合成的短 memoir:证据闭包仅为重放对话 transcript(无 library artifact lineage)。" f" turns={len(utterances)}" ) mem_out = await judge.judge_memoir( memoir_markdown=memoir_md, source_transcript=source_transcript, structured_evidence=( "(结构化记忆证据:自动化 replay 路径未绑定用户 memory chunk/fact/timeline/summary。)" ), reference_memoir_markdown=reference_memoir, evidence_notes=synthetic_notes, ) chapter_entries: list[dict[str, Any]] = [] story_entries: list[dict[str, Any]] = [] uid = (case.source_user_id or "").strip() trace_svc = EvalTraceService(db) def _library_evidence_notes( lineage_tier: str, evidence_summary: str, truncated: bool, dropped: list[str], ) -> str: drops = ",".join(dropped[:12]) if dropped else "" return ( "library artifact 评审:以证据闭包为准;若 lineage 为 fallback 或不足须保守打分。" f" lineage_tier={lineage_tier};summary={evidence_summary};" f" prompt_truncated={truncated};dropped_sections={drops or 'none'}。" " 单章节/单故事节选;跨篇上下文不足写入 insufficient_evidence。" ) if uid: from app.features.memoir.repo import get_chapters_for_memoir_list from app.features.story.repo import get_stories_for_user try: chapters = await get_chapters_for_memoir_list( uid, db, active_only=True, is_new_only=None ) for ch in chapters[:_MAX_EVAL_CHAPTERS]: body = (ch.canonical_markdown or "").strip() if not body: continue md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}" cb = await trace_svc.build_chapter_bundle(uid, ch) formatted, cb2 = await trace_svc.format_chapter_bundle(cb) fm = formatted.format_meta cj = await judge.judge_memoir( memoir_markdown=md, source_transcript=formatted.source_transcript, structured_evidence=formatted.structured_evidence, reference_memoir_markdown=reference_memoir, evidence_notes=_library_evidence_notes( cb2.lineage_tier, formatted.evidence_summary, fm.truncated, fm.dropped_sections, ), ) chapter_entries.append( { "id": ch.id, "title": ch.title, "order_index": ch.order_index, "lineage_tier": cb2.lineage_tier, "evidence_summary": formatted.evidence_summary, "evidence_trace": cb2.model_dump(), "format_meta": fm.model_dump(), "judge": cj.model_dump() if cj else None, } ) except Exception as e: logger.warning("eval chapter judges skipped: {}", e) try: stories = await get_stories_for_user(db, uid, status="active") for st in stories[:_MAX_EVAL_STORIES]: body = (st.canonical_markdown or "").strip() if not body: continue md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}" sb = await trace_svc.build_story_bundle(uid, str(st.id)) formatted, sb2 = await trace_svc.format_story_bundle(sb) fm = formatted.format_meta sj = await judge.judge_memoir( memoir_markdown=md, source_transcript=formatted.source_transcript, structured_evidence=formatted.structured_evidence, reference_memoir_markdown=reference_memoir, evidence_notes=_library_evidence_notes( sb2.lineage_tier, formatted.evidence_summary, fm.truncated, fm.dropped_sections, ), ) story_entries.append( { "id": st.id, "title": st.title, "stage": st.stage, "lineage_tier": sb2.lineage_tier, "evidence_summary": formatted.evidence_summary, "evidence_trace": sb2.model_dump(), "format_meta": fm.model_dump(), "judge": sj.model_dump() if sj else None, } ) except Exception as e: logger.warning("eval story judges skipped: {}", e) synth_scores: list[float] = [] if mem_out is not None: synth_scores.append(float(mem_out.total_score)) library_scores: list[float] = [] for row in chapter_entries: j = row.get("judge") if isinstance(j, dict) and j.get("total_score") is not None: library_scores.append(float(j["total_score"])) for row in story_entries: j = row.get("judge") if isinstance(j, dict) and j.get("total_score") is not None: library_scores.append(float(j["total_score"])) def _mean(xs: list[float]) -> float: return sum(xs) / len(xs) if xs else 0.0 if synth_scores and library_scores: mem_total = 0.5 * _mean(synth_scores) + 0.5 * _mean(library_scores) elif synth_scores: mem_total = _mean(synth_scores) elif library_scores: mem_total = _mean(library_scores) else: mem_total = None exp = await eval_repo.get_experiment(db, str(run.experiment_id)) weights = ( exp.composite_weights_json if exp and isinstance(exp.composite_weights_json, dict) else None ) comp = _composite(conv_total, mem_total, weights) bundle: dict[str, Any] = { "conversation_judge": conv_out.model_dump() if conv_out else None, "memoir_judge": mem_out.model_dump() if mem_out else None, "synthetic_memoir_judge": mem_out.model_dump() if mem_out else None, "chapters": chapter_entries, "stories": story_entries, "judge_meta": { "conversation_judge_ok": conv_out is not None, "memoir_synthetic_ok": mem_out is not None, "memoir_synth_scores_n": len(synth_scores), "memoir_library_scores_n": len(library_scores), "synthetic_memoir_lineage_tier": "replay_transcript_only", "synthetic_memoir_evidence_summary": ( f"replay_turns={len(utterances)};structured_memory=unbound" ), "memoir_aggregate_rule": ( "synthetic_memoir_judge_plus_library_memoir_judge_weighted_mean" if synth_scores and library_scores else ("synthetic_memoir_only" if synth_scores else "library_memoir_only") ), }, } await eval_repo.update_run( db, run, status="completed", memoir_markdown=memoir_md, conversation_score_total=conv_total, memoir_score_total=mem_total, composite_score=comp, judge_bundle_json=bundle, completed_at=datetime.now(timezone.utc), ) await db.commit() async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> None: runs = await eval_repo.list_runs_for_experiment(db, experiment_id) exp = await eval_repo.get_experiment(db, experiment_id) if not exp: return cases = await eval_repo.list_cases(db, str(exp.regression_set_id)) incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")] if incomplete: return failed = [r for r in runs if str(r.status) == "failed"] if failed: await eval_repo.update_experiment( db, exp, status="failed", error_message="部分 run 失败", completed_at=datetime.now(timezone.utc), ) await db.commit() return gr = compute_gate(cases=cases, runs=runs) await eval_repo.upsert_gate_verdict( db, experiment_id=experiment_id, passed=gr.passed, mean_composite_delta=gr.mean_delta, protected_regressions_json=gr.protected_regressions, details_json=gate_result_to_details(gr), ) await eval_repo.update_experiment( db, exp, status="completed", completed_at=datetime.now(timezone.utc), ) await db.commit() async def execute_experiment_full(experiment_id: str) -> None: from app.core.redis_lock import acquire_redis_lock, release_redis_lock lock_key = f"lock:eval_experiment:{experiment_id}" lock_handle = acquire_redis_lock(lock_key, ttl_seconds=7200) if lock_handle is None: logger.warning( "eval experiment already running or lock busy experiment_id={}", experiment_id, ) return try: async with AsyncSessionLocal() as db: exp = await eval_repo.get_experiment(db, experiment_id) if not exp: return await eval_repo.update_experiment(db, exp, status="running") await db.commit() cases = await eval_repo.list_cases(db, str(exp.regression_set_id)) base_v = await eval_repo.get_version(db, str(exp.baseline_version_id)) cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id)) if base_v is None or cand_v is None: await eval_repo.update_experiment( db, exp, status="failed", error_message="version 不存在", completed_at=datetime.now(timezone.utc), ) await db.commit() return for case in cases: for side, ver in ("baseline", base_v), ("candidate", cand_v): run = await eval_repo.get_run(db, experiment_id, str(case.id), side) if not run: run = await eval_repo.create_run( db, experiment_id=experiment_id, case_id=str(case.id), side=side, ) await db.commit() await execute_eval_run(db, run=run, case=case, version=ver) await _finalize_experiment_gate(db, experiment_id) finally: release_redis_lock(lock_handle)