refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词 - 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整 - 基建：开发用 celery broker、compose/development 脚本、依赖注入 - eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground - 文档与单测同步
2026-04-08 21:36:12 +08:00
parent 2a0c80987d
commit 064ad2161d
64 changed files with 3412 additions and 3068 deletions
--- a/api/app/features/evaluation/conversation_compare_summary.py
+++ b/api/app/features/evaluation/conversation_compare_summary.py
@@ -0,0 +1,176 @@
+"""Structured A/B compare summary for internal eval conversation judging."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.features.evaluation.judge_schemas import ConversationJudgeOutput
+
+_GROUP_KEYS: tuple[tuple[str, str], ...] = (
+    ("emotion_score", "情绪与陪伴"),
+    ("information_score", "信息挖掘"),
+    ("persona_score", "人物建模"),
+    ("structure_score", "结构引导"),
+    ("question_score", "提问质量"),
+)
+
+_LEAF_KEYS: tuple[tuple[str, str], ...] = (
+    ("emotion_carry", "情绪承接"),
+    ("context_memory", "上下文记忆"),
+    ("rhythm_control", "节奏控制"),
+    ("persona_understanding", "人物理解"),
+    ("follow_up_depth", "追问深度"),
+    ("non_leading", "非引导性"),
+)
+
+_REPEAT_ISSUE_MARKERS = ("重复盘问", "重复询问", "已答", "忽略上文", "同义重问")
+
+
+def _round(x: float) -> float:
+    return round(float(x), 2)
+
+
+def _issues_text(judge: ConversationJudgeOutput | None) -> list[str]:
+    if judge is None:
+        return []
+    return [str(x).strip() for x in judge.major_issues if str(x).strip()]
+
+
+def _has_repeat_issue(judge: ConversationJudgeOutput | None) -> bool:
+    return any(
+        marker in issue
+        for issue in _issues_text(judge)
+        for marker in _REPEAT_ISSUE_MARKERS
+    )
+
+
+def build_conversation_compare_summary(
+    *,
+    baseline_judge: ConversationJudgeOutput | None,
+    replay_judge: ConversationJudgeOutput | None,
+    baseline_transcript: str,
+    replay_transcript: str,
+    conv_cap: int,
+    compare_cap_each: int,
+    fixture_filename: str | None = None,
+) -> dict[str, Any]:
+    truncation = {
+        "baseline_chars": len((baseline_transcript or "").strip()),
+        "replay_chars": len((replay_transcript or "").strip()),
+        "conversation_cap_chars": int(conv_cap),
+        "compare_cap_each_chars": int(compare_cap_each),
+        "baseline_truncated_for_conversation": len((baseline_transcript or "").strip())
+        > int(conv_cap),
+        "replay_truncated_for_conversation": len((replay_transcript or "").strip())
+        > int(conv_cap),
+        "baseline_truncated_for_compare": len((baseline_transcript or "").strip())
+        > int(compare_cap_each),
+        "replay_truncated_for_compare": len((replay_transcript or "").strip())
+        > int(compare_cap_each),
+    }
+
+    if not replay_judge:
+        return {
+            "fixture_filename": fixture_filename,
+            "mode": "single",
+            "truncation": truncation,
+            "gate": {
+                "status": "insufficient_data",
+                "reasons": ["缺少回放整体评分，无法判断是否追平或超过 A。"],
+            },
+        }
+
+    if not baseline_judge:
+        return {
+            "fixture_filename": fixture_filename,
+            "mode": "single",
+            "replay_total": _round(replay_judge.total_score),
+            "truncation": truncation,
+            "gate": {
+                "status": "single_side_only",
+                "reasons": ["当前只有新对话单侧评分，可用于优化，但不能判定是否超过 A。"],
+            },
+        }
+
+    group_deltas = {
+        key: {
+            "label": label,
+            "baseline": _round(getattr(baseline_judge, key)),
+            "replay": _round(getattr(replay_judge, key)),
+            "delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)),
+        }
+        for key, label in _GROUP_KEYS
+    }
+    leaf_deltas = {
+        key: {
+            "label": label,
+            "baseline": _round(getattr(baseline_judge, key)),
+            "replay": _round(getattr(replay_judge, key)),
+            "delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)),
+        }
+        for key, label in _LEAF_KEYS
+    }
+
+    key_regressions = [
+        v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.75
+    ]
+    key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.75]
+    total_delta = _round(replay_judge.total_score - baseline_judge.total_score)
+    has_repeat_regression = _has_repeat_issue(replay_judge)
+    parity_passed = (
+        total_delta >= -1.0
+        and float(leaf_deltas["context_memory"]["delta"]) >= -0.5
+        and float(leaf_deltas["emotion_carry"]["delta"]) >= -0.5
+        and not has_repeat_regression
+    )
+    surpass_passed = (
+        total_delta >= 1.5
+        and float(leaf_deltas["context_memory"]["delta"]) >= 0
+        and float(leaf_deltas["persona_understanding"]["delta"]) >= 0
+        and float(leaf_deltas["rhythm_control"]["delta"]) >= -0.25
+        and not has_repeat_regression
+    )
+    if surpass_passed:
+        status = "surpass"
+    elif parity_passed:
+        status = "parity"
+    else:
+        status = "regressed"
+
+    reasons: list[str] = []
+    if total_delta >= 1.5:
+        reasons.append("总分已显著超过基线。")
+    elif total_delta >= -1.0:
+        reasons.append("总分已基本追平基线。")
+    else:
+        reasons.append("总分仍明显落后基线。")
+    if has_repeat_regression:
+        reasons.append("回放侧仍出现重复盘问或忽略已知信息的风险。")
+    if key_regressions:
+        reasons.append(f"关键回落维度：{'、'.join(key_regressions[:4])}。")
+    if key_gains:
+        reasons.append(f"关键提升维度：{'、'.join(key_gains[:4])}。")
+    if truncation["baseline_truncated_for_compare"] or truncation["replay_truncated_for_compare"]:
+        reasons.append("A/B 对比稿使用了截断 transcript，长对话结论需结合逐轮评分复核。")
+
+    return {
+        "fixture_filename": fixture_filename,
+        "mode": "ab",
+        "baseline_total": _round(baseline_judge.total_score),
+        "replay_total": _round(replay_judge.total_score),
+        "total_delta": total_delta,
+        "group_deltas": group_deltas,
+        "leaf_deltas": leaf_deltas,
+        "key_regressions": key_regressions,
+        "key_gains": key_gains,
+        "repeat_issue_detected": has_repeat_regression,
+        "truncation": truncation,
+        "gate": {
+            "status": status,
+            "parity_passed": parity_passed,
+            "surpass_passed": surpass_passed,
+            "reasons": reasons,
+            "golden_set_note": "建议在固定黄金样本集上复跑该口径，再决定是否发布。",
+        },
+    }
+