"""Structured A/B compare summary for internal eval memoir chapter judging. Mirrors `conversation_compare_summary.py`: for each chapter, take the baseline judge and the new-chapter judge, compute group-level and leaf-level deltas, and produce a gate verdict. """ from __future__ import annotations from typing import Any from app.features.evaluation.judge_schemas import MemoirJudgeOutput _GROUP_KEYS: tuple[tuple[str, str, float], ...] = ( ("authenticity_score", "记忆与真实度", 23), ("information_score", "信息呈现", 14), ("narrative_score", "叙事结构", 14), ("language_score", "语言表达", 18), ("emotion_score", "情感", 9), ("character_score", "人物", 9), ("coherence_score", "连贯一致", 4), ("richness_score", "丰富度", 5), ("publish_ready_score", "出版就绪", 4), ) _LEAF_KEYS: tuple[tuple[str, str, float], ...] = ( ("mem_fidelity", "记忆忠实度", 9), ("mem_factual_accuracy", "事实准确性", 5), ("mem_factual_coverage", "事实覆盖率", 5), ("mem_traceability", "记忆可追溯性", 4), ("info_slot_coverage", "槽位覆盖度", 6), ("info_sufficiency", "信息充分性", 4), ("info_density", "信息密度", 4), ("narr_structure", "故事结构", 6), ("narr_paragraphs", "段落组织", 5), ("narr_pacing", "节奏控制", 3), ("lang_fluency", "语言流畅度", 3), ("lang_conciseness", "表达精炼度", 3), ("lang_literary", "文笔质量", 4), ("lang_controlled_expansion", "控制性扩写能力", 4), ("lang_detail", "细节还原与强化", 2), ("lang_style", "风格一致性", 2), ("emo_authenticity", "情感真实度", 5), ("emo_depth", "情感深度", 4), ("char_understanding", "人物理解", 4), ("char_consistency", "人物一致性", 3), ("char_integration", "人物融入度", 2), ("coh_timeline", "时间线一致性", 2), ("coh_cross_chapter", "跨章节关联", 2), ("rich_analogy", "类比与引用", 3), ("rich_diversity", "表达多样性", 2), ("pub_editorial_cost", "编辑成本", 2), ("pub_completeness", "完整度", 2), ) def _round(x: float) -> float: return round(float(x), 2) def build_memoir_compare_summary( *, baseline_judge: MemoirJudgeOutput | None, chapter_judge: MemoirJudgeOutput | None, ) -> dict[str, Any]: if not chapter_judge: return { "mode": "single", "gate": { "status": "insufficient_data", "reasons": ["缺少新稿评分,无法进行 A/B 对比。"], }, } if not baseline_judge: return { "mode": "single", "chapter_total": _round(chapter_judge.total_score), "gate": { "status": "single_side_only", "reasons": ["缺少基线评分,仅有新稿单侧分数。"], }, } group_deltas = { key: { "label": label, "max": mx, "baseline": _round(getattr(baseline_judge, key)), "chapter": _round(getattr(chapter_judge, key)), "delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)), } for key, label, mx in _GROUP_KEYS } leaf_deltas = { key: { "label": label, "max": mx, "baseline": _round(getattr(baseline_judge, key)), "chapter": _round(getattr(chapter_judge, key)), "delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)), } for key, label, mx in _LEAF_KEYS } total_delta = _round(chapter_judge.total_score - baseline_judge.total_score) key_regressions = [ v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.5 ] key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.5] parity_passed = total_delta >= -2.0 and len(key_regressions) <= 3 surpass_passed = total_delta >= 2.0 and len(key_regressions) <= 1 if surpass_passed: status = "surpass" elif parity_passed: status = "parity" else: status = "regressed" reasons: list[str] = [] if total_delta >= 2.0: reasons.append("总分显著超过基线。") elif total_delta >= -2.0: reasons.append("总分基本追平基线。") else: reasons.append("总分明显落后基线。") if key_regressions: reasons.append(f"回落项:{'、'.join(key_regressions[:6])}。") if key_gains: reasons.append(f"提升项:{'、'.join(key_gains[:6])}。") return { "mode": "ab", "baseline_total": _round(baseline_judge.total_score), "chapter_total": _round(chapter_judge.total_score), "total_delta": total_delta, "group_deltas": group_deltas, "leaf_deltas": leaf_deltas, "key_regressions": key_regressions, "key_gains": key_gains, "gate": { "status": status, "parity_passed": parity_passed, "surpass_passed": surpass_passed, "reasons": reasons, }, }