life-echo/api/app/features/evaluation/memoir_compare_summary.py

"""Structured A/B compare summary for internal eval memoir chapter judging.

Mirrors `conversation_compare_summary.py`: for each chapter, take the
baseline judge and the new-chapter judge, compute group-level and leaf-level
deltas, and produce a gate verdict.
"""

from __future__ import annotations

from typing import Any

from app.features.evaluation.judge_schemas import MemoirJudgeOutput

_GROUP_KEYS: tuple[tuple[str, str, float], ...] = (
    ("authenticity_score", "记忆与真实度", 23),
    ("information_score", "信息呈现", 14),
    ("narrative_score", "叙事结构", 14),
    ("language_score", "语言表达", 18),
    ("emotion_score", "情感", 9),
    ("character_score", "人物", 9),
    ("coherence_score", "连贯一致", 4),
    ("richness_score", "丰富度", 5),
    ("publish_ready_score", "出版就绪", 4),
)

_LEAF_KEYS: tuple[tuple[str, str, float], ...] = (
    ("mem_fidelity", "记忆忠实度", 9),
    ("mem_factual_accuracy", "事实准确性", 5),
    ("mem_factual_coverage", "事实覆盖率", 5),
    ("mem_traceability", "记忆可追溯性", 4),
    ("info_slot_coverage", "槽位覆盖度", 6),
    ("info_sufficiency", "信息充分性", 4),
    ("info_density", "信息密度", 4),
    ("narr_structure", "故事结构", 6),
    ("narr_paragraphs", "段落组织", 5),
    ("narr_pacing", "节奏控制", 3),
    ("lang_fluency", "语言流畅度", 3),
    ("lang_conciseness", "表达精炼度", 3),
    ("lang_literary", "文笔质量", 4),
    ("lang_controlled_expansion", "控制性扩写能力", 4),
    ("lang_detail", "细节还原与强化", 2),
    ("lang_style", "风格一致性", 2),
    ("emo_authenticity", "情感真实度", 5),
    ("emo_depth", "情感深度", 4),
    ("char_understanding", "人物理解", 4),
    ("char_consistency", "人物一致性", 3),
    ("char_integration", "人物融入度", 2),
    ("coh_timeline", "时间线一致性", 2),
    ("coh_cross_chapter", "跨章节关联", 2),
    ("rich_analogy", "类比与引用", 3),
    ("rich_diversity", "表达多样性", 2),
    ("pub_editorial_cost", "编辑成本", 2),
    ("pub_completeness", "完整度", 2),
)


def _round(x: float) -> float:
    return round(float(x), 2)


def build_memoir_compare_summary(
    *,
    baseline_judge: MemoirJudgeOutput | None,
    chapter_judge: MemoirJudgeOutput | None,
) -> dict[str, Any]:
    if not chapter_judge:
        return {
            "mode": "single",
            "gate": {
                "status": "insufficient_data",
                "reasons": ["缺少新稿评分，无法进行 A/B 对比。"],
            },
        }
    if not baseline_judge:
        return {
            "mode": "single",
            "chapter_total": _round(chapter_judge.total_score),
            "gate": {
                "status": "single_side_only",
                "reasons": ["缺少基线评分，仅有新稿单侧分数。"],
            },
        }

    group_deltas = {
        key: {
            "label": label,
            "max": mx,
            "baseline": _round(getattr(baseline_judge, key)),
            "chapter": _round(getattr(chapter_judge, key)),
            "delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)),
        }
        for key, label, mx in _GROUP_KEYS
    }
    leaf_deltas = {
        key: {
            "label": label,
            "max": mx,
            "baseline": _round(getattr(baseline_judge, key)),
            "chapter": _round(getattr(chapter_judge, key)),
            "delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)),
        }
        for key, label, mx in _LEAF_KEYS
    }

    total_delta = _round(chapter_judge.total_score - baseline_judge.total_score)
    key_regressions = [
        v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.5
    ]
    key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.5]

    parity_passed = total_delta >= -2.0 and len(key_regressions) <= 3
    surpass_passed = total_delta >= 2.0 and len(key_regressions) <= 1

    if surpass_passed:
        status = "surpass"
    elif parity_passed:
        status = "parity"
    else:
        status = "regressed"

    reasons: list[str] = []
    if total_delta >= 2.0:
        reasons.append("总分显著超过基线。")
    elif total_delta >= -2.0:
        reasons.append("总分基本追平基线。")
    else:
        reasons.append("总分明显落后基线。")
    if key_regressions:
        reasons.append(f"回落项：{'、'.join(key_regressions[:6])}。")
    if key_gains:
        reasons.append(f"提升项：{'、'.join(key_gains[:6])}。")

    return {
        "mode": "ab",
        "baseline_total": _round(baseline_judge.total_score),
        "chapter_total": _round(chapter_judge.total_score),
        "total_delta": total_delta,
        "group_deltas": group_deltas,
        "leaf_deltas": leaf_deltas,
        "key_regressions": key_regressions,
        "key_gains": key_gains,
        "gate": {
            "status": status,
            "parity_passed": parity_passed,
            "surpass_passed": surpass_passed,
            "reasons": reasons,
        },
    }