api/app/features/evaluation/memoir_compare_summary.py

"""Structured A/B compare summary for internal eval memoir chapter judging.

Mirrors `conversation_compare_summary.py`: for each chapter, take the
baseline judge and the new-chapter judge, compute group-level and leaf-level
deltas, and produce a gate verdict.
"""

from __future__ import annotations

from typing import Any

from app.features.evaluation.judge_schemas import MemoirJudgeOutput

_GROUP_KEYS: tuple[tuple[str, str, float], ...] = (
    ("authenticity_score", "记忆与真实度", 23),
    ("information_score", "信息呈现", 14),
    ("narrative_score", "叙事结构", 14),
    ("language_score", "语言表达", 18),
    ("emotion_score", "情感", 9),
    ("character_score", "人物", 9),
    ("coherence_score", "连贯一致", 4),
    ("richness_score", "丰富度", 5),
    ("publish_ready_score", "出版就绪", 4),
)

_LEAF_KEYS: tuple[tuple[str, str, float], ...] = (
    ("mem_fidelity", "记忆忠实度", 9),
    ("mem_factual_accuracy", "事实准确性", 5),
    ("mem_factual_coverage", "事实覆盖率", 5),
    ("mem_traceability", "记忆可追溯性", 4),
    ("info_slot_coverage", "槽位覆盖度", 6),
    ("info_sufficiency", "信息充分性", 4),
    ("info_density", "信息密度", 4),
    ("narr_structure", "故事结构", 6),
    ("narr_paragraphs", "段落组织", 5),
    ("narr_pacing", "节奏控制", 3),
    ("lang_fluency", "语言流畅度", 3),
    ("lang_conciseness", "表达精炼度", 3),
    ("lang_literary", "文笔质量", 4),
    ("lang_controlled_expansion", "控制性扩写能力", 4),
    ("lang_detail", "细节还原与强化", 2),
    ("lang_style", "风格一致性", 2),
    ("emo_authenticity", "情感真实度", 5),
    ("emo_depth", "情感深度", 4),
    ("char_understanding", "人物理解", 4),
    ("char_consistency", "人物一致性", 3),
    ("char_integration", "人物融入度", 2),
    ("coh_timeline", "时间线一致性", 2),
    ("coh_cross_chapter", "跨章节关联", 2),
    ("rich_analogy", "类比与引用", 3),
    ("rich_diversity", "表达多样性", 2),
    ("pub_editorial_cost", "编辑成本", 2),
    ("pub_completeness", "完整度", 2),
)


def _round(x: float) -> float:
    return round(float(x), 2)


def build_memoir_compare_summary(
    *,
    baseline_judge: MemoirJudgeOutput | None,
    chapter_judge: MemoirJudgeOutput | None,
) -> dict[str, Any]:
    if not chapter_judge:
        return {
            "mode": "single",
            "gate": {
                "status": "insufficient_data",
                "reasons": ["缺少新稿评分，无法进行 A/B 对比。"],
            },
        }
    if not baseline_judge:
        return {
            "mode": "single",
            "chapter_total": _round(chapter_judge.total_score),
            "gate": {
                "status": "single_side_only",
                "reasons": ["缺少基线评分，仅有新稿单侧分数。"],
            },
        }

    group_deltas = {
        key: {
            "label": label,
            "max": mx,
            "baseline": _round(getattr(baseline_judge, key)),
            "chapter": _round(getattr(chapter_judge, key)),
            "delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)),
        }
        for key, label, mx in _GROUP_KEYS
    }
    leaf_deltas = {
        key: {
            "label": label,
            "max": mx,
            "baseline": _round(getattr(baseline_judge, key)),
            "chapter": _round(getattr(chapter_judge, key)),
            "delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)),
        }
        for key, label, mx in _LEAF_KEYS
    }

    total_delta = _round(chapter_judge.total_score - baseline_judge.total_score)
    key_regressions = [
        v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.5
    ]
    key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.5]

    parity_passed = total_delta >= -2.0 and len(key_regressions) <= 3
    surpass_passed = total_delta >= 2.0 and len(key_regressions) <= 1

    if surpass_passed:
        status = "surpass"
    elif parity_passed:
        status = "parity"
    else:
        status = "regressed"

    reasons: list[str] = []
    if total_delta >= 2.0:
        reasons.append("总分显著超过基线。")
    elif total_delta >= -2.0:
        reasons.append("总分基本追平基线。")
    else:
        reasons.append("总分明显落后基线。")
    if key_regressions:
        reasons.append(f"回落项：{'、'.join(key_regressions[:6])}。")
    if key_gains:
        reasons.append(f"提升项：{'、'.join(key_gains[:6])}。")

    return {
        "mode": "ab",
        "baseline_total": _round(baseline_judge.total_score),
        "chapter_total": _round(chapter_judge.total_score),
        "total_delta": total_delta,
        "group_deltas": group_deltas,
        "leaf_deltas": leaf_deltas,
        "key_regressions": key_regressions,
        "key_gains": key_gains,
        "gate": {
            "status": status,
            "parity_passed": parity_passed,
            "surpass_passed": surpass_passed,
            "reasons": reasons,
        },
    }
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue - Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas. - Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error. - MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings. - app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS. - Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014. - Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples. 2026-04-10 10:23:43 +08:00			`"""Structured A/B compare summary for internal eval memoir chapter judging.`

			Mirrors `conversation_compare_summary.py`: for each chapter, take the
			`baseline judge and the new-chapter judge, compute group-level and leaf-level`
			`deltas, and produce a gate verdict.`
			`"""`

			`from __future__ import annotations`

			`from typing import Any`

			`from app.features.evaluation.judge_schemas import MemoirJudgeOutput`

			`_GROUP_KEYS: tuple[tuple[str, str, float], ...] = (`
			`("authenticity_score", "记忆与真实度", 23),`
			`("information_score", "信息呈现", 14),`
			`("narrative_score", "叙事结构", 14),`
			`("language_score", "语言表达", 18),`
			`("emotion_score", "情感", 9),`
			`("character_score", "人物", 9),`
			`("coherence_score", "连贯一致", 4),`
			`("richness_score", "丰富度", 5),`
			`("publish_ready_score", "出版就绪", 4),`
			`)`

			`_LEAF_KEYS: tuple[tuple[str, str, float], ...] = (`
			`("mem_fidelity", "记忆忠实度", 9),`
			`("mem_factual_accuracy", "事实准确性", 5),`
			`("mem_factual_coverage", "事实覆盖率", 5),`
			`("mem_traceability", "记忆可追溯性", 4),`
			`("info_slot_coverage", "槽位覆盖度", 6),`
			`("info_sufficiency", "信息充分性", 4),`
			`("info_density", "信息密度", 4),`
			`("narr_structure", "故事结构", 6),`
			`("narr_paragraphs", "段落组织", 5),`
			`("narr_pacing", "节奏控制", 3),`
			`("lang_fluency", "语言流畅度", 3),`
			`("lang_conciseness", "表达精炼度", 3),`
			`("lang_literary", "文笔质量", 4),`
			`("lang_controlled_expansion", "控制性扩写能力", 4),`
			`("lang_detail", "细节还原与强化", 2),`
			`("lang_style", "风格一致性", 2),`
			`("emo_authenticity", "情感真实度", 5),`
			`("emo_depth", "情感深度", 4),`
			`("char_understanding", "人物理解", 4),`
			`("char_consistency", "人物一致性", 3),`
			`("char_integration", "人物融入度", 2),`
			`("coh_timeline", "时间线一致性", 2),`
			`("coh_cross_chapter", "跨章节关联", 2),`
			`("rich_analogy", "类比与引用", 3),`
			`("rich_diversity", "表达多样性", 2),`
			`("pub_editorial_cost", "编辑成本", 2),`
			`("pub_completeness", "完整度", 2),`
			`)`


			`def _round(x: float) -> float:`
			`return round(float(x), 2)`


			`def build_memoir_compare_summary(`
			`*,`
			`baseline_judge: MemoirJudgeOutput \| None,`
			`chapter_judge: MemoirJudgeOutput \| None,`
			`) -> dict[str, Any]:`
			`if not chapter_judge:`
			`return {`
			`"mode": "single",`
			`"gate": {`
			`"status": "insufficient_data",`
			`"reasons": ["缺少新稿评分，无法进行 A/B 对比。"],`
			`},`
			`}`
			`if not baseline_judge:`
			`return {`
			`"mode": "single",`
			`"chapter_total": _round(chapter_judge.total_score),`
			`"gate": {`
			`"status": "single_side_only",`
			`"reasons": ["缺少基线评分，仅有新稿单侧分数。"],`
			`},`
			`}`

			`group_deltas = {`
			`key: {`
			`"label": label,`
			`"max": mx,`
			`"baseline": _round(getattr(baseline_judge, key)),`
			`"chapter": _round(getattr(chapter_judge, key)),`
			`"delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)),`
			`}`
			`for key, label, mx in _GROUP_KEYS`
			`}`
			`leaf_deltas = {`
			`key: {`
			`"label": label,`
			`"max": mx,`
			`"baseline": _round(getattr(baseline_judge, key)),`
			`"chapter": _round(getattr(chapter_judge, key)),`
			`"delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)),`
			`}`
			`for key, label, mx in _LEAF_KEYS`
			`}`

			`total_delta = _round(chapter_judge.total_score - baseline_judge.total_score)`
			`key_regressions = [`
			`v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.5`
			`]`
fix: 1. 修复登录界面文字被遮挡问题 2. 大字模式关闭后显示异常问题 3. 重新调整大字模式是否开启时的字体显示效果 2026-04-10 20:35:57 +08:00			`key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.5]`
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue - Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas. - Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error. - MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings. - app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS. - Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014. - Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples. 2026-04-10 10:23:43 +08:00
			`parity_passed = total_delta >= -2.0 and len(key_regressions) <= 3`
			`surpass_passed = total_delta >= 2.0 and len(key_regressions) <= 1`

			`if surpass_passed:`
			`status = "surpass"`
			`elif parity_passed:`
			`status = "parity"`
			`else:`
			`status = "regressed"`

			`reasons: list[str] = []`
			`if total_delta >= 2.0:`
			`reasons.append("总分显著超过基线。")`
			`elif total_delta >= -2.0:`
			`reasons.append("总分基本追平基线。")`
			`else:`
			`reasons.append("总分明显落后基线。")`
			`if key_regressions:`
			`reasons.append(f"回落项：{'、'.join(key_regressions[:6])}。")`
			`if key_gains:`
			`reasons.append(f"提升项：{'、'.join(key_gains[:6])}。")`

			`return {`
			`"mode": "ab",`
			`"baseline_total": _round(baseline_judge.total_score),`
			`"chapter_total": _round(chapter_judge.total_score),`
			`"total_delta": total_delta,`
			`"group_deltas": group_deltas,`
			`"leaf_deltas": leaf_deltas,`
			`"key_regressions": key_regressions,`
			`"key_gains": key_gains,`
			`"gate": {`
			`"status": status,`
			`"parity_passed": parity_passed,`
			`"surpass_passed": surpass_passed,`
			`"reasons": reasons,`
			`},`
			`}`