Files
life-echo/api/app/features/evaluation/memoir_compare_summary.py

149 lines
5.1 KiB
Python
Raw Normal View History

"""Structured A/B compare summary for internal eval memoir chapter judging.
Mirrors `conversation_compare_summary.py`: for each chapter, take the
baseline judge and the new-chapter judge, compute group-level and leaf-level
deltas, and produce a gate verdict.
"""
from __future__ import annotations
from typing import Any
from app.features.evaluation.judge_schemas import MemoirJudgeOutput
_GROUP_KEYS: tuple[tuple[str, str, float], ...] = (
("authenticity_score", "记忆与真实度", 23),
("information_score", "信息呈现", 14),
("narrative_score", "叙事结构", 14),
("language_score", "语言表达", 18),
("emotion_score", "情感", 9),
("character_score", "人物", 9),
("coherence_score", "连贯一致", 4),
("richness_score", "丰富度", 5),
("publish_ready_score", "出版就绪", 4),
)
_LEAF_KEYS: tuple[tuple[str, str, float], ...] = (
("mem_fidelity", "记忆忠实度", 9),
("mem_factual_accuracy", "事实准确性", 5),
("mem_factual_coverage", "事实覆盖率", 5),
("mem_traceability", "记忆可追溯性", 4),
("info_slot_coverage", "槽位覆盖度", 6),
("info_sufficiency", "信息充分性", 4),
("info_density", "信息密度", 4),
("narr_structure", "故事结构", 6),
("narr_paragraphs", "段落组织", 5),
("narr_pacing", "节奏控制", 3),
("lang_fluency", "语言流畅度", 3),
("lang_conciseness", "表达精炼度", 3),
("lang_literary", "文笔质量", 4),
("lang_controlled_expansion", "控制性扩写能力", 4),
("lang_detail", "细节还原与强化", 2),
("lang_style", "风格一致性", 2),
("emo_authenticity", "情感真实度", 5),
("emo_depth", "情感深度", 4),
("char_understanding", "人物理解", 4),
("char_consistency", "人物一致性", 3),
("char_integration", "人物融入度", 2),
("coh_timeline", "时间线一致性", 2),
("coh_cross_chapter", "跨章节关联", 2),
("rich_analogy", "类比与引用", 3),
("rich_diversity", "表达多样性", 2),
("pub_editorial_cost", "编辑成本", 2),
("pub_completeness", "完整度", 2),
)
def _round(x: float) -> float:
return round(float(x), 2)
def build_memoir_compare_summary(
*,
baseline_judge: MemoirJudgeOutput | None,
chapter_judge: MemoirJudgeOutput | None,
) -> dict[str, Any]:
if not chapter_judge:
return {
"mode": "single",
"gate": {
"status": "insufficient_data",
"reasons": ["缺少新稿评分,无法进行 A/B 对比。"],
},
}
if not baseline_judge:
return {
"mode": "single",
"chapter_total": _round(chapter_judge.total_score),
"gate": {
"status": "single_side_only",
"reasons": ["缺少基线评分,仅有新稿单侧分数。"],
},
}
group_deltas = {
key: {
"label": label,
"max": mx,
"baseline": _round(getattr(baseline_judge, key)),
"chapter": _round(getattr(chapter_judge, key)),
"delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)),
}
for key, label, mx in _GROUP_KEYS
}
leaf_deltas = {
key: {
"label": label,
"max": mx,
"baseline": _round(getattr(baseline_judge, key)),
"chapter": _round(getattr(chapter_judge, key)),
"delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)),
}
for key, label, mx in _LEAF_KEYS
}
total_delta = _round(chapter_judge.total_score - baseline_judge.total_score)
key_regressions = [
v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.5
]
key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.5]
parity_passed = total_delta >= -2.0 and len(key_regressions) <= 3
surpass_passed = total_delta >= 2.0 and len(key_regressions) <= 1
if surpass_passed:
status = "surpass"
elif parity_passed:
status = "parity"
else:
status = "regressed"
reasons: list[str] = []
if total_delta >= 2.0:
reasons.append("总分显著超过基线。")
elif total_delta >= -2.0:
reasons.append("总分基本追平基线。")
else:
reasons.append("总分明显落后基线。")
if key_regressions:
reasons.append(f"回落项:{''.join(key_regressions[:6])}")
if key_gains:
reasons.append(f"提升项:{''.join(key_gains[:6])}")
return {
"mode": "ab",
"baseline_total": _round(baseline_judge.total_score),
"chapter_total": _round(chapter_judge.total_score),
"total_delta": total_delta,
"group_deltas": group_deltas,
"leaf_deltas": leaf_deltas,
"key_regressions": key_regressions,
"key_gains": key_gains,
"gate": {
"status": status,
"parity_passed": parity_passed,
"surpass_passed": surpass_passed,
"reasons": reasons,
},
}