149 lines
5.1 KiB
Python
149 lines
5.1 KiB
Python
"""Structured A/B compare summary for internal eval memoir chapter judging.
|
|
|
|
Mirrors `conversation_compare_summary.py`: for each chapter, take the
|
|
baseline judge and the new-chapter judge, compute group-level and leaf-level
|
|
deltas, and produce a gate verdict.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Any
|
|
|
|
from app.features.evaluation.judge_schemas import MemoirJudgeOutput
|
|
|
|
_GROUP_KEYS: tuple[tuple[str, str, float], ...] = (
|
|
("authenticity_score", "记忆与真实度", 23),
|
|
("information_score", "信息呈现", 14),
|
|
("narrative_score", "叙事结构", 14),
|
|
("language_score", "语言表达", 18),
|
|
("emotion_score", "情感", 9),
|
|
("character_score", "人物", 9),
|
|
("coherence_score", "连贯一致", 4),
|
|
("richness_score", "丰富度", 5),
|
|
("publish_ready_score", "出版就绪", 4),
|
|
)
|
|
|
|
_LEAF_KEYS: tuple[tuple[str, str, float], ...] = (
|
|
("mem_fidelity", "记忆忠实度", 9),
|
|
("mem_factual_accuracy", "事实准确性", 5),
|
|
("mem_factual_coverage", "事实覆盖率", 5),
|
|
("mem_traceability", "记忆可追溯性", 4),
|
|
("info_slot_coverage", "槽位覆盖度", 6),
|
|
("info_sufficiency", "信息充分性", 4),
|
|
("info_density", "信息密度", 4),
|
|
("narr_structure", "故事结构", 6),
|
|
("narr_paragraphs", "段落组织", 5),
|
|
("narr_pacing", "节奏控制", 3),
|
|
("lang_fluency", "语言流畅度", 3),
|
|
("lang_conciseness", "表达精炼度", 3),
|
|
("lang_literary", "文笔质量", 4),
|
|
("lang_controlled_expansion", "控制性扩写能力", 4),
|
|
("lang_detail", "细节还原与强化", 2),
|
|
("lang_style", "风格一致性", 2),
|
|
("emo_authenticity", "情感真实度", 5),
|
|
("emo_depth", "情感深度", 4),
|
|
("char_understanding", "人物理解", 4),
|
|
("char_consistency", "人物一致性", 3),
|
|
("char_integration", "人物融入度", 2),
|
|
("coh_timeline", "时间线一致性", 2),
|
|
("coh_cross_chapter", "跨章节关联", 2),
|
|
("rich_analogy", "类比与引用", 3),
|
|
("rich_diversity", "表达多样性", 2),
|
|
("pub_editorial_cost", "编辑成本", 2),
|
|
("pub_completeness", "完整度", 2),
|
|
)
|
|
|
|
|
|
def _round(x: float) -> float:
|
|
return round(float(x), 2)
|
|
|
|
|
|
def build_memoir_compare_summary(
|
|
*,
|
|
baseline_judge: MemoirJudgeOutput | None,
|
|
chapter_judge: MemoirJudgeOutput | None,
|
|
) -> dict[str, Any]:
|
|
if not chapter_judge:
|
|
return {
|
|
"mode": "single",
|
|
"gate": {
|
|
"status": "insufficient_data",
|
|
"reasons": ["缺少新稿评分,无法进行 A/B 对比。"],
|
|
},
|
|
}
|
|
if not baseline_judge:
|
|
return {
|
|
"mode": "single",
|
|
"chapter_total": _round(chapter_judge.total_score),
|
|
"gate": {
|
|
"status": "single_side_only",
|
|
"reasons": ["缺少基线评分,仅有新稿单侧分数。"],
|
|
},
|
|
}
|
|
|
|
group_deltas = {
|
|
key: {
|
|
"label": label,
|
|
"max": mx,
|
|
"baseline": _round(getattr(baseline_judge, key)),
|
|
"chapter": _round(getattr(chapter_judge, key)),
|
|
"delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)),
|
|
}
|
|
for key, label, mx in _GROUP_KEYS
|
|
}
|
|
leaf_deltas = {
|
|
key: {
|
|
"label": label,
|
|
"max": mx,
|
|
"baseline": _round(getattr(baseline_judge, key)),
|
|
"chapter": _round(getattr(chapter_judge, key)),
|
|
"delta": _round(getattr(chapter_judge, key) - getattr(baseline_judge, key)),
|
|
}
|
|
for key, label, mx in _LEAF_KEYS
|
|
}
|
|
|
|
total_delta = _round(chapter_judge.total_score - baseline_judge.total_score)
|
|
key_regressions = [
|
|
v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.5
|
|
]
|
|
key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.5]
|
|
|
|
parity_passed = total_delta >= -2.0 and len(key_regressions) <= 3
|
|
surpass_passed = total_delta >= 2.0 and len(key_regressions) <= 1
|
|
|
|
if surpass_passed:
|
|
status = "surpass"
|
|
elif parity_passed:
|
|
status = "parity"
|
|
else:
|
|
status = "regressed"
|
|
|
|
reasons: list[str] = []
|
|
if total_delta >= 2.0:
|
|
reasons.append("总分显著超过基线。")
|
|
elif total_delta >= -2.0:
|
|
reasons.append("总分基本追平基线。")
|
|
else:
|
|
reasons.append("总分明显落后基线。")
|
|
if key_regressions:
|
|
reasons.append(f"回落项:{'、'.join(key_regressions[:6])}。")
|
|
if key_gains:
|
|
reasons.append(f"提升项:{'、'.join(key_gains[:6])}。")
|
|
|
|
return {
|
|
"mode": "ab",
|
|
"baseline_total": _round(baseline_judge.total_score),
|
|
"chapter_total": _round(chapter_judge.total_score),
|
|
"total_delta": total_delta,
|
|
"group_deltas": group_deltas,
|
|
"leaf_deltas": leaf_deltas,
|
|
"key_regressions": key_regressions,
|
|
"key_gains": key_gains,
|
|
"gate": {
|
|
"status": status,
|
|
"parity_passed": parity_passed,
|
|
"surpass_passed": surpass_passed,
|
|
"reasons": reasons,
|
|
},
|
|
}
|