Files
life-echo/api/app/features/evaluation/conversation_compare_summary.py
Kevin 064ad2161d refactor(eval+memoir):精简内部评测路由与服务,composite/对话摘要与 judge 能力补强
- 访谈:新增 interview_state_hints,联动 orchestrator 与提示词
- 回忆录:story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建:开发用 celery broker、compose/development 脚本、依赖注入
- eval-web:移除数据集/实验/版本等页面与流式轮询,突出 Playground
- 文档与单测同步
2026-04-08 21:36:12 +08:00

177 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Structured A/B compare summary for internal eval conversation judging."""
from __future__ import annotations
from typing import Any
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
_GROUP_KEYS: tuple[tuple[str, str], ...] = (
("emotion_score", "情绪与陪伴"),
("information_score", "信息挖掘"),
("persona_score", "人物建模"),
("structure_score", "结构引导"),
("question_score", "提问质量"),
)
_LEAF_KEYS: tuple[tuple[str, str], ...] = (
("emotion_carry", "情绪承接"),
("context_memory", "上下文记忆"),
("rhythm_control", "节奏控制"),
("persona_understanding", "人物理解"),
("follow_up_depth", "追问深度"),
("non_leading", "非引导性"),
)
_REPEAT_ISSUE_MARKERS = ("重复盘问", "重复询问", "已答", "忽略上文", "同义重问")
def _round(x: float) -> float:
return round(float(x), 2)
def _issues_text(judge: ConversationJudgeOutput | None) -> list[str]:
if judge is None:
return []
return [str(x).strip() for x in judge.major_issues if str(x).strip()]
def _has_repeat_issue(judge: ConversationJudgeOutput | None) -> bool:
return any(
marker in issue
for issue in _issues_text(judge)
for marker in _REPEAT_ISSUE_MARKERS
)
def build_conversation_compare_summary(
*,
baseline_judge: ConversationJudgeOutput | None,
replay_judge: ConversationJudgeOutput | None,
baseline_transcript: str,
replay_transcript: str,
conv_cap: int,
compare_cap_each: int,
fixture_filename: str | None = None,
) -> dict[str, Any]:
truncation = {
"baseline_chars": len((baseline_transcript or "").strip()),
"replay_chars": len((replay_transcript or "").strip()),
"conversation_cap_chars": int(conv_cap),
"compare_cap_each_chars": int(compare_cap_each),
"baseline_truncated_for_conversation": len((baseline_transcript or "").strip())
> int(conv_cap),
"replay_truncated_for_conversation": len((replay_transcript or "").strip())
> int(conv_cap),
"baseline_truncated_for_compare": len((baseline_transcript or "").strip())
> int(compare_cap_each),
"replay_truncated_for_compare": len((replay_transcript or "").strip())
> int(compare_cap_each),
}
if not replay_judge:
return {
"fixture_filename": fixture_filename,
"mode": "single",
"truncation": truncation,
"gate": {
"status": "insufficient_data",
"reasons": ["缺少回放整体评分,无法判断是否追平或超过 A。"],
},
}
if not baseline_judge:
return {
"fixture_filename": fixture_filename,
"mode": "single",
"replay_total": _round(replay_judge.total_score),
"truncation": truncation,
"gate": {
"status": "single_side_only",
"reasons": ["当前只有新对话单侧评分,可用于优化,但不能判定是否超过 A。"],
},
}
group_deltas = {
key: {
"label": label,
"baseline": _round(getattr(baseline_judge, key)),
"replay": _round(getattr(replay_judge, key)),
"delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)),
}
for key, label in _GROUP_KEYS
}
leaf_deltas = {
key: {
"label": label,
"baseline": _round(getattr(baseline_judge, key)),
"replay": _round(getattr(replay_judge, key)),
"delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)),
}
for key, label in _LEAF_KEYS
}
key_regressions = [
v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.75
]
key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.75]
total_delta = _round(replay_judge.total_score - baseline_judge.total_score)
has_repeat_regression = _has_repeat_issue(replay_judge)
parity_passed = (
total_delta >= -1.0
and float(leaf_deltas["context_memory"]["delta"]) >= -0.5
and float(leaf_deltas["emotion_carry"]["delta"]) >= -0.5
and not has_repeat_regression
)
surpass_passed = (
total_delta >= 1.5
and float(leaf_deltas["context_memory"]["delta"]) >= 0
and float(leaf_deltas["persona_understanding"]["delta"]) >= 0
and float(leaf_deltas["rhythm_control"]["delta"]) >= -0.25
and not has_repeat_regression
)
if surpass_passed:
status = "surpass"
elif parity_passed:
status = "parity"
else:
status = "regressed"
reasons: list[str] = []
if total_delta >= 1.5:
reasons.append("总分已显著超过基线。")
elif total_delta >= -1.0:
reasons.append("总分已基本追平基线。")
else:
reasons.append("总分仍明显落后基线。")
if has_repeat_regression:
reasons.append("回放侧仍出现重复盘问或忽略已知信息的风险。")
if key_regressions:
reasons.append(f"关键回落维度:{''.join(key_regressions[:4])}")
if key_gains:
reasons.append(f"关键提升维度:{''.join(key_gains[:4])}")
if truncation["baseline_truncated_for_compare"] or truncation["replay_truncated_for_compare"]:
reasons.append("A/B 对比稿使用了截断 transcript长对话结论需结合逐轮评分复核。")
return {
"fixture_filename": fixture_filename,
"mode": "ab",
"baseline_total": _round(baseline_judge.total_score),
"replay_total": _round(replay_judge.total_score),
"total_delta": total_delta,
"group_deltas": group_deltas,
"leaf_deltas": leaf_deltas,
"key_regressions": key_regressions,
"key_gains": key_gains,
"repeat_issue_detected": has_repeat_regression,
"truncation": truncation,
"gate": {
"status": status,
"parity_passed": parity_passed,
"surpass_passed": surpass_passed,
"reasons": reasons,
"golden_set_note": "建议在固定黄金样本集上复跑该口径,再决定是否发布。",
},
}