2026-04-08 21:36:12 +08:00
|
|
|
|
"""Structured A/B compare summary for internal eval conversation judging."""
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
|
|
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
|
2026-04-09 15:32:35 +08:00
|
|
|
|
from app.features.evaluation.judge_service import trim_compare_transcript_pair
|
2026-04-08 21:36:12 +08:00
|
|
|
|
|
|
|
|
|
|
_GROUP_KEYS: tuple[tuple[str, str], ...] = (
|
|
|
|
|
|
("emotion_score", "情绪与陪伴"),
|
|
|
|
|
|
("information_score", "信息挖掘"),
|
|
|
|
|
|
("persona_score", "人物建模"),
|
|
|
|
|
|
("structure_score", "结构引导"),
|
|
|
|
|
|
("question_score", "提问质量"),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_LEAF_KEYS: tuple[tuple[str, str], ...] = (
|
|
|
|
|
|
("emotion_carry", "情绪承接"),
|
|
|
|
|
|
("context_memory", "上下文记忆"),
|
|
|
|
|
|
("rhythm_control", "节奏控制"),
|
|
|
|
|
|
("persona_understanding", "人物理解"),
|
|
|
|
|
|
("follow_up_depth", "追问深度"),
|
|
|
|
|
|
("non_leading", "非引导性"),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_REPEAT_ISSUE_MARKERS = ("重复盘问", "重复询问", "已答", "忽略上文", "同义重问")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _round(x: float) -> float:
|
|
|
|
|
|
return round(float(x), 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _issues_text(judge: ConversationJudgeOutput | None) -> list[str]:
|
|
|
|
|
|
if judge is None:
|
|
|
|
|
|
return []
|
|
|
|
|
|
return [str(x).strip() for x in judge.major_issues if str(x).strip()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _has_repeat_issue(judge: ConversationJudgeOutput | None) -> bool:
|
|
|
|
|
|
return any(
|
|
|
|
|
|
marker in issue
|
|
|
|
|
|
for issue in _issues_text(judge)
|
|
|
|
|
|
for marker in _REPEAT_ISSUE_MARKERS
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-09 15:32:35 +08:00
|
|
|
|
def _evidence_quality(truncation: dict[str, Any]) -> dict[str, Any]:
|
|
|
|
|
|
"""结构化说明「分数/对比文在多大程度上覆盖全量对话」,便于客观解读。"""
|
|
|
|
|
|
b_h = not truncation["baseline_truncated_for_conversation"]
|
|
|
|
|
|
r_h = not truncation["replay_truncated_for_conversation"]
|
|
|
|
|
|
pair_full = not (
|
|
|
|
|
|
truncation["baseline_truncated_for_compare"]
|
|
|
|
|
|
or truncation["replay_truncated_for_compare"]
|
|
|
|
|
|
)
|
|
|
|
|
|
if b_h and r_h and pair_full:
|
|
|
|
|
|
scope = "full"
|
|
|
|
|
|
note = "评分与 A/B 对比均基于当前注入的全文(在模型上下文内未再裁对话正文)。"
|
|
|
|
|
|
else:
|
|
|
|
|
|
scope = "partial"
|
|
|
|
|
|
note = (
|
|
|
|
|
|
"存在整段或对比环节截断:分数与流式结论仅反映已提交片段;"
|
|
|
|
|
|
"评审侧已注入截断边界说明,长程细项应保守。发布决策请结合逐轮分、人工抽查或更高上下文预算。"
|
|
|
|
|
|
)
|
|
|
|
|
|
return {
|
|
|
|
|
|
"scope": scope,
|
|
|
|
|
|
"baseline_holistic_covers_full_text": b_h,
|
|
|
|
|
|
"replay_holistic_covers_full_text": r_h,
|
|
|
|
|
|
"ab_compare_covers_full_transcripts": pair_full,
|
|
|
|
|
|
"note_zh": note,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 21:36:12 +08:00
|
|
|
|
def build_conversation_compare_summary(
|
|
|
|
|
|
*,
|
|
|
|
|
|
baseline_judge: ConversationJudgeOutput | None,
|
|
|
|
|
|
replay_judge: ConversationJudgeOutput | None,
|
|
|
|
|
|
baseline_transcript: str,
|
|
|
|
|
|
replay_transcript: str,
|
|
|
|
|
|
conv_cap: int,
|
2026-04-09 15:32:35 +08:00
|
|
|
|
compare_cap_total: int,
|
|
|
|
|
|
compare_per_side_cap: int | None = None,
|
2026-04-08 21:36:12 +08:00
|
|
|
|
fixture_filename: str | None = None,
|
|
|
|
|
|
) -> dict[str, Any]:
|
2026-04-09 15:32:35 +08:00
|
|
|
|
_, _, baseline_cmp_trunc, replay_cmp_trunc = trim_compare_transcript_pair(
|
|
|
|
|
|
baseline_transcript,
|
|
|
|
|
|
replay_transcript,
|
|
|
|
|
|
total_max_chars=int(compare_cap_total),
|
|
|
|
|
|
per_side_max_chars=compare_per_side_cap,
|
|
|
|
|
|
)
|
|
|
|
|
|
if compare_per_side_cap and compare_per_side_cap > 0:
|
|
|
|
|
|
each_hint = int(compare_per_side_cap)
|
|
|
|
|
|
else:
|
|
|
|
|
|
each_hint = max(1, int(compare_cap_total) // 2)
|
|
|
|
|
|
|
2026-04-08 21:36:12 +08:00
|
|
|
|
truncation = {
|
|
|
|
|
|
"baseline_chars": len((baseline_transcript or "").strip()),
|
|
|
|
|
|
"replay_chars": len((replay_transcript or "").strip()),
|
|
|
|
|
|
"conversation_cap_chars": int(conv_cap),
|
2026-04-09 15:32:35 +08:00
|
|
|
|
"compare_cap_total_chars": int(compare_cap_total),
|
|
|
|
|
|
"compare_cap_each_chars": each_hint,
|
2026-04-08 21:36:12 +08:00
|
|
|
|
"baseline_truncated_for_conversation": len((baseline_transcript or "").strip())
|
|
|
|
|
|
> int(conv_cap),
|
|
|
|
|
|
"replay_truncated_for_conversation": len((replay_transcript or "").strip())
|
|
|
|
|
|
> int(conv_cap),
|
2026-04-09 15:32:35 +08:00
|
|
|
|
"baseline_truncated_for_compare": baseline_cmp_trunc,
|
|
|
|
|
|
"replay_truncated_for_compare": replay_cmp_trunc,
|
2026-04-08 21:36:12 +08:00
|
|
|
|
}
|
2026-04-09 15:32:35 +08:00
|
|
|
|
evidence_quality = _evidence_quality(truncation)
|
2026-04-08 21:36:12 +08:00
|
|
|
|
|
|
|
|
|
|
if not replay_judge:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"fixture_filename": fixture_filename,
|
|
|
|
|
|
"mode": "single",
|
|
|
|
|
|
"truncation": truncation,
|
2026-04-09 15:32:35 +08:00
|
|
|
|
"evidence_quality": evidence_quality,
|
2026-04-08 21:36:12 +08:00
|
|
|
|
"gate": {
|
|
|
|
|
|
"status": "insufficient_data",
|
|
|
|
|
|
"reasons": ["缺少回放整体评分,无法判断是否追平或超过 A。"],
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if not baseline_judge:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"fixture_filename": fixture_filename,
|
|
|
|
|
|
"mode": "single",
|
|
|
|
|
|
"replay_total": _round(replay_judge.total_score),
|
|
|
|
|
|
"truncation": truncation,
|
2026-04-09 15:32:35 +08:00
|
|
|
|
"evidence_quality": evidence_quality,
|
2026-04-08 21:36:12 +08:00
|
|
|
|
"gate": {
|
|
|
|
|
|
"status": "single_side_only",
|
2026-04-09 15:32:35 +08:00
|
|
|
|
"reasons": [
|
|
|
|
|
|
"当前只有新对话单侧评分,可用于优化,但不能判定是否超过 A。"
|
|
|
|
|
|
],
|
2026-04-08 21:36:12 +08:00
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
group_deltas = {
|
|
|
|
|
|
key: {
|
|
|
|
|
|
"label": label,
|
|
|
|
|
|
"baseline": _round(getattr(baseline_judge, key)),
|
|
|
|
|
|
"replay": _round(getattr(replay_judge, key)),
|
|
|
|
|
|
"delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)),
|
|
|
|
|
|
}
|
|
|
|
|
|
for key, label in _GROUP_KEYS
|
|
|
|
|
|
}
|
|
|
|
|
|
leaf_deltas = {
|
|
|
|
|
|
key: {
|
|
|
|
|
|
"label": label,
|
|
|
|
|
|
"baseline": _round(getattr(baseline_judge, key)),
|
|
|
|
|
|
"replay": _round(getattr(replay_judge, key)),
|
|
|
|
|
|
"delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)),
|
|
|
|
|
|
}
|
|
|
|
|
|
for key, label in _LEAF_KEYS
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
key_regressions = [
|
|
|
|
|
|
v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.75
|
|
|
|
|
|
]
|
|
|
|
|
|
key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.75]
|
|
|
|
|
|
total_delta = _round(replay_judge.total_score - baseline_judge.total_score)
|
|
|
|
|
|
has_repeat_regression = _has_repeat_issue(replay_judge)
|
|
|
|
|
|
parity_passed = (
|
|
|
|
|
|
total_delta >= -1.0
|
|
|
|
|
|
and float(leaf_deltas["context_memory"]["delta"]) >= -0.5
|
|
|
|
|
|
and float(leaf_deltas["emotion_carry"]["delta"]) >= -0.5
|
|
|
|
|
|
and not has_repeat_regression
|
|
|
|
|
|
)
|
|
|
|
|
|
surpass_passed = (
|
|
|
|
|
|
total_delta >= 1.5
|
|
|
|
|
|
and float(leaf_deltas["context_memory"]["delta"]) >= 0
|
|
|
|
|
|
and float(leaf_deltas["persona_understanding"]["delta"]) >= 0
|
|
|
|
|
|
and float(leaf_deltas["rhythm_control"]["delta"]) >= -0.25
|
|
|
|
|
|
and not has_repeat_regression
|
|
|
|
|
|
)
|
|
|
|
|
|
if surpass_passed:
|
|
|
|
|
|
status = "surpass"
|
|
|
|
|
|
elif parity_passed:
|
|
|
|
|
|
status = "parity"
|
|
|
|
|
|
else:
|
|
|
|
|
|
status = "regressed"
|
|
|
|
|
|
|
|
|
|
|
|
reasons: list[str] = []
|
|
|
|
|
|
if total_delta >= 1.5:
|
|
|
|
|
|
reasons.append("总分已显著超过基线。")
|
|
|
|
|
|
elif total_delta >= -1.0:
|
|
|
|
|
|
reasons.append("总分已基本追平基线。")
|
|
|
|
|
|
else:
|
|
|
|
|
|
reasons.append("总分仍明显落后基线。")
|
|
|
|
|
|
if has_repeat_regression:
|
|
|
|
|
|
reasons.append("回放侧仍出现重复盘问或忽略已知信息的风险。")
|
|
|
|
|
|
if key_regressions:
|
|
|
|
|
|
reasons.append(f"关键回落维度:{'、'.join(key_regressions[:4])}。")
|
|
|
|
|
|
if key_gains:
|
|
|
|
|
|
reasons.append(f"关键提升维度:{'、'.join(key_gains[:4])}。")
|
2026-04-09 15:32:35 +08:00
|
|
|
|
if (
|
|
|
|
|
|
truncation["baseline_truncated_for_compare"]
|
|
|
|
|
|
or truncation["replay_truncated_for_compare"]
|
|
|
|
|
|
):
|
|
|
|
|
|
reasons.append(
|
|
|
|
|
|
"A/B 对比稿使用了截断 transcript,长对话结论需结合逐轮评分复核。"
|
|
|
|
|
|
)
|
|
|
|
|
|
if (
|
|
|
|
|
|
truncation["baseline_truncated_for_conversation"]
|
|
|
|
|
|
or truncation["replay_truncated_for_conversation"]
|
|
|
|
|
|
):
|
|
|
|
|
|
reasons.append(
|
|
|
|
|
|
"整段评分可能仅见 transcript 前缀;长程维度已在评审边界下保守处理,请结合逐轮分或全文重跑交叉验证。"
|
|
|
|
|
|
)
|
2026-04-08 21:36:12 +08:00
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"fixture_filename": fixture_filename,
|
|
|
|
|
|
"mode": "ab",
|
|
|
|
|
|
"baseline_total": _round(baseline_judge.total_score),
|
|
|
|
|
|
"replay_total": _round(replay_judge.total_score),
|
|
|
|
|
|
"total_delta": total_delta,
|
|
|
|
|
|
"group_deltas": group_deltas,
|
|
|
|
|
|
"leaf_deltas": leaf_deltas,
|
|
|
|
|
|
"key_regressions": key_regressions,
|
|
|
|
|
|
"key_gains": key_gains,
|
|
|
|
|
|
"repeat_issue_detected": has_repeat_regression,
|
|
|
|
|
|
"truncation": truncation,
|
2026-04-09 15:32:35 +08:00
|
|
|
|
"evidence_quality": evidence_quality,
|
2026-04-08 21:36:12 +08:00
|
|
|
|
"gate": {
|
|
|
|
|
|
"status": status,
|
|
|
|
|
|
"parity_passed": parity_passed,
|
|
|
|
|
|
"surpass_passed": surpass_passed,
|
|
|
|
|
|
"reasons": reasons,
|
|
|
|
|
|
"golden_set_note": "建议在固定黄金样本集上复跑该口径,再决定是否发布。",
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|