"""Structured A/B compare summary for internal eval conversation judging.""" from __future__ import annotations from typing import Any from app.features.evaluation.judge_schemas import ConversationJudgeOutput from app.features.evaluation.judge_service import trim_compare_transcript_pair _GROUP_KEYS: tuple[tuple[str, str], ...] = ( ("emotion_score", "情绪与陪伴"), ("information_score", "信息挖掘"), ("persona_score", "人物建模"), ("structure_score", "结构引导"), ("question_score", "提问质量"), ) _LEAF_KEYS: tuple[tuple[str, str], ...] = ( ("emotion_carry", "情绪承接"), ("context_memory", "上下文记忆"), ("rhythm_control", "节奏控制"), ("persona_understanding", "人物理解"), ("follow_up_depth", "追问深度"), ("non_leading", "非引导性"), ) _REPEAT_ISSUE_MARKERS = ("重复盘问", "重复询问", "已答", "忽略上文", "同义重问") def _round(x: float) -> float: return round(float(x), 2) def _issues_text(judge: ConversationJudgeOutput | None) -> list[str]: if judge is None: return [] return [str(x).strip() for x in judge.major_issues if str(x).strip()] def _has_repeat_issue(judge: ConversationJudgeOutput | None) -> bool: return any( marker in issue for issue in _issues_text(judge) for marker in _REPEAT_ISSUE_MARKERS ) def _evidence_quality(truncation: dict[str, Any]) -> dict[str, Any]: """结构化说明「分数/对比文在多大程度上覆盖全量对话」,便于客观解读。""" b_h = not truncation["baseline_truncated_for_conversation"] r_h = not truncation["replay_truncated_for_conversation"] pair_full = not ( truncation["baseline_truncated_for_compare"] or truncation["replay_truncated_for_compare"] ) if b_h and r_h and pair_full: scope = "full" note = "评分与 A/B 对比均基于当前注入的全文(在模型上下文内未再裁对话正文)。" else: scope = "partial" note = ( "存在整段或对比环节截断:分数与流式结论仅反映已提交片段;" "评审侧已注入截断边界说明,长程细项应保守。发布决策请结合逐轮分、人工抽查或更高上下文预算。" ) return { "scope": scope, "baseline_holistic_covers_full_text": b_h, "replay_holistic_covers_full_text": r_h, "ab_compare_covers_full_transcripts": pair_full, "note_zh": note, } def build_conversation_compare_summary( *, baseline_judge: ConversationJudgeOutput | None, replay_judge: ConversationJudgeOutput | None, baseline_transcript: str, replay_transcript: str, conv_cap: int, compare_cap_total: int, compare_per_side_cap: int | None = None, fixture_filename: str | None = None, ) -> dict[str, Any]: _, _, baseline_cmp_trunc, replay_cmp_trunc = trim_compare_transcript_pair( baseline_transcript, replay_transcript, total_max_chars=int(compare_cap_total), per_side_max_chars=compare_per_side_cap, ) if compare_per_side_cap and compare_per_side_cap > 0: each_hint = int(compare_per_side_cap) else: each_hint = max(1, int(compare_cap_total) // 2) truncation = { "baseline_chars": len((baseline_transcript or "").strip()), "replay_chars": len((replay_transcript or "").strip()), "conversation_cap_chars": int(conv_cap), "compare_cap_total_chars": int(compare_cap_total), "compare_cap_each_chars": each_hint, "baseline_truncated_for_conversation": len((baseline_transcript or "").strip()) > int(conv_cap), "replay_truncated_for_conversation": len((replay_transcript or "").strip()) > int(conv_cap), "baseline_truncated_for_compare": baseline_cmp_trunc, "replay_truncated_for_compare": replay_cmp_trunc, } evidence_quality = _evidence_quality(truncation) if not replay_judge: return { "fixture_filename": fixture_filename, "mode": "single", "truncation": truncation, "evidence_quality": evidence_quality, "gate": { "status": "insufficient_data", "reasons": ["缺少回放整体评分,无法判断是否追平或超过 A。"], }, } if not baseline_judge: return { "fixture_filename": fixture_filename, "mode": "single", "replay_total": _round(replay_judge.total_score), "truncation": truncation, "evidence_quality": evidence_quality, "gate": { "status": "single_side_only", "reasons": [ "当前只有新对话单侧评分,可用于优化,但不能判定是否超过 A。" ], }, } group_deltas = { key: { "label": label, "baseline": _round(getattr(baseline_judge, key)), "replay": _round(getattr(replay_judge, key)), "delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)), } for key, label in _GROUP_KEYS } leaf_deltas = { key: { "label": label, "baseline": _round(getattr(baseline_judge, key)), "replay": _round(getattr(replay_judge, key)), "delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)), } for key, label in _LEAF_KEYS } key_regressions = [ v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.75 ] key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.75] total_delta = _round(replay_judge.total_score - baseline_judge.total_score) has_repeat_regression = _has_repeat_issue(replay_judge) parity_passed = ( total_delta >= -1.0 and float(leaf_deltas["context_memory"]["delta"]) >= -0.5 and float(leaf_deltas["emotion_carry"]["delta"]) >= -0.5 and not has_repeat_regression ) surpass_passed = ( total_delta >= 1.5 and float(leaf_deltas["context_memory"]["delta"]) >= 0 and float(leaf_deltas["persona_understanding"]["delta"]) >= 0 and float(leaf_deltas["rhythm_control"]["delta"]) >= -0.25 and not has_repeat_regression ) if surpass_passed: status = "surpass" elif parity_passed: status = "parity" else: status = "regressed" reasons: list[str] = [] if total_delta >= 1.5: reasons.append("总分已显著超过基线。") elif total_delta >= -1.0: reasons.append("总分已基本追平基线。") else: reasons.append("总分仍明显落后基线。") if has_repeat_regression: reasons.append("回放侧仍出现重复盘问或忽略已知信息的风险。") if key_regressions: reasons.append(f"关键回落维度:{'、'.join(key_regressions[:4])}。") if key_gains: reasons.append(f"关键提升维度:{'、'.join(key_gains[:4])}。") if ( truncation["baseline_truncated_for_compare"] or truncation["replay_truncated_for_compare"] ): reasons.append( "A/B 对比稿使用了截断 transcript,长对话结论需结合逐轮评分复核。" ) if ( truncation["baseline_truncated_for_conversation"] or truncation["replay_truncated_for_conversation"] ): reasons.append( "整段评分可能仅见 transcript 前缀;长程维度已在评审边界下保守处理,请结合逐轮分或全文重跑交叉验证。" ) return { "fixture_filename": fixture_filename, "mode": "ab", "baseline_total": _round(baseline_judge.total_score), "replay_total": _round(replay_judge.total_score), "total_delta": total_delta, "group_deltas": group_deltas, "leaf_deltas": leaf_deltas, "key_regressions": key_regressions, "key_gains": key_gains, "repeat_issue_detected": has_repeat_regression, "truncation": truncation, "evidence_quality": evidence_quality, "gate": { "status": status, "parity_passed": parity_passed, "surpass_passed": surpass_passed, "reasons": reasons, "golden_set_note": "建议在固定黄金样本集上复跑该口径,再决定是否发布。", }, }