Files
life-echo/api/app/features/evaluation/conversation_compare_summary.py
yangshilin e1341c6d18 feat:
1. 建立问题库大纲,对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性
2026-04-09 15:32:35 +08:00

232 lines
8.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Structured A/B compare summary for internal eval conversation judging."""
from __future__ import annotations
from typing import Any
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
from app.features.evaluation.judge_service import trim_compare_transcript_pair
_GROUP_KEYS: tuple[tuple[str, str], ...] = (
("emotion_score", "情绪与陪伴"),
("information_score", "信息挖掘"),
("persona_score", "人物建模"),
("structure_score", "结构引导"),
("question_score", "提问质量"),
)
_LEAF_KEYS: tuple[tuple[str, str], ...] = (
("emotion_carry", "情绪承接"),
("context_memory", "上下文记忆"),
("rhythm_control", "节奏控制"),
("persona_understanding", "人物理解"),
("follow_up_depth", "追问深度"),
("non_leading", "非引导性"),
)
_REPEAT_ISSUE_MARKERS = ("重复盘问", "重复询问", "已答", "忽略上文", "同义重问")
def _round(x: float) -> float:
return round(float(x), 2)
def _issues_text(judge: ConversationJudgeOutput | None) -> list[str]:
if judge is None:
return []
return [str(x).strip() for x in judge.major_issues if str(x).strip()]
def _has_repeat_issue(judge: ConversationJudgeOutput | None) -> bool:
return any(
marker in issue
for issue in _issues_text(judge)
for marker in _REPEAT_ISSUE_MARKERS
)
def _evidence_quality(truncation: dict[str, Any]) -> dict[str, Any]:
"""结构化说明「分数/对比文在多大程度上覆盖全量对话」,便于客观解读。"""
b_h = not truncation["baseline_truncated_for_conversation"]
r_h = not truncation["replay_truncated_for_conversation"]
pair_full = not (
truncation["baseline_truncated_for_compare"]
or truncation["replay_truncated_for_compare"]
)
if b_h and r_h and pair_full:
scope = "full"
note = "评分与 A/B 对比均基于当前注入的全文(在模型上下文内未再裁对话正文)。"
else:
scope = "partial"
note = (
"存在整段或对比环节截断:分数与流式结论仅反映已提交片段;"
"评审侧已注入截断边界说明,长程细项应保守。发布决策请结合逐轮分、人工抽查或更高上下文预算。"
)
return {
"scope": scope,
"baseline_holistic_covers_full_text": b_h,
"replay_holistic_covers_full_text": r_h,
"ab_compare_covers_full_transcripts": pair_full,
"note_zh": note,
}
def build_conversation_compare_summary(
*,
baseline_judge: ConversationJudgeOutput | None,
replay_judge: ConversationJudgeOutput | None,
baseline_transcript: str,
replay_transcript: str,
conv_cap: int,
compare_cap_total: int,
compare_per_side_cap: int | None = None,
fixture_filename: str | None = None,
) -> dict[str, Any]:
_, _, baseline_cmp_trunc, replay_cmp_trunc = trim_compare_transcript_pair(
baseline_transcript,
replay_transcript,
total_max_chars=int(compare_cap_total),
per_side_max_chars=compare_per_side_cap,
)
if compare_per_side_cap and compare_per_side_cap > 0:
each_hint = int(compare_per_side_cap)
else:
each_hint = max(1, int(compare_cap_total) // 2)
truncation = {
"baseline_chars": len((baseline_transcript or "").strip()),
"replay_chars": len((replay_transcript or "").strip()),
"conversation_cap_chars": int(conv_cap),
"compare_cap_total_chars": int(compare_cap_total),
"compare_cap_each_chars": each_hint,
"baseline_truncated_for_conversation": len((baseline_transcript or "").strip())
> int(conv_cap),
"replay_truncated_for_conversation": len((replay_transcript or "").strip())
> int(conv_cap),
"baseline_truncated_for_compare": baseline_cmp_trunc,
"replay_truncated_for_compare": replay_cmp_trunc,
}
evidence_quality = _evidence_quality(truncation)
if not replay_judge:
return {
"fixture_filename": fixture_filename,
"mode": "single",
"truncation": truncation,
"evidence_quality": evidence_quality,
"gate": {
"status": "insufficient_data",
"reasons": ["缺少回放整体评分,无法判断是否追平或超过 A。"],
},
}
if not baseline_judge:
return {
"fixture_filename": fixture_filename,
"mode": "single",
"replay_total": _round(replay_judge.total_score),
"truncation": truncation,
"evidence_quality": evidence_quality,
"gate": {
"status": "single_side_only",
"reasons": [
"当前只有新对话单侧评分,可用于优化,但不能判定是否超过 A。"
],
},
}
group_deltas = {
key: {
"label": label,
"baseline": _round(getattr(baseline_judge, key)),
"replay": _round(getattr(replay_judge, key)),
"delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)),
}
for key, label in _GROUP_KEYS
}
leaf_deltas = {
key: {
"label": label,
"baseline": _round(getattr(baseline_judge, key)),
"replay": _round(getattr(replay_judge, key)),
"delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)),
}
for key, label in _LEAF_KEYS
}
key_regressions = [
v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.75
]
key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.75]
total_delta = _round(replay_judge.total_score - baseline_judge.total_score)
has_repeat_regression = _has_repeat_issue(replay_judge)
parity_passed = (
total_delta >= -1.0
and float(leaf_deltas["context_memory"]["delta"]) >= -0.5
and float(leaf_deltas["emotion_carry"]["delta"]) >= -0.5
and not has_repeat_regression
)
surpass_passed = (
total_delta >= 1.5
and float(leaf_deltas["context_memory"]["delta"]) >= 0
and float(leaf_deltas["persona_understanding"]["delta"]) >= 0
and float(leaf_deltas["rhythm_control"]["delta"]) >= -0.25
and not has_repeat_regression
)
if surpass_passed:
status = "surpass"
elif parity_passed:
status = "parity"
else:
status = "regressed"
reasons: list[str] = []
if total_delta >= 1.5:
reasons.append("总分已显著超过基线。")
elif total_delta >= -1.0:
reasons.append("总分已基本追平基线。")
else:
reasons.append("总分仍明显落后基线。")
if has_repeat_regression:
reasons.append("回放侧仍出现重复盘问或忽略已知信息的风险。")
if key_regressions:
reasons.append(f"关键回落维度:{''.join(key_regressions[:4])}")
if key_gains:
reasons.append(f"关键提升维度:{''.join(key_gains[:4])}")
if (
truncation["baseline_truncated_for_compare"]
or truncation["replay_truncated_for_compare"]
):
reasons.append(
"A/B 对比稿使用了截断 transcript长对话结论需结合逐轮评分复核。"
)
if (
truncation["baseline_truncated_for_conversation"]
or truncation["replay_truncated_for_conversation"]
):
reasons.append(
"整段评分可能仅见 transcript 前缀;长程维度已在评审边界下保守处理,请结合逐轮分或全文重跑交叉验证。"
)
return {
"fixture_filename": fixture_filename,
"mode": "ab",
"baseline_total": _round(baseline_judge.total_score),
"replay_total": _round(replay_judge.total_score),
"total_delta": total_delta,
"group_deltas": group_deltas,
"leaf_deltas": leaf_deltas,
"key_regressions": key_regressions,
"key_gains": key_gains,
"repeat_issue_detected": has_repeat_regression,
"truncation": truncation,
"evidence_quality": evidence_quality,
"gate": {
"status": status,
"parity_passed": parity_passed,
"surpass_passed": surpass_passed,
"reasons": reasons,
"golden_set_note": "建议在固定黄金样本集上复跑该口径,再决定是否发布。",
},
}