feat:
1. 建立问题库大纲,对应每个人生阶段槽位 2. 鼓励使用更生活化的交流语言共情与总结 3. 降低评审模型可能发生截断的概率 4. 成稿质量维度强化情感表达和上下文连贯性
This commit is contained in:
@@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
from typing import Any
|
||||
|
||||
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
|
||||
from app.features.evaluation.judge_service import trim_compare_transcript_pair
|
||||
|
||||
_GROUP_KEYS: tuple[tuple[str, str], ...] = (
|
||||
("emotion_score", "情绪与陪伴"),
|
||||
@@ -44,6 +45,32 @@ def _has_repeat_issue(judge: ConversationJudgeOutput | None) -> bool:
|
||||
)
|
||||
|
||||
|
||||
def _evidence_quality(truncation: dict[str, Any]) -> dict[str, Any]:
|
||||
"""结构化说明「分数/对比文在多大程度上覆盖全量对话」,便于客观解读。"""
|
||||
b_h = not truncation["baseline_truncated_for_conversation"]
|
||||
r_h = not truncation["replay_truncated_for_conversation"]
|
||||
pair_full = not (
|
||||
truncation["baseline_truncated_for_compare"]
|
||||
or truncation["replay_truncated_for_compare"]
|
||||
)
|
||||
if b_h and r_h and pair_full:
|
||||
scope = "full"
|
||||
note = "评分与 A/B 对比均基于当前注入的全文(在模型上下文内未再裁对话正文)。"
|
||||
else:
|
||||
scope = "partial"
|
||||
note = (
|
||||
"存在整段或对比环节截断:分数与流式结论仅反映已提交片段;"
|
||||
"评审侧已注入截断边界说明,长程细项应保守。发布决策请结合逐轮分、人工抽查或更高上下文预算。"
|
||||
)
|
||||
return {
|
||||
"scope": scope,
|
||||
"baseline_holistic_covers_full_text": b_h,
|
||||
"replay_holistic_covers_full_text": r_h,
|
||||
"ab_compare_covers_full_transcripts": pair_full,
|
||||
"note_zh": note,
|
||||
}
|
||||
|
||||
|
||||
def build_conversation_compare_summary(
|
||||
*,
|
||||
baseline_judge: ConversationJudgeOutput | None,
|
||||
@@ -51,29 +78,42 @@ def build_conversation_compare_summary(
|
||||
baseline_transcript: str,
|
||||
replay_transcript: str,
|
||||
conv_cap: int,
|
||||
compare_cap_each: int,
|
||||
compare_cap_total: int,
|
||||
compare_per_side_cap: int | None = None,
|
||||
fixture_filename: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
_, _, baseline_cmp_trunc, replay_cmp_trunc = trim_compare_transcript_pair(
|
||||
baseline_transcript,
|
||||
replay_transcript,
|
||||
total_max_chars=int(compare_cap_total),
|
||||
per_side_max_chars=compare_per_side_cap,
|
||||
)
|
||||
if compare_per_side_cap and compare_per_side_cap > 0:
|
||||
each_hint = int(compare_per_side_cap)
|
||||
else:
|
||||
each_hint = max(1, int(compare_cap_total) // 2)
|
||||
|
||||
truncation = {
|
||||
"baseline_chars": len((baseline_transcript or "").strip()),
|
||||
"replay_chars": len((replay_transcript or "").strip()),
|
||||
"conversation_cap_chars": int(conv_cap),
|
||||
"compare_cap_each_chars": int(compare_cap_each),
|
||||
"compare_cap_total_chars": int(compare_cap_total),
|
||||
"compare_cap_each_chars": each_hint,
|
||||
"baseline_truncated_for_conversation": len((baseline_transcript or "").strip())
|
||||
> int(conv_cap),
|
||||
"replay_truncated_for_conversation": len((replay_transcript or "").strip())
|
||||
> int(conv_cap),
|
||||
"baseline_truncated_for_compare": len((baseline_transcript or "").strip())
|
||||
> int(compare_cap_each),
|
||||
"replay_truncated_for_compare": len((replay_transcript or "").strip())
|
||||
> int(compare_cap_each),
|
||||
"baseline_truncated_for_compare": baseline_cmp_trunc,
|
||||
"replay_truncated_for_compare": replay_cmp_trunc,
|
||||
}
|
||||
evidence_quality = _evidence_quality(truncation)
|
||||
|
||||
if not replay_judge:
|
||||
return {
|
||||
"fixture_filename": fixture_filename,
|
||||
"mode": "single",
|
||||
"truncation": truncation,
|
||||
"evidence_quality": evidence_quality,
|
||||
"gate": {
|
||||
"status": "insufficient_data",
|
||||
"reasons": ["缺少回放整体评分,无法判断是否追平或超过 A。"],
|
||||
@@ -86,9 +126,12 @@ def build_conversation_compare_summary(
|
||||
"mode": "single",
|
||||
"replay_total": _round(replay_judge.total_score),
|
||||
"truncation": truncation,
|
||||
"evidence_quality": evidence_quality,
|
||||
"gate": {
|
||||
"status": "single_side_only",
|
||||
"reasons": ["当前只有新对话单侧评分,可用于优化,但不能判定是否超过 A。"],
|
||||
"reasons": [
|
||||
"当前只有新对话单侧评分,可用于优化,但不能判定是否超过 A。"
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
@@ -150,8 +193,20 @@ def build_conversation_compare_summary(
|
||||
reasons.append(f"关键回落维度:{'、'.join(key_regressions[:4])}。")
|
||||
if key_gains:
|
||||
reasons.append(f"关键提升维度:{'、'.join(key_gains[:4])}。")
|
||||
if truncation["baseline_truncated_for_compare"] or truncation["replay_truncated_for_compare"]:
|
||||
reasons.append("A/B 对比稿使用了截断 transcript,长对话结论需结合逐轮评分复核。")
|
||||
if (
|
||||
truncation["baseline_truncated_for_compare"]
|
||||
or truncation["replay_truncated_for_compare"]
|
||||
):
|
||||
reasons.append(
|
||||
"A/B 对比稿使用了截断 transcript,长对话结论需结合逐轮评分复核。"
|
||||
)
|
||||
if (
|
||||
truncation["baseline_truncated_for_conversation"]
|
||||
or truncation["replay_truncated_for_conversation"]
|
||||
):
|
||||
reasons.append(
|
||||
"整段评分可能仅见 transcript 前缀;长程维度已在评审边界下保守处理,请结合逐轮分或全文重跑交叉验证。"
|
||||
)
|
||||
|
||||
return {
|
||||
"fixture_filename": fixture_filename,
|
||||
@@ -165,6 +220,7 @@ def build_conversation_compare_summary(
|
||||
"key_gains": key_gains,
|
||||
"repeat_issue_detected": has_repeat_regression,
|
||||
"truncation": truncation,
|
||||
"evidence_quality": evidence_quality,
|
||||
"gate": {
|
||||
"status": status,
|
||||
"parity_passed": parity_passed,
|
||||
@@ -173,4 +229,3 @@ def build_conversation_compare_summary(
|
||||
"golden_set_note": "建议在固定黄金样本集上复跑该口径,再决定是否发布。",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user