refactor(eval+memoir):精简内部评测路由与服务,composite/对话摘要与 judge 能力补强

- 访谈:新增 interview_state_hints,联动 orchestrator 与提示词
- 回忆录:story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建:开发用 celery broker、compose/development 脚本、依赖注入
- eval-web:移除数据集/实验/版本等页面与流式轮询,突出 Playground
- 文档与单测同步
This commit is contained in:
Kevin
2026-04-08 21:36:12 +08:00
parent 2a0c80987d
commit 064ad2161d
64 changed files with 3412 additions and 3068 deletions

View File

@@ -0,0 +1,176 @@
"""Structured A/B compare summary for internal eval conversation judging."""
from __future__ import annotations
from typing import Any
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
_GROUP_KEYS: tuple[tuple[str, str], ...] = (
("emotion_score", "情绪与陪伴"),
("information_score", "信息挖掘"),
("persona_score", "人物建模"),
("structure_score", "结构引导"),
("question_score", "提问质量"),
)
_LEAF_KEYS: tuple[tuple[str, str], ...] = (
("emotion_carry", "情绪承接"),
("context_memory", "上下文记忆"),
("rhythm_control", "节奏控制"),
("persona_understanding", "人物理解"),
("follow_up_depth", "追问深度"),
("non_leading", "非引导性"),
)
_REPEAT_ISSUE_MARKERS = ("重复盘问", "重复询问", "已答", "忽略上文", "同义重问")
def _round(x: float) -> float:
return round(float(x), 2)
def _issues_text(judge: ConversationJudgeOutput | None) -> list[str]:
if judge is None:
return []
return [str(x).strip() for x in judge.major_issues if str(x).strip()]
def _has_repeat_issue(judge: ConversationJudgeOutput | None) -> bool:
return any(
marker in issue
for issue in _issues_text(judge)
for marker in _REPEAT_ISSUE_MARKERS
)
def build_conversation_compare_summary(
*,
baseline_judge: ConversationJudgeOutput | None,
replay_judge: ConversationJudgeOutput | None,
baseline_transcript: str,
replay_transcript: str,
conv_cap: int,
compare_cap_each: int,
fixture_filename: str | None = None,
) -> dict[str, Any]:
truncation = {
"baseline_chars": len((baseline_transcript or "").strip()),
"replay_chars": len((replay_transcript or "").strip()),
"conversation_cap_chars": int(conv_cap),
"compare_cap_each_chars": int(compare_cap_each),
"baseline_truncated_for_conversation": len((baseline_transcript or "").strip())
> int(conv_cap),
"replay_truncated_for_conversation": len((replay_transcript or "").strip())
> int(conv_cap),
"baseline_truncated_for_compare": len((baseline_transcript or "").strip())
> int(compare_cap_each),
"replay_truncated_for_compare": len((replay_transcript or "").strip())
> int(compare_cap_each),
}
if not replay_judge:
return {
"fixture_filename": fixture_filename,
"mode": "single",
"truncation": truncation,
"gate": {
"status": "insufficient_data",
"reasons": ["缺少回放整体评分,无法判断是否追平或超过 A。"],
},
}
if not baseline_judge:
return {
"fixture_filename": fixture_filename,
"mode": "single",
"replay_total": _round(replay_judge.total_score),
"truncation": truncation,
"gate": {
"status": "single_side_only",
"reasons": ["当前只有新对话单侧评分,可用于优化,但不能判定是否超过 A。"],
},
}
group_deltas = {
key: {
"label": label,
"baseline": _round(getattr(baseline_judge, key)),
"replay": _round(getattr(replay_judge, key)),
"delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)),
}
for key, label in _GROUP_KEYS
}
leaf_deltas = {
key: {
"label": label,
"baseline": _round(getattr(baseline_judge, key)),
"replay": _round(getattr(replay_judge, key)),
"delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)),
}
for key, label in _LEAF_KEYS
}
key_regressions = [
v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.75
]
key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.75]
total_delta = _round(replay_judge.total_score - baseline_judge.total_score)
has_repeat_regression = _has_repeat_issue(replay_judge)
parity_passed = (
total_delta >= -1.0
and float(leaf_deltas["context_memory"]["delta"]) >= -0.5
and float(leaf_deltas["emotion_carry"]["delta"]) >= -0.5
and not has_repeat_regression
)
surpass_passed = (
total_delta >= 1.5
and float(leaf_deltas["context_memory"]["delta"]) >= 0
and float(leaf_deltas["persona_understanding"]["delta"]) >= 0
and float(leaf_deltas["rhythm_control"]["delta"]) >= -0.25
and not has_repeat_regression
)
if surpass_passed:
status = "surpass"
elif parity_passed:
status = "parity"
else:
status = "regressed"
reasons: list[str] = []
if total_delta >= 1.5:
reasons.append("总分已显著超过基线。")
elif total_delta >= -1.0:
reasons.append("总分已基本追平基线。")
else:
reasons.append("总分仍明显落后基线。")
if has_repeat_regression:
reasons.append("回放侧仍出现重复盘问或忽略已知信息的风险。")
if key_regressions:
reasons.append(f"关键回落维度:{''.join(key_regressions[:4])}")
if key_gains:
reasons.append(f"关键提升维度:{''.join(key_gains[:4])}")
if truncation["baseline_truncated_for_compare"] or truncation["replay_truncated_for_compare"]:
reasons.append("A/B 对比稿使用了截断 transcript长对话结论需结合逐轮评分复核。")
return {
"fixture_filename": fixture_filename,
"mode": "ab",
"baseline_total": _round(baseline_judge.total_score),
"replay_total": _round(replay_judge.total_score),
"total_delta": total_delta,
"group_deltas": group_deltas,
"leaf_deltas": leaf_deltas,
"key_regressions": key_regressions,
"key_gains": key_gains,
"repeat_issue_detected": has_repeat_regression,
"truncation": truncation,
"gate": {
"status": status,
"parity_passed": parity_passed,
"surpass_passed": surpass_passed,
"reasons": reasons,
"golden_set_note": "建议在固定黄金样本集上复跑该口径,再决定是否发布。",
},
}