Files
life-echo/api/tests/test_judge_service.py
Kevin 309a051038 feat: 回忆录证据血缘与内部评测可追溯,顺带对齐本地评测台与 CI
数据库与模型:新增多版迁移(章节证据快照、对话血缘、记忆事实/时间线 lineage 等),把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路:会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照;新增章节证据快照与评测侧 EvalTraceService 等模块,方便组评审用的证据包。
内部评测:自动化 run 与手工 memoir 评审共用可追溯证据;rubric/ judge 相关脚本与文档有配套调整。
app-eval-web:Memoir/实验详情里能展开看证据摘要与 evidence_trace(含对话轮次 id);Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致,避免改端口后页面连错服务。
工程杂项:GitHub Actions / 仓库说明有更新;各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾;新增/扩充了?
2026-04-08 15:37:09 +08:00

119 lines
4.0 KiB
Python

"""评审服务:保留真实失败原因,便于 internal eval 页面排障。"""
import pytest
from app.core.config import settings
from app.core.llm_call import LLMCallError
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
from app.features.evaluation.judge_service import (
EvalJudgeService,
_build_memoir_judge_prompt,
eval_judge_conversation_transcript_max_chars,
eval_judge_compare_transcript_each_max_chars,
)
def _conversation_payload() -> dict:
return {
"emotion_carry": 10,
"empathy_depth": 8,
"emotion_safety": 6,
"emotion_guidance": 6,
"fact_mining": 8,
"info_completeness_guide": 8,
"info_depth_mining": 9,
"persona_understanding": 7,
"persona_consistency_verify": 4,
"persona_expression_guide": 4,
"interview_structure": 6,
"context_memory": 5,
"rhythm_control": 4,
"question_quality": 7,
"follow_up_depth": 5,
"non_leading": 3,
"total_score": 100.0,
"rationale": "整体表现稳定。",
}
@pytest.mark.asyncio
async def test_judge_conversation_result_preserves_validation_error(
monkeypatch: pytest.MonkeyPatch,
) -> None:
async def _boom(*args, **kwargs):
raise LLMCallError(
"validation",
"pydantic validation failed: total_score mismatch",
)
monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _boom)
svc = EvalJudgeService(object())
result = await svc.judge_conversation_result(full_transcript="[Turn 1]\n用户: hi\nAI: hello")
assert result.output is None
assert result.error is not None
assert "结果校验失败" in result.error
assert "total_score mismatch" in result.error
@pytest.mark.asyncio
async def test_judge_conversation_wrapper_keeps_legacy_shape(
monkeypatch: pytest.MonkeyPatch,
) -> None:
expected = ConversationJudgeOutput.model_validate(_conversation_payload())
async def _ok(*args, **kwargs):
return expected
monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _ok)
svc = EvalJudgeService(object())
out = await svc.judge_conversation(full_transcript="[Turn 1]\n用户: hi\nAI: hello")
assert out == expected
def test_eval_judge_transcript_budget_exceeds_legacy_8192(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "eval_judge_max_transcript_chars", 0)
monkeypatch.setattr(settings, "eval_judge_context_window_tokens", 200_000)
n = eval_judge_conversation_transcript_max_chars()
assert n > 90_000
each = eval_judge_compare_transcript_each_max_chars()
assert each > 40_000
def test_eval_judge_transcript_budget_respects_explicit_cap(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "eval_judge_max_transcript_chars", 12_000)
assert eval_judge_conversation_transcript_max_chars() == 12_000
def test_build_memoir_prompt_includes_source_and_reference_evidence() -> None:
prompt = _build_memoir_judge_prompt(
memoir_markdown="# 当前正文\n他后来去了南方。",
source_transcript="用户: 我后来去了深圳工作。",
reference_memoir_markdown="# 导出基线\n他去了深圳。",
evidence_notes="必须严格核对真实性。",
)
assert "【评审说明】" in prompt
assert "【原始访谈/对话证据】" in prompt
assert "用户: 我后来去了深圳工作。" in prompt
assert "【结构化记忆证据】" in prompt
assert "【参考基线/导出成稿】" in prompt
assert "【当前回忆录正文】" in prompt
def test_build_memoir_prompt_requires_conservative_scoring_without_evidence() -> None:
prompt = _build_memoir_judge_prompt(
memoir_markdown="# 当前正文\n他后来去了南方。"
)
assert "无可用局部对话证据" in prompt
assert "必须保守打分" in prompt
assert "【结构化记忆证据】" in prompt