219 lines
7.3 KiB
Python
219 lines
7.3 KiB
Python
"""评审服务:保留真实失败原因,便于 internal eval 页面排障。"""
|
|
|
|
import pytest
|
|
|
|
from app.core.config import settings
|
|
from app.core.llm_call import LLMCallError
|
|
from app.features.evaluation.conversation_compare_summary import (
|
|
build_conversation_compare_summary,
|
|
)
|
|
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
|
|
from app.features.evaluation.judge_service import (
|
|
EvalJudgeService,
|
|
_build_memoir_judge_prompt,
|
|
conversation_judge_transcript_excerpt,
|
|
eval_judge_compare_transcript_each_max_chars,
|
|
eval_judge_conversation_transcript_max_chars,
|
|
trim_compare_transcript_pair,
|
|
turn_judge_prior_excerpt,
|
|
)
|
|
|
|
|
|
def _conversation_payload() -> dict:
|
|
return {
|
|
"emotion_carry": 10,
|
|
"empathy_depth": 8,
|
|
"emotion_safety": 6,
|
|
"emotion_guidance": 6,
|
|
"fact_mining": 8,
|
|
"info_completeness_guide": 8,
|
|
"info_depth_mining": 9,
|
|
"persona_understanding": 7,
|
|
"persona_consistency_verify": 4,
|
|
"persona_expression_guide": 4,
|
|
"interview_structure": 6,
|
|
"context_memory": 5,
|
|
"rhythm_control": 4,
|
|
"question_quality": 7,
|
|
"follow_up_depth": 5,
|
|
"non_leading": 3,
|
|
"total_score": 100.0,
|
|
"rationale": "整体表现稳定。",
|
|
}
|
|
|
|
|
|
def _conversation_payload_variant(**overrides: float | str) -> dict:
|
|
data = _conversation_payload()
|
|
data.update(overrides)
|
|
return data
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_conversation_result_preserves_validation_error(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
async def _boom(*args, **kwargs):
|
|
raise LLMCallError(
|
|
"validation",
|
|
"pydantic validation failed: total_score mismatch",
|
|
)
|
|
|
|
monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _boom)
|
|
|
|
svc = EvalJudgeService(object())
|
|
result = await svc.judge_conversation_result(
|
|
full_transcript="[Turn 1]\n用户: hi\nAI: hello"
|
|
)
|
|
|
|
assert result.output is None
|
|
assert result.error is not None
|
|
assert "结果校验失败" in result.error
|
|
assert "total_score mismatch" in result.error
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_conversation_wrapper_keeps_legacy_shape(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
expected = ConversationJudgeOutput.model_validate(_conversation_payload())
|
|
|
|
async def _ok(*args, **kwargs):
|
|
return expected
|
|
|
|
monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _ok)
|
|
|
|
svc = EvalJudgeService(object())
|
|
out = await svc.judge_conversation(full_transcript="[Turn 1]\n用户: hi\nAI: hello")
|
|
|
|
assert out == expected
|
|
|
|
|
|
def test_eval_judge_transcript_budget_exceeds_legacy_8192(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
monkeypatch.setattr(settings, "eval_judge_max_transcript_chars", 0)
|
|
monkeypatch.setattr(settings, "eval_judge_context_window_tokens", 200_000)
|
|
n = eval_judge_conversation_transcript_max_chars()
|
|
assert n > 90_000
|
|
each = eval_judge_compare_transcript_each_max_chars()
|
|
assert each > 85_000
|
|
|
|
|
|
def test_conversation_judge_excerpt_appends_boundary_when_clipped() -> None:
|
|
body = conversation_judge_transcript_excerpt("abcdefghijklmnopqrstuvwxyz", cap=10)
|
|
assert body.startswith("abcdefghij")
|
|
assert "评审边界" in body
|
|
assert "截断稿" in body
|
|
|
|
|
|
def test_turn_judge_prior_excerpt_appends_boundary_when_clipped() -> None:
|
|
body = turn_judge_prior_excerpt("0123456789abcdef", cap=8)
|
|
assert body.startswith("01234567")
|
|
assert "上文节选已截断" in body
|
|
|
|
|
|
def test_trim_compare_transcript_pair_prefers_trimming_longer_side() -> None:
|
|
b_out, r_out, b_trunc, r_trunc = trim_compare_transcript_pair(
|
|
"a" * 100,
|
|
"b" * 900,
|
|
total_max_chars=950,
|
|
per_side_max_chars=None,
|
|
)
|
|
assert len(b_out) == 100
|
|
assert len(r_out) == 850
|
|
assert b_trunc is False and r_trunc is True
|
|
|
|
|
|
def test_eval_judge_transcript_budget_respects_explicit_cap(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
monkeypatch.setattr(settings, "eval_judge_max_transcript_chars", 12_000)
|
|
assert eval_judge_conversation_transcript_max_chars() == 12_000
|
|
|
|
|
|
def test_build_memoir_prompt_includes_source_and_reference_evidence() -> None:
|
|
prompt = _build_memoir_judge_prompt(
|
|
memoir_markdown="# 当前正文\n他后来去了南方。",
|
|
source_transcript="用户: 我后来去了深圳工作。",
|
|
reference_memoir_markdown="# 导出基线\n他去了深圳。",
|
|
evidence_notes="必须严格核对真实性。",
|
|
)
|
|
|
|
assert "【评审说明】" in prompt
|
|
assert "【原始访谈/对话证据】" in prompt
|
|
assert "用户: 我后来去了深圳工作。" in prompt
|
|
assert "【结构化记忆证据】" in prompt
|
|
assert "【参考基线/导出成稿】" in prompt
|
|
assert "【当前回忆录正文】" in prompt
|
|
|
|
|
|
def test_build_memoir_prompt_requires_conservative_scoring_without_evidence() -> None:
|
|
prompt = _build_memoir_judge_prompt(memoir_markdown="# 当前正文\n他后来去了南方。")
|
|
|
|
assert "无可用局部对话证据" in prompt
|
|
assert "必须保守打分" in prompt
|
|
assert "【结构化记忆证据】" in prompt
|
|
|
|
|
|
def test_compare_summary_surpass_gate_and_truncation_flags() -> None:
|
|
baseline = ConversationJudgeOutput.model_validate(_conversation_payload())
|
|
replay = ConversationJudgeOutput.model_validate(
|
|
_conversation_payload_variant(
|
|
emotion_carry=10,
|
|
empathy_depth=8,
|
|
emotion_safety=6,
|
|
emotion_guidance=6,
|
|
fact_mining=8,
|
|
info_completeness_guide=8,
|
|
info_depth_mining=9,
|
|
persona_understanding=7,
|
|
persona_consistency_verify=4,
|
|
persona_expression_guide=4,
|
|
interview_structure=6,
|
|
context_memory=5,
|
|
rhythm_control=4,
|
|
question_quality=7,
|
|
follow_up_depth=5,
|
|
non_leading=3,
|
|
rationale="更稳定。",
|
|
)
|
|
)
|
|
summary = build_conversation_compare_summary(
|
|
baseline_judge=baseline,
|
|
replay_judge=replay,
|
|
baseline_transcript="A" * 400,
|
|
replay_transcript="B" * 1200,
|
|
conv_cap=1000,
|
|
compare_cap_total=1000,
|
|
compare_per_side_cap=None,
|
|
fixture_filename="golden.md",
|
|
)
|
|
assert summary["mode"] == "ab"
|
|
assert summary["gate"]["status"] in {"parity", "surpass"}
|
|
assert summary["truncation"]["replay_truncated_for_compare"] is True
|
|
assert summary["evidence_quality"]["scope"] == "partial"
|
|
assert "group_deltas" in summary
|
|
|
|
|
|
def test_compare_summary_flags_repeat_issue_as_regression() -> None:
|
|
baseline = ConversationJudgeOutput.model_validate(_conversation_payload())
|
|
replay = ConversationJudgeOutput.model_validate(
|
|
_conversation_payload_variant(
|
|
context_memory=3,
|
|
rhythm_control=3,
|
|
total_score=0,
|
|
major_issues=["存在重复盘问,忽略已答信息"],
|
|
)
|
|
)
|
|
summary = build_conversation_compare_summary(
|
|
baseline_judge=baseline,
|
|
replay_judge=replay,
|
|
baseline_transcript="[Turn 1]",
|
|
replay_transcript="[Turn 1]",
|
|
conv_cap=1000,
|
|
compare_cap_total=1000,
|
|
compare_per_side_cap=None,
|
|
)
|
|
assert summary["repeat_issue_detected"] is True
|
|
assert summary["gate"]["status"] == "regressed"
|