"""评审服务:保留真实失败原因,便于 internal eval 页面排障。""" import pytest from app.core.config import settings from app.core.llm_call import LLMCallError from app.features.evaluation.conversation_compare_summary import ( build_conversation_compare_summary, ) from app.features.evaluation.judge_schemas import ConversationJudgeOutput from app.features.evaluation.judge_service import ( EvalJudgeService, _build_memoir_judge_prompt, eval_judge_conversation_transcript_max_chars, eval_judge_compare_transcript_each_max_chars, ) def _conversation_payload() -> dict: return { "emotion_carry": 10, "empathy_depth": 8, "emotion_safety": 6, "emotion_guidance": 6, "fact_mining": 8, "info_completeness_guide": 8, "info_depth_mining": 9, "persona_understanding": 7, "persona_consistency_verify": 4, "persona_expression_guide": 4, "interview_structure": 6, "context_memory": 5, "rhythm_control": 4, "question_quality": 7, "follow_up_depth": 5, "non_leading": 3, "total_score": 100.0, "rationale": "整体表现稳定。", } def _conversation_payload_variant(**overrides: float | str) -> dict: data = _conversation_payload() data.update(overrides) return data @pytest.mark.asyncio async def test_judge_conversation_result_preserves_validation_error( monkeypatch: pytest.MonkeyPatch, ) -> None: async def _boom(*args, **kwargs): raise LLMCallError( "validation", "pydantic validation failed: total_score mismatch", ) monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _boom) svc = EvalJudgeService(object()) result = await svc.judge_conversation_result(full_transcript="[Turn 1]\n用户: hi\nAI: hello") assert result.output is None assert result.error is not None assert "结果校验失败" in result.error assert "total_score mismatch" in result.error @pytest.mark.asyncio async def test_judge_conversation_wrapper_keeps_legacy_shape( monkeypatch: pytest.MonkeyPatch, ) -> None: expected = ConversationJudgeOutput.model_validate(_conversation_payload()) async def _ok(*args, **kwargs): return expected monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _ok) svc = EvalJudgeService(object()) out = await svc.judge_conversation(full_transcript="[Turn 1]\n用户: hi\nAI: hello") assert out == expected def test_eval_judge_transcript_budget_exceeds_legacy_8192( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr(settings, "eval_judge_max_transcript_chars", 0) monkeypatch.setattr(settings, "eval_judge_context_window_tokens", 200_000) n = eval_judge_conversation_transcript_max_chars() assert n > 90_000 each = eval_judge_compare_transcript_each_max_chars() assert each > 40_000 def test_eval_judge_transcript_budget_respects_explicit_cap( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr(settings, "eval_judge_max_transcript_chars", 12_000) assert eval_judge_conversation_transcript_max_chars() == 12_000 def test_build_memoir_prompt_includes_source_and_reference_evidence() -> None: prompt = _build_memoir_judge_prompt( memoir_markdown="# 当前正文\n他后来去了南方。", source_transcript="用户: 我后来去了深圳工作。", reference_memoir_markdown="# 导出基线\n他去了深圳。", evidence_notes="必须严格核对真实性。", ) assert "【评审说明】" in prompt assert "【原始访谈/对话证据】" in prompt assert "用户: 我后来去了深圳工作。" in prompt assert "【结构化记忆证据】" in prompt assert "【参考基线/导出成稿】" in prompt assert "【当前回忆录正文】" in prompt def test_build_memoir_prompt_requires_conservative_scoring_without_evidence() -> None: prompt = _build_memoir_judge_prompt( memoir_markdown="# 当前正文\n他后来去了南方。" ) assert "无可用局部对话证据" in prompt assert "必须保守打分" in prompt assert "【结构化记忆证据】" in prompt def test_compare_summary_surpass_gate_and_truncation_flags() -> None: baseline = ConversationJudgeOutput.model_validate(_conversation_payload()) replay = ConversationJudgeOutput.model_validate( _conversation_payload_variant( emotion_carry=10, empathy_depth=8, emotion_safety=6, emotion_guidance=6, fact_mining=8, info_completeness_guide=8, info_depth_mining=9, persona_understanding=7, persona_consistency_verify=4, persona_expression_guide=4, interview_structure=6, context_memory=5, rhythm_control=4, question_quality=7, follow_up_depth=5, non_leading=3, rationale="更稳定。", ) ) summary = build_conversation_compare_summary( baseline_judge=baseline, replay_judge=replay, baseline_transcript="A" * 400, replay_transcript="B" * 1200, conv_cap=1000, compare_cap_each=500, fixture_filename="golden.md", ) assert summary["mode"] == "ab" assert summary["gate"]["status"] in {"parity", "surpass"} assert summary["truncation"]["replay_truncated_for_compare"] is True assert "group_deltas" in summary def test_compare_summary_flags_repeat_issue_as_regression() -> None: baseline = ConversationJudgeOutput.model_validate(_conversation_payload()) replay = ConversationJudgeOutput.model_validate( _conversation_payload_variant( context_memory=3, rhythm_control=3, total_score=0, major_issues=["存在重复盘问,忽略已答信息"], ) ) summary = build_conversation_compare_summary( baseline_judge=baseline, replay_judge=replay, baseline_transcript="[Turn 1]", replay_transcript="[Turn 1]", conv_cap=1000, compare_cap_each=500, ) assert summary["repeat_issue_detected"] is True assert summary["gate"]["status"] == "regressed"