"""评审输出 schema:文档级配分与合计校验。""" import pytest from app.features.evaluation.judge_schemas import MemoirJudgeOutput, TurnJudgeOutput def _full_memoir_leaves_max() -> dict: return { "mem_fidelity": 9, "mem_factual_accuracy": 5, "mem_factual_coverage": 5, "mem_traceability": 4, "info_slot_coverage": 6, "info_sufficiency": 4, "info_density": 4, "narr_structure": 6, "narr_paragraphs": 5, "narr_pacing": 3, "lang_fluency": 3, "lang_conciseness": 3, "lang_literary": 4, "lang_controlled_expansion": 4, "lang_detail": 2, "lang_style": 2, "emo_authenticity": 5, "emo_depth": 4, "char_understanding": 4, "char_consistency": 3, "char_integration": 2, "coh_timeline": 2, "coh_cross_chapter": 2, "rich_analogy": 3, "rich_diversity": 2, "pub_editorial_cost": 2, "pub_completeness": 2, } def test_memoir_full_marks_sum_to_100() -> None: leaves = _full_memoir_leaves_max() m = MemoirJudgeOutput( **leaves, total_score=100.0, rationale="", ) assert m.authenticity_score == 23.0 assert m.language_score == 18.0 assert m.total_score == 100.0 def test_memoir_total_score_derived_from_leaves() -> None: """模型传入的 total_score 与细项之和不一致时,以细项和为准。""" leaves = _full_memoir_leaves_max() m = MemoirJudgeOutput( **leaves, total_score=50.0, rationale="", ) assert m.total_score == 100.0 def _full_turn_leaves_max() -> dict: return { "emotion_carry": 10, "empathy_depth": 8, "emotion_safety": 6, "emotion_guidance": 6, "fact_mining": 8, "info_completeness_guide": 8, "info_depth_mining": 9, "persona_understanding": 7, "persona_consistency_verify": 4, "persona_expression_guide": 4, "interview_structure": 6, "context_memory": 5, "rhythm_control": 4, "question_quality": 7, "follow_up_depth": 5, "non_leading": 3, } def test_turn_judge_total_score_synced_from_leaves_like_glm5() -> None: """情绪块 9+8+6+6=29,其余打满时常被误写 total_score=100。""" t = TurnJudgeOutput( emotion_carry=9, empathy_depth=8, emotion_safety=6, emotion_guidance=6, fact_mining=8, info_completeness_guide=8, info_depth_mining=9, persona_understanding=7, persona_consistency_verify=4, persona_expression_guide=4, interview_structure=6, context_memory=5, rhythm_control=4, question_quality=7, follow_up_depth=5, non_leading=3, total_score=100.0, rationale="x", ) assert t.total_score == 99.0 assert t.emotion_score == 29.0 def test_turn_judge_coerces_string_lists_from_llm() -> None: """GLM-5 常把 major_* / insufficient_evidence 写成字符串而非 JSON 数组。""" leaves = _full_turn_leaves_max() t = TurnJudgeOutput( **leaves, total_score=100.0, rationale="x", major_strengths="Strong empathy and flow.", major_issues="None identified.", insufficient_evidence="None identified.", ) assert t.major_strengths == ["Strong empathy and flow."] assert t.major_issues == [] assert t.insufficient_evidence == [] def test_memoir_judge_coerces_string_lists_from_llm() -> None: leaves = _full_memoir_leaves_max() m = MemoirJudgeOutput( **leaves, total_score=100.0, rationale="", major_strengths="Solid structure.", major_issues="n/a", insufficient_evidence="", ) assert m.major_strengths == ["Solid structure."] assert m.major_issues == [] assert m.insufficient_evidence == [] def test_memoir_judge_clamps_leaf_scores_over_max_from_llm() -> None: """细项略超满分(如 rich_diversity=2.5)时钳制到 rubric 上限,避免 validation 整单失败。""" leaves = _full_memoir_leaves_max() leaves["rich_diversity"] = 2.5 m = MemoirJudgeOutput.model_validate( { **leaves, "total_score": 100.0, "rationale": "", "major_strengths": [], "major_issues": [], "insufficient_evidence": [], "evidence_refs": [], } ) assert m.rich_diversity == 2.0 def test_conversation_judge_meta_fields_default() -> None: leaves = { "emotion_carry": 10, "empathy_depth": 8, "emotion_safety": 6, "emotion_guidance": 6, "fact_mining": 8, "info_completeness_guide": 8, "info_depth_mining": 9, "persona_understanding": 7, "persona_consistency_verify": 4, "persona_expression_guide": 4, "interview_structure": 6, "context_memory": 5, "rhythm_control": 4, "question_quality": 7, "follow_up_depth": 5, "non_leading": 3, } t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="x") assert t.major_strengths == [] assert t.major_issues == [] assert t.insufficient_evidence == [] assert t.evidence_refs == [] assert t.confidence == 0.75 def test_conversation_total_must_match_sum() -> None: leaves = { "emotion_carry": 10, "empathy_depth": 8, "emotion_safety": 6, "emotion_guidance": 6, "fact_mining": 8, "info_completeness_guide": 8, "info_depth_mining": 9, "persona_understanding": 7, "persona_consistency_verify": 4, "persona_expression_guide": 4, "interview_structure": 6, "context_memory": 5, "rhythm_control": 4, "question_quality": 7, "follow_up_depth": 5, "non_leading": 3, } t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="") assert t.emotion_score == 30.0 assert t.information_score == 25.0 assert t.total_score == 100.0