api/tests/test_judge_schemas.py

"""评审输出 schema：文档级配分与合计校验。"""

import pytest

from app.features.evaluation.judge_schemas import MemoirJudgeOutput, TurnJudgeOutput


def _full_memoir_leaves_max() -> dict:
    return {
        "mem_fidelity": 9,
        "mem_factual_accuracy": 5,
        "mem_factual_coverage": 5,
        "mem_traceability": 4,
        "info_slot_coverage": 6,
        "info_sufficiency": 4,
        "info_density": 4,
        "narr_structure": 6,
        "narr_paragraphs": 5,
        "narr_pacing": 3,
        "lang_fluency": 3,
        "lang_conciseness": 3,
        "lang_literary": 4,
        "lang_controlled_expansion": 4,
        "lang_detail": 2,
        "lang_style": 2,
        "emo_authenticity": 5,
        "emo_depth": 4,
        "char_understanding": 4,
        "char_consistency": 3,
        "char_integration": 2,
        "coh_timeline": 2,
        "coh_cross_chapter": 2,
        "rich_analogy": 3,
        "rich_diversity": 2,
        "pub_editorial_cost": 2,
        "pub_completeness": 2,
    }


def test_memoir_full_marks_sum_to_100() -> None:
    leaves = _full_memoir_leaves_max()
    m = MemoirJudgeOutput(
        **leaves,
        total_score=100.0,
        rationale="",
    )
    assert m.authenticity_score == 23.0
    assert m.language_score == 18.0
    assert m.total_score == 100.0


def test_memoir_total_score_derived_from_leaves() -> None:
    """模型传入的 total_score 与细项之和不一致时，以细项和为准。"""
    leaves = _full_memoir_leaves_max()
    m = MemoirJudgeOutput(
        **leaves,
        total_score=50.0,
        rationale="",
    )
    assert m.total_score == 100.0


def _full_turn_leaves_max() -> dict:
    return {
        "emotion_carry": 10,
        "empathy_depth": 8,
        "emotion_safety": 6,
        "emotion_guidance": 6,
        "fact_mining": 8,
        "info_completeness_guide": 8,
        "info_depth_mining": 9,
        "persona_understanding": 7,
        "persona_consistency_verify": 4,
        "persona_expression_guide": 4,
        "interview_structure": 6,
        "context_memory": 5,
        "rhythm_control": 4,
        "question_quality": 7,
        "follow_up_depth": 5,
        "non_leading": 3,
    }


def test_turn_judge_total_score_synced_from_leaves_like_glm5() -> None:
    """情绪块 9+8+6+6=29，其余打满时常被误写 total_score=100。"""
    t = TurnJudgeOutput(
        emotion_carry=9,
        empathy_depth=8,
        emotion_safety=6,
        emotion_guidance=6,
        fact_mining=8,
        info_completeness_guide=8,
        info_depth_mining=9,
        persona_understanding=7,
        persona_consistency_verify=4,
        persona_expression_guide=4,
        interview_structure=6,
        context_memory=5,
        rhythm_control=4,
        question_quality=7,
        follow_up_depth=5,
        non_leading=3,
        total_score=100.0,
        rationale="x",
    )
    assert t.total_score == 99.0
    assert t.emotion_score == 29.0


def test_turn_judge_coerces_string_lists_from_llm() -> None:
    """GLM-5 常把 major_* / insufficient_evidence 写成字符串而非 JSON 数组。"""
    leaves = _full_turn_leaves_max()
    t = TurnJudgeOutput(
        **leaves,
        total_score=100.0,
        rationale="x",
        major_strengths="Strong empathy and flow.",
        major_issues="None identified.",
        insufficient_evidence="None identified.",
    )
    assert t.major_strengths == ["Strong empathy and flow."]
    assert t.major_issues == []
    assert t.insufficient_evidence == []


def test_memoir_judge_coerces_string_lists_from_llm() -> None:
    leaves = _full_memoir_leaves_max()
    m = MemoirJudgeOutput(
        **leaves,
        total_score=100.0,
        rationale="",
        major_strengths="Solid structure.",
        major_issues="n/a",
        insufficient_evidence="",
    )
    assert m.major_strengths == ["Solid structure."]
    assert m.major_issues == []
    assert m.insufficient_evidence == []


def test_memoir_judge_clamps_leaf_scores_over_max_from_llm() -> None:
    """细项略超满分（如 rich_diversity=2.5）时钳制到 rubric 上限，避免 validation 整单失败。"""
    leaves = _full_memoir_leaves_max()
    leaves["rich_diversity"] = 2.5
    m = MemoirJudgeOutput.model_validate(
        {
            **leaves,
            "total_score": 100.0,
            "rationale": "",
            "major_strengths": [],
            "major_issues": [],
            "insufficient_evidence": [],
            "evidence_refs": [],
        }
    )
    assert m.rich_diversity == 2.0


def test_conversation_judge_meta_fields_default() -> None:
    leaves = {
        "emotion_carry": 10,
        "empathy_depth": 8,
        "emotion_safety": 6,
        "emotion_guidance": 6,
        "fact_mining": 8,
        "info_completeness_guide": 8,
        "info_depth_mining": 9,
        "persona_understanding": 7,
        "persona_consistency_verify": 4,
        "persona_expression_guide": 4,
        "interview_structure": 6,
        "context_memory": 5,
        "rhythm_control": 4,
        "question_quality": 7,
        "follow_up_depth": 5,
        "non_leading": 3,
    }
    t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="x")
    assert t.major_strengths == []
    assert t.major_issues == []
    assert t.insufficient_evidence == []
    assert t.evidence_refs == []
    assert t.confidence == 0.75


def test_conversation_total_must_match_sum() -> None:
    leaves = {
        "emotion_carry": 10,
        "empathy_depth": 8,
        "emotion_safety": 6,
        "emotion_guidance": 6,
        "fact_mining": 8,
        "info_completeness_guide": 8,
        "info_depth_mining": 9,
        "persona_understanding": 7,
        "persona_consistency_verify": 4,
        "persona_expression_guide": 4,
        "interview_structure": 6,
        "context_memory": 5,
        "rhythm_control": 4,
        "question_quality": 7,
        "follow_up_depth": 5,
        "non_leading": 3,
    }
    t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="")
    assert t.emotion_score == 30.0
    assert t.information_score == 25.0
    assert t.total_score == 100.0