85 lines
2.3 KiB
Python
85 lines
2.3 KiB
Python
|
|
"""评审输出 schema:文档级配分与合计校验。"""
|
|||
|
|
|
|||
|
|
import pytest
|
|||
|
|
|
|||
|
|
from app.features.evaluation.judge_schemas import MemoirJudgeOutput, TurnJudgeOutput
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _full_memoir_leaves_max() -> dict:
|
|||
|
|
return {
|
|||
|
|
"mem_fidelity": 9,
|
|||
|
|
"mem_factual_accuracy": 5,
|
|||
|
|
"mem_factual_coverage": 5,
|
|||
|
|
"mem_traceability": 4,
|
|||
|
|
"info_slot_coverage": 6,
|
|||
|
|
"info_sufficiency": 4,
|
|||
|
|
"info_density": 4,
|
|||
|
|
"narr_structure": 6,
|
|||
|
|
"narr_paragraphs": 5,
|
|||
|
|
"narr_pacing": 3,
|
|||
|
|
"lang_fluency": 3,
|
|||
|
|
"lang_conciseness": 3,
|
|||
|
|
"lang_literary": 4,
|
|||
|
|
"lang_controlled_expansion": 4,
|
|||
|
|
"lang_detail": 2,
|
|||
|
|
"lang_style": 2,
|
|||
|
|
"emo_authenticity": 5,
|
|||
|
|
"emo_depth": 4,
|
|||
|
|
"char_understanding": 4,
|
|||
|
|
"char_consistency": 3,
|
|||
|
|
"char_integration": 2,
|
|||
|
|
"coh_timeline": 2,
|
|||
|
|
"coh_cross_chapter": 2,
|
|||
|
|
"rich_analogy": 3,
|
|||
|
|
"rich_diversity": 2,
|
|||
|
|
"pub_editorial_cost": 2,
|
|||
|
|
"pub_completeness": 2,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def test_memoir_full_marks_sum_to_100() -> None:
|
|||
|
|
leaves = _full_memoir_leaves_max()
|
|||
|
|
m = MemoirJudgeOutput(
|
|||
|
|
**leaves,
|
|||
|
|
total_score=100.0,
|
|||
|
|
rationale="",
|
|||
|
|
)
|
|||
|
|
assert m.authenticity_score == 23.0
|
|||
|
|
assert m.language_score == 18.0
|
|||
|
|
assert m.total_score == 100.0
|
|||
|
|
|
|||
|
|
|
|||
|
|
def test_memoir_total_must_match_sum() -> None:
|
|||
|
|
leaves = _full_memoir_leaves_max()
|
|||
|
|
with pytest.raises(ValueError, match="不一致"):
|
|||
|
|
MemoirJudgeOutput(
|
|||
|
|
**leaves,
|
|||
|
|
total_score=50.0,
|
|||
|
|
rationale="",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def test_conversation_total_must_match_sum() -> None:
|
|||
|
|
leaves = {
|
|||
|
|
"emotion_carry": 10,
|
|||
|
|
"empathy_depth": 8,
|
|||
|
|
"emotion_safety": 6,
|
|||
|
|
"emotion_guidance": 6,
|
|||
|
|
"fact_mining": 8,
|
|||
|
|
"info_completeness_guide": 8,
|
|||
|
|
"info_depth_mining": 9,
|
|||
|
|
"persona_understanding": 7,
|
|||
|
|
"persona_consistency_verify": 4,
|
|||
|
|
"persona_expression_guide": 4,
|
|||
|
|
"interview_structure": 6,
|
|||
|
|
"context_memory": 5,
|
|||
|
|
"rhythm_control": 4,
|
|||
|
|
"question_quality": 7,
|
|||
|
|
"follow_up_depth": 5,
|
|||
|
|
"non_leading": 3,
|
|||
|
|
}
|
|||
|
|
t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="")
|
|||
|
|
assert t.emotion_score == 30.0
|
|||
|
|
assert t.information_score == 25.0
|
|||
|
|
assert t.total_score == 100.0
|