Files
life-echo/api/tests/test_judge_schemas.py
Kevin 6772e1269c feat(evaluation): memoir readiness, judge/replay updates, eval web playground
Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.
2026-04-08 09:43:34 +08:00

191 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""评审输出 schema文档级配分与合计校验。"""
import pytest
from app.features.evaluation.judge_schemas import MemoirJudgeOutput, TurnJudgeOutput
def _full_memoir_leaves_max() -> dict:
return {
"mem_fidelity": 9,
"mem_factual_accuracy": 5,
"mem_factual_coverage": 5,
"mem_traceability": 4,
"info_slot_coverage": 6,
"info_sufficiency": 4,
"info_density": 4,
"narr_structure": 6,
"narr_paragraphs": 5,
"narr_pacing": 3,
"lang_fluency": 3,
"lang_conciseness": 3,
"lang_literary": 4,
"lang_controlled_expansion": 4,
"lang_detail": 2,
"lang_style": 2,
"emo_authenticity": 5,
"emo_depth": 4,
"char_understanding": 4,
"char_consistency": 3,
"char_integration": 2,
"coh_timeline": 2,
"coh_cross_chapter": 2,
"rich_analogy": 3,
"rich_diversity": 2,
"pub_editorial_cost": 2,
"pub_completeness": 2,
}
def test_memoir_full_marks_sum_to_100() -> None:
leaves = _full_memoir_leaves_max()
m = MemoirJudgeOutput(
**leaves,
total_score=100.0,
rationale="",
)
assert m.authenticity_score == 23.0
assert m.language_score == 18.0
assert m.total_score == 100.0
def test_memoir_total_score_derived_from_leaves() -> None:
"""模型传入的 total_score 与细项之和不一致时,以细项和为准。"""
leaves = _full_memoir_leaves_max()
m = MemoirJudgeOutput(
**leaves,
total_score=50.0,
rationale="",
)
assert m.total_score == 100.0
def _full_turn_leaves_max() -> dict:
return {
"emotion_carry": 10,
"empathy_depth": 8,
"emotion_safety": 6,
"emotion_guidance": 6,
"fact_mining": 8,
"info_completeness_guide": 8,
"info_depth_mining": 9,
"persona_understanding": 7,
"persona_consistency_verify": 4,
"persona_expression_guide": 4,
"interview_structure": 6,
"context_memory": 5,
"rhythm_control": 4,
"question_quality": 7,
"follow_up_depth": 5,
"non_leading": 3,
}
def test_turn_judge_total_score_synced_from_leaves_like_glm5() -> None:
"""情绪块 9+8+6+6=29其余打满时常被误写 total_score=100。"""
t = TurnJudgeOutput(
emotion_carry=9,
empathy_depth=8,
emotion_safety=6,
emotion_guidance=6,
fact_mining=8,
info_completeness_guide=8,
info_depth_mining=9,
persona_understanding=7,
persona_consistency_verify=4,
persona_expression_guide=4,
interview_structure=6,
context_memory=5,
rhythm_control=4,
question_quality=7,
follow_up_depth=5,
non_leading=3,
total_score=100.0,
rationale="x",
)
assert t.total_score == 99.0
assert t.emotion_score == 29.0
def test_turn_judge_coerces_string_lists_from_llm() -> None:
"""GLM-5 常把 major_* / insufficient_evidence 写成字符串而非 JSON 数组。"""
leaves = _full_turn_leaves_max()
t = TurnJudgeOutput(
**leaves,
total_score=100.0,
rationale="x",
major_strengths="Strong empathy and flow.",
major_issues="None identified.",
insufficient_evidence="None identified.",
)
assert t.major_strengths == ["Strong empathy and flow."]
assert t.major_issues == []
assert t.insufficient_evidence == []
def test_memoir_judge_coerces_string_lists_from_llm() -> None:
leaves = _full_memoir_leaves_max()
m = MemoirJudgeOutput(
**leaves,
total_score=100.0,
rationale="",
major_strengths="Solid structure.",
major_issues="n/a",
insufficient_evidence="",
)
assert m.major_strengths == ["Solid structure."]
assert m.major_issues == []
assert m.insufficient_evidence == []
def test_conversation_judge_meta_fields_default() -> None:
leaves = {
"emotion_carry": 10,
"empathy_depth": 8,
"emotion_safety": 6,
"emotion_guidance": 6,
"fact_mining": 8,
"info_completeness_guide": 8,
"info_depth_mining": 9,
"persona_understanding": 7,
"persona_consistency_verify": 4,
"persona_expression_guide": 4,
"interview_structure": 6,
"context_memory": 5,
"rhythm_control": 4,
"question_quality": 7,
"follow_up_depth": 5,
"non_leading": 3,
}
t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="x")
assert t.major_strengths == []
assert t.major_issues == []
assert t.insufficient_evidence == []
assert t.evidence_refs == []
assert t.confidence == 0.75
def test_conversation_total_must_match_sum() -> None:
leaves = {
"emotion_carry": 10,
"empathy_depth": 8,
"emotion_safety": 6,
"emotion_guidance": 6,
"fact_mining": 8,
"info_completeness_guide": 8,
"info_depth_mining": 9,
"persona_understanding": 7,
"persona_consistency_verify": 4,
"persona_expression_guide": 4,
"interview_structure": 6,
"context_memory": 5,
"rhythm_control": 4,
"question_quality": 7,
"follow_up_depth": 5,
"non_leading": 3,
}
t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="")
assert t.emotion_score == 30.0
assert t.information_score == 25.0
assert t.total_score == 100.0