Files
life-echo/api/tests/test_judge_schemas.py
Kevin ac49bc7f23 feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:25:15 +08:00

209 lines
6.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""评审输出 schema文档级配分与合计校验。"""
import pytest
from app.features.evaluation.judge_schemas import MemoirJudgeOutput, TurnJudgeOutput
def _full_memoir_leaves_max() -> dict:
return {
"mem_fidelity": 9,
"mem_factual_accuracy": 5,
"mem_factual_coverage": 5,
"mem_traceability": 4,
"info_slot_coverage": 6,
"info_sufficiency": 4,
"info_density": 4,
"narr_structure": 6,
"narr_paragraphs": 5,
"narr_pacing": 3,
"lang_fluency": 3,
"lang_conciseness": 3,
"lang_literary": 4,
"lang_controlled_expansion": 4,
"lang_detail": 2,
"lang_style": 2,
"emo_authenticity": 5,
"emo_depth": 4,
"char_understanding": 4,
"char_consistency": 3,
"char_integration": 2,
"coh_timeline": 2,
"coh_cross_chapter": 2,
"rich_analogy": 3,
"rich_diversity": 2,
"pub_editorial_cost": 2,
"pub_completeness": 2,
}
def test_memoir_full_marks_sum_to_100() -> None:
leaves = _full_memoir_leaves_max()
m = MemoirJudgeOutput(
**leaves,
total_score=100.0,
rationale="",
)
assert m.authenticity_score == 23.0
assert m.language_score == 18.0
assert m.total_score == 100.0
def test_memoir_total_score_derived_from_leaves() -> None:
"""模型传入的 total_score 与细项之和不一致时,以细项和为准。"""
leaves = _full_memoir_leaves_max()
m = MemoirJudgeOutput(
**leaves,
total_score=50.0,
rationale="",
)
assert m.total_score == 100.0
def _full_turn_leaves_max() -> dict:
return {
"emotion_carry": 10,
"empathy_depth": 8,
"emotion_safety": 6,
"emotion_guidance": 6,
"fact_mining": 8,
"info_completeness_guide": 8,
"info_depth_mining": 9,
"persona_understanding": 7,
"persona_consistency_verify": 4,
"persona_expression_guide": 4,
"interview_structure": 6,
"context_memory": 5,
"rhythm_control": 4,
"question_quality": 7,
"follow_up_depth": 5,
"non_leading": 3,
}
def test_turn_judge_total_score_synced_from_leaves_like_glm5() -> None:
"""情绪块 9+8+6+6=29其余打满时常被误写 total_score=100。"""
t = TurnJudgeOutput(
emotion_carry=9,
empathy_depth=8,
emotion_safety=6,
emotion_guidance=6,
fact_mining=8,
info_completeness_guide=8,
info_depth_mining=9,
persona_understanding=7,
persona_consistency_verify=4,
persona_expression_guide=4,
interview_structure=6,
context_memory=5,
rhythm_control=4,
question_quality=7,
follow_up_depth=5,
non_leading=3,
total_score=100.0,
rationale="x",
)
assert t.total_score == 99.0
assert t.emotion_score == 29.0
def test_turn_judge_coerces_string_lists_from_llm() -> None:
"""GLM-5 常把 major_* / insufficient_evidence 写成字符串而非 JSON 数组。"""
leaves = _full_turn_leaves_max()
t = TurnJudgeOutput(
**leaves,
total_score=100.0,
rationale="x",
major_strengths="Strong empathy and flow.",
major_issues="None identified.",
insufficient_evidence="None identified.",
)
assert t.major_strengths == ["Strong empathy and flow."]
assert t.major_issues == []
assert t.insufficient_evidence == []
def test_memoir_judge_coerces_string_lists_from_llm() -> None:
leaves = _full_memoir_leaves_max()
m = MemoirJudgeOutput(
**leaves,
total_score=100.0,
rationale="",
major_strengths="Solid structure.",
major_issues="n/a",
insufficient_evidence="",
)
assert m.major_strengths == ["Solid structure."]
assert m.major_issues == []
assert m.insufficient_evidence == []
def test_memoir_judge_clamps_leaf_scores_over_max_from_llm() -> None:
"""细项略超满分(如 rich_diversity=2.5)时钳制到 rubric 上限,避免 validation 整单失败。"""
leaves = _full_memoir_leaves_max()
leaves["rich_diversity"] = 2.5
m = MemoirJudgeOutput.model_validate(
{
**leaves,
"total_score": 100.0,
"rationale": "",
"major_strengths": [],
"major_issues": [],
"insufficient_evidence": [],
"evidence_refs": [],
}
)
assert m.rich_diversity == 2.0
def test_conversation_judge_meta_fields_default() -> None:
leaves = {
"emotion_carry": 10,
"empathy_depth": 8,
"emotion_safety": 6,
"emotion_guidance": 6,
"fact_mining": 8,
"info_completeness_guide": 8,
"info_depth_mining": 9,
"persona_understanding": 7,
"persona_consistency_verify": 4,
"persona_expression_guide": 4,
"interview_structure": 6,
"context_memory": 5,
"rhythm_control": 4,
"question_quality": 7,
"follow_up_depth": 5,
"non_leading": 3,
}
t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="x")
assert t.major_strengths == []
assert t.major_issues == []
assert t.insufficient_evidence == []
assert t.evidence_refs == []
assert t.confidence == 0.75
def test_conversation_total_must_match_sum() -> None:
leaves = {
"emotion_carry": 10,
"empathy_depth": 8,
"emotion_safety": 6,
"emotion_guidance": 6,
"fact_mining": 8,
"info_completeness_guide": 8,
"info_depth_mining": 9,
"persona_understanding": 7,
"persona_consistency_verify": 4,
"persona_expression_guide": 4,
"interview_structure": 6,
"context_memory": 5,
"rhythm_control": 4,
"question_quality": 7,
"follow_up_depth": 5,
"non_leading": 3,
}
t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="")
assert t.emotion_score == 30.0
assert t.information_score == 25.0
assert t.total_score == 100.0