- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas. - Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error. - MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings. - app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS. - Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014. - Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
209 lines
6.0 KiB
Python
209 lines
6.0 KiB
Python
"""评审输出 schema:文档级配分与合计校验。"""
|
||
|
||
import pytest
|
||
|
||
from app.features.evaluation.judge_schemas import MemoirJudgeOutput, TurnJudgeOutput
|
||
|
||
|
||
def _full_memoir_leaves_max() -> dict:
|
||
return {
|
||
"mem_fidelity": 9,
|
||
"mem_factual_accuracy": 5,
|
||
"mem_factual_coverage": 5,
|
||
"mem_traceability": 4,
|
||
"info_slot_coverage": 6,
|
||
"info_sufficiency": 4,
|
||
"info_density": 4,
|
||
"narr_structure": 6,
|
||
"narr_paragraphs": 5,
|
||
"narr_pacing": 3,
|
||
"lang_fluency": 3,
|
||
"lang_conciseness": 3,
|
||
"lang_literary": 4,
|
||
"lang_controlled_expansion": 4,
|
||
"lang_detail": 2,
|
||
"lang_style": 2,
|
||
"emo_authenticity": 5,
|
||
"emo_depth": 4,
|
||
"char_understanding": 4,
|
||
"char_consistency": 3,
|
||
"char_integration": 2,
|
||
"coh_timeline": 2,
|
||
"coh_cross_chapter": 2,
|
||
"rich_analogy": 3,
|
||
"rich_diversity": 2,
|
||
"pub_editorial_cost": 2,
|
||
"pub_completeness": 2,
|
||
}
|
||
|
||
|
||
def test_memoir_full_marks_sum_to_100() -> None:
|
||
leaves = _full_memoir_leaves_max()
|
||
m = MemoirJudgeOutput(
|
||
**leaves,
|
||
total_score=100.0,
|
||
rationale="",
|
||
)
|
||
assert m.authenticity_score == 23.0
|
||
assert m.language_score == 18.0
|
||
assert m.total_score == 100.0
|
||
|
||
|
||
def test_memoir_total_score_derived_from_leaves() -> None:
|
||
"""模型传入的 total_score 与细项之和不一致时,以细项和为准。"""
|
||
leaves = _full_memoir_leaves_max()
|
||
m = MemoirJudgeOutput(
|
||
**leaves,
|
||
total_score=50.0,
|
||
rationale="",
|
||
)
|
||
assert m.total_score == 100.0
|
||
|
||
|
||
def _full_turn_leaves_max() -> dict:
|
||
return {
|
||
"emotion_carry": 10,
|
||
"empathy_depth": 8,
|
||
"emotion_safety": 6,
|
||
"emotion_guidance": 6,
|
||
"fact_mining": 8,
|
||
"info_completeness_guide": 8,
|
||
"info_depth_mining": 9,
|
||
"persona_understanding": 7,
|
||
"persona_consistency_verify": 4,
|
||
"persona_expression_guide": 4,
|
||
"interview_structure": 6,
|
||
"context_memory": 5,
|
||
"rhythm_control": 4,
|
||
"question_quality": 7,
|
||
"follow_up_depth": 5,
|
||
"non_leading": 3,
|
||
}
|
||
|
||
|
||
def test_turn_judge_total_score_synced_from_leaves_like_glm5() -> None:
|
||
"""情绪块 9+8+6+6=29,其余打满时常被误写 total_score=100。"""
|
||
t = TurnJudgeOutput(
|
||
emotion_carry=9,
|
||
empathy_depth=8,
|
||
emotion_safety=6,
|
||
emotion_guidance=6,
|
||
fact_mining=8,
|
||
info_completeness_guide=8,
|
||
info_depth_mining=9,
|
||
persona_understanding=7,
|
||
persona_consistency_verify=4,
|
||
persona_expression_guide=4,
|
||
interview_structure=6,
|
||
context_memory=5,
|
||
rhythm_control=4,
|
||
question_quality=7,
|
||
follow_up_depth=5,
|
||
non_leading=3,
|
||
total_score=100.0,
|
||
rationale="x",
|
||
)
|
||
assert t.total_score == 99.0
|
||
assert t.emotion_score == 29.0
|
||
|
||
|
||
def test_turn_judge_coerces_string_lists_from_llm() -> None:
|
||
"""GLM-5 常把 major_* / insufficient_evidence 写成字符串而非 JSON 数组。"""
|
||
leaves = _full_turn_leaves_max()
|
||
t = TurnJudgeOutput(
|
||
**leaves,
|
||
total_score=100.0,
|
||
rationale="x",
|
||
major_strengths="Strong empathy and flow.",
|
||
major_issues="None identified.",
|
||
insufficient_evidence="None identified.",
|
||
)
|
||
assert t.major_strengths == ["Strong empathy and flow."]
|
||
assert t.major_issues == []
|
||
assert t.insufficient_evidence == []
|
||
|
||
|
||
def test_memoir_judge_coerces_string_lists_from_llm() -> None:
|
||
leaves = _full_memoir_leaves_max()
|
||
m = MemoirJudgeOutput(
|
||
**leaves,
|
||
total_score=100.0,
|
||
rationale="",
|
||
major_strengths="Solid structure.",
|
||
major_issues="n/a",
|
||
insufficient_evidence="",
|
||
)
|
||
assert m.major_strengths == ["Solid structure."]
|
||
assert m.major_issues == []
|
||
assert m.insufficient_evidence == []
|
||
|
||
|
||
def test_memoir_judge_clamps_leaf_scores_over_max_from_llm() -> None:
|
||
"""细项略超满分(如 rich_diversity=2.5)时钳制到 rubric 上限,避免 validation 整单失败。"""
|
||
leaves = _full_memoir_leaves_max()
|
||
leaves["rich_diversity"] = 2.5
|
||
m = MemoirJudgeOutput.model_validate(
|
||
{
|
||
**leaves,
|
||
"total_score": 100.0,
|
||
"rationale": "",
|
||
"major_strengths": [],
|
||
"major_issues": [],
|
||
"insufficient_evidence": [],
|
||
"evidence_refs": [],
|
||
}
|
||
)
|
||
assert m.rich_diversity == 2.0
|
||
|
||
|
||
def test_conversation_judge_meta_fields_default() -> None:
|
||
leaves = {
|
||
"emotion_carry": 10,
|
||
"empathy_depth": 8,
|
||
"emotion_safety": 6,
|
||
"emotion_guidance": 6,
|
||
"fact_mining": 8,
|
||
"info_completeness_guide": 8,
|
||
"info_depth_mining": 9,
|
||
"persona_understanding": 7,
|
||
"persona_consistency_verify": 4,
|
||
"persona_expression_guide": 4,
|
||
"interview_structure": 6,
|
||
"context_memory": 5,
|
||
"rhythm_control": 4,
|
||
"question_quality": 7,
|
||
"follow_up_depth": 5,
|
||
"non_leading": 3,
|
||
}
|
||
t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="x")
|
||
assert t.major_strengths == []
|
||
assert t.major_issues == []
|
||
assert t.insufficient_evidence == []
|
||
assert t.evidence_refs == []
|
||
assert t.confidence == 0.75
|
||
|
||
|
||
def test_conversation_total_must_match_sum() -> None:
|
||
leaves = {
|
||
"emotion_carry": 10,
|
||
"empathy_depth": 8,
|
||
"emotion_safety": 6,
|
||
"emotion_guidance": 6,
|
||
"fact_mining": 8,
|
||
"info_completeness_guide": 8,
|
||
"info_depth_mining": 9,
|
||
"persona_understanding": 7,
|
||
"persona_consistency_verify": 4,
|
||
"persona_expression_guide": 4,
|
||
"interview_structure": 6,
|
||
"context_memory": 5,
|
||
"rhythm_control": 4,
|
||
"question_quality": 7,
|
||
"follow_up_depth": 5,
|
||
"non_leading": 3,
|
||
}
|
||
t = TurnJudgeOutput(**leaves, total_score=100.0, rationale="")
|
||
assert t.emotion_score == 30.0
|
||
assert t.information_score == 25.0
|
||
assert t.total_score == 100.0
|