feat:
1. 建立问题库大纲,对应每个人生阶段槽位 2. 鼓励使用更生活化的交流语言共情与总结 3. 降低评审模型可能发生截断的概率 4. 成稿质量维度强化情感表达和上下文连贯性
This commit is contained in:
@@ -11,8 +11,11 @@ from app.features.evaluation.judge_schemas import ConversationJudgeOutput
|
||||
from app.features.evaluation.judge_service import (
|
||||
EvalJudgeService,
|
||||
_build_memoir_judge_prompt,
|
||||
eval_judge_conversation_transcript_max_chars,
|
||||
conversation_judge_transcript_excerpt,
|
||||
eval_judge_compare_transcript_each_max_chars,
|
||||
eval_judge_conversation_transcript_max_chars,
|
||||
trim_compare_transcript_pair,
|
||||
turn_judge_prior_excerpt,
|
||||
)
|
||||
|
||||
|
||||
@@ -58,7 +61,9 @@ async def test_judge_conversation_result_preserves_validation_error(
|
||||
monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _boom)
|
||||
|
||||
svc = EvalJudgeService(object())
|
||||
result = await svc.judge_conversation_result(full_transcript="[Turn 1]\n用户: hi\nAI: hello")
|
||||
result = await svc.judge_conversation_result(
|
||||
full_transcript="[Turn 1]\n用户: hi\nAI: hello"
|
||||
)
|
||||
|
||||
assert result.output is None
|
||||
assert result.error is not None
|
||||
@@ -91,7 +96,32 @@ def test_eval_judge_transcript_budget_exceeds_legacy_8192(
|
||||
n = eval_judge_conversation_transcript_max_chars()
|
||||
assert n > 90_000
|
||||
each = eval_judge_compare_transcript_each_max_chars()
|
||||
assert each > 40_000
|
||||
assert each > 85_000
|
||||
|
||||
|
||||
def test_conversation_judge_excerpt_appends_boundary_when_clipped() -> None:
|
||||
body = conversation_judge_transcript_excerpt("abcdefghijklmnopqrstuvwxyz", cap=10)
|
||||
assert body.startswith("abcdefghij")
|
||||
assert "评审边界" in body
|
||||
assert "截断稿" in body
|
||||
|
||||
|
||||
def test_turn_judge_prior_excerpt_appends_boundary_when_clipped() -> None:
|
||||
body = turn_judge_prior_excerpt("0123456789abcdef", cap=8)
|
||||
assert body.startswith("01234567")
|
||||
assert "上文节选已截断" in body
|
||||
|
||||
|
||||
def test_trim_compare_transcript_pair_prefers_trimming_longer_side() -> None:
|
||||
b_out, r_out, b_trunc, r_trunc = trim_compare_transcript_pair(
|
||||
"a" * 100,
|
||||
"b" * 900,
|
||||
total_max_chars=950,
|
||||
per_side_max_chars=None,
|
||||
)
|
||||
assert len(b_out) == 100
|
||||
assert len(r_out) == 850
|
||||
assert b_trunc is False and r_trunc is True
|
||||
|
||||
|
||||
def test_eval_judge_transcript_budget_respects_explicit_cap(
|
||||
@@ -118,9 +148,7 @@ def test_build_memoir_prompt_includes_source_and_reference_evidence() -> None:
|
||||
|
||||
|
||||
def test_build_memoir_prompt_requires_conservative_scoring_without_evidence() -> None:
|
||||
prompt = _build_memoir_judge_prompt(
|
||||
memoir_markdown="# 当前正文\n他后来去了南方。"
|
||||
)
|
||||
prompt = _build_memoir_judge_prompt(memoir_markdown="# 当前正文\n他后来去了南方。")
|
||||
|
||||
assert "无可用局部对话证据" in prompt
|
||||
assert "必须保守打分" in prompt
|
||||
@@ -156,12 +184,14 @@ def test_compare_summary_surpass_gate_and_truncation_flags() -> None:
|
||||
baseline_transcript="A" * 400,
|
||||
replay_transcript="B" * 1200,
|
||||
conv_cap=1000,
|
||||
compare_cap_each=500,
|
||||
compare_cap_total=1000,
|
||||
compare_per_side_cap=None,
|
||||
fixture_filename="golden.md",
|
||||
)
|
||||
assert summary["mode"] == "ab"
|
||||
assert summary["gate"]["status"] in {"parity", "surpass"}
|
||||
assert summary["truncation"]["replay_truncated_for_compare"] is True
|
||||
assert summary["evidence_quality"]["scope"] == "partial"
|
||||
assert "group_deltas" in summary
|
||||
|
||||
|
||||
@@ -181,7 +211,8 @@ def test_compare_summary_flags_repeat_issue_as_regression() -> None:
|
||||
baseline_transcript="[Turn 1]",
|
||||
replay_transcript="[Turn 1]",
|
||||
conv_cap=1000,
|
||||
compare_cap_each=500,
|
||||
compare_cap_total=1000,
|
||||
compare_per_side_cap=None,
|
||||
)
|
||||
assert summary["repeat_issue_detected"] is True
|
||||
assert summary["gate"]["status"] == "regressed"
|
||||
|
||||
Reference in New Issue
Block a user