1. 建立问题库大纲,对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性
This commit is contained in:
yangshilin
2026-04-09 15:32:35 +08:00
parent 064ad2161d
commit e1341c6d18
49 changed files with 938 additions and 271 deletions

View File

@@ -27,7 +27,7 @@ from app.features.evaluation.eval_trace_service import EvalTraceService
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
from app.features.evaluation.judge_service import (
EvalJudgeService,
eval_judge_compare_transcript_each_max_chars_for_context,
eval_judge_compare_bundle_caps,
eval_judge_conversation_transcript_max_chars_for_context,
)
from app.features.evaluation.schemas import MemoirSectionBaselineOut
@@ -234,6 +234,7 @@ class EvalJudgeManualService:
f"replay_glm5_failed: {replay_result.error or 'unknown error'}"
)
_cmp_total, _cmp_per_side = eval_judge_compare_bundle_caps(judge._ctx_tokens)
bundle: dict[str, Any] = {
"version": 1,
"judged_at": datetime.now(timezone.utc).isoformat(),
@@ -250,9 +251,8 @@ class EvalJudgeManualService:
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_cmp_total,
compare_per_side_cap=_cmp_per_side,
fixture_filename=fn,
),
"compare_markdown": "",
@@ -363,6 +363,7 @@ class EvalJudgeManualService:
acc["options"]["judge_model"] = resolved_model
acc["fixture_filename"] = fn
_sse_cmp_total, _sse_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
persist = True
try:
yield {
@@ -435,9 +436,7 @@ class EvalJudgeManualService:
full_transcript=replay_transcript
)
replay_judge = replay_result.output
acc["replay_judge"] = (
replay_judge.model_dump() if replay_judge else None
)
acc["replay_judge"] = replay_judge.model_dump() if replay_judge else None
acc["compare_summary"] = build_conversation_compare_summary(
baseline_judge=baseline_judge,
replay_judge=replay_judge,
@@ -446,9 +445,8 @@ class EvalJudgeManualService:
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_sse_cmp_total,
compare_per_side_cap=_sse_cmp_per,
fixture_filename=fn,
)
yield {
@@ -532,7 +530,9 @@ class EvalJudgeManualService:
fn = (fixture_filename or "").strip() or None
if not fn:
raise EvaluationBadRequestError("请选择基线 MDfixture_filename后再重试基准分")
raise EvaluationBadRequestError(
"请选择基线 MDfixture_filename后再重试基准分"
)
try:
turns, _ = read_user_export_fixture(fn)
@@ -568,6 +568,7 @@ class EvalJudgeManualService:
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
if not judge:
raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
_rt_cmp_total, _rt_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
baseline_result = await judge.judge_conversation_result(
full_transcript=baseline_transcript
)
@@ -590,9 +591,8 @@ class EvalJudgeManualService:
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_rt_cmp_total,
compare_per_side_cap=_rt_cmp_per,
fixture_filename=fn,
),
"compare_markdown": "",
@@ -619,10 +619,7 @@ class EvalJudgeManualService:
sse_event="baseline_turn_judge",
):
idx = row.get("turn_index")
if (
isinstance(idx, (int, float))
and row.get("judge") is not None
):
if isinstance(idx, (int, float)) and row.get("judge") is not None:
acc["baseline_turn_judges"][str(int(idx))] = row["judge"]
acc["compare_markdown"] = ""
@@ -634,9 +631,8 @@ class EvalJudgeManualService:
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_rt_cmp_total,
compare_per_side_cap=_rt_cmp_per,
fixture_filename=fn,
)
async for piece in judge.stream_conversation_compare(
@@ -682,7 +678,10 @@ class EvalJudgeManualService:
trace_svc = EvalTraceService(self._db)
def _chapter_evidence_notes(
lineage_tier: str, evidence_summary: str, truncated: bool, dropped: list[str]
lineage_tier: str,
evidence_summary: str,
truncated: bool,
dropped: list[str],
) -> str:
drops = ",".join(dropped[:12]) if dropped else ""
return (