feat:
1. 建立问题库大纲,对应每个人生阶段槽位 2. 鼓励使用更生活化的交流语言共情与总结 3. 降低评审模型可能发生截断的概率 4. 成稿质量维度强化情感表达和上下文连贯性
This commit is contained in:
@@ -27,7 +27,7 @@ from app.features.evaluation.eval_trace_service import EvalTraceService
|
||||
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
|
||||
from app.features.evaluation.judge_service import (
|
||||
EvalJudgeService,
|
||||
eval_judge_compare_transcript_each_max_chars_for_context,
|
||||
eval_judge_compare_bundle_caps,
|
||||
eval_judge_conversation_transcript_max_chars_for_context,
|
||||
)
|
||||
from app.features.evaluation.schemas import MemoirSectionBaselineOut
|
||||
@@ -234,6 +234,7 @@ class EvalJudgeManualService:
|
||||
f"replay_glm5_failed: {replay_result.error or 'unknown error'}"
|
||||
)
|
||||
|
||||
_cmp_total, _cmp_per_side = eval_judge_compare_bundle_caps(judge._ctx_tokens)
|
||||
bundle: dict[str, Any] = {
|
||||
"version": 1,
|
||||
"judged_at": datetime.now(timezone.utc).isoformat(),
|
||||
@@ -250,9 +251,8 @@ class EvalJudgeManualService:
|
||||
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
|
||||
judge._ctx_tokens
|
||||
),
|
||||
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
|
||||
judge._ctx_tokens
|
||||
),
|
||||
compare_cap_total=_cmp_total,
|
||||
compare_per_side_cap=_cmp_per_side,
|
||||
fixture_filename=fn,
|
||||
),
|
||||
"compare_markdown": "",
|
||||
@@ -363,6 +363,7 @@ class EvalJudgeManualService:
|
||||
|
||||
acc["options"]["judge_model"] = resolved_model
|
||||
acc["fixture_filename"] = fn
|
||||
_sse_cmp_total, _sse_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
|
||||
persist = True
|
||||
try:
|
||||
yield {
|
||||
@@ -435,9 +436,7 @@ class EvalJudgeManualService:
|
||||
full_transcript=replay_transcript
|
||||
)
|
||||
replay_judge = replay_result.output
|
||||
acc["replay_judge"] = (
|
||||
replay_judge.model_dump() if replay_judge else None
|
||||
)
|
||||
acc["replay_judge"] = replay_judge.model_dump() if replay_judge else None
|
||||
acc["compare_summary"] = build_conversation_compare_summary(
|
||||
baseline_judge=baseline_judge,
|
||||
replay_judge=replay_judge,
|
||||
@@ -446,9 +445,8 @@ class EvalJudgeManualService:
|
||||
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
|
||||
judge._ctx_tokens
|
||||
),
|
||||
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
|
||||
judge._ctx_tokens
|
||||
),
|
||||
compare_cap_total=_sse_cmp_total,
|
||||
compare_per_side_cap=_sse_cmp_per,
|
||||
fixture_filename=fn,
|
||||
)
|
||||
yield {
|
||||
@@ -532,7 +530,9 @@ class EvalJudgeManualService:
|
||||
|
||||
fn = (fixture_filename or "").strip() or None
|
||||
if not fn:
|
||||
raise EvaluationBadRequestError("请选择基线 MD(fixture_filename)后再重试基准分")
|
||||
raise EvaluationBadRequestError(
|
||||
"请选择基线 MD(fixture_filename)后再重试基准分"
|
||||
)
|
||||
|
||||
try:
|
||||
turns, _ = read_user_export_fixture(fn)
|
||||
@@ -568,6 +568,7 @@ class EvalJudgeManualService:
|
||||
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
|
||||
if not judge:
|
||||
raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
|
||||
_rt_cmp_total, _rt_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
|
||||
baseline_result = await judge.judge_conversation_result(
|
||||
full_transcript=baseline_transcript
|
||||
)
|
||||
@@ -590,9 +591,8 @@ class EvalJudgeManualService:
|
||||
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
|
||||
judge._ctx_tokens
|
||||
),
|
||||
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
|
||||
judge._ctx_tokens
|
||||
),
|
||||
compare_cap_total=_rt_cmp_total,
|
||||
compare_per_side_cap=_rt_cmp_per,
|
||||
fixture_filename=fn,
|
||||
),
|
||||
"compare_markdown": "",
|
||||
@@ -619,10 +619,7 @@ class EvalJudgeManualService:
|
||||
sse_event="baseline_turn_judge",
|
||||
):
|
||||
idx = row.get("turn_index")
|
||||
if (
|
||||
isinstance(idx, (int, float))
|
||||
and row.get("judge") is not None
|
||||
):
|
||||
if isinstance(idx, (int, float)) and row.get("judge") is not None:
|
||||
acc["baseline_turn_judges"][str(int(idx))] = row["judge"]
|
||||
|
||||
acc["compare_markdown"] = ""
|
||||
@@ -634,9 +631,8 @@ class EvalJudgeManualService:
|
||||
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
|
||||
judge._ctx_tokens
|
||||
),
|
||||
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
|
||||
judge._ctx_tokens
|
||||
),
|
||||
compare_cap_total=_rt_cmp_total,
|
||||
compare_per_side_cap=_rt_cmp_per,
|
||||
fixture_filename=fn,
|
||||
)
|
||||
async for piece in judge.stream_conversation_compare(
|
||||
@@ -682,7 +678,10 @@ class EvalJudgeManualService:
|
||||
trace_svc = EvalTraceService(self._db)
|
||||
|
||||
def _chapter_evidence_notes(
|
||||
lineage_tier: str, evidence_summary: str, truncated: bool, dropped: list[str]
|
||||
lineage_tier: str,
|
||||
evidence_summary: str,
|
||||
truncated: bool,
|
||||
dropped: list[str],
|
||||
) -> str:
|
||||
drops = ",".join(dropped[:12]) if dropped else ""
|
||||
return (
|
||||
|
||||
Reference in New Issue
Block a user