feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.
2026-04-08 09:38:07 +08:00
parent 99543d04c6
commit 6772e1269c
26 changed files with 1255 additions and 124 deletions
--- a/api/app/features/evaluation/judge_service.py
+++ b/api/app/features/evaluation/judge_service.py
@@ -1,4 +1,4 @@
-"""GLM 评审调用（结构化 JSON）。"""
+"""智谱 GLM-5 评审调用（结构化 JSON）。"""

 from __future__ import annotations

@@ -6,6 +6,7 @@ from collections.abc import AsyncIterator
 from dataclasses import dataclass
 from typing import Any, Generic, TypeVar

+from app.core.config import settings
 from app.core.llm_call import LLMCallError, allm_json_call
 from app.core.logging import get_logger
 from app.features.evaluation.judge_schemas import (
@@ -27,14 +28,49 @@ TJudgeOutput = TypeVar(
 )

 _TURN_MAX = 768
-_CONV_MAX = 8192
 _CONV_JUDGE_JSON_MAX = 2048
+_CONV_HEADER = "【完整对话】（每轮以 `[Turn k]` 开头）\n\n"
 _MEMOIR_MAX = 12000
 _MEMOIR_JSON_MAX = 1536
 _COMPARE_STREAM_MAX = 6144
 _MEMOIR_EVIDENCE_MAX = 12000


+def _eval_judge_prompt_char_pool() -> int:
+    """整段请求的字符预算（由评审模型 context window 推导，保守）。"""
+    toks = (
+        settings.eval_judge_context_window_tokens
+        - settings.eval_judge_completion_reserve_tokens
+        - settings.eval_judge_prompt_budget_safety_tokens
+    )
+    toks = max(1, toks)
+    return max(1, int(toks / settings.eval_judge_approx_tokens_per_char))
+
+
+def eval_judge_conversation_transcript_max_chars() -> int:
+    """整段对话评审：【完整对话】transcript 最大字符数。"""
+    if settings.eval_judge_max_transcript_chars > 0:
+        return settings.eval_judge_max_transcript_chars
+    overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
+    return max(1, _eval_judge_prompt_char_pool() - overhead)
+
+
+def eval_judge_turn_prior_transcript_max_chars() -> int:
+    """逐轮评审：截至上一轮的 transcript 节选上限（含与用户/助手正文头的固定开销）。"""
+    if settings.eval_judge_max_transcript_chars > 0:
+        return settings.eval_judge_max_transcript_chars
+    static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
+    return max(1, _eval_judge_prompt_char_pool() - static)
+
+
+def eval_judge_compare_transcript_each_max_chars() -> int:
+    """A/B 两段 transcript 同 prompt 时，每条 transcript 的上限（均分字符预算）。"""
+    if settings.eval_judge_max_compare_transcript_chars_each > 0:
+        return settings.eval_judge_max_compare_transcript_chars_each
+    pool = _eval_judge_prompt_char_pool() - settings.eval_judge_compare_prompt_overhead_chars
+    return max(1, pool // 2)
+
+
@dataclass(slots=True)
 class JudgeCallResult(Generic[TJudgeOutput]):
    output: TJudgeOutput | None
@@ -106,7 +142,7 @@ class EvalJudgeService:
 【本轮位置】完整对话中当前轮次为 Turn {t + 1}（与下方节选及全量 transcript 的 `[Turn ...]` 编号一致）。evidence_refs.turn_index 请使用该编号。

 【截至上一轮的对话节选】（含 `[Turn k]` 标签）
-{prior_transcript[:_CONV_MAX]}
+{prior_transcript[: eval_judge_turn_prior_transcript_max_chars()]}

 【本轮用户】
 {user_utterance[:4000]}
@@ -134,7 +170,7 @@ class EvalJudgeService:
        prompt = f"""{CONV_JUDGE_INSTRUCTIONS}

 【完整对话】（每轮以 `[Turn k]` 开头）
-{full_transcript[:_CONV_MAX]}
+{full_transcript[: eval_judge_conversation_transcript_max_chars()]}
 """
        try:
            out = await allm_json_call(
@@ -168,8 +204,10 @@ class EvalJudgeService:
        if not self._llm:
            yield "[错误] 未配置评审模型 API Key（eval_judge_api_key / zhipu_api_key）"
            return
-        b_tr = (baseline_transcript or "").strip()[:_CONV_MAX]
-        r_tr = (replay_transcript or "").strip()[:_CONV_MAX]
+        cap_each = eval_judge_compare_transcript_each_max_chars()
+        cap_single = eval_judge_conversation_transcript_max_chars()
+        b_tr = (baseline_transcript or "").strip()[:cap_each]
+        r_tr = (replay_transcript or "").strip()[:cap_each]
        b_json = (
            baseline_judge.model_dump_json(ensure_ascii=False)
            if baseline_judge
@@ -200,10 +238,11 @@ class EvalJudgeService:

 笔调简洁、偏执行清单。"""
        elif replay_judge:
+            r_one = (replay_transcript or "").strip()[:cap_single]
            prompt = f"""{COMPARE_CONV_STREAM_HINT}

 【回放/新测 transcript】
-{r_tr}
+{r_one}

 【整体评分 JSON】
 {r_json}