feat(evaluation): memoir readiness, judge/replay updates, eval web playground
Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
"""GLM 评审调用(结构化 JSON)。"""
|
||||
"""智谱 GLM-5 评审调用(结构化 JSON)。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -6,6 +6,7 @@ from collections.abc import AsyncIterator
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Generic, TypeVar
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.llm_call import LLMCallError, allm_json_call
|
||||
from app.core.logging import get_logger
|
||||
from app.features.evaluation.judge_schemas import (
|
||||
@@ -27,14 +28,49 @@ TJudgeOutput = TypeVar(
|
||||
)
|
||||
|
||||
_TURN_MAX = 768
|
||||
_CONV_MAX = 8192
|
||||
_CONV_JUDGE_JSON_MAX = 2048
|
||||
_CONV_HEADER = "【完整对话】(每轮以 `[Turn k]` 开头)\n\n"
|
||||
_MEMOIR_MAX = 12000
|
||||
_MEMOIR_JSON_MAX = 1536
|
||||
_COMPARE_STREAM_MAX = 6144
|
||||
_MEMOIR_EVIDENCE_MAX = 12000
|
||||
|
||||
|
||||
def _eval_judge_prompt_char_pool() -> int:
|
||||
"""整段请求的字符预算(由评审模型 context window 推导,保守)。"""
|
||||
toks = (
|
||||
settings.eval_judge_context_window_tokens
|
||||
- settings.eval_judge_completion_reserve_tokens
|
||||
- settings.eval_judge_prompt_budget_safety_tokens
|
||||
)
|
||||
toks = max(1, toks)
|
||||
return max(1, int(toks / settings.eval_judge_approx_tokens_per_char))
|
||||
|
||||
|
||||
def eval_judge_conversation_transcript_max_chars() -> int:
|
||||
"""整段对话评审:【完整对话】transcript 最大字符数。"""
|
||||
if settings.eval_judge_max_transcript_chars > 0:
|
||||
return settings.eval_judge_max_transcript_chars
|
||||
overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
|
||||
return max(1, _eval_judge_prompt_char_pool() - overhead)
|
||||
|
||||
|
||||
def eval_judge_turn_prior_transcript_max_chars() -> int:
|
||||
"""逐轮评审:截至上一轮的 transcript 节选上限(含与用户/助手正文头的固定开销)。"""
|
||||
if settings.eval_judge_max_transcript_chars > 0:
|
||||
return settings.eval_judge_max_transcript_chars
|
||||
static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
|
||||
return max(1, _eval_judge_prompt_char_pool() - static)
|
||||
|
||||
|
||||
def eval_judge_compare_transcript_each_max_chars() -> int:
|
||||
"""A/B 两段 transcript 同 prompt 时,每条 transcript 的上限(均分字符预算)。"""
|
||||
if settings.eval_judge_max_compare_transcript_chars_each > 0:
|
||||
return settings.eval_judge_max_compare_transcript_chars_each
|
||||
pool = _eval_judge_prompt_char_pool() - settings.eval_judge_compare_prompt_overhead_chars
|
||||
return max(1, pool // 2)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class JudgeCallResult(Generic[TJudgeOutput]):
|
||||
output: TJudgeOutput | None
|
||||
@@ -106,7 +142,7 @@ class EvalJudgeService:
|
||||
【本轮位置】完整对话中当前轮次为 Turn {t + 1}(与下方节选及全量 transcript 的 `[Turn ...]` 编号一致)。evidence_refs.turn_index 请使用该编号。
|
||||
|
||||
【截至上一轮的对话节选】(含 `[Turn k]` 标签)
|
||||
{prior_transcript[:_CONV_MAX]}
|
||||
{prior_transcript[: eval_judge_turn_prior_transcript_max_chars()]}
|
||||
|
||||
【本轮用户】
|
||||
{user_utterance[:4000]}
|
||||
@@ -134,7 +170,7 @@ class EvalJudgeService:
|
||||
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
|
||||
|
||||
【完整对话】(每轮以 `[Turn k]` 开头)
|
||||
{full_transcript[:_CONV_MAX]}
|
||||
{full_transcript[: eval_judge_conversation_transcript_max_chars()]}
|
||||
"""
|
||||
try:
|
||||
out = await allm_json_call(
|
||||
@@ -168,8 +204,10 @@ class EvalJudgeService:
|
||||
if not self._llm:
|
||||
yield "[错误] 未配置评审模型 API Key(eval_judge_api_key / zhipu_api_key)"
|
||||
return
|
||||
b_tr = (baseline_transcript or "").strip()[:_CONV_MAX]
|
||||
r_tr = (replay_transcript or "").strip()[:_CONV_MAX]
|
||||
cap_each = eval_judge_compare_transcript_each_max_chars()
|
||||
cap_single = eval_judge_conversation_transcript_max_chars()
|
||||
b_tr = (baseline_transcript or "").strip()[:cap_each]
|
||||
r_tr = (replay_transcript or "").strip()[:cap_each]
|
||||
b_json = (
|
||||
baseline_judge.model_dump_json(ensure_ascii=False)
|
||||
if baseline_judge
|
||||
@@ -200,10 +238,11 @@ class EvalJudgeService:
|
||||
|
||||
笔调简洁、偏执行清单。"""
|
||||
elif replay_judge:
|
||||
r_one = (replay_transcript or "").strip()[:cap_single]
|
||||
prompt = f"""{COMPARE_CONV_STREAM_HINT}
|
||||
|
||||
【回放/新测 transcript】
|
||||
{r_tr}
|
||||
{r_one}
|
||||
|
||||
【整体评分 JSON】
|
||||
{r_json}
|
||||
|
||||
Reference in New Issue
Block a user