feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.
This commit is contained in:
Kevin
2026-04-08 09:38:07 +08:00
parent 99543d04c6
commit 6772e1269c
26 changed files with 1255 additions and 124 deletions

View File

@@ -5,7 +5,7 @@ Agent / LLM 诊断日志:耗时、输入输出规模、截断预览。
- **摘要**单行耗时、字符数、operation 名):当 ``LOG_AGENT_VERBOSE=1`` 时通过 ``logger.info`` 输出,
便于生产环境在不把全局日志调到 DEBUG 的情况下排查 Agent 性能与路径。
敏感内容DEBUG 下会记录用户相关文本截断预览,生产环境请勿长期开启 DEBUG。
敏感内容DEBUG 下会记录用户相关文本``AGENT_LOG_MAX_CHARS=0`` 时记录全文,生产环境请勿长期开启 DEBUG。
配置(节选):``AGENT_LOG_OMIT_SYSTEM_MESSAGE_BODY``(默认 true省略聊天 System 正文,仅打 len+sha12
``AGENT_LOG_JSON_PROMPT_PREFIX_CHARS`` + ``AGENT_LOG_JSON_PROMPT_PREFIX_ONLY_IF_LEN_GT`` 在 DEBUG 下跳过
@@ -35,12 +35,12 @@ def agent_summary_enabled() -> bool:
def truncate_for_log(text: str | None, *, max_chars: int | None = None) -> str:
"""截断过长文本,避免日志爆量。"""
"""截断过长文本,避免日志爆量。max_chars / AGENT_LOG_MAX_CHARS 为 0 表示不截断。"""
if text is None:
return ""
max_c = max_chars if max_chars is not None else settings.agent_log_max_chars
s = str(text)
if len(s) <= max_c:
if max_c <= 0 or len(s) <= max_c:
return s
return s[:max_c] + f"... [truncated total_len={len(s)}]"

View File

@@ -163,8 +163,8 @@ class Settings(BaseSettings):
log_level: str = "INFO"
# LOG_AGENT_VERBOSE为 True 时额外输出 Agent 单行 INFO 摘要(耗时、规模),无需全局 DEBUG
log_agent_verbose: bool = False
# AGENT_LOG_MAX_CHARSDEBUG 下记录 prompt/响应预览时的最大字符数
agent_log_max_chars: int = Field(default=4096, ge=256, le=100_000)
# AGENT_LOG_MAX_CHARSDEBUG 下记录 prompt/响应预览时的最大字符数0=不截断(完整输出)
agent_log_max_chars: int = Field(default=0, ge=0, le=50_000_000)
# AGENT_LOG_OMIT_SYSTEM_MESSAGE_BODYDEBUG 下访谈/资料聊天日志省略 System 正文(仅 len+sha12
agent_log_omit_system_message_body: bool = True
# AGENT_LOG_JSON_PROMPT_PREFIX_CHARSDEBUG 下 *.prompt 总长超过下项时再跳过前 N 字符后预览0=不跳过)
@@ -225,14 +225,18 @@ class Settings(BaseSettings):
story_append_max_canonical_chars: int = Field(default=12000, ge=1000, le=500_000)
story_append_max_versions: int = Field(default=20, ge=1, le=500)
# StoryRouteAgent候选 JSON 预算(保守默认,可调大)
story_route_candidate_body_max_chars: int = Field(default=1600, ge=200, le=8000)
story_route_candidate_body_max_chars: int = Field(default=2200, ge=200, le=8000)
story_route_candidate_total_max_chars: int = Field(
default=16_000, ge=2000, le=100_000
default=20_000, ge=2000, le=100_000
)
story_route_long_body_head_chars: int = Field(default=700, ge=100, le=4000)
story_route_long_body_tail_chars: int = Field(default=700, ge=100, le=4000)
story_route_summary_min_chars: int = Field(default=30, ge=0, le=500)
story_route_index_preview_chars: int = Field(default=80, ge=20, le=500)
story_route_index_preview_chars: int = Field(default=140, ge=20, le=500)
# 童年/求学/家庭:本批口述低于该字数且路由为 new 时,倾向续写到默认候选,减少碎篇
memoir_story_route_append_guardrail_oral_chars: int = Field(
default=1800, ge=0, le=50_000
)
# Evidence 检索 top_k大批次 unit 时降低检索量
evidence_top_k_default: int = Field(default=10, ge=1, le=50)
evidence_top_k_large_batch: int = Field(default=5, ge=1, le=50)
@@ -317,11 +321,36 @@ class Settings(BaseSettings):
internal_eval_enable_docs: bool = False
# 逗号分隔;空则内部 API 不额外限制 Origin仍可依赖 internal_eval_api_key
internal_eval_cors_origins: str = ""
# GLM / 智谱评审模型OpenAI 兼容 Chat Completions与 langchain-openai 一致)
# 智谱 GLM-5评审模型OpenAI 兼容 Chat Completions与 langchain-openai 一致)
eval_judge_api_key: str = ""
eval_judge_base_url: str = "https://open.bigmodel.cn/api/paas/v4"
eval_judge_model: str = "glm-4-flash"
eval_judge_model: str = "glm-5"
eval_judge_temperature: float = 0.3
# GLM-5 输入上下文 200Khttps://docs.bigmodel.cn
eval_judge_context_window_tokens: int = Field(
default=200_000, ge=4096, le=2_000_000
)
# 预留给完成 tokensjson 输出)及路由误差
eval_judge_completion_reserve_tokens: int = Field(
default=4096, ge=256, le=131_072
)
eval_judge_prompt_budget_safety_tokens: int = Field(
default=2048, ge=0, le=32_768
)
# transcript 混合中英文时 token/字 保守估值(估大以免超出真实上下文)
eval_judge_approx_tokens_per_char: float = Field(
default=1.2, ge=0.3, le=8.0
)
# 整段/逐轮节选 transcript 最大字符0=按 eval_judge_context_window_tokens 自动扣 rubric 头
eval_judge_max_transcript_chars: int = Field(default=0, ge=0, le=2_000_000)
# 双 transcript 对比流每条对话上限0=按上下文平分(扣 overhead
eval_judge_max_compare_transcript_chars_each: int = Field(
default=0, ge=0, le=2_000_000
)
# 对比 prompt 固定开销(模板 + 两份评分 JSON的字符估值供自动平分使用
eval_judge_compare_prompt_overhead_chars: int = Field(
default=14_000, ge=500, le=500_000
)
# 候选对话回放:与生产访谈类似的温度
eval_candidate_temperature: float = 0.7
# 门禁:受保护 session 合成份数下跌超过该阈值视为回归0100 分制)

View File

@@ -186,7 +186,7 @@ async def get_current_user(
def get_eval_judge_langchain_llm():
"""智谱 GLMOpenAI 兼容)用于评审 JSON与访谈生产模型分离。"""
"""智谱 GLM-5OpenAI 兼容 Chat Completions)用于评审 JSON与访谈生产模型分离。"""
from langchain_openai import ChatOpenAI
api_key = (settings.eval_judge_api_key or settings.zhipu_api_key or "").strip()
@@ -199,7 +199,7 @@ def get_eval_judge_langchain_llm():
return ChatOpenAI(
api_key=api_key,
base_url=base or "https://open.bigmodel.cn/api/paas/v4",
model=settings.eval_judge_model or "glm-4-flash",
model=settings.eval_judge_model or "glm-5",
temperature=settings.eval_judge_temperature,
)