feat(evaluation): memoir readiness, judge/replay updates, eval web playground
Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.
This commit is contained in:
@@ -5,7 +5,7 @@ Agent / LLM 诊断日志:耗时、输入输出规模、截断预览。
|
||||
- **摘要**(单行:耗时、字符数、operation 名):当 ``LOG_AGENT_VERBOSE=1`` 时通过 ``logger.info`` 输出,
|
||||
便于生产环境在不把全局日志调到 DEBUG 的情况下排查 Agent 性能与路径。
|
||||
|
||||
敏感内容:DEBUG 下会记录用户相关文本截断预览,生产环境请勿长期开启 DEBUG。
|
||||
敏感内容:DEBUG 下会记录用户相关文本;``AGENT_LOG_MAX_CHARS=0`` 时记录全文,生产环境请勿长期开启 DEBUG。
|
||||
|
||||
配置(节选):``AGENT_LOG_OMIT_SYSTEM_MESSAGE_BODY``(默认 true)省略聊天 System 正文,仅打 len+sha12;
|
||||
``AGENT_LOG_JSON_PROMPT_PREFIX_CHARS`` + ``AGENT_LOG_JSON_PROMPT_PREFIX_ONLY_IF_LEN_GT`` 在 DEBUG 下跳过
|
||||
@@ -35,12 +35,12 @@ def agent_summary_enabled() -> bool:
|
||||
|
||||
|
||||
def truncate_for_log(text: str | None, *, max_chars: int | None = None) -> str:
|
||||
"""截断过长文本,避免日志爆量。"""
|
||||
"""截断过长文本,避免日志爆量。max_chars / AGENT_LOG_MAX_CHARS 为 0 表示不截断。"""
|
||||
if text is None:
|
||||
return ""
|
||||
max_c = max_chars if max_chars is not None else settings.agent_log_max_chars
|
||||
s = str(text)
|
||||
if len(s) <= max_c:
|
||||
if max_c <= 0 or len(s) <= max_c:
|
||||
return s
|
||||
return s[:max_c] + f"... [truncated total_len={len(s)}]"
|
||||
|
||||
|
||||
@@ -163,8 +163,8 @@ class Settings(BaseSettings):
|
||||
log_level: str = "INFO"
|
||||
# LOG_AGENT_VERBOSE:为 True 时额外输出 Agent 单行 INFO 摘要(耗时、规模),无需全局 DEBUG
|
||||
log_agent_verbose: bool = False
|
||||
# AGENT_LOG_MAX_CHARS:DEBUG 下记录 prompt/响应预览时的最大字符数
|
||||
agent_log_max_chars: int = Field(default=4096, ge=256, le=100_000)
|
||||
# AGENT_LOG_MAX_CHARS:DEBUG 下记录 prompt/响应预览时的最大字符数;0=不截断(完整输出)
|
||||
agent_log_max_chars: int = Field(default=0, ge=0, le=50_000_000)
|
||||
# AGENT_LOG_OMIT_SYSTEM_MESSAGE_BODY:DEBUG 下访谈/资料聊天日志省略 System 正文(仅 len+sha12)
|
||||
agent_log_omit_system_message_body: bool = True
|
||||
# AGENT_LOG_JSON_PROMPT_PREFIX_CHARS:DEBUG 下 *.prompt 总长超过下项时再跳过前 N 字符后预览(0=不跳过)
|
||||
@@ -225,14 +225,18 @@ class Settings(BaseSettings):
|
||||
story_append_max_canonical_chars: int = Field(default=12000, ge=1000, le=500_000)
|
||||
story_append_max_versions: int = Field(default=20, ge=1, le=500)
|
||||
# StoryRouteAgent:候选 JSON 预算(保守默认,可调大)
|
||||
story_route_candidate_body_max_chars: int = Field(default=1600, ge=200, le=8000)
|
||||
story_route_candidate_body_max_chars: int = Field(default=2200, ge=200, le=8000)
|
||||
story_route_candidate_total_max_chars: int = Field(
|
||||
default=16_000, ge=2000, le=100_000
|
||||
default=20_000, ge=2000, le=100_000
|
||||
)
|
||||
story_route_long_body_head_chars: int = Field(default=700, ge=100, le=4000)
|
||||
story_route_long_body_tail_chars: int = Field(default=700, ge=100, le=4000)
|
||||
story_route_summary_min_chars: int = Field(default=30, ge=0, le=500)
|
||||
story_route_index_preview_chars: int = Field(default=80, ge=20, le=500)
|
||||
story_route_index_preview_chars: int = Field(default=140, ge=20, le=500)
|
||||
# 童年/求学/家庭:本批口述低于该字数且路由为 new 时,倾向续写到默认候选,减少碎篇
|
||||
memoir_story_route_append_guardrail_oral_chars: int = Field(
|
||||
default=1800, ge=0, le=50_000
|
||||
)
|
||||
# Evidence 检索 top_k:大批次 unit 时降低检索量
|
||||
evidence_top_k_default: int = Field(default=10, ge=1, le=50)
|
||||
evidence_top_k_large_batch: int = Field(default=5, ge=1, le=50)
|
||||
@@ -317,11 +321,36 @@ class Settings(BaseSettings):
|
||||
internal_eval_enable_docs: bool = False
|
||||
# 逗号分隔;空则内部 API 不额外限制 Origin(仍可依赖 internal_eval_api_key)
|
||||
internal_eval_cors_origins: str = ""
|
||||
# GLM / 智谱:评审模型(OpenAI 兼容 Chat Completions,与 langchain-openai 一致)
|
||||
# 智谱 GLM-5:评审模型(OpenAI 兼容 Chat Completions,与 langchain-openai 一致)
|
||||
eval_judge_api_key: str = ""
|
||||
eval_judge_base_url: str = "https://open.bigmodel.cn/api/paas/v4"
|
||||
eval_judge_model: str = "glm-4-flash"
|
||||
eval_judge_model: str = "glm-5"
|
||||
eval_judge_temperature: float = 0.3
|
||||
# GLM-5 输入上下文 200K(https://docs.bigmodel.cn)
|
||||
eval_judge_context_window_tokens: int = Field(
|
||||
default=200_000, ge=4096, le=2_000_000
|
||||
)
|
||||
# 预留给完成 tokens(json 输出)及路由误差
|
||||
eval_judge_completion_reserve_tokens: int = Field(
|
||||
default=4096, ge=256, le=131_072
|
||||
)
|
||||
eval_judge_prompt_budget_safety_tokens: int = Field(
|
||||
default=2048, ge=0, le=32_768
|
||||
)
|
||||
# transcript 混合中英文时 token/字 保守估值(估大以免超出真实上下文)
|
||||
eval_judge_approx_tokens_per_char: float = Field(
|
||||
default=1.2, ge=0.3, le=8.0
|
||||
)
|
||||
# 整段/逐轮节选 transcript 最大字符;0=按 eval_judge_context_window_tokens 自动扣 rubric 头
|
||||
eval_judge_max_transcript_chars: int = Field(default=0, ge=0, le=2_000_000)
|
||||
# 双 transcript 对比流:每条对话上限;0=按上下文平分(扣 overhead)
|
||||
eval_judge_max_compare_transcript_chars_each: int = Field(
|
||||
default=0, ge=0, le=2_000_000
|
||||
)
|
||||
# 对比 prompt 固定开销(模板 + 两份评分 JSON)的字符估值,供自动平分使用
|
||||
eval_judge_compare_prompt_overhead_chars: int = Field(
|
||||
default=14_000, ge=500, le=500_000
|
||||
)
|
||||
# 候选对话回放:与生产访谈类似的温度
|
||||
eval_candidate_temperature: float = 0.7
|
||||
# 门禁:受保护 session 合成份数下跌超过该阈值视为回归(0–100 分制)
|
||||
|
||||
@@ -186,7 +186,7 @@ async def get_current_user(
|
||||
|
||||
|
||||
def get_eval_judge_langchain_llm():
|
||||
"""智谱 GLM(OpenAI 兼容)用于评审 JSON;与访谈生产模型分离。"""
|
||||
"""智谱 GLM-5(OpenAI 兼容 Chat Completions)用于评审 JSON;与访谈生产模型分离。"""
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
api_key = (settings.eval_judge_api_key or settings.zhipu_api_key or "").strip()
|
||||
@@ -199,7 +199,7 @@ def get_eval_judge_langchain_llm():
|
||||
return ChatOpenAI(
|
||||
api_key=api_key,
|
||||
base_url=base or "https://open.bigmodel.cn/api/paas/v4",
|
||||
model=settings.eval_judge_model or "glm-4-flash",
|
||||
model=settings.eval_judge_model or "glm-5",
|
||||
temperature=settings.eval_judge_temperature,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user