feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
This commit is contained in:
Kevin
2026-04-10 10:23:43 +08:00
parent b0251e5b26
commit ac49bc7f23
59 changed files with 4773 additions and 696 deletions

View File

@@ -42,6 +42,8 @@ class Settings(BaseSettings):
)
# 非 production 且为 True 时,在 main/internal_main 连接 Redis 后清空 Celery 队列(不 FLUSHDB不影响会话键
celery_purge_broker_on_startup: bool = False
# Memory LLM 富化任务路由队列;可与主 worker 分离(见 README / docker-compose
celery_memory_enrichment_queue: str = "memory_idle"
# ── Auth / JWT ────────────────────────────────────────────
secret_key: str = Field(default_factory=lambda: secrets.token_urlsafe(32))
@@ -112,6 +114,8 @@ class Settings(BaseSettings):
memoir_phase1_batch_llm_max_tokens: int = Field(default=4096, ge=512, le=32_768)
#: Phase1 批处理 LLM单次请求最多包含的 segment 数(多块合并,避免 completion 顶满截断)
memoir_phase1_batch_llm_chunk_size: int = Field(default=24, ge=1, le=500)
#: 回忆录流水线细粒度进度 Redis 快照 TTLmemoir_pipeline_run:*
memoir_pipeline_run_ttl_seconds: int = Field(default=172_800, ge=3600, le=2_592_000)
# Memoir agents`invoke_json_object` / `llm_json_call` 的 max_tokens原硬编码迁至配置
memoir_extraction_max_tokens: int = Field(default=1024, ge=64, le=8192)
memoir_classification_max_tokens: int = Field(default=256, ge=32, le=4096)
@@ -188,9 +192,11 @@ class Settings(BaseSettings):
agent_log_prompt_mode: str = Field(default="preview")
# AGENT_LOG_PROMPT_DEDUPDEBUG 下同一 label 连续相同全文时第二条起跳过(减重复模板噪音)
agent_log_prompt_dedup: bool = False
# 第三方 stdlib logging空=自动:LOG_LEVEL 为 DEBUG/TRACE 时 Celery→INFO、httpx/httpcore→WARNING
# 第三方 stdlib logging空=自动DEBUG/TRACE 时 Celery→INFO;否则 Celery 与 httpx 默认 WARNING
celery_log_level: str = ""
httpx_log_level: str = ""
# 非空时额外写入 JSONLserialize=True便于 Loki/ELK与 stderr 彩色控制台并存
log_json_file: str = ""
@field_validator("celery_purge_broker_on_startup", mode="before")
@classmethod
@@ -405,6 +411,31 @@ class Settings(BaseSettings):
eval_judge_compare_prompt_overhead_chars: int = Field(
default=14_000, ge=500, le=500_000
)
# 回忆录音评:章节 LLM 并发上限(仅评审请求;准备阶段仍串行访问 DB
eval_judge_memoir_chapter_concurrency: int = Field(
default=4,
ge=1,
le=32,
)
# 回忆录评审 prompt 内粗截断(汉字计字符);万字级章节请保持 body ≥ 正文峰值
eval_judge_memoir_body_max_chars: int = Field(
default=36_000,
ge=8_000,
le=500_000,
description="【当前回忆录正文】注入评审 prompt 前的最大字符",
)
eval_judge_memoir_evidence_max_chars: int = Field(
default=32_000,
ge=8_000,
le=500_000,
description="对话证据 / 结构化证据 / 参考基线各块的最大字符(与 eval_trace_format 对齐)",
)
# json_object 完成预算MemoirJudgeOutput 字段多,需预留足量 token
eval_judge_memoir_completion_max_tokens: int = Field(
default=3072,
ge=512,
le=16_384,
)
# 候选对话回放:与生产访谈类似的温度
eval_candidate_temperature: float = 0.7
# 门禁:受保护 session 合成份数下跌超过该阈值视为回归0100 分制)