feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas. - Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error. - MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings. - app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS. - Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014. - Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
parent b0251e5b26
commit ac49bc7f23
59 changed files with 4773 additions and 696 deletions
--- a/api/app/core/config.py
+++ b/api/app/core/config.py
@@ -42,6 +42,8 @@ class Settings(BaseSettings):
    )
    # 非 production 且为 True 时，在 main/internal_main 连接 Redis 后清空 Celery 队列（不 FLUSHDB，不影响会话键）
    celery_purge_broker_on_startup: bool = False
+    # Memory LLM 富化任务路由队列；可与主 worker 分离（见 README / docker-compose）
+    celery_memory_enrichment_queue: str = "memory_idle"

    # ── Auth / JWT ────────────────────────────────────────────
    secret_key: str = Field(default_factory=lambda: secrets.token_urlsafe(32))
@@ -112,6 +114,8 @@ class Settings(BaseSettings):
    memoir_phase1_batch_llm_max_tokens: int = Field(default=4096, ge=512, le=32_768)
    #: Phase1 批处理 LLM：单次请求最多包含的 segment 数（多块合并，避免 completion 顶满截断）
    memoir_phase1_batch_llm_chunk_size: int = Field(default=24, ge=1, le=500)
+    #: 回忆录流水线细粒度进度 Redis 快照 TTL（memoir_pipeline_run:*）
+    memoir_pipeline_run_ttl_seconds: int = Field(default=172_800, ge=3600, le=2_592_000)
    # Memoir agents：`invoke_json_object` / `llm_json_call` 的 max_tokens（原硬编码迁至配置）
    memoir_extraction_max_tokens: int = Field(default=1024, ge=64, le=8192)
    memoir_classification_max_tokens: int = Field(default=256, ge=32, le=4096)
@@ -188,9 +192,11 @@ class Settings(BaseSettings):
    agent_log_prompt_mode: str = Field(default="preview")
    # AGENT_LOG_PROMPT_DEDUP：DEBUG 下同一 label 连续相同全文时第二条起跳过（减重复模板噪音）
    agent_log_prompt_dedup: bool = False
-    # 第三方 stdlib logging（空=自动：LOG_LEVEL 为 DEBUG/TRACE 时 Celery→INFO、httpx/httpcore→WARNING）
+    # 第三方 stdlib logging（空=自动：DEBUG/TRACE 时 Celery→INFO；否则 Celery 与 httpx 默认 WARNING）
    celery_log_level: str = ""
    httpx_log_level: str = ""
+    # 非空时额外写入 JSONL（serialize=True），便于 Loki/ELK；与 stderr 彩色控制台并存
+    log_json_file: str = ""

    @field_validator("celery_purge_broker_on_startup", mode="before")
    @classmethod
@@ -405,6 +411,31 @@ class Settings(BaseSettings):
    eval_judge_compare_prompt_overhead_chars: int = Field(
        default=14_000, ge=500, le=500_000
    )
+    # 回忆录音评：章节 LLM 并发上限（仅评审请求；准备阶段仍串行访问 DB）
+    eval_judge_memoir_chapter_concurrency: int = Field(
+        default=4,
+        ge=1,
+        le=32,
+    )
+    # 回忆录评审 prompt 内粗截断（汉字计字符）；万字级章节请保持 body ≥ 正文峰值
+    eval_judge_memoir_body_max_chars: int = Field(
+        default=36_000,
+        ge=8_000,
+        le=500_000,
+        description="【当前回忆录正文】注入评审 prompt 前的最大字符",
+    )
+    eval_judge_memoir_evidence_max_chars: int = Field(
+        default=32_000,
+        ge=8_000,
+        le=500_000,
+        description="对话证据 / 结构化证据 / 参考基线各块的最大字符（与 eval_trace_format 对齐）",
+    )
+    # json_object 完成预算；MemoirJudgeOutput 字段多，需预留足量 token
+    eval_judge_memoir_completion_max_tokens: int = Field(
+        default=3072,
+        ge=512,
+        le=16_384,
+    )
    # 候选对话回放：与生产访谈类似的温度
    eval_candidate_temperature: float = 0.7
    # 门禁：受保护 session 合成份数下跌超过该阈值视为回归（0–100 分制）