feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas. - Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error. - MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings. - app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS. - Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014. - Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
This commit is contained in:
@@ -42,6 +42,8 @@ class Settings(BaseSettings):
|
||||
)
|
||||
# 非 production 且为 True 时,在 main/internal_main 连接 Redis 后清空 Celery 队列(不 FLUSHDB,不影响会话键)
|
||||
celery_purge_broker_on_startup: bool = False
|
||||
# Memory LLM 富化任务路由队列;可与主 worker 分离(见 README / docker-compose)
|
||||
celery_memory_enrichment_queue: str = "memory_idle"
|
||||
|
||||
# ── Auth / JWT ────────────────────────────────────────────
|
||||
secret_key: str = Field(default_factory=lambda: secrets.token_urlsafe(32))
|
||||
@@ -112,6 +114,8 @@ class Settings(BaseSettings):
|
||||
memoir_phase1_batch_llm_max_tokens: int = Field(default=4096, ge=512, le=32_768)
|
||||
#: Phase1 批处理 LLM:单次请求最多包含的 segment 数(多块合并,避免 completion 顶满截断)
|
||||
memoir_phase1_batch_llm_chunk_size: int = Field(default=24, ge=1, le=500)
|
||||
#: 回忆录流水线细粒度进度 Redis 快照 TTL(memoir_pipeline_run:*)
|
||||
memoir_pipeline_run_ttl_seconds: int = Field(default=172_800, ge=3600, le=2_592_000)
|
||||
# Memoir agents:`invoke_json_object` / `llm_json_call` 的 max_tokens(原硬编码迁至配置)
|
||||
memoir_extraction_max_tokens: int = Field(default=1024, ge=64, le=8192)
|
||||
memoir_classification_max_tokens: int = Field(default=256, ge=32, le=4096)
|
||||
@@ -188,9 +192,11 @@ class Settings(BaseSettings):
|
||||
agent_log_prompt_mode: str = Field(default="preview")
|
||||
# AGENT_LOG_PROMPT_DEDUP:DEBUG 下同一 label 连续相同全文时第二条起跳过(减重复模板噪音)
|
||||
agent_log_prompt_dedup: bool = False
|
||||
# 第三方 stdlib logging(空=自动:LOG_LEVEL 为 DEBUG/TRACE 时 Celery→INFO、httpx/httpcore→WARNING)
|
||||
# 第三方 stdlib logging(空=自动:DEBUG/TRACE 时 Celery→INFO;否则 Celery 与 httpx 默认 WARNING)
|
||||
celery_log_level: str = ""
|
||||
httpx_log_level: str = ""
|
||||
# 非空时额外写入 JSONL(serialize=True),便于 Loki/ELK;与 stderr 彩色控制台并存
|
||||
log_json_file: str = ""
|
||||
|
||||
@field_validator("celery_purge_broker_on_startup", mode="before")
|
||||
@classmethod
|
||||
@@ -405,6 +411,31 @@ class Settings(BaseSettings):
|
||||
eval_judge_compare_prompt_overhead_chars: int = Field(
|
||||
default=14_000, ge=500, le=500_000
|
||||
)
|
||||
# 回忆录音评:章节 LLM 并发上限(仅评审请求;准备阶段仍串行访问 DB)
|
||||
eval_judge_memoir_chapter_concurrency: int = Field(
|
||||
default=4,
|
||||
ge=1,
|
||||
le=32,
|
||||
)
|
||||
# 回忆录评审 prompt 内粗截断(汉字计字符);万字级章节请保持 body ≥ 正文峰值
|
||||
eval_judge_memoir_body_max_chars: int = Field(
|
||||
default=36_000,
|
||||
ge=8_000,
|
||||
le=500_000,
|
||||
description="【当前回忆录正文】注入评审 prompt 前的最大字符",
|
||||
)
|
||||
eval_judge_memoir_evidence_max_chars: int = Field(
|
||||
default=32_000,
|
||||
ge=8_000,
|
||||
le=500_000,
|
||||
description="对话证据 / 结构化证据 / 参考基线各块的最大字符(与 eval_trace_format 对齐)",
|
||||
)
|
||||
# json_object 完成预算;MemoirJudgeOutput 字段多,需预留足量 token
|
||||
eval_judge_memoir_completion_max_tokens: int = Field(
|
||||
default=3072,
|
||||
ge=512,
|
||||
le=16_384,
|
||||
)
|
||||
# 候选对话回放:与生产访谈类似的温度
|
||||
eval_candidate_temperature: float = 0.7
|
||||
# 门禁:受保护 session 合成份数下跌超过该阈值视为回归(0–100 分制)
|
||||
|
||||
Reference in New Issue
Block a user