feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas. - Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error. - MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings. - app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS. - Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014. - Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
parent b0251e5b26
commit ac49bc7f23
59 changed files with 4773 additions and 696 deletions
--- a/api/app/agents/memoir/batch_phase1_prep.py
+++ b/api/app/agents/memoir/batch_phase1_prep.py
@@ -6,7 +6,7 @@ from __future__ import annotations

 import math
 from dataclasses import dataclass
-from typing import Any, Dict, List
+from typing import Any, Callable, Dict, List

 from app.agents.memoir.prompts import get_batch_memoir_phase1_prep_prompt
 from app.agents.memoir.schemas import BatchPhase1LLMOutput
@@ -135,7 +135,7 @@ def _run_batch_phase1_prep_chunk_with_bisect(
        if merged.keys() != expected:
            raise ValueError(
                "batch phase1 chunked bisect merge: segment ids do not match input"
-            )
+            ) from None
        return merged


@@ -145,6 +145,7 @@ def run_batch_phase1_prep_chunked(
    llm: Any,
    *,
    chunk_size: int,
+    on_chunk: Callable[[int, int], None] | None = None,
 ) -> Dict[str, BatchPhase1SegmentRow]:
    """
    将 segments 按 chunk_size 切片多次调用 Phase1 批处理 LLM，合并 by_id。
@@ -161,13 +162,16 @@ def run_batch_phase1_prep_chunked(
        chunk_idx = i // chunk_size + 1
        sub = segments[i : i + chunk_size]
        logger.info(
-            "event=batch_phase1_chunk chunk_idx={}/{} segment_count={} batch_path=chunked",
+            "event=batch_phase1_chunk chunk_idx={}/{} segment_count={} batch_path=chunked "
+            "msg=Phase1 批处理分块调用",
            chunk_idx,
            total_chunks,
            len(sub),
        )
        part = _run_batch_phase1_prep_chunk_with_bisect(sub, state, llm)
        merged.update(part)
+        if on_chunk is not None:
+            on_chunk(chunk_idx, total_chunks)
    expected = {str(s.id) for s in segments}
    if merged.keys() != expected:
        missing = expected - merged.keys()