feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas. - Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error. - MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings. - app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS. - Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014. - Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
parent b0251e5b26
commit ac49bc7f23
59 changed files with 4773 additions and 696 deletions
--- a/api/app/features/evaluation/eval_trace_format.py
+++ b/api/app/features/evaluation/eval_trace_format.py
@@ -2,6 +2,7 @@

 from __future__ import annotations

+from app.core.config import settings
 from app.features.conversation.models import Segment
 from app.features.evaluation.eval_trace_schemas import (
    ChapterEvidenceBundle,
@@ -16,9 +17,10 @@ from app.features.memory.models import (
    TimelineEvent,
 )

-# 与 judge_service._MEMOIR_EVIDENCE_MAX 对齐：访谈与结构化证据分预算，避免总长失控
-_MEMOIR_TRANSCRIPT_CAP = 12_000
-_MEMOIR_STRUCTURED_CAP = 12_000
+
+def _memoir_evidence_char_cap() -> int:
+    """与 ``Settings.eval_judge_memoir_evidence_max_chars`` 对齐。"""
+    return max(1000, int(settings.eval_judge_memoir_evidence_max_chars))


 def _approx_tokens(chars: int) -> int:
@@ -75,11 +77,12 @@ def build_structured_evidence_text(
    facts: list[MemoryFact],
    events: list[TimelineEvent],
    summaries: list[MemorySummary],
-    max_chars: int = _MEMOIR_STRUCTURED_CAP,
+    max_chars: int | None = None,
 ) -> tuple[str, bool, list[str]]:
    """
    结构化记忆证据块；返回 (text, truncated, dropped_section_tags)。
    """
+    cap = max_chars if max_chars is not None else _memoir_evidence_char_cap()
    parts: list[str] = []
    dropped: list[str] = []
    used = 0
@@ -90,7 +93,7 @@ def build_structured_evidence_text(
        block = f"{title}\n{body}".strip()
        if not block:
            return
-        if used + len(block) + 2 > max_chars:
+        if used + len(block) + 2 > cap:
            truncated = True
            dropped.append(title.strip("【】").split("·")[0].strip())
            return
@@ -172,23 +175,22 @@ def format_chapter_for_judge(
    events: list[TimelineEvent],
    summaries: list[MemorySummary],
 ) -> FormattedMemoirEvidence:
-    t_cap = _MEMOIR_TRANSCRIPT_CAP
-    s_cap = _MEMOIR_STRUCTURED_CAP
+    ev_cap = _memoir_evidence_char_cap()
    dropped: list[str] = []
    truncated = False

    t_in = transcript.strip()
-    if len(t_in) > t_cap:
+    if len(t_in) > ev_cap:
        truncated = True
        dropped.append("source_transcript_tail")
-        t_in = t_in[:t_cap] + "\n\n…（原始对话证据已截断）"
+        t_in = t_in[:ev_cap] + "\n\n…（原始对话证据已截断）"

    struct, s_trunc, s_drop = build_structured_evidence_text(
        chunks=chunks,
        facts=facts,
        events=events,
        summaries=summaries,
-        max_chars=s_cap,
+        max_chars=ev_cap,
    )
    if s_trunc:
        truncated = True
@@ -228,23 +230,22 @@ def format_story_for_judge(
    events: list[TimelineEvent],
    summaries: list[MemorySummary],
 ) -> FormattedMemoirEvidence:
-    t_cap = _MEMOIR_TRANSCRIPT_CAP
-    s_cap = _MEMOIR_STRUCTURED_CAP
+    ev_cap = _memoir_evidence_char_cap()
    dropped: list[str] = []
    truncated = False

    t_in = transcript.strip()
-    if len(t_in) > t_cap:
+    if len(t_in) > ev_cap:
        truncated = True
        dropped.append("source_transcript_tail")
-        t_in = t_in[:t_cap] + "\n\n…（原始对话证据已截断）"
+        t_in = t_in[:ev_cap] + "\n\n…（原始对话证据已截断）"

    struct, s_trunc, s_drop = build_structured_evidence_text(
        chunks=chunks,
        facts=facts,
        events=events,
        summaries=summaries,
-        max_chars=s_cap,
+        max_chars=ev_cap,
    )
    if s_trunc:
        truncated = True