feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
This commit is contained in:
Kevin
2026-04-10 10:23:43 +08:00
parent b0251e5b26
commit ac49bc7f23
59 changed files with 4773 additions and 696 deletions

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
from app.core.config import settings
from app.features.conversation.models import Segment
from app.features.evaluation.eval_trace_schemas import (
ChapterEvidenceBundle,
@@ -16,9 +17,10 @@ from app.features.memory.models import (
TimelineEvent,
)
# 与 judge_service._MEMOIR_EVIDENCE_MAX 对齐:访谈与结构化证据分预算,避免总长失控
_MEMOIR_TRANSCRIPT_CAP = 12_000
_MEMOIR_STRUCTURED_CAP = 12_000
def _memoir_evidence_char_cap() -> int:
"""与 ``Settings.eval_judge_memoir_evidence_max_chars`` 对齐。"""
return max(1000, int(settings.eval_judge_memoir_evidence_max_chars))
def _approx_tokens(chars: int) -> int:
@@ -75,11 +77,12 @@ def build_structured_evidence_text(
facts: list[MemoryFact],
events: list[TimelineEvent],
summaries: list[MemorySummary],
max_chars: int = _MEMOIR_STRUCTURED_CAP,
max_chars: int | None = None,
) -> tuple[str, bool, list[str]]:
"""
结构化记忆证据块;返回 (text, truncated, dropped_section_tags)。
"""
cap = max_chars if max_chars is not None else _memoir_evidence_char_cap()
parts: list[str] = []
dropped: list[str] = []
used = 0
@@ -90,7 +93,7 @@ def build_structured_evidence_text(
block = f"{title}\n{body}".strip()
if not block:
return
if used + len(block) + 2 > max_chars:
if used + len(block) + 2 > cap:
truncated = True
dropped.append(title.strip("【】").split("·")[0].strip())
return
@@ -172,23 +175,22 @@ def format_chapter_for_judge(
events: list[TimelineEvent],
summaries: list[MemorySummary],
) -> FormattedMemoirEvidence:
t_cap = _MEMOIR_TRANSCRIPT_CAP
s_cap = _MEMOIR_STRUCTURED_CAP
ev_cap = _memoir_evidence_char_cap()
dropped: list[str] = []
truncated = False
t_in = transcript.strip()
if len(t_in) > t_cap:
if len(t_in) > ev_cap:
truncated = True
dropped.append("source_transcript_tail")
t_in = t_in[:t_cap] + "\n\n…(原始对话证据已截断)"
t_in = t_in[:ev_cap] + "\n\n…(原始对话证据已截断)"
struct, s_trunc, s_drop = build_structured_evidence_text(
chunks=chunks,
facts=facts,
events=events,
summaries=summaries,
max_chars=s_cap,
max_chars=ev_cap,
)
if s_trunc:
truncated = True
@@ -228,23 +230,22 @@ def format_story_for_judge(
events: list[TimelineEvent],
summaries: list[MemorySummary],
) -> FormattedMemoirEvidence:
t_cap = _MEMOIR_TRANSCRIPT_CAP
s_cap = _MEMOIR_STRUCTURED_CAP
ev_cap = _memoir_evidence_char_cap()
dropped: list[str] = []
truncated = False
t_in = transcript.strip()
if len(t_in) > t_cap:
if len(t_in) > ev_cap:
truncated = True
dropped.append("source_transcript_tail")
t_in = t_in[:t_cap] + "\n\n…(原始对话证据已截断)"
t_in = t_in[:ev_cap] + "\n\n…(原始对话证据已截断)"
struct, s_trunc, s_drop = build_structured_evidence_text(
chunks=chunks,
facts=facts,
events=events,
summaries=summaries,
max_chars=s_cap,
max_chars=ev_cap,
)
if s_trunc:
truncated = True