feat(api): 访谈路径轻量门控、Memoir Phase1 批处理与叙事/记忆管线加固

- 新增 utterance_substance：短时/应答/元话语可跳过记忆检索、阶段 LLM 与资料抽取 LLM；可配置 - 输入归一化：LLM 模式默认仅语音/ASR；配置项写入 .env.example - Memoir Phase1：可选 batch LLM 一次性抽取+分类（失败回退逐段）；Extraction 空槽位时阶段与 current_stage 对齐，prompt 约束收紧 - 叙事与忠实度：narrative_safety、证据重叠/场合锚点、标题 slots 与履历短语 grounded；fidelity 解析失败 fail-open 可配置 - 章节管线：锁 TTL 上调、锁竞争 Celery 重试、Phase2 immediate singleflight 等；story_pipeline_sync / chapter_compose / memoir_tasks 联动 - Memory：compaction / repo / summarizer / evidence 小修；事实 FTS 未命中是否回退最近事实可配置 - 新增 memoir_pipeline_trace；补充 memoir_reliability 文档与多项回归/门控测试
2026-04-03 10:12:59 +08:00
parent 6b930808a3
commit 07c6478742
49 changed files with 12258 additions and 57 deletions
--- a/api/app/features/memoir/story_pipeline_sync.py
+++ b/api/app/features/memoir/story_pipeline_sync.py
@@ -7,6 +7,7 @@ Celery 用：按批次将 transcript 写入 Story，并标记 Chapter 需物化
 from __future__ import annotations

 import json
+import re
 import time
 import uuid
 from typing import Any
@@ -37,6 +38,12 @@ from app.features.memoir.cover_eligibility import chapter_needs_cover_enqueue
 from app.features.memoir.memoir_images.settings import MemoirImageSettings
 from app.features.memoir.models import Chapter
 from app.features.memoir.narrative_to_markdown import narrative_to_markdown
+from app.features.memoir.narrative_safety import (
+    body_contains_prompt_artifact,
+    evidence_leakage_heuristic,
+    evidence_scene_anchor_leak,
+    strip_evidence_for_overlap_check,
+)
 from app.features.memoir.oral_normalize import (
    apply_oral_rules,
    normalize_oral_for_memoir,
@@ -57,6 +64,16 @@ from app.features.story.sync_write import (

 logger = get_logger(__name__)

+# 标题中若出现下列多字履历表述，则必须在 hay（正文+口述+传入标题的 slots）中逐字出现，否则剔除无果片段或降级占位
+_MEMOIR_TITLE_HAY_GROUNDING_PHRASES: tuple[str, ...] = (
+    "晋升旅长",
+    "晋升为旅长",
+    "晋升师长",
+    "晋升军长",
+    "旅长职务",
+    "师长职务",
+)
+
 # summary 章节跨阶段汇总 slots 时的上限（防叙事 prompt 膨胀）
 MAX_SUMMARY_SLOT_KEYS = 80
 MAX_SUMMARY_SLOT_CHARS = 12_000
@@ -127,6 +144,83 @@ def _placeholder_title(chapter_category: str) -> str:
    return CHAPTER_CATEGORIES.get(chapter_category, chapter_category)


+def _title_slots_filtered_for_generation(
+    slot_snippets: dict[str, str], *, md: str, oral_scope: str
+) -> dict[str, str]:
+    """仅保留与正文或本批口述有文本重叠的 slot，降低档案/历史 slot 串台到标题。"""
+    if not settings.memoir_title_slots_require_body_or_oral_match:
+        return dict(slot_snippets)
+    hay = f"{(md or '').strip()}\n{(oral_scope or '').strip()}"
+    if not hay.strip():
+        return {}
+    out: dict[str, str] = {}
+    for k, v in (slot_snippets or {}).items():
+        if k == "content_excerpt":
+            continue
+        s = (v or "").strip()
+        if len(s) < 2:
+            continue
+        if s in hay:
+            out[k] = s
+            continue
+        prefix = s[: min(12, len(s))]
+        if len(prefix) >= 4 and prefix in hay:
+            out[k] = s
+    return out
+
+
+def _title_hay_for_grounding(
+    merged_slots: dict[str, str], md: str, oral_scope: str
+) -> str:
+    """与标题模型可见材料一致的依据串（用于事后逐字 grounding）。"""
+    parts: list[str] = [(md or "").strip(), (oral_scope or "").strip()]
+    for k, v in (merged_slots or {}).items():
+        if k == "content_excerpt":
+            continue
+        if (v or "").strip():
+            parts.append(str(v).strip())
+    return "\n".join(p for p in parts if p)
+
+
+def _strip_ungrounded_title_segments(
+    title: str,
+    hay: str,
+    *,
+    chapter_category: str,
+) -> str:
+    """
+    按 · / • 分节丢弃含未落地履历短语的小节；全部丢弃则占位。
+    """
+    if not settings.memoir_title_hay_grounding_strict_phrases_enabled:
+        return (title or "").strip() or _placeholder_title(chapter_category)
+    t = (title or "").strip()
+    h = (hay or "").strip()
+    if not t:
+        return _placeholder_title(chapter_category)
+    segments = [s.strip() for s in re.split(r"\s*[·•]\s*", t) if s.strip()]
+    if not segments:
+        segments = [t]
+    kept: list[str] = []
+    for seg in segments:
+        bad = any(
+            phrase in seg and phrase not in h
+            for phrase in _MEMOIR_TITLE_HAY_GROUNDING_PHRASES
+        )
+        if bad:
+            logger.info(
+                "event=memoir_title_segment_ungrounded segment_preview={} chapter_category={}",
+                seg[:40],
+                chapter_category,
+            )
+            continue
+        kept.append(seg)
+    if not kept:
+        return _placeholder_title(chapter_category)
+    if len(kept) == 1:
+        return kept[0]
+    return " · ".join(kept)
+
+
 def _maybe_generate_title(
    narrative_agent: "NarrativeAgent",
    *,
@@ -136,23 +230,33 @@ def _maybe_generate_title(
    user_profile: str,
    user_birth_year: int | None,
    llm: Any,
+    oral_scope: str = "",
+    narrow_profile_for_title: bool = True,
 ) -> str:
    """Generate a title only when body is long enough; otherwise return placeholder."""
    body_len = len((md or "").strip())
    if body_len < settings.story_title_min_body_chars:
        return _placeholder_title(chapter_category)
    content_excerpt = (md or "").strip()[:300]
-    merged_slots = dict(slot_snippets)
+    merged_slots = _title_slots_filtered_for_generation(
+        slot_snippets, md=md, oral_scope=oral_scope
+    )
    if content_excerpt and "content_excerpt" not in merged_slots:
        merged_slots["content_excerpt"] = content_excerpt
-    return narrative_agent.generate_title(
+    # 标题默认不注入完整档案，仅年龄提示仍可用（来自 birth_year）
+    profile_for_title = "" if narrow_profile_for_title else user_profile
+    raw_title = narrative_agent.generate_title(
        stage=chapter_category,
        emotion="neutral",
        slots=merged_slots,
-        user_profile=user_profile,
+        user_profile=profile_for_title,
        birth_year=user_birth_year,
        llm=llm,
    )
+    hay = _title_hay_for_grounding(merged_slots, md, oral_scope)
+    return _strip_ungrounded_title_segments(
+        raw_title, hay, chapter_category=chapter_category
+    )


 def _route_segment_texts(category_segments: list) -> list[tuple[str, str]]:
@@ -206,11 +310,13 @@ def _gate_narrative_fidelity(
        return narrative_raw, "none"
    agent = FidelityCheckAgent()
    ex = (existing_canonical or "").strip() or None
+    is_append = bool(ex)
    if agent.passes(
        oral_text=oral_text,
        narrative_json=narrative_raw,
        llm=llm,
        existing_canonical_markdown=ex,
+        is_append=is_append,
    ):
        return narrative_raw, "none"
    logger.warning(
@@ -224,6 +330,56 @@ def _gate_narrative_fidelity(
    return _fidelity_fallback_json(o, ex), "fidelity_failed"


+def _apply_narrative_body_safety(
+    md: str,
+    *,
+    oral: str,
+    existing_for_narrative: str,
+    evidence_text: str,
+    chapter_category: str,
+) -> tuple[str, str]:
+    """prompt 标记或摘录子串疑似渗入正文时，回退为口述/旧文拼接。"""
+    m = (md or "").strip()
+    ex = (existing_for_narrative or "").strip()
+    o = (oral or "").strip()
+    min_len = int(settings.memoir_narrative_evidence_overlap_min_chars)
+    ev_plain = strip_evidence_for_overlap_check(evidence_text)
+    if m and body_contains_prompt_artifact(m):
+        logger.warning(
+            "event=narrative_invariant_failed reason=prompt_artifact chapter_category={}",
+            chapter_category,
+        )
+        return _coalesce_story_markdown("", oral, existing_for_narrative), (
+            "prompt_artifact_in_body"
+        )
+    if (
+        m
+        and evidence_text.strip()
+        and evidence_leakage_heuristic(m, ev_plain, o, ex, min_len)
+    ):
+        logger.warning(
+            "event=narrative_invariant_failed reason=evidence_leak chapter_category={}",
+            chapter_category,
+        )
+        return _coalesce_story_markdown("", oral, existing_for_narrative), (
+            "evidence_leak_heuristic"
+        )
+    if (
+        settings.memoir_evidence_scene_anchor_check_enabled
+        and m
+        and evidence_text.strip()
+        and evidence_scene_anchor_leak(m, ev_plain, o, ex)
+    ):
+        logger.warning(
+            "event=narrative_invariant_failed reason=evidence_scene_anchor chapter_category={}",
+            chapter_category,
+        )
+        return _coalesce_story_markdown("", oral, existing_for_narrative), (
+            "evidence_scene_anchor"
+        )
+    return m, "none"
+
+
 def _coalesce_story_markdown(
    md: str,
    oral: str,
@@ -375,6 +531,7 @@ def _run_batch_plan_writes(
    narrative_agent: NarrativeAgent,
    background_voice: str = "default",
    occupation: str = "",
+    memoir_correlation_id: str | None = None,
 ) -> set[str]:
    dispatch_ids: set[str] = set()
    max_chars = int(settings.story_append_max_canonical_chars)
@@ -426,6 +583,7 @@ def _run_batch_plan_writes(
            llm=llm,
            background_voice=background_voice,
            occupation=occupation,
+            fallback_plain_oral=ut_norm,
        )
        json_invalid = False
        s0 = (raw_gen or "").strip()
@@ -456,6 +614,17 @@ def _run_batch_plan_writes(
            oral_unit.strip(),
            existing_for_narrative or "",
        )
+        md, inv_fb = _apply_narrative_body_safety(
+            md,
+            oral=oral_unit,
+            existing_for_narrative=existing_for_narrative or "",
+            evidence_text=evidence_text,
+            chapter_category=chapter_category,
+        )
+        if inv_fb != "none":
+            fallback_type = (
+                inv_fb if fallback_type == "none" else f"{fallback_type}+{inv_fb}"
+            )

        if target_story_id:
            append_story_version_sync(session, str(target_story_id), md)
@@ -474,6 +643,7 @@ def _run_batch_plan_writes(
                user_profile=user_profile,
                user_birth_year=user_birth_year,
                llm=llm,
+                oral_scope=ut_norm,
            )
            st = create_story_with_version_sync(
                session,
@@ -491,10 +661,12 @@ def _run_batch_plan_writes(

        elapsed = time.perf_counter() - t0
        logger.info(
-            "event=story_generated route_type=batch decision_source={} route_decision={} "
+            "event=story_generated memoir_correlation_id={} route_type=batch "
+            "decision_source={} route_decision={} "
            "unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} "
            "fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} "
            "story_id={} seconds={:.3f} oral_normalize_changed={}",
+            memoir_correlation_id or "",
            decision_source,
            unit.decision,
            len(unit.segment_ids),
@@ -525,6 +697,7 @@ def run_story_pipeline_for_category_batch(
    llm: Any,
    background_voice: str = "default",
    occupation: str = "",
+    memoir_correlation_id: str | None = None,
 ) -> tuple[Chapter | None, bool, set[str]]:
    """
    返回 (chapter, needs_cover_enqueue, story_ids_to_dispatch_after_commit)。
@@ -564,6 +737,14 @@ def run_story_pipeline_for_category_batch(
            len(om_norm),
        )
    new_content_input = format_narrative_user_content(oral_for_memoir, evidence_text)
+    logger.info(
+        "event=memoir_story_pipeline_start memoir_correlation_id={} user_id={} "
+        "chapter_category={} segment_count={}",
+        memoir_correlation_id or "",
+        user_id,
+        chapter_category,
+        len(category_segments),
+    )

    stmt_chapter = (
        select(Chapter)
@@ -641,6 +822,7 @@ def run_story_pipeline_for_category_batch(
            narrative_agent=narrative_agent,
            background_voice=background_voice,
            occupation=occupation,
+            memoir_correlation_id=memoir_correlation_id,
        )
    else:
        route = route_agent.decide(
@@ -689,6 +871,7 @@ def run_story_pipeline_for_category_batch(
            llm=llm,
            background_voice=background_voice,
            occupation=occupation,
+            fallback_plain_oral=om_norm,
        )
        json_invalid = False
        s0 = (raw_gen or "").strip()
@@ -720,6 +903,17 @@ def run_story_pipeline_for_category_batch(
            oral_for_memoir.strip(),
            existing_for_narrative or "",
        )
+        md, inv_fb = _apply_narrative_body_safety(
+            md,
+            oral=oral_for_memoir,
+            existing_for_narrative=existing_for_narrative or "",
+            evidence_text=evidence_text,
+            chapter_category=chapter_category,
+        )
+        if inv_fb != "none":
+            fallback_type = (
+                inv_fb if fallback_type == "none" else f"{fallback_type}+{inv_fb}"
+            )

        do_append = target_story_id is not None

@@ -740,6 +934,7 @@ def run_story_pipeline_for_category_batch(
                user_profile=user_profile,
                user_birth_year=user_birth_year,
                llm=llm,
+                oral_scope=om_norm,
            )
            st = create_story_with_version_sync(
                session,
@@ -757,10 +952,12 @@ def run_story_pipeline_for_category_batch(

        elapsed = time.perf_counter() - t0
        logger.info(
-            "event=story_generated route_type=single decision_source={} route_decision={} "
+            "event=story_generated memoir_correlation_id={} route_type=single "
+            "decision_source={} route_decision={} "
            "unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} "
            "fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} "
            "story_id={} seconds={:.3f} oral_normalize_changed={}",
+            memoir_correlation_id or "",
            decision_source,
            route.decision,
            len(category_segments),