feat(api): 访谈人格/回复长度策略、口述归一、背景语气与输入净稿全链路

Chat 访谈 - 新增 persona 系统（default / warm_listener / curious_guide）与 background_voice 语气层 - 回复长度由 compute_reply_plan 统一决策（brief / standard / expanded），融合信息密度启发式 - 输入净稿（input_normalize）：编排层可选 rules/llm 归一用户口语后再喂模型与记忆检索 - 记忆证据注入：按用户话检索 memory evidence 并注入 prompt Memoir 回忆录 - 口述归一（oral_normalize）：segment 原文保留，story 管线取派生净稿作叙事输入 - segment 入队批次门闸：累计字数 + 最长等待秒数，减少零碎提交 - fidelity_check / prompts / narrative_agent 微调 - Alembic 0005：清理跨章节 story 外键 Infra - Dockerfile 加入 ffmpeg - pyproject.toml 新增依赖并同步 uv.lock - .env.example / .env.production 补全新配置项 Tests - 新增 test_background_voice、test_chat_input_normalize、test_experience_regressions - 扩展 test_interview_prompts、test_interview_reply_length、test_story_route_oral_invariant Made-with: Cursor
2026-03-31 23:55:26 +08:00
parent 42ae2a5e91
commit 69a673e6c6
44 changed files with 2998 additions and 259 deletions
--- a/api/app/features/memoir/story_pipeline_sync.py
+++ b/api/app/features/memoir/story_pipeline_sync.py
@@ -32,6 +32,10 @@ from app.features.memoir.cover_eligibility import chapter_needs_cover_enqueue
 from app.features.memoir.memoir_images.settings import MemoirImageSettings
 from app.features.memoir.models import Chapter
 from app.features.memoir.narrative_to_markdown import narrative_to_markdown
+from app.features.memoir.oral_normalize import (
+    apply_oral_normalization_rules,
+    normalize_oral_for_memoir,
+)
 from app.features.memoir.repo import (
    mark_chapter_dirty_sync,
    reorder_chapter_story_links_by_life_order_sync,
@@ -49,6 +53,23 @@ from app.features.story.sync_write import (
 logger = get_logger(__name__)


+def _route_segment_texts(category_segments: list) -> list[tuple[str, str]]:
+    """批量路由 plan_batch：每段仅做规则归一，避免 N 次 LLM。"""
+    out: list[tuple[str, str]] = []
+    for seg in category_segments:
+        raw = seg.user_input_text or ""
+        if (
+            settings.memoir_oral_normalize_enabled
+            and (settings.memoir_oral_normalize_mode or "rules").strip().lower()
+            != "off"
+        ):
+            t = apply_oral_normalization_rules(raw)
+        else:
+            t = raw
+        out.append((str(seg.id), t))
+    return out
+
+
 def _fidelity_fallback_json(oral: str, existing_canonical: str | None) -> str:
    """忠实度未通过时的安全回退：续写场景保留旧文 + 本段口述，避免只剩一句。"""
    o = (oral or "").strip()[:15000]
@@ -102,7 +123,7 @@ def _gate_narrative_fidelity(


 def _should_fallback_to_transcript(md: str, oral: str) -> bool:
-    """模型输出相对口述明显过短时回退为口述原文（防「1999」类压缩）。"""
+    """模型输出相对口述极度过短时才回退（仅防极端压缩如「1999」）。"""
    o = (oral or "").strip()
    if not o:
        return False
@@ -165,7 +186,7 @@ def _apply_narrative_fallbacks(
    if existing_for_narrative and _is_json_narrative(narrative_raw):
        merged_md = narrative_to_markdown(narrative_raw).strip()
        ex = (existing_for_narrative or "").strip()
-        if ex and len(ex) > 400 and len(merged_md) < len(ex) * 0.35:
+        if ex and len(ex) > 400 and len(merged_md) < len(ex) * 0.25:
            logger.warning(
                "event=narrative_fallback reason=merge_shrink action=append_oral "
                "chapter_category={}",
@@ -176,7 +197,7 @@ def _apply_narrative_fallbacks(
    if (
        existing_for_narrative
        and not _is_json_narrative(narrative_raw)
-        and len(narrative_raw) < len(existing_for_narrative) * 0.8
+        and len(narrative_raw) < len(existing_for_narrative) * 0.5
    ):
        logger.warning(
            "event=narrative_fallback reason=length_anomaly action=append_raw "
@@ -290,6 +311,7 @@ def _run_batch_plan_writes(
    user_birth_year: int | None,
    llm: Any,
    narrative_agent: NarrativeAgent,
+    background_voice: str = "default",
 ) -> set[str]:
    dispatch_ids: set[str] = set()
    max_chars = int(settings.story_append_max_canonical_chars)
@@ -297,7 +319,16 @@ def _run_batch_plan_writes(
    for unit in plan.units:
        t0 = time.perf_counter()
        unit_text = _ordered_text_for_segment_ids(category_segments, unit.segment_ids)
-        new_content_input = format_narrative_user_content(unit_text, evidence_text)
+        oral_unit = normalize_oral_for_memoir(unit_text, llm=llm)
+        ut_raw = (unit_text or "").strip()
+        ut_norm = (oral_unit or "").strip()
+        if ut_raw != ut_norm:
+            logger.info(
+                "event=oral_normalized context=batch_unit raw_len={} norm_len={}",
+                len(ut_raw),
+                len(ut_norm),
+            )
+        new_content_input = format_narrative_user_content(oral_unit, evidence_text)

        target_story_id: str | None = None
        existing_for_narrative = ""
@@ -330,6 +361,7 @@ def _run_batch_plan_writes(
            user_profile=user_profile,
            birth_year=user_birth_year,
            llm=llm,
+            background_voice=background_voice,
        )
        json_invalid = False
        s0 = (raw_gen or "").strip()
@@ -340,14 +372,14 @@ def _run_batch_plan_writes(
                json_invalid = True

        narrative_raw, fb_gate = _gate_narrative_fidelity(
-            unit_text,
+            oral_unit,
            raw_gen,
            llm,
            existing_canonical=existing_for_narrative or None,
        )
        narrative_raw, fb_apply = _apply_narrative_fallbacks(
            narrative_raw,
-            unit_text,
+            oral_unit,
            existing_for_narrative,
            chapter_category=chapter_category,
        )
@@ -357,7 +389,7 @@ def _run_batch_plan_writes(

        md = _coalesce_story_markdown(
            narrative_to_markdown(narrative_raw).strip(),
-            unit_text.strip(),
+            oral_unit.strip(),
            existing_for_narrative or "",
        )

@@ -399,7 +431,7 @@ def _run_batch_plan_writes(
            "event=story_generated route_type=batch decision_source={} route_decision={} "
            "unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} "
            "fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} "
-            "story_id={} seconds={:.3f}",
+            "story_id={} seconds={:.3f} oral_normalize_changed={}",
            decision_source,
            unit.decision,
            len(unit.segment_ids),
@@ -407,12 +439,13 @@ def _run_batch_plan_writes(
            _is_json_narrative(raw_gen),
            fb_gate == "none",
            fallback_type,
-            len(unit_text.strip()),
+            len(ut_norm),
            len(md.strip()),
            chapter_category,
            is_append,
            sid_log,
            elapsed,
+            ut_raw != ut_norm,
        )
    return dispatch_ids

@@ -427,6 +460,7 @@ def run_story_pipeline_for_category_batch(
    user_profile: str,
    user_birth_year: int | None,
    llm: Any,
+    background_voice: str = "default",
 ) -> tuple[Chapter | None, bool, set[str]]:
    """
    返回 (chapter, needs_cover_enqueue, story_ids_to_dispatch_after_commit)。
@@ -456,7 +490,16 @@ def run_story_pipeline_for_category_batch(
        }

    evidence_text = format_evidence_chunks_for_prompt(evidence)
-    new_content_input = format_narrative_user_content(combined_text, evidence_text)
+    oral_for_memoir = normalize_oral_for_memoir(combined_text, llm=llm)
+    ct_raw = (combined_text or "").strip()
+    om_norm = (oral_for_memoir or "").strip()
+    if ct_raw != om_norm:
+        logger.info(
+            "event=oral_normalized context=category_batch raw_len={} norm_len={}",
+            len(ct_raw),
+            len(om_norm),
+        )
+    new_content_input = format_narrative_user_content(oral_for_memoir, evidence_text)

    stmt_chapter = (
        select(Chapter)
@@ -493,15 +536,14 @@ def run_story_pipeline_for_category_batch(
            llm=llm,
        )

-    candidates = list_active_stories_for_user_sync(session, user_id)
+    # 仅同 chapter_category（story.stage）的 Story 可作为 append 候选，避免跨章节链接导致多章内容相同
+    all_stories = list_active_stories_for_user_sync(session, user_id)
+    candidates = [s for s in all_stories if s.stage == chapter_category]
    valid_ids = {str(s.id) for s in candidates}
    story_meta = _story_meta_for_route(session, candidates)

-    batch_for_route = (
-        f"{combined_text}\n\n{evidence_text}"
-        if evidence_text.strip()
-        else combined_text
-    )
+    # Story route 仅依据本批用户口述；evidence 只进入叙事/合并，不参与 new/append 判定。
+    route_transcript = oral_for_memoir

    calculated_order_index = STAGE_TO_ORDER.get(chapter_category, 999)

@@ -512,7 +554,7 @@ def run_story_pipeline_for_category_batch(
    )
    plan: StoryBatchPlan | None = None
    if use_batch_plan:
-        segs = [(seg.id, seg.user_input_text or "") for seg in category_segments]
+        segs = _route_segment_texts(category_segments)
        plan = route_agent.plan_batch(
            chapter_category=chapter_category,
            chapter_title=title,
@@ -546,12 +588,13 @@ def run_story_pipeline_for_category_batch(
            user_birth_year=user_birth_year,
            llm=llm,
            narrative_agent=narrative_agent,
+            background_voice=background_voice,
        )
    else:
        route = route_agent.decide(
            chapter_category=chapter_category,
            chapter_title=title,
-            batch_transcript=batch_for_route,
+            batch_transcript=route_transcript,
            candidate_stories=candidates,
            llm=llm,
            valid_story_ids=valid_ids,
@@ -592,6 +635,7 @@ def run_story_pipeline_for_category_batch(
            user_profile=user_profile,
            birth_year=user_birth_year,
            llm=llm,
+            background_voice=background_voice,
        )
        json_invalid = False
        s0 = (raw_gen or "").strip()
@@ -602,7 +646,7 @@ def run_story_pipeline_for_category_batch(
                json_invalid = True

        narrative_raw, fb_gate = _gate_narrative_fidelity(
-            combined_text,
+            oral_for_memoir,
            raw_gen,
            llm,
            existing_canonical=existing_for_narrative or None,
@@ -610,7 +654,7 @@ def run_story_pipeline_for_category_batch(

        narrative_raw, fb_apply = _apply_narrative_fallbacks(
            narrative_raw,
-            combined_text,
+            oral_for_memoir,
            existing_for_narrative,
            chapter_category=chapter_category,
        )
@@ -620,7 +664,7 @@ def run_story_pipeline_for_category_batch(

        md = _coalesce_story_markdown(
            narrative_to_markdown(narrative_raw).strip(),
-            combined_text.strip(),
+            oral_for_memoir.strip(),
            existing_for_narrative or "",
        )

@@ -664,7 +708,7 @@ def run_story_pipeline_for_category_batch(
            "event=story_generated route_type=single decision_source={} route_decision={} "
            "unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} "
            "fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} "
-            "story_id={} seconds={:.3f}",
+            "story_id={} seconds={:.3f} oral_normalize_changed={}",
            decision_source,
            route.decision,
            len(category_segments),
@@ -672,12 +716,13 @@ def run_story_pipeline_for_category_batch(
            _is_json_narrative(raw_gen),
            fb_gate == "none",
            fallback_type,
-            len(combined_text.strip()),
+            len(om_norm),
            len(md.strip()),
            chapter_category,
            is_append,
            sid_log,
            elapsed,
+            ct_raw != om_norm,
        )

    reorder_chapter_story_links_by_life_order_sync(session, str(chapter.id))