feat(api): 访谈路径轻量门控、Memoir Phase1 批处理与叙事/记忆管线加固

- 新增 utterance_substance:短时/应答/元话语可跳过记忆检索、阶段 LLM 与资料抽取 LLM;可配置
- 输入归一化:LLM 模式默认仅语音/ASR;配置项写入 .env.example
- Memoir Phase1:可选 batch LLM 一次性抽取+分类(失败回退逐段);Extraction 空槽位时阶段与 current_stage 对齐,prompt 约束收紧
- 叙事与忠实度:narrative_safety、证据重叠/场合锚点、标题 slots 与履历短语 grounded;fidelity 解析失败 fail-open 可配置
- 章节管线:锁 TTL 上调、锁竞争 Celery 重试、Phase2 immediate singleflight 等;story_pipeline_sync / chapter_compose / memoir_tasks 联动
- Memory:compaction / repo / summarizer / evidence 小修;事实 FTS 未命中是否回退最近事实可配置
- 新增 memoir_pipeline_trace;补充 memoir_reliability 文档与多项回归/门控测试
This commit is contained in:
Kevin
2026-04-03 10:12:59 +08:00
parent 6b930808a3
commit 07c6478742
49 changed files with 12258 additions and 57 deletions

View File

@@ -7,6 +7,7 @@ Celery 用:按批次将 transcript 写入 Story并标记 Chapter 需物化
from __future__ import annotations
import json
import re
import time
import uuid
from typing import Any
@@ -37,6 +38,12 @@ from app.features.memoir.cover_eligibility import chapter_needs_cover_enqueue
from app.features.memoir.memoir_images.settings import MemoirImageSettings
from app.features.memoir.models import Chapter
from app.features.memoir.narrative_to_markdown import narrative_to_markdown
from app.features.memoir.narrative_safety import (
body_contains_prompt_artifact,
evidence_leakage_heuristic,
evidence_scene_anchor_leak,
strip_evidence_for_overlap_check,
)
from app.features.memoir.oral_normalize import (
apply_oral_rules,
normalize_oral_for_memoir,
@@ -57,6 +64,16 @@ from app.features.story.sync_write import (
logger = get_logger(__name__)
# 标题中若出现下列多字履历表述,则必须在 hay正文+口述+传入标题的 slots中逐字出现否则剔除无果片段或降级占位
_MEMOIR_TITLE_HAY_GROUNDING_PHRASES: tuple[str, ...] = (
"晋升旅长",
"晋升为旅长",
"晋升师长",
"晋升军长",
"旅长职务",
"师长职务",
)
# summary 章节跨阶段汇总 slots 时的上限(防叙事 prompt 膨胀)
MAX_SUMMARY_SLOT_KEYS = 80
MAX_SUMMARY_SLOT_CHARS = 12_000
@@ -127,6 +144,83 @@ def _placeholder_title(chapter_category: str) -> str:
return CHAPTER_CATEGORIES.get(chapter_category, chapter_category)
def _title_slots_filtered_for_generation(
slot_snippets: dict[str, str], *, md: str, oral_scope: str
) -> dict[str, str]:
"""仅保留与正文或本批口述有文本重叠的 slot降低档案/历史 slot 串台到标题。"""
if not settings.memoir_title_slots_require_body_or_oral_match:
return dict(slot_snippets)
hay = f"{(md or '').strip()}\n{(oral_scope or '').strip()}"
if not hay.strip():
return {}
out: dict[str, str] = {}
for k, v in (slot_snippets or {}).items():
if k == "content_excerpt":
continue
s = (v or "").strip()
if len(s) < 2:
continue
if s in hay:
out[k] = s
continue
prefix = s[: min(12, len(s))]
if len(prefix) >= 4 and prefix in hay:
out[k] = s
return out
def _title_hay_for_grounding(
merged_slots: dict[str, str], md: str, oral_scope: str
) -> str:
"""与标题模型可见材料一致的依据串(用于事后逐字 grounding"""
parts: list[str] = [(md or "").strip(), (oral_scope or "").strip()]
for k, v in (merged_slots or {}).items():
if k == "content_excerpt":
continue
if (v or "").strip():
parts.append(str(v).strip())
return "\n".join(p for p in parts if p)
def _strip_ungrounded_title_segments(
title: str,
hay: str,
*,
chapter_category: str,
) -> str:
"""
按 · / • 分节丢弃含未落地履历短语的小节;全部丢弃则占位。
"""
if not settings.memoir_title_hay_grounding_strict_phrases_enabled:
return (title or "").strip() or _placeholder_title(chapter_category)
t = (title or "").strip()
h = (hay or "").strip()
if not t:
return _placeholder_title(chapter_category)
segments = [s.strip() for s in re.split(r"\s*[·•]\s*", t) if s.strip()]
if not segments:
segments = [t]
kept: list[str] = []
for seg in segments:
bad = any(
phrase in seg and phrase not in h
for phrase in _MEMOIR_TITLE_HAY_GROUNDING_PHRASES
)
if bad:
logger.info(
"event=memoir_title_segment_ungrounded segment_preview={} chapter_category={}",
seg[:40],
chapter_category,
)
continue
kept.append(seg)
if not kept:
return _placeholder_title(chapter_category)
if len(kept) == 1:
return kept[0]
return " · ".join(kept)
def _maybe_generate_title(
narrative_agent: "NarrativeAgent",
*,
@@ -136,23 +230,33 @@ def _maybe_generate_title(
user_profile: str,
user_birth_year: int | None,
llm: Any,
oral_scope: str = "",
narrow_profile_for_title: bool = True,
) -> str:
"""Generate a title only when body is long enough; otherwise return placeholder."""
body_len = len((md or "").strip())
if body_len < settings.story_title_min_body_chars:
return _placeholder_title(chapter_category)
content_excerpt = (md or "").strip()[:300]
merged_slots = dict(slot_snippets)
merged_slots = _title_slots_filtered_for_generation(
slot_snippets, md=md, oral_scope=oral_scope
)
if content_excerpt and "content_excerpt" not in merged_slots:
merged_slots["content_excerpt"] = content_excerpt
return narrative_agent.generate_title(
# 标题默认不注入完整档案,仅年龄提示仍可用(来自 birth_year
profile_for_title = "" if narrow_profile_for_title else user_profile
raw_title = narrative_agent.generate_title(
stage=chapter_category,
emotion="neutral",
slots=merged_slots,
user_profile=user_profile,
user_profile=profile_for_title,
birth_year=user_birth_year,
llm=llm,
)
hay = _title_hay_for_grounding(merged_slots, md, oral_scope)
return _strip_ungrounded_title_segments(
raw_title, hay, chapter_category=chapter_category
)
def _route_segment_texts(category_segments: list) -> list[tuple[str, str]]:
@@ -206,11 +310,13 @@ def _gate_narrative_fidelity(
return narrative_raw, "none"
agent = FidelityCheckAgent()
ex = (existing_canonical or "").strip() or None
is_append = bool(ex)
if agent.passes(
oral_text=oral_text,
narrative_json=narrative_raw,
llm=llm,
existing_canonical_markdown=ex,
is_append=is_append,
):
return narrative_raw, "none"
logger.warning(
@@ -224,6 +330,56 @@ def _gate_narrative_fidelity(
return _fidelity_fallback_json(o, ex), "fidelity_failed"
def _apply_narrative_body_safety(
md: str,
*,
oral: str,
existing_for_narrative: str,
evidence_text: str,
chapter_category: str,
) -> tuple[str, str]:
"""prompt 标记或摘录子串疑似渗入正文时,回退为口述/旧文拼接。"""
m = (md or "").strip()
ex = (existing_for_narrative or "").strip()
o = (oral or "").strip()
min_len = int(settings.memoir_narrative_evidence_overlap_min_chars)
ev_plain = strip_evidence_for_overlap_check(evidence_text)
if m and body_contains_prompt_artifact(m):
logger.warning(
"event=narrative_invariant_failed reason=prompt_artifact chapter_category={}",
chapter_category,
)
return _coalesce_story_markdown("", oral, existing_for_narrative), (
"prompt_artifact_in_body"
)
if (
m
and evidence_text.strip()
and evidence_leakage_heuristic(m, ev_plain, o, ex, min_len)
):
logger.warning(
"event=narrative_invariant_failed reason=evidence_leak chapter_category={}",
chapter_category,
)
return _coalesce_story_markdown("", oral, existing_for_narrative), (
"evidence_leak_heuristic"
)
if (
settings.memoir_evidence_scene_anchor_check_enabled
and m
and evidence_text.strip()
and evidence_scene_anchor_leak(m, ev_plain, o, ex)
):
logger.warning(
"event=narrative_invariant_failed reason=evidence_scene_anchor chapter_category={}",
chapter_category,
)
return _coalesce_story_markdown("", oral, existing_for_narrative), (
"evidence_scene_anchor"
)
return m, "none"
def _coalesce_story_markdown(
md: str,
oral: str,
@@ -375,6 +531,7 @@ def _run_batch_plan_writes(
narrative_agent: NarrativeAgent,
background_voice: str = "default",
occupation: str = "",
memoir_correlation_id: str | None = None,
) -> set[str]:
dispatch_ids: set[str] = set()
max_chars = int(settings.story_append_max_canonical_chars)
@@ -426,6 +583,7 @@ def _run_batch_plan_writes(
llm=llm,
background_voice=background_voice,
occupation=occupation,
fallback_plain_oral=ut_norm,
)
json_invalid = False
s0 = (raw_gen or "").strip()
@@ -456,6 +614,17 @@ def _run_batch_plan_writes(
oral_unit.strip(),
existing_for_narrative or "",
)
md, inv_fb = _apply_narrative_body_safety(
md,
oral=oral_unit,
existing_for_narrative=existing_for_narrative or "",
evidence_text=evidence_text,
chapter_category=chapter_category,
)
if inv_fb != "none":
fallback_type = (
inv_fb if fallback_type == "none" else f"{fallback_type}+{inv_fb}"
)
if target_story_id:
append_story_version_sync(session, str(target_story_id), md)
@@ -474,6 +643,7 @@ def _run_batch_plan_writes(
user_profile=user_profile,
user_birth_year=user_birth_year,
llm=llm,
oral_scope=ut_norm,
)
st = create_story_with_version_sync(
session,
@@ -491,10 +661,12 @@ def _run_batch_plan_writes(
elapsed = time.perf_counter() - t0
logger.info(
"event=story_generated route_type=batch decision_source={} route_decision={} "
"event=story_generated memoir_correlation_id={} route_type=batch "
"decision_source={} route_decision={} "
"unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} "
"fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} "
"story_id={} seconds={:.3f} oral_normalize_changed={}",
memoir_correlation_id or "",
decision_source,
unit.decision,
len(unit.segment_ids),
@@ -525,6 +697,7 @@ def run_story_pipeline_for_category_batch(
llm: Any,
background_voice: str = "default",
occupation: str = "",
memoir_correlation_id: str | None = None,
) -> tuple[Chapter | None, bool, set[str]]:
"""
返回 (chapter, needs_cover_enqueue, story_ids_to_dispatch_after_commit)。
@@ -564,6 +737,14 @@ def run_story_pipeline_for_category_batch(
len(om_norm),
)
new_content_input = format_narrative_user_content(oral_for_memoir, evidence_text)
logger.info(
"event=memoir_story_pipeline_start memoir_correlation_id={} user_id={} "
"chapter_category={} segment_count={}",
memoir_correlation_id or "",
user_id,
chapter_category,
len(category_segments),
)
stmt_chapter = (
select(Chapter)
@@ -641,6 +822,7 @@ def run_story_pipeline_for_category_batch(
narrative_agent=narrative_agent,
background_voice=background_voice,
occupation=occupation,
memoir_correlation_id=memoir_correlation_id,
)
else:
route = route_agent.decide(
@@ -689,6 +871,7 @@ def run_story_pipeline_for_category_batch(
llm=llm,
background_voice=background_voice,
occupation=occupation,
fallback_plain_oral=om_norm,
)
json_invalid = False
s0 = (raw_gen or "").strip()
@@ -720,6 +903,17 @@ def run_story_pipeline_for_category_batch(
oral_for_memoir.strip(),
existing_for_narrative or "",
)
md, inv_fb = _apply_narrative_body_safety(
md,
oral=oral_for_memoir,
existing_for_narrative=existing_for_narrative or "",
evidence_text=evidence_text,
chapter_category=chapter_category,
)
if inv_fb != "none":
fallback_type = (
inv_fb if fallback_type == "none" else f"{fallback_type}+{inv_fb}"
)
do_append = target_story_id is not None
@@ -740,6 +934,7 @@ def run_story_pipeline_for_category_batch(
user_profile=user_profile,
user_birth_year=user_birth_year,
llm=llm,
oral_scope=om_norm,
)
st = create_story_with_version_sync(
session,
@@ -757,10 +952,12 @@ def run_story_pipeline_for_category_batch(
elapsed = time.perf_counter() - t0
logger.info(
"event=story_generated route_type=single decision_source={} route_decision={} "
"event=story_generated memoir_correlation_id={} route_type=single "
"decision_source={} route_decision={} "
"unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} "
"fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} "
"story_id={} seconds={:.3f} oral_normalize_changed={}",
memoir_correlation_id or "",
decision_source,
route.decision,
len(category_segments),