feat(api): 访谈路径轻量门控、Memoir Phase1 批处理与叙事/记忆管线加固
- 新增 utterance_substance:短时/应答/元话语可跳过记忆检索、阶段 LLM 与资料抽取 LLM;可配置 - 输入归一化:LLM 模式默认仅语音/ASR;配置项写入 .env.example - Memoir Phase1:可选 batch LLM 一次性抽取+分类(失败回退逐段);Extraction 空槽位时阶段与 current_stage 对齐,prompt 约束收紧 - 叙事与忠实度:narrative_safety、证据重叠/场合锚点、标题 slots 与履历短语 grounded;fidelity 解析失败 fail-open 可配置 - 章节管线:锁 TTL 上调、锁竞争 Celery 重试、Phase2 immediate singleflight 等;story_pipeline_sync / chapter_compose / memoir_tasks 联动 - Memory:compaction / repo / summarizer / evidence 小修;事实 FTS 未命中是否回退最近事实可配置 - 新增 memoir_pipeline_trace;补充 memoir_reliability 文档与多项回归/门控测试
This commit is contained in:
@@ -7,6 +7,7 @@ Celery 用:按批次将 transcript 写入 Story,并标记 Chapter 需物化
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from typing import Any
|
||||
@@ -37,6 +38,12 @@ from app.features.memoir.cover_eligibility import chapter_needs_cover_enqueue
|
||||
from app.features.memoir.memoir_images.settings import MemoirImageSettings
|
||||
from app.features.memoir.models import Chapter
|
||||
from app.features.memoir.narrative_to_markdown import narrative_to_markdown
|
||||
from app.features.memoir.narrative_safety import (
|
||||
body_contains_prompt_artifact,
|
||||
evidence_leakage_heuristic,
|
||||
evidence_scene_anchor_leak,
|
||||
strip_evidence_for_overlap_check,
|
||||
)
|
||||
from app.features.memoir.oral_normalize import (
|
||||
apply_oral_rules,
|
||||
normalize_oral_for_memoir,
|
||||
@@ -57,6 +64,16 @@ from app.features.story.sync_write import (
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# 标题中若出现下列多字履历表述,则必须在 hay(正文+口述+传入标题的 slots)中逐字出现,否则剔除无果片段或降级占位
|
||||
_MEMOIR_TITLE_HAY_GROUNDING_PHRASES: tuple[str, ...] = (
|
||||
"晋升旅长",
|
||||
"晋升为旅长",
|
||||
"晋升师长",
|
||||
"晋升军长",
|
||||
"旅长职务",
|
||||
"师长职务",
|
||||
)
|
||||
|
||||
# summary 章节跨阶段汇总 slots 时的上限(防叙事 prompt 膨胀)
|
||||
MAX_SUMMARY_SLOT_KEYS = 80
|
||||
MAX_SUMMARY_SLOT_CHARS = 12_000
|
||||
@@ -127,6 +144,83 @@ def _placeholder_title(chapter_category: str) -> str:
|
||||
return CHAPTER_CATEGORIES.get(chapter_category, chapter_category)
|
||||
|
||||
|
||||
def _title_slots_filtered_for_generation(
|
||||
slot_snippets: dict[str, str], *, md: str, oral_scope: str
|
||||
) -> dict[str, str]:
|
||||
"""仅保留与正文或本批口述有文本重叠的 slot,降低档案/历史 slot 串台到标题。"""
|
||||
if not settings.memoir_title_slots_require_body_or_oral_match:
|
||||
return dict(slot_snippets)
|
||||
hay = f"{(md or '').strip()}\n{(oral_scope or '').strip()}"
|
||||
if not hay.strip():
|
||||
return {}
|
||||
out: dict[str, str] = {}
|
||||
for k, v in (slot_snippets or {}).items():
|
||||
if k == "content_excerpt":
|
||||
continue
|
||||
s = (v or "").strip()
|
||||
if len(s) < 2:
|
||||
continue
|
||||
if s in hay:
|
||||
out[k] = s
|
||||
continue
|
||||
prefix = s[: min(12, len(s))]
|
||||
if len(prefix) >= 4 and prefix in hay:
|
||||
out[k] = s
|
||||
return out
|
||||
|
||||
|
||||
def _title_hay_for_grounding(
|
||||
merged_slots: dict[str, str], md: str, oral_scope: str
|
||||
) -> str:
|
||||
"""与标题模型可见材料一致的依据串(用于事后逐字 grounding)。"""
|
||||
parts: list[str] = [(md or "").strip(), (oral_scope or "").strip()]
|
||||
for k, v in (merged_slots or {}).items():
|
||||
if k == "content_excerpt":
|
||||
continue
|
||||
if (v or "").strip():
|
||||
parts.append(str(v).strip())
|
||||
return "\n".join(p for p in parts if p)
|
||||
|
||||
|
||||
def _strip_ungrounded_title_segments(
|
||||
title: str,
|
||||
hay: str,
|
||||
*,
|
||||
chapter_category: str,
|
||||
) -> str:
|
||||
"""
|
||||
按 · / • 分节丢弃含未落地履历短语的小节;全部丢弃则占位。
|
||||
"""
|
||||
if not settings.memoir_title_hay_grounding_strict_phrases_enabled:
|
||||
return (title or "").strip() or _placeholder_title(chapter_category)
|
||||
t = (title or "").strip()
|
||||
h = (hay or "").strip()
|
||||
if not t:
|
||||
return _placeholder_title(chapter_category)
|
||||
segments = [s.strip() for s in re.split(r"\s*[·•]\s*", t) if s.strip()]
|
||||
if not segments:
|
||||
segments = [t]
|
||||
kept: list[str] = []
|
||||
for seg in segments:
|
||||
bad = any(
|
||||
phrase in seg and phrase not in h
|
||||
for phrase in _MEMOIR_TITLE_HAY_GROUNDING_PHRASES
|
||||
)
|
||||
if bad:
|
||||
logger.info(
|
||||
"event=memoir_title_segment_ungrounded segment_preview={} chapter_category={}",
|
||||
seg[:40],
|
||||
chapter_category,
|
||||
)
|
||||
continue
|
||||
kept.append(seg)
|
||||
if not kept:
|
||||
return _placeholder_title(chapter_category)
|
||||
if len(kept) == 1:
|
||||
return kept[0]
|
||||
return " · ".join(kept)
|
||||
|
||||
|
||||
def _maybe_generate_title(
|
||||
narrative_agent: "NarrativeAgent",
|
||||
*,
|
||||
@@ -136,23 +230,33 @@ def _maybe_generate_title(
|
||||
user_profile: str,
|
||||
user_birth_year: int | None,
|
||||
llm: Any,
|
||||
oral_scope: str = "",
|
||||
narrow_profile_for_title: bool = True,
|
||||
) -> str:
|
||||
"""Generate a title only when body is long enough; otherwise return placeholder."""
|
||||
body_len = len((md or "").strip())
|
||||
if body_len < settings.story_title_min_body_chars:
|
||||
return _placeholder_title(chapter_category)
|
||||
content_excerpt = (md or "").strip()[:300]
|
||||
merged_slots = dict(slot_snippets)
|
||||
merged_slots = _title_slots_filtered_for_generation(
|
||||
slot_snippets, md=md, oral_scope=oral_scope
|
||||
)
|
||||
if content_excerpt and "content_excerpt" not in merged_slots:
|
||||
merged_slots["content_excerpt"] = content_excerpt
|
||||
return narrative_agent.generate_title(
|
||||
# 标题默认不注入完整档案,仅年龄提示仍可用(来自 birth_year)
|
||||
profile_for_title = "" if narrow_profile_for_title else user_profile
|
||||
raw_title = narrative_agent.generate_title(
|
||||
stage=chapter_category,
|
||||
emotion="neutral",
|
||||
slots=merged_slots,
|
||||
user_profile=user_profile,
|
||||
user_profile=profile_for_title,
|
||||
birth_year=user_birth_year,
|
||||
llm=llm,
|
||||
)
|
||||
hay = _title_hay_for_grounding(merged_slots, md, oral_scope)
|
||||
return _strip_ungrounded_title_segments(
|
||||
raw_title, hay, chapter_category=chapter_category
|
||||
)
|
||||
|
||||
|
||||
def _route_segment_texts(category_segments: list) -> list[tuple[str, str]]:
|
||||
@@ -206,11 +310,13 @@ def _gate_narrative_fidelity(
|
||||
return narrative_raw, "none"
|
||||
agent = FidelityCheckAgent()
|
||||
ex = (existing_canonical or "").strip() or None
|
||||
is_append = bool(ex)
|
||||
if agent.passes(
|
||||
oral_text=oral_text,
|
||||
narrative_json=narrative_raw,
|
||||
llm=llm,
|
||||
existing_canonical_markdown=ex,
|
||||
is_append=is_append,
|
||||
):
|
||||
return narrative_raw, "none"
|
||||
logger.warning(
|
||||
@@ -224,6 +330,56 @@ def _gate_narrative_fidelity(
|
||||
return _fidelity_fallback_json(o, ex), "fidelity_failed"
|
||||
|
||||
|
||||
def _apply_narrative_body_safety(
|
||||
md: str,
|
||||
*,
|
||||
oral: str,
|
||||
existing_for_narrative: str,
|
||||
evidence_text: str,
|
||||
chapter_category: str,
|
||||
) -> tuple[str, str]:
|
||||
"""prompt 标记或摘录子串疑似渗入正文时,回退为口述/旧文拼接。"""
|
||||
m = (md or "").strip()
|
||||
ex = (existing_for_narrative or "").strip()
|
||||
o = (oral or "").strip()
|
||||
min_len = int(settings.memoir_narrative_evidence_overlap_min_chars)
|
||||
ev_plain = strip_evidence_for_overlap_check(evidence_text)
|
||||
if m and body_contains_prompt_artifact(m):
|
||||
logger.warning(
|
||||
"event=narrative_invariant_failed reason=prompt_artifact chapter_category={}",
|
||||
chapter_category,
|
||||
)
|
||||
return _coalesce_story_markdown("", oral, existing_for_narrative), (
|
||||
"prompt_artifact_in_body"
|
||||
)
|
||||
if (
|
||||
m
|
||||
and evidence_text.strip()
|
||||
and evidence_leakage_heuristic(m, ev_plain, o, ex, min_len)
|
||||
):
|
||||
logger.warning(
|
||||
"event=narrative_invariant_failed reason=evidence_leak chapter_category={}",
|
||||
chapter_category,
|
||||
)
|
||||
return _coalesce_story_markdown("", oral, existing_for_narrative), (
|
||||
"evidence_leak_heuristic"
|
||||
)
|
||||
if (
|
||||
settings.memoir_evidence_scene_anchor_check_enabled
|
||||
and m
|
||||
and evidence_text.strip()
|
||||
and evidence_scene_anchor_leak(m, ev_plain, o, ex)
|
||||
):
|
||||
logger.warning(
|
||||
"event=narrative_invariant_failed reason=evidence_scene_anchor chapter_category={}",
|
||||
chapter_category,
|
||||
)
|
||||
return _coalesce_story_markdown("", oral, existing_for_narrative), (
|
||||
"evidence_scene_anchor"
|
||||
)
|
||||
return m, "none"
|
||||
|
||||
|
||||
def _coalesce_story_markdown(
|
||||
md: str,
|
||||
oral: str,
|
||||
@@ -375,6 +531,7 @@ def _run_batch_plan_writes(
|
||||
narrative_agent: NarrativeAgent,
|
||||
background_voice: str = "default",
|
||||
occupation: str = "",
|
||||
memoir_correlation_id: str | None = None,
|
||||
) -> set[str]:
|
||||
dispatch_ids: set[str] = set()
|
||||
max_chars = int(settings.story_append_max_canonical_chars)
|
||||
@@ -426,6 +583,7 @@ def _run_batch_plan_writes(
|
||||
llm=llm,
|
||||
background_voice=background_voice,
|
||||
occupation=occupation,
|
||||
fallback_plain_oral=ut_norm,
|
||||
)
|
||||
json_invalid = False
|
||||
s0 = (raw_gen or "").strip()
|
||||
@@ -456,6 +614,17 @@ def _run_batch_plan_writes(
|
||||
oral_unit.strip(),
|
||||
existing_for_narrative or "",
|
||||
)
|
||||
md, inv_fb = _apply_narrative_body_safety(
|
||||
md,
|
||||
oral=oral_unit,
|
||||
existing_for_narrative=existing_for_narrative or "",
|
||||
evidence_text=evidence_text,
|
||||
chapter_category=chapter_category,
|
||||
)
|
||||
if inv_fb != "none":
|
||||
fallback_type = (
|
||||
inv_fb if fallback_type == "none" else f"{fallback_type}+{inv_fb}"
|
||||
)
|
||||
|
||||
if target_story_id:
|
||||
append_story_version_sync(session, str(target_story_id), md)
|
||||
@@ -474,6 +643,7 @@ def _run_batch_plan_writes(
|
||||
user_profile=user_profile,
|
||||
user_birth_year=user_birth_year,
|
||||
llm=llm,
|
||||
oral_scope=ut_norm,
|
||||
)
|
||||
st = create_story_with_version_sync(
|
||||
session,
|
||||
@@ -491,10 +661,12 @@ def _run_batch_plan_writes(
|
||||
|
||||
elapsed = time.perf_counter() - t0
|
||||
logger.info(
|
||||
"event=story_generated route_type=batch decision_source={} route_decision={} "
|
||||
"event=story_generated memoir_correlation_id={} route_type=batch "
|
||||
"decision_source={} route_decision={} "
|
||||
"unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} "
|
||||
"fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} "
|
||||
"story_id={} seconds={:.3f} oral_normalize_changed={}",
|
||||
memoir_correlation_id or "",
|
||||
decision_source,
|
||||
unit.decision,
|
||||
len(unit.segment_ids),
|
||||
@@ -525,6 +697,7 @@ def run_story_pipeline_for_category_batch(
|
||||
llm: Any,
|
||||
background_voice: str = "default",
|
||||
occupation: str = "",
|
||||
memoir_correlation_id: str | None = None,
|
||||
) -> tuple[Chapter | None, bool, set[str]]:
|
||||
"""
|
||||
返回 (chapter, needs_cover_enqueue, story_ids_to_dispatch_after_commit)。
|
||||
@@ -564,6 +737,14 @@ def run_story_pipeline_for_category_batch(
|
||||
len(om_norm),
|
||||
)
|
||||
new_content_input = format_narrative_user_content(oral_for_memoir, evidence_text)
|
||||
logger.info(
|
||||
"event=memoir_story_pipeline_start memoir_correlation_id={} user_id={} "
|
||||
"chapter_category={} segment_count={}",
|
||||
memoir_correlation_id or "",
|
||||
user_id,
|
||||
chapter_category,
|
||||
len(category_segments),
|
||||
)
|
||||
|
||||
stmt_chapter = (
|
||||
select(Chapter)
|
||||
@@ -641,6 +822,7 @@ def run_story_pipeline_for_category_batch(
|
||||
narrative_agent=narrative_agent,
|
||||
background_voice=background_voice,
|
||||
occupation=occupation,
|
||||
memoir_correlation_id=memoir_correlation_id,
|
||||
)
|
||||
else:
|
||||
route = route_agent.decide(
|
||||
@@ -689,6 +871,7 @@ def run_story_pipeline_for_category_batch(
|
||||
llm=llm,
|
||||
background_voice=background_voice,
|
||||
occupation=occupation,
|
||||
fallback_plain_oral=om_norm,
|
||||
)
|
||||
json_invalid = False
|
||||
s0 = (raw_gen or "").strip()
|
||||
@@ -720,6 +903,17 @@ def run_story_pipeline_for_category_batch(
|
||||
oral_for_memoir.strip(),
|
||||
existing_for_narrative or "",
|
||||
)
|
||||
md, inv_fb = _apply_narrative_body_safety(
|
||||
md,
|
||||
oral=oral_for_memoir,
|
||||
existing_for_narrative=existing_for_narrative or "",
|
||||
evidence_text=evidence_text,
|
||||
chapter_category=chapter_category,
|
||||
)
|
||||
if inv_fb != "none":
|
||||
fallback_type = (
|
||||
inv_fb if fallback_type == "none" else f"{fallback_type}+{inv_fb}"
|
||||
)
|
||||
|
||||
do_append = target_story_id is not None
|
||||
|
||||
@@ -740,6 +934,7 @@ def run_story_pipeline_for_category_batch(
|
||||
user_profile=user_profile,
|
||||
user_birth_year=user_birth_year,
|
||||
llm=llm,
|
||||
oral_scope=om_norm,
|
||||
)
|
||||
st = create_story_with_version_sync(
|
||||
session,
|
||||
@@ -757,10 +952,12 @@ def run_story_pipeline_for_category_batch(
|
||||
|
||||
elapsed = time.perf_counter() - t0
|
||||
logger.info(
|
||||
"event=story_generated route_type=single decision_source={} route_decision={} "
|
||||
"event=story_generated memoir_correlation_id={} route_type=single "
|
||||
"decision_source={} route_decision={} "
|
||||
"unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} "
|
||||
"fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} "
|
||||
"story_id={} seconds={:.3f} oral_normalize_changed={}",
|
||||
memoir_correlation_id or "",
|
||||
decision_source,
|
||||
route.decision,
|
||||
len(category_segments),
|
||||
|
||||
Reference in New Issue
Block a user