Files
life-echo/api/app/features/memoir/narrative_safety.py
Kevin 07c6478742 feat(api): 访谈路径轻量门控、Memoir Phase1 批处理与叙事/记忆管线加固
- 新增 utterance_substance:短时/应答/元话语可跳过记忆检索、阶段 LLM 与资料抽取 LLM;可配置
- 输入归一化:LLM 模式默认仅语音/ASR;配置项写入 .env.example
- Memoir Phase1:可选 batch LLM 一次性抽取+分类(失败回退逐段);Extraction 空槽位时阶段与 current_stage 对齐,prompt 约束收紧
- 叙事与忠实度:narrative_safety、证据重叠/场合锚点、标题 slots 与履历短语 grounded;fidelity 解析失败 fail-open 可配置
- 章节管线:锁 TTL 上调、锁竞争 Celery 重试、Phase2 immediate singleflight 等;story_pipeline_sync / chapter_compose / memoir_tasks 联动
- Memory:compaction / repo / summarizer / evidence 小修;事实 FTS 未命中是否回退最近事实可配置
- 新增 memoir_pipeline_trace;补充 memoir_reliability 文档与多项回归/门控测试
2026-04-03 10:12:59 +08:00

166 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""叙事落库前的确定性安检:防止 prompt 分区标记或摘录块泄漏进正文。"""
from __future__ import annotations
# 与 app.agents.memoir.prompts.format_narrative_user_content 保持一致
ORAL_SECTION_MARKER = "【本段用户口述】"
EVIDENCE_SECTION_MARKER = "【仅供参考的相关记忆摘录"
# 摘录引导语中的固定短语(用于粗检)
EVIDENCE_SECTION_TAIL = "不得把其中具体事实写成本轮亲历经历"
def body_contains_prompt_artifact(markdown_body: str) -> bool:
s = (markdown_body or "").strip()
if not s:
return False
if ORAL_SECTION_MARKER in s:
return True
if EVIDENCE_SECTION_MARKER in s:
return True
if EVIDENCE_SECTION_TAIL in s:
return True
return False
def longest_common_substring_len(a: str, b: str, min_len: int = 14) -> int:
"""O(n*m) DP仅用于短 evidence / body防止过大。"""
a = a or ""
b = b or ""
if len(a) > 8000 or len(b) > 8000:
return 0
if not a or not b:
return 0
best = 0
prev = [0] * (len(b) + 1)
for i in range(1, len(a) + 1):
cur = [0] * (len(b) + 1)
for j in range(1, len(b) + 1):
if a[i - 1] == b[j - 1]:
cur[j] = prev[j - 1] + 1
if cur[j] > best:
best = cur[j]
else:
cur[j] = 0
prev = cur
return best if best >= min_len else 0
def evidence_substring_leak_score(
body: str, evidence_plain: str, min_len: int = 14
) -> int:
"""
若正文与 evidence 存在较长公共子串,且该子串不在 oral/existing 中,
则视为摘录渗漏风险(返回子串长度),否则 0。
"""
body = (body or "").strip()
ev = (evidence_plain or "").strip()
if not body or not ev or len(ev) < min_len:
return 0
return longest_common_substring_len(body, ev, min_len=min_len)
def longest_common_substring(a: str, b: str) -> str:
"""返回 a、b 的最长公共子串(长度上限防 DP 爆内存)。"""
a = a or ""
b = b or ""
if len(a) > 8000 or len(b) > 8000:
return ""
best_i, best_len = 0, 0
prev = [0] * (len(b) + 1)
for i in range(1, len(a) + 1):
cur = [0] * (len(b) + 1)
for j in range(1, len(b) + 1):
if a[i - 1] == b[j - 1]:
cur[j] = prev[j - 1] + 1
if cur[j] > best_len:
best_len = cur[j]
best_i = i
else:
cur[j] = 0
prev = cur
if best_len <= 0:
return ""
start = best_i - best_len
return a[start:best_i]
# 具体场合描写:易由「相关摘录」渗入正文但长 LCS 抓不住(词短)。
EVIDENCE_SCENE_ANCHOR_TOKENS: tuple[str, ...] = (
"聚餐",
"酒席",
"酒桌",
"宴会",
"宴席",
"当晚",
"那晚",
"昨夜",
"前一晚",
"前一天晚上",
)
def evidence_scene_anchor_leak(
body: str,
evidence_plain: str,
oral: str,
existing: str,
) -> bool:
"""
True正文出现了与「摘录」共享的具体场合锚点词且口述与旧正文均未出现
视为摘录场景渗漏(短词不走 LCS 阈值)。
"""
body = (body or "").strip()
ev = (evidence_plain or "").strip()
o = (oral or "").strip()
ex = (existing or "").strip()
if not body or not ev:
return False
base = f"{o}\n{ex}"
for tok in EVIDENCE_SCENE_ANCHOR_TOKENS:
if tok not in body:
continue
if tok in base:
continue
if tok in ev:
return True
return False
def evidence_leakage_heuristic(
body: str,
evidence_plain: str,
oral: str,
existing: str,
min_len: int,
) -> bool:
"""
True正文与 evidence 的最长公共子串足够长,且该子串未出现在口述或已有正文中,
视为摘录渗漏,应回退安全正文。
"""
body = (body or "").strip()
ev = (evidence_plain or "").strip()
if not body or not ev:
return False
lcs = longest_common_substring(body, ev)
if len(lcs) < min_len:
return False
o = oral or ""
ex = existing or ""
if lcs in o or lcs in ex:
return False
return True
def strip_evidence_for_overlap_check(evidence_text: str) -> str:
"""去掉 chunk 标记行等噪声,仅保留内容用于 overlap。"""
lines: list[str] = []
for line in (evidence_text or "").splitlines():
t = line.strip()
if t.startswith("[chunk_id="):
continue
if t.startswith("[摘要:"):
continue
lines.append(line)
return "\n".join(lines).strip()