feat(api): 访谈路径轻量门控、Memoir Phase1 批处理与叙事/记忆管线加固
- 新增 utterance_substance:短时/应答/元话语可跳过记忆检索、阶段 LLM 与资料抽取 LLM;可配置 - 输入归一化:LLM 模式默认仅语音/ASR;配置项写入 .env.example - Memoir Phase1:可选 batch LLM 一次性抽取+分类(失败回退逐段);Extraction 空槽位时阶段与 current_stage 对齐,prompt 约束收紧 - 叙事与忠实度:narrative_safety、证据重叠/场合锚点、标题 slots 与履历短语 grounded;fidelity 解析失败 fail-open 可配置 - 章节管线:锁 TTL 上调、锁竞争 Celery 重试、Phase2 immediate singleflight 等;story_pipeline_sync / chapter_compose / memoir_tasks 联动 - Memory:compaction / repo / summarizer / evidence 小修;事实 FTS 未命中是否回退最近事实可配置 - 新增 memoir_pipeline_trace;补充 memoir_reliability 文档与多项回归/门控测试
This commit is contained in:
165
api/app/features/memoir/narrative_safety.py
Normal file
165
api/app/features/memoir/narrative_safety.py
Normal file
@@ -0,0 +1,165 @@
|
||||
"""叙事落库前的确定性安检:防止 prompt 分区标记或摘录块泄漏进正文。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# 与 app.agents.memoir.prompts.format_narrative_user_content 保持一致
|
||||
ORAL_SECTION_MARKER = "【本段用户口述】"
|
||||
EVIDENCE_SECTION_MARKER = "【仅供参考的相关记忆摘录"
|
||||
|
||||
# 摘录引导语中的固定短语(用于粗检)
|
||||
EVIDENCE_SECTION_TAIL = "不得把其中具体事实写成本轮亲历经历"
|
||||
|
||||
|
||||
def body_contains_prompt_artifact(markdown_body: str) -> bool:
|
||||
s = (markdown_body or "").strip()
|
||||
if not s:
|
||||
return False
|
||||
if ORAL_SECTION_MARKER in s:
|
||||
return True
|
||||
if EVIDENCE_SECTION_MARKER in s:
|
||||
return True
|
||||
if EVIDENCE_SECTION_TAIL in s:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def longest_common_substring_len(a: str, b: str, min_len: int = 14) -> int:
|
||||
"""O(n*m) DP;仅用于短 evidence / body,防止过大。"""
|
||||
a = a or ""
|
||||
b = b or ""
|
||||
if len(a) > 8000 or len(b) > 8000:
|
||||
return 0
|
||||
if not a or not b:
|
||||
return 0
|
||||
best = 0
|
||||
prev = [0] * (len(b) + 1)
|
||||
for i in range(1, len(a) + 1):
|
||||
cur = [0] * (len(b) + 1)
|
||||
for j in range(1, len(b) + 1):
|
||||
if a[i - 1] == b[j - 1]:
|
||||
cur[j] = prev[j - 1] + 1
|
||||
if cur[j] > best:
|
||||
best = cur[j]
|
||||
else:
|
||||
cur[j] = 0
|
||||
prev = cur
|
||||
return best if best >= min_len else 0
|
||||
|
||||
|
||||
def evidence_substring_leak_score(
|
||||
body: str, evidence_plain: str, min_len: int = 14
|
||||
) -> int:
|
||||
"""
|
||||
若正文与 evidence 存在较长公共子串,且该子串不在 oral/existing 中,
|
||||
则视为摘录渗漏风险(返回子串长度),否则 0。
|
||||
"""
|
||||
body = (body or "").strip()
|
||||
ev = (evidence_plain or "").strip()
|
||||
if not body or not ev or len(ev) < min_len:
|
||||
return 0
|
||||
return longest_common_substring_len(body, ev, min_len=min_len)
|
||||
|
||||
|
||||
def longest_common_substring(a: str, b: str) -> str:
|
||||
"""返回 a、b 的最长公共子串(长度上限防 DP 爆内存)。"""
|
||||
a = a or ""
|
||||
b = b or ""
|
||||
if len(a) > 8000 or len(b) > 8000:
|
||||
return ""
|
||||
best_i, best_len = 0, 0
|
||||
prev = [0] * (len(b) + 1)
|
||||
for i in range(1, len(a) + 1):
|
||||
cur = [0] * (len(b) + 1)
|
||||
for j in range(1, len(b) + 1):
|
||||
if a[i - 1] == b[j - 1]:
|
||||
cur[j] = prev[j - 1] + 1
|
||||
if cur[j] > best_len:
|
||||
best_len = cur[j]
|
||||
best_i = i
|
||||
else:
|
||||
cur[j] = 0
|
||||
prev = cur
|
||||
if best_len <= 0:
|
||||
return ""
|
||||
start = best_i - best_len
|
||||
return a[start:best_i]
|
||||
|
||||
|
||||
# 具体场合描写:易由「相关摘录」渗入正文但长 LCS 抓不住(词短)。
|
||||
EVIDENCE_SCENE_ANCHOR_TOKENS: tuple[str, ...] = (
|
||||
"聚餐",
|
||||
"酒席",
|
||||
"酒桌",
|
||||
"宴会",
|
||||
"宴席",
|
||||
"当晚",
|
||||
"那晚",
|
||||
"昨夜",
|
||||
"前一晚",
|
||||
"前一天晚上",
|
||||
)
|
||||
|
||||
|
||||
def evidence_scene_anchor_leak(
|
||||
body: str,
|
||||
evidence_plain: str,
|
||||
oral: str,
|
||||
existing: str,
|
||||
) -> bool:
|
||||
"""
|
||||
True:正文出现了与「摘录」共享的具体场合锚点词,且口述与旧正文均未出现,
|
||||
视为摘录场景渗漏(短词不走 LCS 阈值)。
|
||||
"""
|
||||
body = (body or "").strip()
|
||||
ev = (evidence_plain or "").strip()
|
||||
o = (oral or "").strip()
|
||||
ex = (existing or "").strip()
|
||||
if not body or not ev:
|
||||
return False
|
||||
base = f"{o}\n{ex}"
|
||||
for tok in EVIDENCE_SCENE_ANCHOR_TOKENS:
|
||||
if tok not in body:
|
||||
continue
|
||||
if tok in base:
|
||||
continue
|
||||
if tok in ev:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def evidence_leakage_heuristic(
|
||||
body: str,
|
||||
evidence_plain: str,
|
||||
oral: str,
|
||||
existing: str,
|
||||
min_len: int,
|
||||
) -> bool:
|
||||
"""
|
||||
True:正文与 evidence 的最长公共子串足够长,且该子串未出现在口述或已有正文中,
|
||||
视为摘录渗漏,应回退安全正文。
|
||||
"""
|
||||
body = (body or "").strip()
|
||||
ev = (evidence_plain or "").strip()
|
||||
if not body or not ev:
|
||||
return False
|
||||
lcs = longest_common_substring(body, ev)
|
||||
if len(lcs) < min_len:
|
||||
return False
|
||||
o = oral or ""
|
||||
ex = existing or ""
|
||||
if lcs in o or lcs in ex:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def strip_evidence_for_overlap_check(evidence_text: str) -> str:
|
||||
"""去掉 chunk 标记行等噪声,仅保留内容用于 overlap。"""
|
||||
lines: list[str] = []
|
||||
for line in (evidence_text or "").splitlines():
|
||||
t = line.strip()
|
||||
if t.startswith("[chunk_id="):
|
||||
continue
|
||||
if t.startswith("[摘要:"):
|
||||
continue
|
||||
lines.append(line)
|
||||
return "\n".join(lines).strip()
|
||||
Reference in New Issue
Block a user