Files
life-echo/api/app/agents/chat/utterance_substance.py
Kevin 07c6478742 feat(api): 访谈路径轻量门控、Memoir Phase1 批处理与叙事/记忆管线加固
- 新增 utterance_substance:短时/应答/元话语可跳过记忆检索、阶段 LLM 与资料抽取 LLM;可配置
- 输入归一化:LLM 模式默认仅语音/ASR;配置项写入 .env.example
- Memoir Phase1:可选 batch LLM 一次性抽取+分类(失败回退逐段);Extraction 空槽位时阶段与 current_stage 对齐,prompt 约束收紧
- 叙事与忠实度:narrative_safety、证据重叠/场合锚点、标题 slots 与履历短语 grounded;fidelity 解析失败 fail-open 可配置
- 章节管线:锁 TTL 上调、锁竞争 Celery 重试、Phase2 immediate singleflight 等;story_pipeline_sync / chapter_compose / memoir_tasks 联动
- Memory:compaction / repo / summarizer / evidence 小修;事实 FTS 未命中是否回退最近事实可配置
- 新增 memoir_pipeline_trace;补充 memoir_reliability 文档与多项回归/门控测试
2026-04-03 10:12:59 +08:00

74 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
启发式判断访谈「本轮」是否值得跑阶段 LLM / 记忆检索等高成本步骤。
短答、应答词、元话语(谈整理回忆本身而非人生经历)为 False长文本或中等长度非常用词为 True。
与配置 `chat_substantive_*` 配合;关闭启发式时恒为 True。
"""
from __future__ import annotations
import re
from typing import Final
from app.core.config import settings
# 极短应答(整句精确匹配)
_SHORT_ACK_EXACT: Final[frozenset[str]] = frozenset(
{
"",
"",
"",
"",
"行的",
"是的",
"没有",
"",
"",
"",
"好吧",
"嗯嗯",
"对对",
"好嘞",
"对的",
"没了",
"可以",
"就这样",
"还行",
"还好",
}
)
# 元话语:谈回忆过程/访谈本身,不足以切换人生阶段或拉记忆证据
_META_PROCESS: Final[re.Pattern[str]] = re.compile(
r"(回忆|想起).{0,20}(细节|收获|快忘|忘的|很多东西)"
r"|(整理|聊聊|谈到).{0,8}(回忆|访谈|记录)"
r"|最大的收获",
re.UNICODE,
)
def should_run_chat_stage_memory_heavy_work(text: str) -> bool:
"""
True值得调用阶段检测 LLM、记忆检索向量等
False仅用关键词阶段回退、跳过记忆检索。
"""
if not settings.chat_substantive_heuristic_enabled:
return True
s = (text or "").strip()
if not s:
return False
# 元话语可略长,须在「达到 min_chars」分支之前判断
if _META_PROCESS.search(s):
return False
min_chars = int(settings.chat_substantive_min_chars)
if len(s) >= min_chars:
return True
if s in _SHORT_ACK_EXACT:
return False
if len(s) <= 4:
# 极短:多为语气/应答
if all(ch in "嗯哦噢对对好好的没行是的不没一下的了呗嘛呀啊" for ch in s):
return False
# 偏短但未命中噪音规则默认走完整路径5 字常见为有信息短句(旧逻辑用 >=6 会误杀)
return len(s) >= 5