Files
life-echo/api/app/agents/chat/reply_limits.py
Kevin df6eafeae2 feat(chat): host-style memoir prompts and strip parenthetical stage directions
- Add strip_parenthetical_asides_for_chat in reply pipeline before [SPLIT]
- Expand output_rules bans (performance parens) and voice as warm host
- Refocus opening/guided prompts on pulling conversation toward memoir oral history
- Align interview opening fallbacks with memoir-first tone
- Add unit tests for parenthetical stripping
2026-04-10 13:55:56 +08:00

126 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""访谈/资料追问:回复条数与单条字数硬限制(不靠长 prompt"""
from __future__ import annotations
import re
def strip_markdown_for_chat(text: str) -> str:
"""
将模型偶然输出的常见 Markdown 剥成纯文本,供 App 聊天气泡展示。
保留换行与字面量 [SPLIT];不做完整 MD 解析,以简单可预测为主。
"""
if not text:
return text
s = text
# 围栏代码块(含首行语言标记):整段替换为块内正文,去掉栅栏
s = re.sub(
r"```(?:[^\n`]*)\n([\s\S]*?)```",
r"\1",
s,
flags=re.MULTILINE,
)
s = s.replace("```", "")
# 图片 ![alt](url) → alt链接 [label](url) → label
s = re.sub(r"!\[([^\]]*)\]\([^)]*\)", r"\1", s)
s = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", s)
# ATX 标题
s = re.sub(r"(?m)^#{1,6}\s+", "", s)
# 无序列表行首(仅限行首减号/星号/+ 后接空格,避免误判「—」)
s = re.sub(r"(?m)^\s*[-*+]\s+", "", s)
# 有序列表「数字. 」仅行首
s = re.sub(r"(?m)^\s*\d+\.\s+", "", s)
# 粗体/删除线常见标记
s = s.replace("**", "").replace("__", "")
s = s.replace("~~", "")
# 行内反引号
s = s.replace("`", "")
# 孤立 emphasis*词* 或 _词_不含跨行
s = re.sub(r"(?<![*])\*([^*\n]+)\*(?![*])", r"\1", s)
s = re.sub(r"(?<![_])_([^_\n]+)_(?![_])", r"\1", s)
# 分割线
s = re.sub(r"(?m)^\s*---+\s*$", "", s)
return s
def strip_parenthetical_asides_for_chat(text: str) -> str:
"""
去掉模型输出的表演性括注(全角「(…)」与半角「(...)」),迭代至不再有可删对。
口述回忆录场景下助理回复几乎不需要夹注若写成「约1993年」等说明也会被删属产品上有意识取舍
与禁止「(轻轻笑)」类舞台说明一致。须在 strip_markdown_for_chat 之后调用(链接里的 () 已先处理)。
"""
if not text:
return text
s = text
prev: str | None = None
while prev != s:
prev = s
s = re.sub(r"[^]*", "", s)
s = re.sub(r"\([^)]*\)", "", s)
s = re.sub(r"[ \t]{2,}", " ", s)
return s.strip()
def segments_from_llm_response(
response_text: str,
*,
max_segments: int = 3,
min_paragraph_chars: int = 12,
) -> list[str]:
"""
优先按字面 [SPLIT] 拆段;若模型只输出一段、但用空行写了多段,再按段落拆。
解决「两段话 + 换行」却未写 [SPLIT] 时仍要拆气泡 / 多段 TTS 的情况。
"""
text = strip_markdown_for_chat((response_text or "").strip())
text = strip_parenthetical_asides_for_chat(text)
if not text:
return []
primary = [p.strip() for p in text.split("[SPLIT]") if p.strip()]
if len(primary) > 1:
return primary[:max_segments]
blob = primary[0] if primary else text
if "\n" not in blob:
return [blob]
paras = [p.strip() for p in re.split(r"\n\s*\n+", blob) if p.strip()]
if len(paras) < 2:
return [blob]
paras = [p for p in paras if len(p) >= min_paragraph_chars]
if len(paras) < 2:
return [blob]
return paras[:max_segments]
def nonempty_segments_or_fallback(
segments: list[str],
*,
fallback: str,
) -> list[str]:
"""去掉空段;若全部为空白/空串则返回单条 fallback避免 WS 下发空 text。"""
cleaned = [s for s in segments if (s or "").strip()]
if cleaned:
return cleaned
fb = (fallback or "").strip()
return [fb] if fb else [""]
def truncate_chat_segments(
segments: list[str],
*,
max_segments: int,
max_chars_per_segment: int,
) -> list[str]:
"""保留前 max_segments 条,每条截断至 max_chars_per_segment按字符数中文友好"""
if not segments:
return []
out: list[str] = []
for raw in segments[:max_segments]:
s = (raw or "").strip()
if not s:
continue
if len(s) > max_chars_per_segment:
# 保留 1 个字符给省略号,使总长度不超过上限
s = s[: max_chars_per_segment - 1].rstrip() + ""
out.append(s)
return out