Files
life-echo/api/app/agents/chat/reply_limits.py

148 lines
5.0 KiB
Python
Raw Normal View History

"""访谈/资料追问:回复条数与单条字数硬限制(不靠长 prompt"""
from __future__ import annotations
import re
def strip_markdown_for_chat(text: str) -> str:
"""
将模型偶然输出的常见 Markdown 剥成纯文本 App 聊天气泡展示
保留换行与字面量 [SPLIT]不做完整 MD 解析以简单可预测为主
"""
if not text:
return text
s = text
# 围栏代码块(含首行语言标记):整段替换为块内正文,去掉栅栏
s = re.sub(
r"```(?:[^\n`]*)\n([\s\S]*?)```",
r"\1",
s,
flags=re.MULTILINE,
)
s = s.replace("```", "")
# 图片 ![alt](url) → alt链接 [label](url) → label
s = re.sub(r"!\[([^\]]*)\]\([^)]*\)", r"\1", s)
s = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", s)
# ATX 标题
s = re.sub(r"(?m)^#{1,6}\s+", "", s)
# 无序列表行首(仅限行首减号/星号/+ 后接空格,避免误判「—」)
s = re.sub(r"(?m)^\s*[-*+]\s+", "", s)
# 有序列表「数字. 」仅行首
s = re.sub(r"(?m)^\s*\d+\.\s+", "", s)
# 粗体/删除线常见标记
s = s.replace("**", "").replace("__", "")
s = s.replace("~~", "")
# 行内反引号
s = s.replace("`", "")
# 孤立 emphasis*词* 或 _词_不含跨行
s = re.sub(r"(?<![*])\*([^*\n]+)\*(?![*])", r"\1", s)
s = re.sub(r"(?<![_])_([^_\n]+)_(?![_])", r"\1", s)
# 分割线
s = re.sub(r"(?m)^\s*---+\s*$", "", s)
return s
def strip_parenthetical_asides_for_chat(text: str) -> str:
"""
去掉模型输出的表演性括注全角与半角(...)迭代至不再有可删对
口述回忆录场景下助理回复几乎不需要夹注若写成约1993年等说明也会被删属产品上有意识取舍
与禁止轻轻笑类舞台说明一致须在 strip_markdown_for_chat 之后调用链接里的 () 已先处理
"""
if not text:
return text
s = text
prev: str | None = None
while prev != s:
prev = s
s = re.sub(r"[^]*", "", s)
s = re.sub(r"\([^)]*\)", "", s)
s = re.sub(r"[ \t]{2,}", " ", s)
return s.strip()
def strip_leading_en_period_ack_for_chat(text: str) -> str:
"""
去掉段首生硬的可重复即使后面还有正文只剥字符串开头不误伤句中
支持全角/半角句号
"""
s = (text or "").strip()
if not s:
return s
# 允许多次「嗯。」/「嗯嗯。」叠在段首;句号仅匹配全角 。、. 与 ASCII `.`
s2 = re.sub(r"^(?:嗯+(?:。||\.)+\s*)+", "", s)
return s2.strip()
def segments_from_llm_response(
response_text: str,
*,
max_segments: int = 3,
min_paragraph_chars: int = 12,
) -> list[str]:
"""
优先按字面 [SPLIT] 拆段若模型只输出一段但用空行写了多段再按段落拆
解决两段话 + 换行却未写 [SPLIT] 时仍要拆气泡 / 多段 TTS 的情况
"""
text = strip_markdown_for_chat((response_text or "").strip())
text = strip_parenthetical_asides_for_chat(text)
if not text:
return []
primary = [
strip_leading_en_period_ack_for_chat(p)
for p in text.split("[SPLIT]")
if strip_leading_en_period_ack_for_chat(p).strip()
]
if len(primary) > 1:
return primary[:max_segments]
blob = primary[0] if primary else strip_leading_en_period_ack_for_chat(text)
blob = strip_leading_en_period_ack_for_chat(blob)
if "\n" not in blob:
return [blob]
paras = [
strip_leading_en_period_ack_for_chat(p)
for p in re.split(r"\n\s*\n+", blob)
if strip_leading_en_period_ack_for_chat(p).strip()
]
if len(paras) < 2:
return [blob]
paras = [p for p in paras if len(p) >= min_paragraph_chars]
if len(paras) < 2:
return [blob]
return paras[:max_segments]
def nonempty_segments_or_fallback(
segments: list[str],
*,
fallback: str,
) -> list[str]:
"""去掉空段;若全部为空白/空串则返回单条 fallback避免 WS 下发空 text。"""
cleaned = [s for s in segments if (s or "").strip()]
if cleaned:
return cleaned
fb = (fallback or "").strip()
return [fb] if fb else [""]
def truncate_chat_segments(
segments: list[str],
*,
max_segments: int,
max_chars_per_segment: int,
) -> list[str]:
"""保留前 max_segments 条,每条截断至 max_chars_per_segment按字符数中文友好"""
if not segments:
return []
out: list[str] = []
for raw in segments[:max_segments]:
s = (raw or "").strip()
if not s:
continue
if len(s) > max_chars_per_segment:
# 保留 1 个字符给省略号,使总长度不超过上限
s = s[: max_chars_per_segment - 1].rstrip() + ""
out.append(s)
return out