2026-03-26 12:13:36 +08:00
|
|
|
|
"""访谈/资料追问:回复条数与单条字数硬限制(不靠长 prompt)。"""
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
2026-03-27 16:01:28 +08:00
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-03 14:06:55 +08:00
|
|
|
|
def strip_markdown_for_chat(text: str) -> str:
|
|
|
|
|
|
"""
|
|
|
|
|
|
将模型偶然输出的常见 Markdown 剥成纯文本,供 App 聊天气泡展示。
|
|
|
|
|
|
保留换行与字面量 [SPLIT];不做完整 MD 解析,以简单可预测为主。
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
return text
|
|
|
|
|
|
s = text
|
|
|
|
|
|
# 围栏代码块(含首行语言标记):整段替换为块内正文,去掉栅栏
|
|
|
|
|
|
s = re.sub(
|
|
|
|
|
|
r"```(?:[^\n`]*)\n([\s\S]*?)```",
|
|
|
|
|
|
r"\1",
|
|
|
|
|
|
s,
|
|
|
|
|
|
flags=re.MULTILINE,
|
|
|
|
|
|
)
|
|
|
|
|
|
s = s.replace("```", "")
|
|
|
|
|
|
# 图片  → alt;链接 [label](url) → label
|
|
|
|
|
|
s = re.sub(r"!\[([^\]]*)\]\([^)]*\)", r"\1", s)
|
|
|
|
|
|
s = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", s)
|
|
|
|
|
|
# ATX 标题
|
|
|
|
|
|
s = re.sub(r"(?m)^#{1,6}\s+", "", s)
|
|
|
|
|
|
# 无序列表行首(仅限行首减号/星号/+ 后接空格,避免误判「—」)
|
|
|
|
|
|
s = re.sub(r"(?m)^\s*[-*+]\s+", "", s)
|
|
|
|
|
|
# 有序列表「数字. 」仅行首
|
|
|
|
|
|
s = re.sub(r"(?m)^\s*\d+\.\s+", "", s)
|
|
|
|
|
|
# 粗体/删除线常见标记
|
|
|
|
|
|
s = s.replace("**", "").replace("__", "")
|
|
|
|
|
|
s = s.replace("~~", "")
|
|
|
|
|
|
# 行内反引号
|
|
|
|
|
|
s = s.replace("`", "")
|
|
|
|
|
|
# 孤立 emphasis:*词* 或 _词_(不含跨行)
|
|
|
|
|
|
s = re.sub(r"(?<![*])\*([^*\n]+)\*(?![*])", r"\1", s)
|
|
|
|
|
|
s = re.sub(r"(?<![_])_([^_\n]+)_(?![_])", r"\1", s)
|
|
|
|
|
|
# 分割线
|
|
|
|
|
|
s = re.sub(r"(?m)^\s*---+\s*$", "", s)
|
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-27 16:01:28 +08:00
|
|
|
|
def segments_from_llm_response(
|
|
|
|
|
|
response_text: str,
|
|
|
|
|
|
*,
|
|
|
|
|
|
max_segments: int = 3,
|
|
|
|
|
|
min_paragraph_chars: int = 12,
|
|
|
|
|
|
) -> list[str]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
优先按字面 [SPLIT] 拆段;若模型只输出一段、但用空行写了多段,再按段落拆。
|
|
|
|
|
|
解决「两段话 + 换行」却未写 [SPLIT] 时仍要拆气泡 / 多段 TTS 的情况。
|
|
|
|
|
|
"""
|
2026-04-03 14:06:55 +08:00
|
|
|
|
text = strip_markdown_for_chat((response_text or "").strip())
|
2026-03-27 16:01:28 +08:00
|
|
|
|
if not text:
|
|
|
|
|
|
return []
|
|
|
|
|
|
primary = [p.strip() for p in text.split("[SPLIT]") if p.strip()]
|
|
|
|
|
|
if len(primary) > 1:
|
|
|
|
|
|
return primary[:max_segments]
|
|
|
|
|
|
blob = primary[0] if primary else text
|
|
|
|
|
|
if "\n" not in blob:
|
|
|
|
|
|
return [blob]
|
|
|
|
|
|
paras = [p.strip() for p in re.split(r"\n\s*\n+", blob) if p.strip()]
|
|
|
|
|
|
if len(paras) < 2:
|
|
|
|
|
|
return [blob]
|
|
|
|
|
|
paras = [p for p in paras if len(p) >= min_paragraph_chars]
|
|
|
|
|
|
if len(paras) < 2:
|
|
|
|
|
|
return [blob]
|
|
|
|
|
|
return paras[:max_segments]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def nonempty_segments_or_fallback(
|
|
|
|
|
|
segments: list[str],
|
|
|
|
|
|
*,
|
|
|
|
|
|
fallback: str,
|
|
|
|
|
|
) -> list[str]:
|
|
|
|
|
|
"""去掉空段;若全部为空白/空串则返回单条 fallback,避免 WS 下发空 text。"""
|
|
|
|
|
|
cleaned = [s for s in segments if (s or "").strip()]
|
|
|
|
|
|
if cleaned:
|
|
|
|
|
|
return cleaned
|
|
|
|
|
|
fb = (fallback or "").strip()
|
|
|
|
|
|
return [fb] if fb else ["…"]
|
|
|
|
|
|
|
2026-03-26 12:13:36 +08:00
|
|
|
|
|
|
|
|
|
|
def truncate_chat_segments(
|
|
|
|
|
|
segments: list[str],
|
|
|
|
|
|
*,
|
|
|
|
|
|
max_segments: int,
|
|
|
|
|
|
max_chars_per_segment: int,
|
|
|
|
|
|
) -> list[str]:
|
|
|
|
|
|
"""保留前 max_segments 条,每条截断至 max_chars_per_segment(按字符数,中文友好)。"""
|
|
|
|
|
|
if not segments:
|
|
|
|
|
|
return []
|
|
|
|
|
|
out: list[str] = []
|
|
|
|
|
|
for raw in segments[:max_segments]:
|
|
|
|
|
|
s = (raw or "").strip()
|
|
|
|
|
|
if not s:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if len(s) > max_chars_per_segment:
|
|
|
|
|
|
# 保留 1 个字符给省略号,使总长度不超过上限
|
|
|
|
|
|
s = s[: max_chars_per_segment - 1].rstrip() + "…"
|
|
|
|
|
|
out.append(s)
|
|
|
|
|
|
return out
|