"""访谈/资料追问:回复条数与单条字数硬限制(不靠长 prompt)。""" from __future__ import annotations import re def strip_markdown_for_chat(text: str) -> str: """ 将模型偶然输出的常见 Markdown 剥成纯文本,供 App 聊天气泡展示。 保留换行与字面量 [SPLIT];不做完整 MD 解析,以简单可预测为主。 """ if not text: return text s = text # 围栏代码块(含首行语言标记):整段替换为块内正文,去掉栅栏 s = re.sub( r"```(?:[^\n`]*)\n([\s\S]*?)```", r"\1", s, flags=re.MULTILINE, ) s = s.replace("```", "") # 图片 ![alt](url) → alt;链接 [label](url) → label s = re.sub(r"!\[([^\]]*)\]\([^)]*\)", r"\1", s) s = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", s) # ATX 标题 s = re.sub(r"(?m)^#{1,6}\s+", "", s) # 无序列表行首(仅限行首减号/星号/+ 后接空格,避免误判「—」) s = re.sub(r"(?m)^\s*[-*+]\s+", "", s) # 有序列表「数字. 」仅行首 s = re.sub(r"(?m)^\s*\d+\.\s+", "", s) # 粗体/删除线常见标记 s = s.replace("**", "").replace("__", "") s = s.replace("~~", "") # 行内反引号 s = s.replace("`", "") # 孤立 emphasis:*词* 或 _词_(不含跨行) s = re.sub(r"(? str: """ 去掉模型输出的表演性括注(全角「(…)」与半角「(...)」),迭代至不再有可删对。 口述回忆录场景下助理回复几乎不需要夹注;若写成「(约1993年)」等说明也会被删,属产品上有意识取舍, 与禁止「(轻轻笑)」类舞台说明一致。须在 strip_markdown_for_chat 之后调用(链接里的 () 已先处理)。 """ if not text: return text s = text prev: str | None = None while prev != s: prev = s s = re.sub(r"([^)]*)", "", s) s = re.sub(r"\([^)]*\)", "", s) s = re.sub(r"[ \t]{2,}", " ", s) return s.strip() def segments_from_llm_response( response_text: str, *, max_segments: int = 3, min_paragraph_chars: int = 12, ) -> list[str]: """ 优先按字面 [SPLIT] 拆段;若模型只输出一段、但用空行写了多段,再按段落拆。 解决「两段话 + 换行」却未写 [SPLIT] 时仍要拆气泡 / 多段 TTS 的情况。 """ text = strip_markdown_for_chat((response_text or "").strip()) text = strip_parenthetical_asides_for_chat(text) if not text: return [] primary = [p.strip() for p in text.split("[SPLIT]") if p.strip()] if len(primary) > 1: return primary[:max_segments] blob = primary[0] if primary else text if "\n" not in blob: return [blob] paras = [p.strip() for p in re.split(r"\n\s*\n+", blob) if p.strip()] if len(paras) < 2: return [blob] paras = [p for p in paras if len(p) >= min_paragraph_chars] if len(paras) < 2: return [blob] return paras[:max_segments] def nonempty_segments_or_fallback( segments: list[str], *, fallback: str, ) -> list[str]: """去掉空段;若全部为空白/空串则返回单条 fallback,避免 WS 下发空 text。""" cleaned = [s for s in segments if (s or "").strip()] if cleaned: return cleaned fb = (fallback or "").strip() return [fb] if fb else ["…"] def truncate_chat_segments( segments: list[str], *, max_segments: int, max_chars_per_segment: int, ) -> list[str]: """保留前 max_segments 条,每条截断至 max_chars_per_segment(按字符数,中文友好)。""" if not segments: return [] out: list[str] = [] for raw in segments[:max_segments]: s = (raw or "").strip() if not s: continue if len(s) > max_chars_per_segment: # 保留 1 个字符给省略号,使总长度不超过上限 s = s[: max_chars_per_segment - 1].rstrip() + "…" out.append(s) return out