feat(i18n): persist language preference and thread through chat, memoir, TTS

- Add users.language_preference (Alembic 0018, default zh); capture at signup/SMS only; expose on auth and profile APIs - Lite English prompts for chat and memoir; localized stage labels and agent names (Life Echo / 岁月知己) - Tencent TTS: language-aware synthesis, ModelType=1 for 501004, English chunking - WebSocket pipeline: emit all AGENT_RESPONSE segments when TTS cancels; INFO logs for tts_this_turn and TTS decisions; on-demand TTS logging - Expo: device language on auth, i18n tiers/agent name, [SPLIT] streaming UX fixes - Tests for migration, prompts, pipeline, router tts_this_turn, reply segments Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-11 16:16:49 +08:00
parent 5ce29aad64
commit ccdc4e4277
64 changed files with 3233 additions and 208 deletions
--- a/api/app/agents/chat/reply_limits.py
+++ b/api/app/agents/chat/reply_limits.py
@@ -4,11 +4,35 @@ from __future__ import annotations

 import re

+# 零宽字符：LLM 偶尔会在 [SPLIT] 周围注入 ZWSP/ZWNJ/ZWJ/BOM，需在拆段前去掉
+_ZERO_WIDTH_RE = re.compile(r"[\u200B-\u200D\uFEFF]")
+
+# 与客户端 `message-split.ts` 对齐：宽松正则匹配 [SPLIT] / [ SPLIT ] / [split] 等
+# 全角中括号 【】 / ［］ 先在 _normalize_split_markers 里折成 ASCII 再走该正则
+SPLIT_MARKER_RE = re.compile(r"\[\s*SPLIT\s*\]", re.IGNORECASE)
+
+
+def _normalize_split_markers(text: str) -> str:
+    """归一化 [SPLIT] 周围常见变体，确保后端拆段与前端 `MESSAGE_SPLIT_REGEX` 等价。
+
+    覆盖：
+    - 零宽空格 / ZWNJ / ZWJ / BOM
+    - 全角方括号 【】 / ［］ 折叠为 ASCII []
+    后续仍用 ``SPLIT_MARKER_RE`` 一次性匹配（含大小写、内部空白）。
+    """
+    if not text:
+        return text
+    s = _ZERO_WIDTH_RE.sub("", text)
+    s = s.replace("\uff3b", "[").replace("\uff3d", "]")
+    s = s.replace("\u3010", "[").replace("\u3011", "]")
+    return s
+

 def strip_markdown_for_chat(text: str) -> str:
    """
    将模型偶然输出的常见 Markdown 剥成纯文本，供 App 聊天气泡展示。
-    保留换行与字面量 [SPLIT]；不做完整 MD 解析，以简单可预测为主。
+    保留换行与字面量 [SPLIT]（实际拆段由 `segments_from_llm_response` 用宽松正则完成，
+    支持 `[ SPLIT ]`、`[split]`、`【SPLIT】` 等变体）。不做完整 MD 解析，以简单可预测为主。
    """
    if not text:
        return text
@@ -82,21 +106,24 @@ def segments_from_llm_response(
    min_paragraph_chars: int = 12,
 ) -> list[str]:
    """
-    优先按字面 [SPLIT] 拆段；若模型只输出一段、但用空行写了多段，再按段落拆。
-    解决「两段话 + 换行」却未写 [SPLIT] 时仍要拆气泡 / 多段 TTS 的情况。
+    优先按 [SPLIT] 标记拆段（容错：大小写、内部空白、全角中括号、零宽字符均视作分隔符）；
+    若模型只输出一段、但用空行写了多段，再按段落拆。
+    解决「两段话 + 换行」却未写 [SPLIT] 时仍要拆气泡 / 多段 TTS 的情况，
+    并避免后端 literal split 与前端容错正则不一致时把字面 `[ SPLIT ]` 留在文本里。
    """
    text = strip_markdown_for_chat((response_text or "").strip())
    text = strip_parenthetical_asides_for_chat(text)
    if not text:
        return []
+    normalized = _normalize_split_markers(text)
    primary = [
        strip_leading_en_period_ack_for_chat(p)
-        for p in text.split("[SPLIT]")
+        for p in SPLIT_MARKER_RE.split(normalized)
        if strip_leading_en_period_ack_for_chat(p).strip()
    ]
    if len(primary) > 1:
        return primary[:max_segments]
-    blob = primary[0] if primary else strip_leading_en_period_ack_for_chat(text)
+    blob = primary[0] if primary else strip_leading_en_period_ack_for_chat(normalized)
    blob = strip_leading_en_period_ack_for_chat(blob)
    if "\n" not in blob:
        return [blob]