feat(i18n): persist language preference and thread through chat, memoir, TTS

- Add users.language_preference (Alembic 0018, default zh); capture at signup/SMS only; expose on auth and profile APIs - Lite English prompts for chat and memoir; localized stage labels and agent names (Life Echo / 岁月知己) - Tencent TTS: language-aware synthesis, ModelType=1 for 501004, English chunking - WebSocket pipeline: emit all AGENT_RESPONSE segments when TTS cancels; INFO logs for tts_this_turn and TTS decisions; on-demand TTS logging - Expo: device language on auth, i18n tiers/agent name, [SPLIT] streaming UX fixes - Tests for migration, prompts, pipeline, router tts_this_turn, reply segments Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-11 16:16:49 +08:00
parent 5ce29aad64
commit ccdc4e4277
64 changed files with 3233 additions and 208 deletions
--- a/api/app/adapters/tts/tencent_tts.py
+++ b/api/app/adapters/tts/tencent_tts.py
@@ -19,11 +19,22 @@ VOICE_MAP: dict[str, int] = {
    "shimmer": 1006,
 }

-# 中文 150 字 / 英文 500 字母，取保守值
-MAX_CHARS_PER_REQUEST = 150
+# Tencent TTS API limit: ≤150 Chinese chars or ≤500 letters (英文按字母放宽到 ~480 留余量)
+MAX_CHARS_PER_REQUEST_ZH = 150
+MAX_CHARS_PER_REQUEST_EN = 480
+
+# Tencent PrimaryLanguage: 1=中文（含中英混读），2=英文
+PRIMARY_LANGUAGE_ZH = 1
+PRIMARY_LANGUAGE_EN = 2
+
+# Tencent ModelType: 1=新模型（覆盖大模型音色 501xxx 系列与新版精品音色）。
+# 大模型音色（如 501004 月华）必须显式传 ModelType=1，否则可能被旧模型拒绝并返回空音频；
+# 老精品音色（如 1001/101050 等）也接受 ModelType=1，因此无条件设置不会破坏老链路。
+# 文档：https://cloud.tencent.com/document/api/1073/37995
+MODEL_TYPE_LLM = 1


-def _chunk_text(text: str, max_chars: int = MAX_CHARS_PER_REQUEST) -> list[str]:
+def _chunk_text(text: str, max_chars: int = MAX_CHARS_PER_REQUEST_ZH) -> list[str]:
    """Split text into chunks within API limit."""
    text = text.strip()
    if not text:
@@ -66,10 +77,15 @@ class TencentTTSProvider:
        secret_key: str,
        voice_type: int = 1001,
        codec: str = "mp3",
+        voice_type_en: int | None = None,
    ):
        self._secret_id = secret_id
        self._secret_key = secret_key
        self._voice_type = voice_type
+        # 英文音色未单独配置时回落到 501004（月华，腾讯云大模型音色，支持中英混合）。
+        # 大模型音色 501xxx 系列在 PrimaryLanguage=1/2 下均支持中英混读，不会被 Tencent
+        # 以 InvalidParameterValue.PrimaryLanguage 拒绝；与之对应必须配合 ModelType=1。
+        self._voice_type_en = voice_type_en if voice_type_en is not None else 501004
        self._codec = codec
        self._client = None

@@ -93,9 +109,18 @@ class TencentTTSProvider:
            logger.error("Tencent TTS client init failed: {}", e)
            return None

-    def _synthesize_sync(self, text: str, voice_type: int) -> bytes:
+    def _synthesize_sync(
+        self,
+        text: str,
+        voice_type: int,
+        primary_language: int = PRIMARY_LANGUAGE_ZH,
+    ) -> bytes:
        client = self._get_client()
        if not client:
+            logger.warning(
+                "tencent_tts._synthesize_sync no client provider=tencent voice_type={}",
+                voice_type,
+            )
            return b""
        try:
            from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
@@ -107,41 +132,142 @@ class TencentTTSProvider:
            req.Text = text
            req.SessionId = uuid.uuid4().hex
            req.VoiceType = voice_type
-            req.PrimaryLanguage = 1
+            req.PrimaryLanguage = primary_language
            req.SampleRate = 16000
            req.Codec = self._codec
+            # 显式声明使用新模型；大模型音色（501xxx）若不带该字段会被旧模型拒绝并静默返回空音频。
+            req.ModelType = MODEL_TYPE_LLM
+
+            # 长期保留 INFO：TTS 实际请求腾讯云 SDK 时的关键参数
+            logger.info(
+                "tencent_tts._synthesize_sync request voice_type={} primary_language={} "
+                "model_type={} sample_rate={} codec={} text_len={}",
+                voice_type,
+                primary_language,
+                MODEL_TYPE_LLM,
+                req.SampleRate,
+                self._codec,
+                len(text or ""),
+            )

            resp = client.TextToVoice(req)
-            if not resp or not resp.Audio:
+            request_id = getattr(resp, "RequestId", None) if resp is not None else None
+            audio_b64 = getattr(resp, "Audio", "") if resp is not None else ""
+            if not audio_b64:
+                logger.warning(
+                    "tencent_tts._synthesize_sync empty audio voice_type={} "
+                    "primary_language={} model_type={} request_id={}",
+                    voice_type,
+                    primary_language,
+                    MODEL_TYPE_LLM,
+                    request_id,
+                )
                return b""
-            return base64.b64decode(resp.Audio)
+            audio_bytes = base64.b64decode(audio_b64)
+            # 长期保留 INFO：腾讯云 SDK 返回的 request_id + 音频字节数（用户排查必需）
+            logger.info(
+                "tencent_tts._synthesize_sync response request_id={} audio_bytes_len={} "
+                "voice_type={} primary_language={}",
+                request_id,
+                len(audio_bytes),
+                voice_type,
+                primary_language,
+            )
+            return audio_bytes
        except TencentCloudSDKException as e:
-            logger.error("Tencent TTS SDK error: {}", e)
+            logger.error(
+                "Tencent TTS SDK error provider=tencent voice_type={} primary_language={} "
+                "model_type={} code={} message={} request_id={} raw={}",
+                voice_type,
+                primary_language,
+                MODEL_TYPE_LLM,
+                getattr(e, "code", None),
+                getattr(e, "message", None),
+                getattr(e, "requestId", None),
+                e,
+            )
            return b""
        except Exception as e:
-            logger.error("Tencent TTS synthesize failed: {}", e)
+            logger.error(
+                "Tencent TTS synthesize failed provider=tencent voice_type={} primary_language={}: {}",
+                voice_type,
+                primary_language,
+                e,
+            )
            return b""

-    async def synthesize(self, text: str, voice: str = "alloy") -> bytes:
+    async def synthesize(
+        self,
+        text: str,
+        voice: str = "alloy",
+        *,
+        language: str = "zh",
+    ) -> bytes:
        if not self._secret_id or not self._secret_key:
-            logger.error("Tencent TTS credentials not configured")
+            logger.error(
+                "Tencent TTS credentials not configured provider=tencent secret_id_set={} secret_key_set={}",
+                bool(self._secret_id),
+                bool(self._secret_key),
+            )
            return b""

-        # Default "alloy" aligns with OpenAI TTS naming; Tencent uses VoiceType IDs from settings.
+        is_en = (language or "zh").strip().lower() == "en"
+        primary_language = PRIMARY_LANGUAGE_EN if is_en else PRIMARY_LANGUAGE_ZH
+        default_voice = self._voice_type_en if is_en else self._voice_type
+        max_chars = MAX_CHARS_PER_REQUEST_EN if is_en else MAX_CHARS_PER_REQUEST_ZH
+
+        # Default "alloy" aligns with OpenAI TTS naming. Caller 链路里目前不会传具体音色，
+        # 因此实际只走 default_voice 分支，对应 settings.tts_voice_type / tts_voice_type_en。
        v = voice.lower()
        if v == "alloy":
-            voice_type = self._voice_type
+            voice_type = default_voice
        else:
-            voice_type = VOICE_MAP.get(v, self._voice_type)
-        chunks = _chunk_text(text)
+            voice_type = VOICE_MAP.get(v, default_voice)
+        chunks = _chunk_text(text, max_chars=max_chars)
+        # 长期保留 INFO：adapter 入口的 language / voice_type / chunk_count（排查必需）
+        logger.info(
+            "tencent_tts.synthesize entry language={} voice_arg={} resolved_voice_type={} "
+            "primary_language={} max_chars={} text_len={} chunk_count={}",
+            language,
+            voice,
+            voice_type,
+            primary_language,
+            max_chars,
+            len(text or ""),
+            len(chunks),
+        )
        if not chunks:
            return b""

        results: list[bytes] = []
-        for chunk in chunks:
-            audio = await asyncio.to_thread(self._synthesize_sync, chunk, voice_type)
+        for idx, chunk in enumerate(chunks):
+            audio = await asyncio.to_thread(
+                self._synthesize_sync, chunk, voice_type, primary_language
+            )
            if not audio:
+                logger.warning(
+                    "tencent_tts.synthesize chunk failed chunk_index={} chunk_chars={} "
+                    "voice_type={} primary_language={}",
+                    idx,
+                    len(chunk),
+                    voice_type,
+                    primary_language,
+                )
                return b""
+            logger.debug(
+                "tencent_tts.synthesize chunk ok chunk_index={} chunk_chars={} audio_bytes_len={}",
+                idx,
+                len(chunk),
+                len(audio),
+            )
            results.append(audio)

-        return b"".join(results)
+        merged = b"".join(results)
+        logger.debug(
+            "tencent_tts.synthesize done language={} voice_type={} chunks={} total_bytes={}",
+            language,
+            voice_type,
+            len(chunks),
+            len(merged),
+        )
+        return merged