From 93be60f74ca943fafcad6e311bf56decc6d205ac Mon Sep 17 00:00:00 2001
From: Kevin <kevin@brighteng.org>
Date: Mon, 11 May 2026 17:15:02 +0800
Subject: [PATCH] fix(tts): gate auto reply by ENABLE_TTS; allow on-demand and
 manual playback

- Pipeline: skip _send_tts_audio only for non-manual when ENABLE_TTS=false;
  remove enable_tts early return from handle_tts_request_on_demand.
- Tencent TTS: PrimaryLanguage/chunking follow user language preference only.
- Expo: let manual tts_audio bypass late-segment playback gate after interrupt.
- Docs: clarify ENABLE_TTS vs tts_request in api/.env.example and TTSProvider port.
- Tests: add manual bypass cases; adjust pipeline language tests for en+Chinese text.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 api/.env.example                              |  5 +-
 api/app/adapters/tts/tencent_tts.py           |  7 +-
 api/app/features/conversation/ws/pipeline.py  | 11 +---
 api/app/ports/tts.py                          |  2 +-
 api/tests/test_pipeline_language_skip_tts.py  | 23 +++++++
 .../test_tts_manual_bypass_enable_tts.py      | 65 +++++++++++++++++++
 app-expo/src/app/(main)/conversation/[id].tsx |  5 +-
 7 files changed, 101 insertions(+), 17 deletions(-)
 create mode 100644 api/tests/test_tts_manual_bypass_enable_tts.py

diff --git a/api/.env.example b/api/.env.example
index 019462c..35cd342 100644
--- a/api/.env.example
+++ b/api/.env.example
@@ -239,9 +239,8 @@ TENCENT_SECRET_KEY=your_tencent_asr_secret_key
 # =============================================================================
 # TTS（文字转语音，Agent 回复朗读）— 与 ASR 独立
 # =============================================================================
-# ENABLE_TTS：是否启用「助手回复朗读」服务端能力（TTS 适配器与密钥配置）。关则永远不合成。
-# 每轮是否实际合成：由客户端在 WebSocket `text` / `audio_segment` / `audio_message` 的 `data.tts_this_turn` 控制（未传或 false 仅返回文字）。
-# 若 ENABLE_TTS=true 且该轮 `tts_this_turn=true`：每一段助手文案先下发 `tts_audio`，再下发对应段的 `agent_response`。
+# ENABLE_TTS：关闭时禁用「助手每轮自动生成 TTS」（tts_this_turn 链路）；不影响 WebSocket「按需朗读」tts_request。
+# 每轮是否自动生成：客户端 `data.tts_this_turn`，且 ENABLE_TTS=true、skeleton skip_tts 均未阻止时才会合成。
 ENABLE_TTS=true
 TTS_PROVIDER=tencent
 # 仅 TTS_PROVIDER=openai 时需要
diff --git a/api/app/adapters/tts/tencent_tts.py b/api/app/adapters/tts/tencent_tts.py
index 39909d2..501cdd1 100644
--- a/api/app/adapters/tts/tencent_tts.py
+++ b/api/app/adapters/tts/tencent_tts.py
@@ -82,9 +82,8 @@ class TencentTTSProvider:
         self._secret_id = secret_id
         self._secret_key = secret_key
         self._voice_type = voice_type
-        # 英文音色未单独配置时回落到 501004（月华，腾讯云大模型音色，支持中英混合）。
-        # 大模型音色 501xxx 系列在 PrimaryLanguage=1/2 下均支持中英混读，不会被 Tencent
-        # 以 InvalidParameterValue.PrimaryLanguage 拒绝；与之对应必须配合 ModelType=1。
+        # 英文音色未单独配置时回落到 501004（月华，腾讯云大模型音色）。
+        # 大模型音色 501xxx 须配合 ModelType=1（见 Tencent TextToVoice 文档）。
         self._voice_type_en = voice_type_en if voice_type_en is not None else 501004
         self._codec = codec
         self._client = None
@@ -211,6 +210,8 @@ class TencentTTSProvider:
             )
             return b""
 
+        # ``language`` 由 pipeline 从用户 ``language_preference`` 解析（仅 'en' / 其它→中文路径），
+        # 与助手正文实际语种无关：产品规则是 TTS 主语言跟用户语言一致。
         is_en = (language or "zh").strip().lower() == "en"
         primary_language = PRIMARY_LANGUAGE_EN if is_en else PRIMARY_LANGUAGE_ZH
         default_voice = self._voice_type_en if is_en else self._voice_type
diff --git a/api/app/features/conversation/ws/pipeline.py b/api/app/features/conversation/ws/pipeline.py
index 8bd37f6..6270422 100644
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -115,7 +115,8 @@ async def _send_tts_audio(
         settings.enable_tts,
         settings.tts_provider,
     )
-    if not settings.enable_tts:
+    # enable_tts：仅禁用「助手回复自动生成 TTS」（want_tts 路径）；用户点喇叭（manual=True）仍可合成。
+    if not manual and not settings.enable_tts:
         logger.info(
             "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
             "url_set=False audio_bytes_len=0 reason=enable_tts_false",
@@ -269,14 +270,6 @@ async def handle_tts_request_on_demand(
         settings.enable_tts,
         settings.tts_provider,
     )
-    if not settings.enable_tts:
-        logger.info(
-            "pipeline.handle_tts_request_on_demand result ok=False reason=未开启语音合成 "
-            "conversation_id={} assistant_message_id={}",
-            conversation_id,
-            assistant_message_id,
-        )
-        return False, "未开启语音合成"
 
     conv = await db.get(Conversation, conversation_id)
     if not conv or conv.user_id != user_id or conv.deleted_at is not None:
diff --git a/api/app/ports/tts.py b/api/app/ports/tts.py
index 75f49cb..abce49a 100644
--- a/api/app/ports/tts.py
+++ b/api/app/ports/tts.py
@@ -10,6 +10,6 @@ class TTSProvider(Protocol):
     ) -> bytes:
         """Convert text to speech audio bytes.
 
-        language: 'zh' or 'en'. Adapters that natively detect language may ignore it.
+        language: 'zh' or 'en' — 调用方应使用用户语言偏好（与正文语种无关）；各 adapter 按自身能力解释。
         """
         ...
diff --git a/api/tests/test_pipeline_language_skip_tts.py b/api/tests/test_pipeline_language_skip_tts.py
index de32d60..e2c1eb0 100644
--- a/api/tests/test_pipeline_language_skip_tts.py
+++ b/api/tests/test_pipeline_language_skip_tts.py
@@ -64,6 +64,29 @@ async def test_tencent_tts_zh_uses_primary_language_1_and_zh_voice() -> None:
     assert seen["voice_type"] == 501004
 
 
+@pytest.mark.asyncio
+async def test_tencent_tts_en_user_language_uses_primary_en_even_if_text_is_chinese() -> None:
+    """主语言与用户偏好一致：即使用户语言为 en 且正文为中文，也向 Tencent 提交 PrimaryLanguage=2。"""
+    provider = TencentTTSProvider(
+        secret_id="id",
+        secret_key="key",
+        voice_type=501004,
+        voice_type_en=501004,
+    )
+    seen: dict = {}
+
+    def fake_sync(text: str, voice_type: int, primary_language: int) -> bytes:
+        seen["primary_language"] = primary_language
+        seen["voice_type"] = voice_type
+        return b"OK"
+
+    with patch.object(provider, "_synthesize_sync", side_effect=fake_sync):
+        out = await provider.synthesize("这是中文回复。", language="en")
+
+    assert out == b"OK"
+    assert seen["primary_language"] == PRIMARY_LANGUAGE_EN
+
+
 @pytest.mark.asyncio
 async def test_tencent_tts_en_uses_primary_language_2_and_en_voice() -> None:
     provider = TencentTTSProvider(
diff --git a/api/tests/test_tts_manual_bypass_enable_tts.py b/api/tests/test_tts_manual_bypass_enable_tts.py
new file mode 100644
index 0000000..26ca661
--- /dev/null
+++ b/api/tests/test_tts_manual_bypass_enable_tts.py
@@ -0,0 +1,65 @@
+"""ENABLE_TTS=false 时仍可走喇叭按需合成；自动回复路径则被关闭。"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.features.conversation.ws import pipeline as pl
+
+
+@pytest.mark.asyncio
+async def test_send_tts_manual_bypasses_enable_tts_false(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(pl.settings, "enable_tts", False)
+
+    fake_tts = MagicMock()
+    fake_tts.synthesize = AsyncMock(return_value=b"\xff\xd3-mp3stub")
+    monkeypatch.setattr(pl, "get_tts_provider", lambda: fake_tts)
+
+    storage = MagicMock()
+    storage.upload.return_value = "https://example/public.wav"
+    storage.get_url.return_value = "https://example/signed.wav"
+    monkeypatch.setattr(pl, "get_object_storage", lambda: storage)
+
+    send_mock = AsyncMock()
+    monkeypatch.setattr(pl.manager, "send_message", send_mock)
+    monkeypatch.setattr(pl, "_tts_epoch_value", lambda _cid: 0)
+
+    cid = "c0000000-0000-4000-8000-000000000001"
+    out = await pl._send_tts_audio(
+        cid,
+        "hi",
+        chunk_index=0,
+        chunk_total=1,
+        assistant_message_id="m1",
+        tts_epoch_start=0,
+        manual=True,
+        language="en",
+    )
+    assert out == "https://example/public.wav"
+    fake_tts.synthesize.assert_awaited_once()
+    send_mock.assert_awaited_once()
+
+
+@pytest.mark.asyncio
+async def test_send_tts_auto_blocked_when_enable_tts_false(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(pl.settings, "enable_tts", False)
+
+    fake_tts = MagicMock()
+    fake_tts.synthesize = AsyncMock(return_value=b"audio")
+    monkeypatch.setattr(pl, "get_tts_provider", lambda: fake_tts)
+
+    cid = "c0000000-0000-4000-8000-000000000002"
+    out = await pl._send_tts_audio(
+        cid,
+        "hi",
+        chunk_index=0,
+        chunk_total=1,
+        assistant_message_id="m1",
+        tts_epoch_start=0,
+        manual=False,
+        language="en",
+    )
+    assert out is None
+    fake_tts.synthesize.assert_not_called()
diff --git a/app-expo/src/app/(main)/conversation/[id].tsx b/app-expo/src/app/(main)/conversation/[id].tsx
index 0593552..b460c9b 100644
--- a/app-expo/src/app/(main)/conversation/[id].tsx
+++ b/app-expo/src/app/(main)/conversation/[id].tsx
@@ -1238,7 +1238,10 @@ export default function ConversationScreen() {
 
   const handleTtsSegment = useCallback(
     (p: TtsSegmentPayload) => {
-      if (!ttsGate.current.shouldAcceptIncomingTts()) return;
+      // 闸门用于丢弃「用户已打断后」迟到的自动 TTS；按需朗读 (manual) 是当前明确操作，必须放行。
+      const allowByGate =
+        p.manual === true || ttsGate.current.shouldAcceptIncomingTts();
+      if (!allowByGate) return;
       const convId = id ?? '';
       const cosUrl = p.audioUrl?.trim();
       /**