fix(tts): gate auto reply by ENABLE_TTS; allow on-demand and manual playback

- Pipeline: skip _send_tts_audio only for non-manual when ENABLE_TTS=false; remove enable_tts early return from handle_tts_request_on_demand. - Tencent TTS: PrimaryLanguage/chunking follow user language preference only. - Expo: let manual tts_audio bypass late-segment playback gate after interrupt. - Docs: clarify ENABLE_TTS vs tts_request in api/.env.example and TTSProvider port. - Tests: add manual bypass cases; adjust pipeline language tests for en+Chinese text. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-11 17:15:02 +08:00
parent ccdc4e4277
commit 93be60f74c
7 changed files with 101 additions and 17 deletions
--- a/api/.env.example
+++ b/api/.env.example
@@ -239,9 +239,8 @@ TENCENT_SECRET_KEY=your_tencent_asr_secret_key
 # =============================================================================
 # TTS（文字转语音，Agent 回复朗读）— 与 ASR 独立
 # =============================================================================
-# ENABLE_TTS：是否启用「助手回复朗读」服务端能力（TTS 适配器与密钥配置）。关则永远不合成。
-# 每轮是否实际合成：由客户端在 WebSocket `text` / `audio_segment` / `audio_message` 的 `data.tts_this_turn` 控制（未传或 false 仅返回文字）。
-# 若 ENABLE_TTS=true 且该轮 `tts_this_turn=true`：每一段助手文案先下发 `tts_audio`，再下发对应段的 `agent_response`。
+# ENABLE_TTS：关闭时禁用「助手每轮自动生成 TTS」（tts_this_turn 链路）；不影响 WebSocket「按需朗读」tts_request。
+# 每轮是否自动生成：客户端 `data.tts_this_turn`，且 ENABLE_TTS=true、skeleton skip_tts 均未阻止时才会合成。
 ENABLE_TTS=true
 TTS_PROVIDER=tencent
 # 仅 TTS_PROVIDER=openai 时需要
--- a/api/app/adapters/tts/tencent_tts.py
+++ b/api/app/adapters/tts/tencent_tts.py
@@ -82,9 +82,8 @@ class TencentTTSProvider:
        self._secret_id = secret_id
        self._secret_key = secret_key
        self._voice_type = voice_type
-        # 英文音色未单独配置时回落到 501004（月华，腾讯云大模型音色，支持中英混合）。
-        # 大模型音色 501xxx 系列在 PrimaryLanguage=1/2 下均支持中英混读，不会被 Tencent
-        # 以 InvalidParameterValue.PrimaryLanguage 拒绝；与之对应必须配合 ModelType=1。
+        # 英文音色未单独配置时回落到 501004（月华，腾讯云大模型音色）。
+        # 大模型音色 501xxx 须配合 ModelType=1（见 Tencent TextToVoice 文档）。
        self._voice_type_en = voice_type_en if voice_type_en is not None else 501004
        self._codec = codec
        self._client = None
@@ -211,6 +210,8 @@ class TencentTTSProvider:
            )
            return b""

+        # ``language`` 由 pipeline 从用户 ``language_preference`` 解析（仅 'en' / 其它→中文路径），
+        # 与助手正文实际语种无关：产品规则是 TTS 主语言跟用户语言一致。
        is_en = (language or "zh").strip().lower() == "en"
        primary_language = PRIMARY_LANGUAGE_EN if is_en else PRIMARY_LANGUAGE_ZH
        default_voice = self._voice_type_en if is_en else self._voice_type
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -115,7 +115,8 @@ async def _send_tts_audio(
        settings.enable_tts,
        settings.tts_provider,
    )
-    if not settings.enable_tts:
+    # enable_tts：仅禁用「助手回复自动生成 TTS」（want_tts 路径）；用户点喇叭（manual=True）仍可合成。
+    if not manual and not settings.enable_tts:
        logger.info(
            "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
            "url_set=False audio_bytes_len=0 reason=enable_tts_false",
@@ -269,14 +270,6 @@ async def handle_tts_request_on_demand(
        settings.enable_tts,
        settings.tts_provider,
    )
-    if not settings.enable_tts:
-        logger.info(
-            "pipeline.handle_tts_request_on_demand result ok=False reason=未开启语音合成 "
-            "conversation_id={} assistant_message_id={}",
-            conversation_id,
-            assistant_message_id,
-        )
-        return False, "未开启语音合成"

    conv = await db.get(Conversation, conversation_id)
    if not conv or conv.user_id != user_id or conv.deleted_at is not None:
--- a/api/app/ports/tts.py
+++ b/api/app/ports/tts.py
@@ -10,6 +10,6 @@ class TTSProvider(Protocol):
    ) -> bytes:
        """Convert text to speech audio bytes.

-        language: 'zh' or 'en'. Adapters that natively detect language may ignore it.
+        language: 'zh' or 'en' — 调用方应使用用户语言偏好（与正文语种无关）；各 adapter 按自身能力解释。
        """
        ...
--- a/api/tests/test_pipeline_language_skip_tts.py
+++ b/api/tests/test_pipeline_language_skip_tts.py
@@ -64,6 +64,29 @@ async def test_tencent_tts_zh_uses_primary_language_1_and_zh_voice() -> None:
    assert seen["voice_type"] == 501004


+@pytest.mark.asyncio
+async def test_tencent_tts_en_user_language_uses_primary_en_even_if_text_is_chinese() -> None:
+    """主语言与用户偏好一致：即使用户语言为 en 且正文为中文，也向 Tencent 提交 PrimaryLanguage=2。"""
+    provider = TencentTTSProvider(
+        secret_id="id",
+        secret_key="key",
+        voice_type=501004,
+        voice_type_en=501004,
+    )
+    seen: dict = {}
+
+    def fake_sync(text: str, voice_type: int, primary_language: int) -> bytes:
+        seen["primary_language"] = primary_language
+        seen["voice_type"] = voice_type
+        return b"OK"
+
+    with patch.object(provider, "_synthesize_sync", side_effect=fake_sync):
+        out = await provider.synthesize("这是中文回复。", language="en")
+
+    assert out == b"OK"
+    assert seen["primary_language"] == PRIMARY_LANGUAGE_EN
+
+
@pytest.mark.asyncio
 async def test_tencent_tts_en_uses_primary_language_2_and_en_voice() -> None:
    provider = TencentTTSProvider(
--- a/api/tests/test_tts_manual_bypass_enable_tts.py
+++ b/api/tests/test_tts_manual_bypass_enable_tts.py
@@ -0,0 +1,65 @@
+"""ENABLE_TTS=false 时仍可走喇叭按需合成；自动回复路径则被关闭。"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.features.conversation.ws import pipeline as pl
+
+
+@pytest.mark.asyncio
+async def test_send_tts_manual_bypasses_enable_tts_false(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(pl.settings, "enable_tts", False)
+
+    fake_tts = MagicMock()
+    fake_tts.synthesize = AsyncMock(return_value=b"\xff\xd3-mp3stub")
+    monkeypatch.setattr(pl, "get_tts_provider", lambda: fake_tts)
+
+    storage = MagicMock()
+    storage.upload.return_value = "https://example/public.wav"
+    storage.get_url.return_value = "https://example/signed.wav"
+    monkeypatch.setattr(pl, "get_object_storage", lambda: storage)
+
+    send_mock = AsyncMock()
+    monkeypatch.setattr(pl.manager, "send_message", send_mock)
+    monkeypatch.setattr(pl, "_tts_epoch_value", lambda _cid: 0)
+
+    cid = "c0000000-0000-4000-8000-000000000001"
+    out = await pl._send_tts_audio(
+        cid,
+        "hi",
+        chunk_index=0,
+        chunk_total=1,
+        assistant_message_id="m1",
+        tts_epoch_start=0,
+        manual=True,
+        language="en",
+    )
+    assert out == "https://example/public.wav"
+    fake_tts.synthesize.assert_awaited_once()
+    send_mock.assert_awaited_once()
+
+
+@pytest.mark.asyncio
+async def test_send_tts_auto_blocked_when_enable_tts_false(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(pl.settings, "enable_tts", False)
+
+    fake_tts = MagicMock()
+    fake_tts.synthesize = AsyncMock(return_value=b"audio")
+    monkeypatch.setattr(pl, "get_tts_provider", lambda: fake_tts)
+
+    cid = "c0000000-0000-4000-8000-000000000002"
+    out = await pl._send_tts_audio(
+        cid,
+        "hi",
+        chunk_index=0,
+        chunk_total=1,
+        assistant_message_id="m1",
+        tts_epoch_start=0,
+        manual=False,
+        language="en",
+    )
+    assert out is None
+    fake_tts.synthesize.assert_not_called()
--- a/app-expo/src/app/(main)/conversation/[id].tsx
+++ b/app-expo/src/app/(main)/conversation/[id].tsx
@@ -1238,7 +1238,10 @@ export default function ConversationScreen() {

  const handleTtsSegment = useCallback(
    (p: TtsSegmentPayload) => {
-      if (!ttsGate.current.shouldAcceptIncomingTts()) return;
+      // 闸门用于丢弃「用户已打断后」迟到的自动 TTS；按需朗读 (manual) 是当前明确操作，必须放行。
+      const allowByGate =
+        p.manual === true || ttsGate.current.shouldAcceptIncomingTts();
+      if (!allowByGate) return;
      const convId = id ?? '';
      const cosUrl = p.audioUrl?.trim();
      /**