From 93be60f74ca943fafcad6e311bf56decc6d205ac Mon Sep 17 00:00:00 2001 From: Kevin Date: Mon, 11 May 2026 17:15:02 +0800 Subject: [PATCH] fix(tts): gate auto reply by ENABLE_TTS; allow on-demand and manual playback - Pipeline: skip _send_tts_audio only for non-manual when ENABLE_TTS=false; remove enable_tts early return from handle_tts_request_on_demand. - Tencent TTS: PrimaryLanguage/chunking follow user language preference only. - Expo: let manual tts_audio bypass late-segment playback gate after interrupt. - Docs: clarify ENABLE_TTS vs tts_request in api/.env.example and TTSProvider port. - Tests: add manual bypass cases; adjust pipeline language tests for en+Chinese text. Co-authored-by: Cursor --- api/.env.example | 5 +- api/app/adapters/tts/tencent_tts.py | 7 +- api/app/features/conversation/ws/pipeline.py | 11 +--- api/app/ports/tts.py | 2 +- api/tests/test_pipeline_language_skip_tts.py | 23 +++++++ .../test_tts_manual_bypass_enable_tts.py | 65 +++++++++++++++++++ app-expo/src/app/(main)/conversation/[id].tsx | 5 +- 7 files changed, 101 insertions(+), 17 deletions(-) create mode 100644 api/tests/test_tts_manual_bypass_enable_tts.py diff --git a/api/.env.example b/api/.env.example index 019462c..35cd342 100644 --- a/api/.env.example +++ b/api/.env.example @@ -239,9 +239,8 @@ TENCENT_SECRET_KEY=your_tencent_asr_secret_key # ============================================================================= # TTS(文字转语音,Agent 回复朗读)— 与 ASR 独立 # ============================================================================= -# ENABLE_TTS:是否启用「助手回复朗读」服务端能力(TTS 适配器与密钥配置)。关则永远不合成。 -# 每轮是否实际合成:由客户端在 WebSocket `text` / `audio_segment` / `audio_message` 的 `data.tts_this_turn` 控制(未传或 false 仅返回文字)。 -# 若 ENABLE_TTS=true 且该轮 `tts_this_turn=true`:每一段助手文案先下发 `tts_audio`,再下发对应段的 `agent_response`。 +# ENABLE_TTS:关闭时禁用「助手每轮自动生成 TTS」(tts_this_turn 链路);不影响 WebSocket「按需朗读」tts_request。 +# 每轮是否自动生成:客户端 `data.tts_this_turn`,且 ENABLE_TTS=true、skeleton skip_tts 均未阻止时才会合成。 ENABLE_TTS=true TTS_PROVIDER=tencent # 仅 TTS_PROVIDER=openai 时需要 diff --git a/api/app/adapters/tts/tencent_tts.py b/api/app/adapters/tts/tencent_tts.py index 39909d2..501cdd1 100644 --- a/api/app/adapters/tts/tencent_tts.py +++ b/api/app/adapters/tts/tencent_tts.py @@ -82,9 +82,8 @@ class TencentTTSProvider: self._secret_id = secret_id self._secret_key = secret_key self._voice_type = voice_type - # 英文音色未单独配置时回落到 501004(月华,腾讯云大模型音色,支持中英混合)。 - # 大模型音色 501xxx 系列在 PrimaryLanguage=1/2 下均支持中英混读,不会被 Tencent - # 以 InvalidParameterValue.PrimaryLanguage 拒绝;与之对应必须配合 ModelType=1。 + # 英文音色未单独配置时回落到 501004(月华,腾讯云大模型音色)。 + # 大模型音色 501xxx 须配合 ModelType=1(见 Tencent TextToVoice 文档)。 self._voice_type_en = voice_type_en if voice_type_en is not None else 501004 self._codec = codec self._client = None @@ -211,6 +210,8 @@ class TencentTTSProvider: ) return b"" + # ``language`` 由 pipeline 从用户 ``language_preference`` 解析(仅 'en' / 其它→中文路径), + # 与助手正文实际语种无关:产品规则是 TTS 主语言跟用户语言一致。 is_en = (language or "zh").strip().lower() == "en" primary_language = PRIMARY_LANGUAGE_EN if is_en else PRIMARY_LANGUAGE_ZH default_voice = self._voice_type_en if is_en else self._voice_type diff --git a/api/app/features/conversation/ws/pipeline.py b/api/app/features/conversation/ws/pipeline.py index 8bd37f6..6270422 100644 --- a/api/app/features/conversation/ws/pipeline.py +++ b/api/app/features/conversation/ws/pipeline.py @@ -115,7 +115,8 @@ async def _send_tts_audio( settings.enable_tts, settings.tts_provider, ) - if not settings.enable_tts: + # enable_tts:仅禁用「助手回复自动生成 TTS」(want_tts 路径);用户点喇叭(manual=True)仍可合成。 + if not manual and not settings.enable_tts: logger.info( "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False " "url_set=False audio_bytes_len=0 reason=enable_tts_false", @@ -269,14 +270,6 @@ async def handle_tts_request_on_demand( settings.enable_tts, settings.tts_provider, ) - if not settings.enable_tts: - logger.info( - "pipeline.handle_tts_request_on_demand result ok=False reason=未开启语音合成 " - "conversation_id={} assistant_message_id={}", - conversation_id, - assistant_message_id, - ) - return False, "未开启语音合成" conv = await db.get(Conversation, conversation_id) if not conv or conv.user_id != user_id or conv.deleted_at is not None: diff --git a/api/app/ports/tts.py b/api/app/ports/tts.py index 75f49cb..abce49a 100644 --- a/api/app/ports/tts.py +++ b/api/app/ports/tts.py @@ -10,6 +10,6 @@ class TTSProvider(Protocol): ) -> bytes: """Convert text to speech audio bytes. - language: 'zh' or 'en'. Adapters that natively detect language may ignore it. + language: 'zh' or 'en' — 调用方应使用用户语言偏好(与正文语种无关);各 adapter 按自身能力解释。 """ ... diff --git a/api/tests/test_pipeline_language_skip_tts.py b/api/tests/test_pipeline_language_skip_tts.py index de32d60..e2c1eb0 100644 --- a/api/tests/test_pipeline_language_skip_tts.py +++ b/api/tests/test_pipeline_language_skip_tts.py @@ -64,6 +64,29 @@ async def test_tencent_tts_zh_uses_primary_language_1_and_zh_voice() -> None: assert seen["voice_type"] == 501004 +@pytest.mark.asyncio +async def test_tencent_tts_en_user_language_uses_primary_en_even_if_text_is_chinese() -> None: + """主语言与用户偏好一致:即使用户语言为 en 且正文为中文,也向 Tencent 提交 PrimaryLanguage=2。""" + provider = TencentTTSProvider( + secret_id="id", + secret_key="key", + voice_type=501004, + voice_type_en=501004, + ) + seen: dict = {} + + def fake_sync(text: str, voice_type: int, primary_language: int) -> bytes: + seen["primary_language"] = primary_language + seen["voice_type"] = voice_type + return b"OK" + + with patch.object(provider, "_synthesize_sync", side_effect=fake_sync): + out = await provider.synthesize("这是中文回复。", language="en") + + assert out == b"OK" + assert seen["primary_language"] == PRIMARY_LANGUAGE_EN + + @pytest.mark.asyncio async def test_tencent_tts_en_uses_primary_language_2_and_en_voice() -> None: provider = TencentTTSProvider( diff --git a/api/tests/test_tts_manual_bypass_enable_tts.py b/api/tests/test_tts_manual_bypass_enable_tts.py new file mode 100644 index 0000000..26ca661 --- /dev/null +++ b/api/tests/test_tts_manual_bypass_enable_tts.py @@ -0,0 +1,65 @@ +"""ENABLE_TTS=false 时仍可走喇叭按需合成;自动回复路径则被关闭。""" + +from __future__ import annotations + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from app.features.conversation.ws import pipeline as pl + + +@pytest.mark.asyncio +async def test_send_tts_manual_bypasses_enable_tts_false(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(pl.settings, "enable_tts", False) + + fake_tts = MagicMock() + fake_tts.synthesize = AsyncMock(return_value=b"\xff\xd3-mp3stub") + monkeypatch.setattr(pl, "get_tts_provider", lambda: fake_tts) + + storage = MagicMock() + storage.upload.return_value = "https://example/public.wav" + storage.get_url.return_value = "https://example/signed.wav" + monkeypatch.setattr(pl, "get_object_storage", lambda: storage) + + send_mock = AsyncMock() + monkeypatch.setattr(pl.manager, "send_message", send_mock) + monkeypatch.setattr(pl, "_tts_epoch_value", lambda _cid: 0) + + cid = "c0000000-0000-4000-8000-000000000001" + out = await pl._send_tts_audio( + cid, + "hi", + chunk_index=0, + chunk_total=1, + assistant_message_id="m1", + tts_epoch_start=0, + manual=True, + language="en", + ) + assert out == "https://example/public.wav" + fake_tts.synthesize.assert_awaited_once() + send_mock.assert_awaited_once() + + +@pytest.mark.asyncio +async def test_send_tts_auto_blocked_when_enable_tts_false(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(pl.settings, "enable_tts", False) + + fake_tts = MagicMock() + fake_tts.synthesize = AsyncMock(return_value=b"audio") + monkeypatch.setattr(pl, "get_tts_provider", lambda: fake_tts) + + cid = "c0000000-0000-4000-8000-000000000002" + out = await pl._send_tts_audio( + cid, + "hi", + chunk_index=0, + chunk_total=1, + assistant_message_id="m1", + tts_epoch_start=0, + manual=False, + language="en", + ) + assert out is None + fake_tts.synthesize.assert_not_called() diff --git a/app-expo/src/app/(main)/conversation/[id].tsx b/app-expo/src/app/(main)/conversation/[id].tsx index 0593552..b460c9b 100644 --- a/app-expo/src/app/(main)/conversation/[id].tsx +++ b/app-expo/src/app/(main)/conversation/[id].tsx @@ -1238,7 +1238,10 @@ export default function ConversationScreen() { const handleTtsSegment = useCallback( (p: TtsSegmentPayload) => { - if (!ttsGate.current.shouldAcceptIncomingTts()) return; + // 闸门用于丢弃「用户已打断后」迟到的自动 TTS;按需朗读 (manual) 是当前明确操作,必须放行。 + const allowByGate = + p.manual === true || ttsGate.current.shouldAcceptIncomingTts(); + if (!allowByGate) return; const convId = id ?? ''; const cosUrl = p.audioUrl?.trim(); /**