fix(tts): gate auto reply by ENABLE_TTS; allow on-demand and manual playback

- Pipeline: skip _send_tts_audio only for non-manual when ENABLE_TTS=false;
  remove enable_tts early return from handle_tts_request_on_demand.
- Tencent TTS: PrimaryLanguage/chunking follow user language preference only.
- Expo: let manual tts_audio bypass late-segment playback gate after interrupt.
- Docs: clarify ENABLE_TTS vs tts_request in api/.env.example and TTSProvider port.
- Tests: add manual bypass cases; adjust pipeline language tests for en+Chinese text.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Kevin
2026-05-11 17:15:02 +08:00
parent ccdc4e4277
commit 93be60f74c
7 changed files with 101 additions and 17 deletions

View File

@@ -239,9 +239,8 @@ TENCENT_SECRET_KEY=your_tencent_asr_secret_key
# =============================================================================
# TTS文字转语音Agent 回复朗读)— 与 ASR 独立
# =============================================================================
# ENABLE_TTS是否启用「助手回复朗读」服务端能力TTS 适配器与密钥配置)。关则永远不合成
# 每轮是否实际合成:客户端在 WebSocket `text` / `audio_segment` / `audio_message` 的 `data.tts_this_turn` 控制(未传或 false 仅返回文字)
# 若 ENABLE_TTS=true 且该轮 `tts_this_turn=true`:每一段助手文案先下发 `tts_audio`,再下发对应段的 `agent_response`。
# ENABLE_TTS关闭时禁用「助手每轮自动生成 TTS」tts_this_turn 链路);不影响 WebSocket「按需朗读」tts_request
# 每轮是否自动生成:客户端 `data.tts_this_turn`,且 ENABLE_TTS=true、skeleton skip_tts 均未阻止时才会合成
ENABLE_TTS=true
TTS_PROVIDER=tencent
# 仅 TTS_PROVIDER=openai 时需要

View File

@@ -82,9 +82,8 @@ class TencentTTSProvider:
self._secret_id = secret_id
self._secret_key = secret_key
self._voice_type = voice_type
# 英文音色未单独配置时回落到 501004月华腾讯云大模型音色,支持中英混合)。
# 大模型音色 501xxx 系列在 PrimaryLanguage=1/2 下均支持中英混读,不会被 Tencent
# 以 InvalidParameterValue.PrimaryLanguage 拒绝;与之对应必须配合 ModelType=1。
# 英文音色未单独配置时回落到 501004月华腾讯云大模型音色
# 大模型音色 501xxx 须配合 ModelType=1见 Tencent TextToVoice 文档)。
self._voice_type_en = voice_type_en if voice_type_en is not None else 501004
self._codec = codec
self._client = None
@@ -211,6 +210,8 @@ class TencentTTSProvider:
)
return b""
# ``language`` 由 pipeline 从用户 ``language_preference`` 解析(仅 'en' / 其它→中文路径),
# 与助手正文实际语种无关:产品规则是 TTS 主语言跟用户语言一致。
is_en = (language or "zh").strip().lower() == "en"
primary_language = PRIMARY_LANGUAGE_EN if is_en else PRIMARY_LANGUAGE_ZH
default_voice = self._voice_type_en if is_en else self._voice_type

View File

@@ -115,7 +115,8 @@ async def _send_tts_audio(
settings.enable_tts,
settings.tts_provider,
)
if not settings.enable_tts:
# enable_tts仅禁用「助手回复自动生成 TTS」want_tts 路径用户点喇叭manual=True仍可合成。
if not manual and not settings.enable_tts:
logger.info(
"pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
"url_set=False audio_bytes_len=0 reason=enable_tts_false",
@@ -269,14 +270,6 @@ async def handle_tts_request_on_demand(
settings.enable_tts,
settings.tts_provider,
)
if not settings.enable_tts:
logger.info(
"pipeline.handle_tts_request_on_demand result ok=False reason=未开启语音合成 "
"conversation_id={} assistant_message_id={}",
conversation_id,
assistant_message_id,
)
return False, "未开启语音合成"
conv = await db.get(Conversation, conversation_id)
if not conv or conv.user_id != user_id or conv.deleted_at is not None:

View File

@@ -10,6 +10,6 @@ class TTSProvider(Protocol):
) -> bytes:
"""Convert text to speech audio bytes.
language: 'zh' or 'en'. Adapters that natively detect language may ignore it.
language: 'zh' or 'en' — 调用方应使用用户语言偏好(与正文语种无关);各 adapter 按自身能力解释。
"""
...

View File

@@ -64,6 +64,29 @@ async def test_tencent_tts_zh_uses_primary_language_1_and_zh_voice() -> None:
assert seen["voice_type"] == 501004
@pytest.mark.asyncio
async def test_tencent_tts_en_user_language_uses_primary_en_even_if_text_is_chinese() -> None:
"""主语言与用户偏好一致:即使用户语言为 en 且正文为中文,也向 Tencent 提交 PrimaryLanguage=2。"""
provider = TencentTTSProvider(
secret_id="id",
secret_key="key",
voice_type=501004,
voice_type_en=501004,
)
seen: dict = {}
def fake_sync(text: str, voice_type: int, primary_language: int) -> bytes:
seen["primary_language"] = primary_language
seen["voice_type"] = voice_type
return b"OK"
with patch.object(provider, "_synthesize_sync", side_effect=fake_sync):
out = await provider.synthesize("这是中文回复。", language="en")
assert out == b"OK"
assert seen["primary_language"] == PRIMARY_LANGUAGE_EN
@pytest.mark.asyncio
async def test_tencent_tts_en_uses_primary_language_2_and_en_voice() -> None:
provider = TencentTTSProvider(

View File

@@ -0,0 +1,65 @@
"""ENABLE_TTS=false 时仍可走喇叭按需合成;自动回复路径则被关闭。"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
from app.features.conversation.ws import pipeline as pl
@pytest.mark.asyncio
async def test_send_tts_manual_bypasses_enable_tts_false(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(pl.settings, "enable_tts", False)
fake_tts = MagicMock()
fake_tts.synthesize = AsyncMock(return_value=b"\xff\xd3-mp3stub")
monkeypatch.setattr(pl, "get_tts_provider", lambda: fake_tts)
storage = MagicMock()
storage.upload.return_value = "https://example/public.wav"
storage.get_url.return_value = "https://example/signed.wav"
monkeypatch.setattr(pl, "get_object_storage", lambda: storage)
send_mock = AsyncMock()
monkeypatch.setattr(pl.manager, "send_message", send_mock)
monkeypatch.setattr(pl, "_tts_epoch_value", lambda _cid: 0)
cid = "c0000000-0000-4000-8000-000000000001"
out = await pl._send_tts_audio(
cid,
"hi",
chunk_index=0,
chunk_total=1,
assistant_message_id="m1",
tts_epoch_start=0,
manual=True,
language="en",
)
assert out == "https://example/public.wav"
fake_tts.synthesize.assert_awaited_once()
send_mock.assert_awaited_once()
@pytest.mark.asyncio
async def test_send_tts_auto_blocked_when_enable_tts_false(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(pl.settings, "enable_tts", False)
fake_tts = MagicMock()
fake_tts.synthesize = AsyncMock(return_value=b"audio")
monkeypatch.setattr(pl, "get_tts_provider", lambda: fake_tts)
cid = "c0000000-0000-4000-8000-000000000002"
out = await pl._send_tts_audio(
cid,
"hi",
chunk_index=0,
chunk_total=1,
assistant_message_id="m1",
tts_epoch_start=0,
manual=False,
language="en",
)
assert out is None
fake_tts.synthesize.assert_not_called()

View File

@@ -1238,7 +1238,10 @@ export default function ConversationScreen() {
const handleTtsSegment = useCallback(
(p: TtsSegmentPayload) => {
if (!ttsGate.current.shouldAcceptIncomingTts()) return;
// 闸门用于丢弃「用户已打断后」迟到的自动 TTS按需朗读 (manual) 是当前明确操作,必须放行。
const allowByGate =
p.manual === true || ttsGate.current.shouldAcceptIncomingTts();
if (!allowByGate) return;
const convId = id ?? '';
const cosUrl = p.audioUrl?.trim();
/**