fix(tts): gate auto reply by ENABLE_TTS; allow on-demand and manual playback
- Pipeline: skip _send_tts_audio only for non-manual when ENABLE_TTS=false; remove enable_tts early return from handle_tts_request_on_demand. - Tencent TTS: PrimaryLanguage/chunking follow user language preference only. - Expo: let manual tts_audio bypass late-segment playback gate after interrupt. - Docs: clarify ENABLE_TTS vs tts_request in api/.env.example and TTSProvider port. - Tests: add manual bypass cases; adjust pipeline language tests for en+Chinese text. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -239,9 +239,8 @@ TENCENT_SECRET_KEY=your_tencent_asr_secret_key
|
||||
# =============================================================================
|
||||
# TTS(文字转语音,Agent 回复朗读)— 与 ASR 独立
|
||||
# =============================================================================
|
||||
# ENABLE_TTS:是否启用「助手回复朗读」服务端能力(TTS 适配器与密钥配置)。关则永远不合成。
|
||||
# 每轮是否实际合成:由客户端在 WebSocket `text` / `audio_segment` / `audio_message` 的 `data.tts_this_turn` 控制(未传或 false 仅返回文字)。
|
||||
# 若 ENABLE_TTS=true 且该轮 `tts_this_turn=true`:每一段助手文案先下发 `tts_audio`,再下发对应段的 `agent_response`。
|
||||
# ENABLE_TTS:关闭时禁用「助手每轮自动生成 TTS」(tts_this_turn 链路);不影响 WebSocket「按需朗读」tts_request。
|
||||
# 每轮是否自动生成:客户端 `data.tts_this_turn`,且 ENABLE_TTS=true、skeleton skip_tts 均未阻止时才会合成。
|
||||
ENABLE_TTS=true
|
||||
TTS_PROVIDER=tencent
|
||||
# 仅 TTS_PROVIDER=openai 时需要
|
||||
|
||||
@@ -82,9 +82,8 @@ class TencentTTSProvider:
|
||||
self._secret_id = secret_id
|
||||
self._secret_key = secret_key
|
||||
self._voice_type = voice_type
|
||||
# 英文音色未单独配置时回落到 501004(月华,腾讯云大模型音色,支持中英混合)。
|
||||
# 大模型音色 501xxx 系列在 PrimaryLanguage=1/2 下均支持中英混读,不会被 Tencent
|
||||
# 以 InvalidParameterValue.PrimaryLanguage 拒绝;与之对应必须配合 ModelType=1。
|
||||
# 英文音色未单独配置时回落到 501004(月华,腾讯云大模型音色)。
|
||||
# 大模型音色 501xxx 须配合 ModelType=1(见 Tencent TextToVoice 文档)。
|
||||
self._voice_type_en = voice_type_en if voice_type_en is not None else 501004
|
||||
self._codec = codec
|
||||
self._client = None
|
||||
@@ -211,6 +210,8 @@ class TencentTTSProvider:
|
||||
)
|
||||
return b""
|
||||
|
||||
# ``language`` 由 pipeline 从用户 ``language_preference`` 解析(仅 'en' / 其它→中文路径),
|
||||
# 与助手正文实际语种无关:产品规则是 TTS 主语言跟用户语言一致。
|
||||
is_en = (language or "zh").strip().lower() == "en"
|
||||
primary_language = PRIMARY_LANGUAGE_EN if is_en else PRIMARY_LANGUAGE_ZH
|
||||
default_voice = self._voice_type_en if is_en else self._voice_type
|
||||
|
||||
@@ -115,7 +115,8 @@ async def _send_tts_audio(
|
||||
settings.enable_tts,
|
||||
settings.tts_provider,
|
||||
)
|
||||
if not settings.enable_tts:
|
||||
# enable_tts:仅禁用「助手回复自动生成 TTS」(want_tts 路径);用户点喇叭(manual=True)仍可合成。
|
||||
if not manual and not settings.enable_tts:
|
||||
logger.info(
|
||||
"pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
|
||||
"url_set=False audio_bytes_len=0 reason=enable_tts_false",
|
||||
@@ -269,14 +270,6 @@ async def handle_tts_request_on_demand(
|
||||
settings.enable_tts,
|
||||
settings.tts_provider,
|
||||
)
|
||||
if not settings.enable_tts:
|
||||
logger.info(
|
||||
"pipeline.handle_tts_request_on_demand result ok=False reason=未开启语音合成 "
|
||||
"conversation_id={} assistant_message_id={}",
|
||||
conversation_id,
|
||||
assistant_message_id,
|
||||
)
|
||||
return False, "未开启语音合成"
|
||||
|
||||
conv = await db.get(Conversation, conversation_id)
|
||||
if not conv or conv.user_id != user_id or conv.deleted_at is not None:
|
||||
|
||||
@@ -10,6 +10,6 @@ class TTSProvider(Protocol):
|
||||
) -> bytes:
|
||||
"""Convert text to speech audio bytes.
|
||||
|
||||
language: 'zh' or 'en'. Adapters that natively detect language may ignore it.
|
||||
language: 'zh' or 'en' — 调用方应使用用户语言偏好(与正文语种无关);各 adapter 按自身能力解释。
|
||||
"""
|
||||
...
|
||||
|
||||
@@ -64,6 +64,29 @@ async def test_tencent_tts_zh_uses_primary_language_1_and_zh_voice() -> None:
|
||||
assert seen["voice_type"] == 501004
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tencent_tts_en_user_language_uses_primary_en_even_if_text_is_chinese() -> None:
|
||||
"""主语言与用户偏好一致:即使用户语言为 en 且正文为中文,也向 Tencent 提交 PrimaryLanguage=2。"""
|
||||
provider = TencentTTSProvider(
|
||||
secret_id="id",
|
||||
secret_key="key",
|
||||
voice_type=501004,
|
||||
voice_type_en=501004,
|
||||
)
|
||||
seen: dict = {}
|
||||
|
||||
def fake_sync(text: str, voice_type: int, primary_language: int) -> bytes:
|
||||
seen["primary_language"] = primary_language
|
||||
seen["voice_type"] = voice_type
|
||||
return b"OK"
|
||||
|
||||
with patch.object(provider, "_synthesize_sync", side_effect=fake_sync):
|
||||
out = await provider.synthesize("这是中文回复。", language="en")
|
||||
|
||||
assert out == b"OK"
|
||||
assert seen["primary_language"] == PRIMARY_LANGUAGE_EN
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tencent_tts_en_uses_primary_language_2_and_en_voice() -> None:
|
||||
provider = TencentTTSProvider(
|
||||
|
||||
65
api/tests/test_tts_manual_bypass_enable_tts.py
Normal file
65
api/tests/test_tts_manual_bypass_enable_tts.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""ENABLE_TTS=false 时仍可走喇叭按需合成;自动回复路径则被关闭。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from app.features.conversation.ws import pipeline as pl
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_tts_manual_bypasses_enable_tts_false(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(pl.settings, "enable_tts", False)
|
||||
|
||||
fake_tts = MagicMock()
|
||||
fake_tts.synthesize = AsyncMock(return_value=b"\xff\xd3-mp3stub")
|
||||
monkeypatch.setattr(pl, "get_tts_provider", lambda: fake_tts)
|
||||
|
||||
storage = MagicMock()
|
||||
storage.upload.return_value = "https://example/public.wav"
|
||||
storage.get_url.return_value = "https://example/signed.wav"
|
||||
monkeypatch.setattr(pl, "get_object_storage", lambda: storage)
|
||||
|
||||
send_mock = AsyncMock()
|
||||
monkeypatch.setattr(pl.manager, "send_message", send_mock)
|
||||
monkeypatch.setattr(pl, "_tts_epoch_value", lambda _cid: 0)
|
||||
|
||||
cid = "c0000000-0000-4000-8000-000000000001"
|
||||
out = await pl._send_tts_audio(
|
||||
cid,
|
||||
"hi",
|
||||
chunk_index=0,
|
||||
chunk_total=1,
|
||||
assistant_message_id="m1",
|
||||
tts_epoch_start=0,
|
||||
manual=True,
|
||||
language="en",
|
||||
)
|
||||
assert out == "https://example/public.wav"
|
||||
fake_tts.synthesize.assert_awaited_once()
|
||||
send_mock.assert_awaited_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_tts_auto_blocked_when_enable_tts_false(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(pl.settings, "enable_tts", False)
|
||||
|
||||
fake_tts = MagicMock()
|
||||
fake_tts.synthesize = AsyncMock(return_value=b"audio")
|
||||
monkeypatch.setattr(pl, "get_tts_provider", lambda: fake_tts)
|
||||
|
||||
cid = "c0000000-0000-4000-8000-000000000002"
|
||||
out = await pl._send_tts_audio(
|
||||
cid,
|
||||
"hi",
|
||||
chunk_index=0,
|
||||
chunk_total=1,
|
||||
assistant_message_id="m1",
|
||||
tts_epoch_start=0,
|
||||
manual=False,
|
||||
language="en",
|
||||
)
|
||||
assert out is None
|
||||
fake_tts.synthesize.assert_not_called()
|
||||
@@ -1238,7 +1238,10 @@ export default function ConversationScreen() {
|
||||
|
||||
const handleTtsSegment = useCallback(
|
||||
(p: TtsSegmentPayload) => {
|
||||
if (!ttsGate.current.shouldAcceptIncomingTts()) return;
|
||||
// 闸门用于丢弃「用户已打断后」迟到的自动 TTS;按需朗读 (manual) 是当前明确操作,必须放行。
|
||||
const allowByGate =
|
||||
p.manual === true || ttsGate.current.shouldAcceptIncomingTts();
|
||||
if (!allowByGate) return;
|
||||
const convId = id ?? '';
|
||||
const cosUrl = p.audioUrl?.trim();
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user