- Pipeline: skip _send_tts_audio only for non-manual when ENABLE_TTS=false; remove enable_tts early return from handle_tts_request_on_demand. - Tencent TTS: PrimaryLanguage/chunking follow user language preference only. - Expo: let manual tts_audio bypass late-segment playback gate after interrupt. - Docs: clarify ENABLE_TTS vs tts_request in api/.env.example and TTSProvider port. - Tests: add manual bypass cases; adjust pipeline language tests for en+Chinese text. Co-authored-by: Cursor <cursoragent@cursor.com>
224 lines
8.1 KiB
Python
224 lines
8.1 KiB
Python
"""WS pipeline 语言解析与 Tencent TTS 英文合成参数。"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from types import SimpleNamespace
|
||
from unittest.mock import MagicMock, patch
|
||
|
||
import pytest
|
||
|
||
from app.adapters.tts.tencent_tts import (
|
||
MODEL_TYPE_LLM,
|
||
PRIMARY_LANGUAGE_EN,
|
||
PRIMARY_LANGUAGE_ZH,
|
||
TencentTTSProvider,
|
||
)
|
||
from app.features.conversation.ws.pipeline import _resolve_user_language
|
||
|
||
|
||
# ── pipeline._resolve_user_language ─────────────────────────────────
|
||
|
||
|
||
def test_resolve_user_language_zh_default_when_missing() -> None:
|
||
assert _resolve_user_language(None) == "zh"
|
||
assert _resolve_user_language(SimpleNamespace()) == "zh"
|
||
assert _resolve_user_language(SimpleNamespace(language_preference=None)) == "zh"
|
||
assert _resolve_user_language(SimpleNamespace(language_preference="zh")) == "zh"
|
||
|
||
|
||
def test_resolve_user_language_en_only_for_en_token() -> None:
|
||
assert _resolve_user_language(SimpleNamespace(language_preference="en")) == "en"
|
||
assert _resolve_user_language(SimpleNamespace(language_preference="EN")) == "en"
|
||
assert _resolve_user_language(SimpleNamespace(language_preference=" en ")) == "en"
|
||
|
||
|
||
def test_resolve_user_language_unknown_falls_back_to_zh() -> None:
|
||
assert _resolve_user_language(SimpleNamespace(language_preference="ja")) == "zh"
|
||
assert _resolve_user_language(SimpleNamespace(language_preference="")) == "zh"
|
||
|
||
|
||
# ── TencentTTSProvider 语言分支 ──────────────────────────────────────
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_tencent_tts_zh_uses_primary_language_1_and_zh_voice() -> None:
|
||
provider = TencentTTSProvider(
|
||
secret_id="id",
|
||
secret_key="key",
|
||
voice_type=501004,
|
||
voice_type_en=501004,
|
||
)
|
||
seen: dict = {}
|
||
|
||
def fake_sync(text: str, voice_type: int, primary_language: int) -> bytes:
|
||
seen["text"] = text
|
||
seen["voice_type"] = voice_type
|
||
seen["primary_language"] = primary_language
|
||
return b"AUDIO"
|
||
|
||
with patch.object(provider, "_synthesize_sync", side_effect=fake_sync):
|
||
out = await provider.synthesize("你好", language="zh")
|
||
|
||
assert out == b"AUDIO"
|
||
assert seen["primary_language"] == PRIMARY_LANGUAGE_ZH
|
||
assert seen["voice_type"] == 501004
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_tencent_tts_en_user_language_uses_primary_en_even_if_text_is_chinese() -> None:
|
||
"""主语言与用户偏好一致:即使用户语言为 en 且正文为中文,也向 Tencent 提交 PrimaryLanguage=2。"""
|
||
provider = TencentTTSProvider(
|
||
secret_id="id",
|
||
secret_key="key",
|
||
voice_type=501004,
|
||
voice_type_en=501004,
|
||
)
|
||
seen: dict = {}
|
||
|
||
def fake_sync(text: str, voice_type: int, primary_language: int) -> bytes:
|
||
seen["primary_language"] = primary_language
|
||
seen["voice_type"] = voice_type
|
||
return b"OK"
|
||
|
||
with patch.object(provider, "_synthesize_sync", side_effect=fake_sync):
|
||
out = await provider.synthesize("这是中文回复。", language="en")
|
||
|
||
assert out == b"OK"
|
||
assert seen["primary_language"] == PRIMARY_LANGUAGE_EN
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_tencent_tts_en_uses_primary_language_2_and_en_voice() -> None:
|
||
provider = TencentTTSProvider(
|
||
secret_id="id",
|
||
secret_key="key",
|
||
voice_type=501004,
|
||
voice_type_en=501004,
|
||
)
|
||
seen: dict = {}
|
||
|
||
def fake_sync(text: str, voice_type: int, primary_language: int) -> bytes:
|
||
seen["text"] = text
|
||
seen["voice_type"] = voice_type
|
||
seen["primary_language"] = primary_language
|
||
return b"AUDIO_EN"
|
||
|
||
with patch.object(provider, "_synthesize_sync", side_effect=fake_sync):
|
||
out = await provider.synthesize("Hello there.", language="en")
|
||
|
||
assert out == b"AUDIO_EN"
|
||
assert seen["primary_language"] == PRIMARY_LANGUAGE_EN
|
||
assert seen["voice_type"] == 501004
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_tencent_tts_en_uses_relaxed_chunk_size() -> None:
|
||
"""English text up to ~480 letters fits in a single chunk; zh path would split it."""
|
||
provider = TencentTTSProvider(
|
||
secret_id="id",
|
||
secret_key="key",
|
||
voice_type=501004,
|
||
voice_type_en=501004,
|
||
)
|
||
en_chunks: list[int] = []
|
||
zh_chunks: list[int] = []
|
||
|
||
def fake_en(text: str, voice_type: int, primary_language: int) -> bytes:
|
||
en_chunks.append(len(text))
|
||
return b"X"
|
||
|
||
def fake_zh(text: str, voice_type: int, primary_language: int) -> bytes:
|
||
zh_chunks.append(len(text))
|
||
return b"X"
|
||
|
||
text_400 = ("Word " * 80).strip() # 399 chars, no sentence terminators
|
||
with patch.object(provider, "_synthesize_sync", side_effect=fake_en):
|
||
await provider.synthesize(text_400, language="en")
|
||
with patch.object(provider, "_synthesize_sync", side_effect=fake_zh):
|
||
await provider.synthesize(text_400, language="zh")
|
||
|
||
# English allows the 400-char text in a single request; Chinese path must split
|
||
assert en_chunks == [len(text_400)]
|
||
assert len(zh_chunks) > 1
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_tencent_tts_returns_empty_when_credentials_missing() -> None:
|
||
provider = TencentTTSProvider(secret_id="", secret_key="")
|
||
out = await provider.synthesize("Hello", language="en")
|
||
assert out == b""
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_tencent_tts_voice_type_en_falls_back_to_english_voice_when_unset() -> None:
|
||
"""缺省 voice_type_en 时回落到 501004(月华,大模型音色,原生中英混合)。"""
|
||
provider = TencentTTSProvider(secret_id="id", secret_key="key")
|
||
seen: dict = {}
|
||
|
||
def fake_sync(text: str, voice_type: int, primary_language: int) -> bytes:
|
||
seen["voice_type"] = voice_type
|
||
return b"X"
|
||
|
||
with patch.object(provider, "_synthesize_sync", side_effect=fake_sync):
|
||
await provider.synthesize("Hi", language="en")
|
||
|
||
assert seen["voice_type"] == 501004
|
||
# 显式断言不是中文老精品音色(防止回归):禁止回落到 1001 / 1002 等
|
||
assert seen["voice_type"] not in (1001, 1002)
|
||
|
||
|
||
# ── 关键回归:_synthesize_sync 必须在请求中设置 ModelType=1(大模型音色路由所需) ──
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_tencent_tts_synthesize_sync_sets_model_type_1() -> None:
|
||
"""501004 月华属于大模型音色,TextToVoice 必须显式带 ModelType=1,否则会被旧模型
|
||
拒绝并静默返回空音频。这里 mock SDK client 捕获 req.ModelType 防止回归。"""
|
||
import base64 as _b64
|
||
|
||
provider = TencentTTSProvider(
|
||
secret_id="id",
|
||
secret_key="key",
|
||
voice_type=501004,
|
||
voice_type_en=501004,
|
||
)
|
||
|
||
captured: dict = {}
|
||
|
||
def _fake_text_to_voice(req):
|
||
captured["VoiceType"] = req.VoiceType
|
||
captured["PrimaryLanguage"] = req.PrimaryLanguage
|
||
captured["ModelType"] = req.ModelType
|
||
captured["Codec"] = req.Codec
|
||
captured["SampleRate"] = req.SampleRate
|
||
captured["Text"] = req.Text
|
||
fake_resp = MagicMock()
|
||
fake_resp.Audio = _b64.b64encode(b"AUDIO").decode("ascii")
|
||
fake_resp.RequestId = "req-test"
|
||
return fake_resp
|
||
|
||
fake_client = MagicMock()
|
||
fake_client.TextToVoice.side_effect = _fake_text_to_voice
|
||
|
||
with patch.object(provider, "_get_client", return_value=fake_client):
|
||
out = await provider.synthesize("你好", language="zh")
|
||
|
||
assert out == b"AUDIO"
|
||
assert captured["ModelType"] == MODEL_TYPE_LLM == 1
|
||
assert captured["VoiceType"] == 501004
|
||
assert captured["PrimaryLanguage"] == PRIMARY_LANGUAGE_ZH
|
||
assert captured["Text"] == "你好"
|
||
|
||
|
||
# ── port 兼容性:OpenAI 实现接受 language kwarg ──────────────────────
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_openai_tts_accepts_language_kwarg() -> None:
|
||
"""端口签名兼容:OpenAI adapter 必须接受 language(即使不使用)。"""
|
||
from app.adapters.tts.openai_tts import OpenAITTSProvider
|
||
|
||
provider = OpenAITTSProvider(api_key="") # No client → returns b""
|
||
assert await provider.synthesize("hi", language="en") == b""
|
||
assert await provider.synthesize("你好", language="zh") == b""
|