Files
life-echo/api/app/adapters/tts/tencent_tts.py
Kevin ccdc4e4277 feat(i18n): persist language preference and thread through chat, memoir, TTS
- Add users.language_preference (Alembic 0018, default zh); capture at signup/SMS
  only; expose on auth and profile APIs
- Lite English prompts for chat and memoir; localized stage labels and agent
  names (Life Echo / 岁月知己)
- Tencent TTS: language-aware synthesis, ModelType=1 for 501004, English chunking
- WebSocket pipeline: emit all AGENT_RESPONSE segments when TTS cancels; INFO logs
  for tts_this_turn and TTS decisions; on-demand TTS logging
- Expo: device language on auth, i18n tiers/agent name, [SPLIT] streaming UX fixes
- Tests for migration, prompts, pipeline, router tts_this_turn, reply segments

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-11 16:16:49 +08:00

274 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tencent Cloud TTS adapter — implements TTSProvider port."""
import asyncio
import base64
import re
import uuid
from app.core.logging import get_logger
logger = get_logger(__name__)
# OpenAI voice name -> Tencent VoiceType ID
VOICE_MAP: dict[str, int] = {
"alloy": 1001,
"echo": 1002,
"fable": 1003,
"onyx": 1004,
"nova": 1005,
"shimmer": 1006,
}
# Tencent TTS API limit: ≤150 Chinese chars or ≤500 letters (英文按字母放宽到 ~480 留余量)
MAX_CHARS_PER_REQUEST_ZH = 150
MAX_CHARS_PER_REQUEST_EN = 480
# Tencent PrimaryLanguage: 1=中文含中英混读2=英文
PRIMARY_LANGUAGE_ZH = 1
PRIMARY_LANGUAGE_EN = 2
# Tencent ModelType: 1=新模型(覆盖大模型音色 501xxx 系列与新版精品音色)。
# 大模型音色(如 501004 月华)必须显式传 ModelType=1否则可能被旧模型拒绝并返回空音频
# 老精品音色(如 1001/101050 等)也接受 ModelType=1因此无条件设置不会破坏老链路。
# 文档https://cloud.tencent.com/document/api/1073/37995
MODEL_TYPE_LLM = 1
def _chunk_text(text: str, max_chars: int = MAX_CHARS_PER_REQUEST_ZH) -> list[str]:
"""Split text into chunks within API limit."""
text = text.strip()
if not text:
return []
if len(text) <= max_chars:
return [text]
chunks: list[str] = []
# Split by sentence boundaries first
pattern = r"[。!?.!?\n]+"
parts = re.split(f"({pattern})", text)
current = ""
for i, p in enumerate(parts):
if re.match(pattern, p):
current += p
if current.strip():
chunks.append(current.strip())
current = ""
else:
if len(current) + len(p) <= max_chars:
current += p
else:
if current.strip():
chunks.append(current.strip())
current = ""
# Single part exceeds limit, split by length
while p:
chunk = p[:max_chars]
p = p[max_chars:]
chunks.append(chunk)
if current.strip():
chunks.append(current.strip())
return chunks
class TencentTTSProvider:
def __init__(
self,
secret_id: str,
secret_key: str,
voice_type: int = 1001,
codec: str = "mp3",
voice_type_en: int | None = None,
):
self._secret_id = secret_id
self._secret_key = secret_key
self._voice_type = voice_type
# 英文音色未单独配置时回落到 501004月华腾讯云大模型音色支持中英混合
# 大模型音色 501xxx 系列在 PrimaryLanguage=1/2 下均支持中英混读,不会被 Tencent
# 以 InvalidParameterValue.PrimaryLanguage 拒绝;与之对应必须配合 ModelType=1。
self._voice_type_en = voice_type_en if voice_type_en is not None else 501004
self._codec = codec
self._client = None
def _get_client(self):
if self._client is not None:
return self._client
try:
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.tts.v20190823 import tts_client
cred = credential.Credential(self._secret_id, self._secret_key)
http_profile = HttpProfile()
http_profile.endpoint = "tts.tencentcloudapi.com"
client_profile = ClientProfile()
client_profile.httpProfile = http_profile
self._client = tts_client.TtsClient(cred, "", client_profile)
return self._client
except Exception as e:
logger.error("Tencent TTS client init failed: {}", e)
return None
def _synthesize_sync(
self,
text: str,
voice_type: int,
primary_language: int = PRIMARY_LANGUAGE_ZH,
) -> bytes:
client = self._get_client()
if not client:
logger.warning(
"tencent_tts._synthesize_sync no client provider=tencent voice_type={}",
voice_type,
)
return b""
try:
from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
TencentCloudSDKException,
)
from tencentcloud.tts.v20190823 import models
req = models.TextToVoiceRequest()
req.Text = text
req.SessionId = uuid.uuid4().hex
req.VoiceType = voice_type
req.PrimaryLanguage = primary_language
req.SampleRate = 16000
req.Codec = self._codec
# 显式声明使用新模型大模型音色501xxx若不带该字段会被旧模型拒绝并静默返回空音频。
req.ModelType = MODEL_TYPE_LLM
# 长期保留 INFOTTS 实际请求腾讯云 SDK 时的关键参数
logger.info(
"tencent_tts._synthesize_sync request voice_type={} primary_language={} "
"model_type={} sample_rate={} codec={} text_len={}",
voice_type,
primary_language,
MODEL_TYPE_LLM,
req.SampleRate,
self._codec,
len(text or ""),
)
resp = client.TextToVoice(req)
request_id = getattr(resp, "RequestId", None) if resp is not None else None
audio_b64 = getattr(resp, "Audio", "") if resp is not None else ""
if not audio_b64:
logger.warning(
"tencent_tts._synthesize_sync empty audio voice_type={} "
"primary_language={} model_type={} request_id={}",
voice_type,
primary_language,
MODEL_TYPE_LLM,
request_id,
)
return b""
audio_bytes = base64.b64decode(audio_b64)
# 长期保留 INFO腾讯云 SDK 返回的 request_id + 音频字节数(用户排查必需)
logger.info(
"tencent_tts._synthesize_sync response request_id={} audio_bytes_len={} "
"voice_type={} primary_language={}",
request_id,
len(audio_bytes),
voice_type,
primary_language,
)
return audio_bytes
except TencentCloudSDKException as e:
logger.error(
"Tencent TTS SDK error provider=tencent voice_type={} primary_language={} "
"model_type={} code={} message={} request_id={} raw={}",
voice_type,
primary_language,
MODEL_TYPE_LLM,
getattr(e, "code", None),
getattr(e, "message", None),
getattr(e, "requestId", None),
e,
)
return b""
except Exception as e:
logger.error(
"Tencent TTS synthesize failed provider=tencent voice_type={} primary_language={}: {}",
voice_type,
primary_language,
e,
)
return b""
async def synthesize(
self,
text: str,
voice: str = "alloy",
*,
language: str = "zh",
) -> bytes:
if not self._secret_id or not self._secret_key:
logger.error(
"Tencent TTS credentials not configured provider=tencent secret_id_set={} secret_key_set={}",
bool(self._secret_id),
bool(self._secret_key),
)
return b""
is_en = (language or "zh").strip().lower() == "en"
primary_language = PRIMARY_LANGUAGE_EN if is_en else PRIMARY_LANGUAGE_ZH
default_voice = self._voice_type_en if is_en else self._voice_type
max_chars = MAX_CHARS_PER_REQUEST_EN if is_en else MAX_CHARS_PER_REQUEST_ZH
# Default "alloy" aligns with OpenAI TTS naming. Caller 链路里目前不会传具体音色,
# 因此实际只走 default_voice 分支,对应 settings.tts_voice_type / tts_voice_type_en。
v = voice.lower()
if v == "alloy":
voice_type = default_voice
else:
voice_type = VOICE_MAP.get(v, default_voice)
chunks = _chunk_text(text, max_chars=max_chars)
# 长期保留 INFOadapter 入口的 language / voice_type / chunk_count排查必需
logger.info(
"tencent_tts.synthesize entry language={} voice_arg={} resolved_voice_type={} "
"primary_language={} max_chars={} text_len={} chunk_count={}",
language,
voice,
voice_type,
primary_language,
max_chars,
len(text or ""),
len(chunks),
)
if not chunks:
return b""
results: list[bytes] = []
for idx, chunk in enumerate(chunks):
audio = await asyncio.to_thread(
self._synthesize_sync, chunk, voice_type, primary_language
)
if not audio:
logger.warning(
"tencent_tts.synthesize chunk failed chunk_index={} chunk_chars={} "
"voice_type={} primary_language={}",
idx,
len(chunk),
voice_type,
primary_language,
)
return b""
logger.debug(
"tencent_tts.synthesize chunk ok chunk_index={} chunk_chars={} audio_bytes_len={}",
idx,
len(chunk),
len(audio),
)
results.append(audio)
merged = b"".join(results)
logger.debug(
"tencent_tts.synthesize done language={} voice_type={} chunks={} total_bytes={}",
language,
voice_type,
len(chunks),
len(merged),
)
return merged