"""Tencent Cloud TTS adapter — implements TTSProvider port.""" import asyncio import base64 import re import uuid from app.core.logging import get_logger logger = get_logger(__name__) # OpenAI voice name -> Tencent VoiceType ID VOICE_MAP: dict[str, int] = { "alloy": 1001, "echo": 1002, "fable": 1003, "onyx": 1004, "nova": 1005, "shimmer": 1006, } # Tencent TTS API limit: ≤150 Chinese chars or ≤500 letters (英文按字母放宽到 ~480 留余量) MAX_CHARS_PER_REQUEST_ZH = 150 MAX_CHARS_PER_REQUEST_EN = 480 # Tencent PrimaryLanguage: 1=中文(含中英混读),2=英文 PRIMARY_LANGUAGE_ZH = 1 PRIMARY_LANGUAGE_EN = 2 # Tencent ModelType: 1=新模型(覆盖大模型音色 501xxx 系列与新版精品音色)。 # 大模型音色(如 501004 月华)必须显式传 ModelType=1,否则可能被旧模型拒绝并返回空音频; # 老精品音色(如 1001/101050 等)也接受 ModelType=1,因此无条件设置不会破坏老链路。 # 文档:https://cloud.tencent.com/document/api/1073/37995 MODEL_TYPE_LLM = 1 def _chunk_text(text: str, max_chars: int = MAX_CHARS_PER_REQUEST_ZH) -> list[str]: """Split text into chunks within API limit.""" text = text.strip() if not text: return [] if len(text) <= max_chars: return [text] chunks: list[str] = [] # Split by sentence boundaries first pattern = r"[。!?.!?\n]+" parts = re.split(f"({pattern})", text) current = "" for i, p in enumerate(parts): if re.match(pattern, p): current += p if current.strip(): chunks.append(current.strip()) current = "" else: if len(current) + len(p) <= max_chars: current += p else: if current.strip(): chunks.append(current.strip()) current = "" # Single part exceeds limit, split by length while p: chunk = p[:max_chars] p = p[max_chars:] chunks.append(chunk) if current.strip(): chunks.append(current.strip()) return chunks class TencentTTSProvider: def __init__( self, secret_id: str, secret_key: str, voice_type: int = 1001, codec: str = "mp3", voice_type_en: int | None = None, ): self._secret_id = secret_id self._secret_key = secret_key self._voice_type = voice_type # 英文音色未单独配置时回落到 501004(月华,腾讯云大模型音色,支持中英混合)。 # 大模型音色 501xxx 系列在 PrimaryLanguage=1/2 下均支持中英混读,不会被 Tencent # 以 InvalidParameterValue.PrimaryLanguage 拒绝;与之对应必须配合 ModelType=1。 self._voice_type_en = voice_type_en if voice_type_en is not None else 501004 self._codec = codec self._client = None def _get_client(self): if self._client is not None: return self._client try: from tencentcloud.common import credential from tencentcloud.common.profile.client_profile import ClientProfile from tencentcloud.common.profile.http_profile import HttpProfile from tencentcloud.tts.v20190823 import tts_client cred = credential.Credential(self._secret_id, self._secret_key) http_profile = HttpProfile() http_profile.endpoint = "tts.tencentcloudapi.com" client_profile = ClientProfile() client_profile.httpProfile = http_profile self._client = tts_client.TtsClient(cred, "", client_profile) return self._client except Exception as e: logger.error("Tencent TTS client init failed: {}", e) return None def _synthesize_sync( self, text: str, voice_type: int, primary_language: int = PRIMARY_LANGUAGE_ZH, ) -> bytes: client = self._get_client() if not client: logger.warning( "tencent_tts._synthesize_sync no client provider=tencent voice_type={}", voice_type, ) return b"" try: from tencentcloud.common.exception.tencent_cloud_sdk_exception import ( TencentCloudSDKException, ) from tencentcloud.tts.v20190823 import models req = models.TextToVoiceRequest() req.Text = text req.SessionId = uuid.uuid4().hex req.VoiceType = voice_type req.PrimaryLanguage = primary_language req.SampleRate = 16000 req.Codec = self._codec # 显式声明使用新模型;大模型音色(501xxx)若不带该字段会被旧模型拒绝并静默返回空音频。 req.ModelType = MODEL_TYPE_LLM # 长期保留 INFO:TTS 实际请求腾讯云 SDK 时的关键参数 logger.info( "tencent_tts._synthesize_sync request voice_type={} primary_language={} " "model_type={} sample_rate={} codec={} text_len={}", voice_type, primary_language, MODEL_TYPE_LLM, req.SampleRate, self._codec, len(text or ""), ) resp = client.TextToVoice(req) request_id = getattr(resp, "RequestId", None) if resp is not None else None audio_b64 = getattr(resp, "Audio", "") if resp is not None else "" if not audio_b64: logger.warning( "tencent_tts._synthesize_sync empty audio voice_type={} " "primary_language={} model_type={} request_id={}", voice_type, primary_language, MODEL_TYPE_LLM, request_id, ) return b"" audio_bytes = base64.b64decode(audio_b64) # 长期保留 INFO:腾讯云 SDK 返回的 request_id + 音频字节数(用户排查必需) logger.info( "tencent_tts._synthesize_sync response request_id={} audio_bytes_len={} " "voice_type={} primary_language={}", request_id, len(audio_bytes), voice_type, primary_language, ) return audio_bytes except TencentCloudSDKException as e: logger.error( "Tencent TTS SDK error provider=tencent voice_type={} primary_language={} " "model_type={} code={} message={} request_id={} raw={}", voice_type, primary_language, MODEL_TYPE_LLM, getattr(e, "code", None), getattr(e, "message", None), getattr(e, "requestId", None), e, ) return b"" except Exception as e: logger.error( "Tencent TTS synthesize failed provider=tencent voice_type={} primary_language={}: {}", voice_type, primary_language, e, ) return b"" async def synthesize( self, text: str, voice: str = "alloy", *, language: str = "zh", ) -> bytes: if not self._secret_id or not self._secret_key: logger.error( "Tencent TTS credentials not configured provider=tencent secret_id_set={} secret_key_set={}", bool(self._secret_id), bool(self._secret_key), ) return b"" is_en = (language or "zh").strip().lower() == "en" primary_language = PRIMARY_LANGUAGE_EN if is_en else PRIMARY_LANGUAGE_ZH default_voice = self._voice_type_en if is_en else self._voice_type max_chars = MAX_CHARS_PER_REQUEST_EN if is_en else MAX_CHARS_PER_REQUEST_ZH # Default "alloy" aligns with OpenAI TTS naming. Caller 链路里目前不会传具体音色, # 因此实际只走 default_voice 分支,对应 settings.tts_voice_type / tts_voice_type_en。 v = voice.lower() if v == "alloy": voice_type = default_voice else: voice_type = VOICE_MAP.get(v, default_voice) chunks = _chunk_text(text, max_chars=max_chars) # 长期保留 INFO:adapter 入口的 language / voice_type / chunk_count(排查必需) logger.info( "tencent_tts.synthesize entry language={} voice_arg={} resolved_voice_type={} " "primary_language={} max_chars={} text_len={} chunk_count={}", language, voice, voice_type, primary_language, max_chars, len(text or ""), len(chunks), ) if not chunks: return b"" results: list[bytes] = [] for idx, chunk in enumerate(chunks): audio = await asyncio.to_thread( self._synthesize_sync, chunk, voice_type, primary_language ) if not audio: logger.warning( "tencent_tts.synthesize chunk failed chunk_index={} chunk_chars={} " "voice_type={} primary_language={}", idx, len(chunk), voice_type, primary_language, ) return b"" logger.debug( "tencent_tts.synthesize chunk ok chunk_index={} chunk_chars={} audio_bytes_len={}", idx, len(chunk), len(audio), ) results.append(audio) merged = b"".join(results) logger.debug( "tencent_tts.synthesize done language={} voice_type={} chunks={} total_bytes={}", language, voice_type, len(chunks), len(merged), ) return merged