feat(api): use Tencent ASR flash with 16k_zh_large and dev transcript logs

Replace CreateRecTask polling with recording-file flash API, add TENCENT_APP_ID,
remove server-side pydub slicing, and log ASR recognition text at INFO in development.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Kevin
2026-05-25 11:28:22 +08:00
parent 22d282dc01
commit 07979bfb09
22 changed files with 354 additions and 185 deletions

View File

@@ -1,7 +1,13 @@
"""Tencent Cloud ASR adapter — implements ASRProvider port."""
"""Tencent Cloud ASR adapter — 录音文件识别极速版 (Flash), implements ASRProvider port."""
import asyncio
import base64
import hashlib
import hmac
import json
import time
from typing import Any
import httpx
from app.core.business_telemetry import business_span
from app.core.logging import get_logger
@@ -9,82 +15,198 @@ from app.ports.asr import ASRTranscriptionError
logger = get_logger(__name__)
_FLASH_HOST = "asr.cloud.tencent.com"
_FLASH_PATH_PREFIX = "/asr/flash/v1/"
# 极速版本地上传上限腾讯文档≤100MB
_MAX_FLASH_AUDIO_BYTES = 100 * 1024 * 1024
def _format_flash_sign_string(sorted_params: list[tuple[str, Any]]) -> str:
"""与官方 flash_recognizer._format_sign_string 一致。"""
signstr = f"POST{_FLASH_HOST}{_FLASH_PATH_PREFIX}"
for key, value in sorted_params:
if key == "appid":
signstr += str(value)
break
signstr += "?"
for key, value in sorted_params:
if key == "appid":
continue
signstr += f"{key}={value}&"
return signstr[:-1]
def _build_flash_url_and_headers(
secret_key: str, params: dict[str, Any]
) -> tuple[str, dict[str, str]]:
sorted_params = sorted(params.items(), key=lambda item: item[0])
signstr = _format_flash_sign_string(sorted_params)
signature = base64.b64encode(
hmac.new(
secret_key.encode("utf-8"),
signstr.encode("utf-8"),
hashlib.sha1,
).digest()
).decode("utf-8")
url = "https://" + signstr[4:]
headers = {
"Host": _FLASH_HOST,
"Authorization": signature,
}
return url, headers
def _build_flash_query_params(
*,
app_id: str,
secret_id: str,
engine_type: str,
voice_format: str,
) -> dict[str, Any]:
return {
"appid": app_id,
"secretid": secret_id,
"timestamp": str(int(time.time())),
"engine_type": engine_type,
"voice_format": voice_format,
"speaker_diarization": 0,
"customization_id": "",
"filter_dirty": 0,
"filter_modal": 0,
"filter_punc": 0,
"convert_num_mode": 1,
"word_info": 0,
"first_channel_only": 1,
"reinforce_hotword": 0,
"sentence_max_length": 0,
}
class TencentASRProvider:
def __init__(
self,
secret_id: str,
secret_key: str,
app_id: str,
*,
engine_type: str = "16k_zh_large",
request_timeout_seconds: float = 60.0,
):
self._secret_id = secret_id
self._secret_key = secret_key
self._app_id = (app_id or "").strip()
self._engine_type = engine_type
self._client = None
def _get_client(self):
if self._client is not None:
return self._client
try:
from tencentcloud.asr.v20190614 import asr_client
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
cred = credential.Credential(self._secret_id, self._secret_key)
http_profile = HttpProfile()
http_profile.endpoint = "asr.tencentcloudapi.com"
client_profile = ClientProfile()
client_profile.httpProfile = http_profile
self._client = asr_client.AsrClient(cred, "", client_profile)
return self._client
except Exception as e:
logger.error("Tencent ASR client init failed: {}", e)
return None
self._request_timeout_seconds = request_timeout_seconds
def ensure_ready(self) -> bool:
return bool(self._secret_id and self._secret_key and self._get_client())
return bool(self._secret_id and self._secret_key and self._app_id)
async def transcribe(self, audio: bytes, format: str = "m4a") -> str:
with business_span("asr.transcribe", provider="tencent"):
with business_span("asr.transcribe", provider="tencent_flash"):
return await self._transcribe_inner(audio, format)
async def _transcribe_inner(self, audio: bytes, format: str) -> str:
client = self._get_client()
if not client:
if not self.ensure_ready():
raise ASRTranscriptionError(
"Tencent ASR client not initialized (check credentials)"
"Tencent ASR flash not configured (need TENCENT_APP_ID, SECRET_ID, SECRET_KEY)"
)
if len(audio) > _MAX_FLASH_AUDIO_BYTES:
raise ASRTranscriptionError(
f"Audio exceeds {_MAX_FLASH_AUDIO_BYTES // (1024 * 1024)}MB flash upload limit"
)
voice_format = (format or "m4a").lower()
params = _build_flash_query_params(
app_id=self._app_id,
secret_id=self._secret_id,
engine_type=self._engine_type,
voice_format=voice_format,
)
url, headers = _build_flash_url_and_headers(self._secret_key, params)
headers["Content-Type"] = "application/octet-stream"
headers["Content-Length"] = str(len(audio))
try:
from tencentcloud.asr.v20190614 import models
audio_base64 = base64.b64encode(audio).decode("utf-8")
req = models.SentenceRecognitionRequest()
req.EngSerViceType = self._engine_type
req.SourceType = 1
# 小写与文档一致。iOS 常见为 m4a(AAC) 容器,与 16k 引擎匹配
req.VoiceFormat = (format or "m4a").lower()
req.Data = audio_base64
req.DataLen = len(audio)
# 腾讯 SDK 为同步阻塞调用;放到线程池里避免卡住事件循环。
resp = await asyncio.to_thread(client.SentenceRecognition, req)
text = (resp.Result or "").strip()
if text:
return text
err = getattr(resp, "Error", None) or getattr(resp, "Message", None)
logger.warning(
"Tencent ASR empty Result, audio_len={} format={} err={}",
len(audio),
req.VoiceFormat,
err,
)
raise ASRTranscriptionError(
"Tencent ASR empty Result (check sample rate / format / audio)"
)
async with httpx.AsyncClient() as client:
resp = await client.post(
url,
headers=headers,
content=audio,
timeout=self._request_timeout_seconds,
)
if resp.status_code >= 400:
raise ASRTranscriptionError(
f"Tencent ASR flash HTTP {resp.status_code}: {resp.text[:200]}"
)
payload = resp.json()
except ASRTranscriptionError:
raise
except httpx.HTTPError as e:
logger.error("Tencent ASR flash HTTP failed: {}", e, exc_info=True)
raise ASRTranscriptionError(f"Tencent ASR flash HTTP failed: {e!s}") from e
except json.JSONDecodeError as e:
logger.error("Tencent ASR flash invalid JSON: {}", e, exc_info=True)
raise ASRTranscriptionError("Tencent ASR flash returned invalid JSON") from e
except Exception as e:
logger.error("Tencent ASR transcribe failed: {}", e, exc_info=True)
raise ASRTranscriptionError(f"Tencent ASR transcribe failed: {e!s}") from e
logger.error("Tencent ASR flash transcribe failed: {}", e, exc_info=True)
raise ASRTranscriptionError(f"Tencent ASR flash transcribe failed: {e!s}") from e
return self._parse_flash_response(payload, audio_len=len(audio), voice_format=voice_format)
def _parse_flash_response(
self, payload: dict[str, Any], *, audio_len: int, voice_format: str
) -> str:
code = payload.get("code")
if code != 0:
message = payload.get("message") or "unknown error"
request_id = payload.get("request_id", "")
logger.warning(
"Tencent ASR flash error code={} message={} request_id={} audio_len={} format={}",
code,
message,
request_id,
audio_len,
voice_format,
)
if code == 4004:
raise ASRTranscriptionError(
"Tencent ASR flash resource pack exhausted (purchase 录音文件识别极速版)"
)
if code == 4003:
raise ASRTranscriptionError(
"Tencent ASR flash service not enabled in console"
)
raise ASRTranscriptionError(
f"Tencent ASR flash failed (code={code}): {message}"
)
flash_result = payload.get("flash_result") or []
texts: list[str] = []
for channel in flash_result:
if not isinstance(channel, dict):
continue
text = (channel.get("text") or "").strip()
if text:
texts.append(text)
combined = "".join(texts)
if combined:
logger.debug(
"Tencent ASR flash ok request_id={} audio_len={} audio_duration_ms={} "
"voice_format={} chars={}",
payload.get("request_id"),
audio_len,
payload.get("audio_duration"),
voice_format,
len(combined),
)
return combined
logger.warning(
"Tencent ASR flash empty flash_result, audio_len={} format={} request_id={}",
audio_len,
voice_format,
payload.get("request_id"),
)
raise ASRTranscriptionError(
"Tencent ASR flash empty result (check sample rate / format / audio)"
)