feat(api): use Tencent ASR flash with 16k_zh_large and dev transcript logs

Replace CreateRecTask polling with recording-file flash API, add TENCENT_APP_ID, remove server-side pydub slicing, and log ASR recognition text at INFO in development. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-25 11:28:22 +08:00
parent 22d282dc01
commit 07979bfb09
22 changed files with 354 additions and 185 deletions
--- a/api/app/adapters/asr/tencent_asr.py
+++ b/api/app/adapters/asr/tencent_asr.py
@@ -1,7 +1,13 @@
-"""Tencent Cloud ASR adapter — implements ASRProvider port."""
+"""Tencent Cloud ASR adapter — 录音文件识别极速版 (Flash), implements ASRProvider port."""

-import asyncio
 import base64
+import hashlib
+import hmac
+import json
+import time
+from typing import Any
+
+import httpx

 from app.core.business_telemetry import business_span
 from app.core.logging import get_logger
@@ -9,82 +15,198 @@ from app.ports.asr import ASRTranscriptionError

 logger = get_logger(__name__)

+_FLASH_HOST = "asr.cloud.tencent.com"
+_FLASH_PATH_PREFIX = "/asr/flash/v1/"
+# 极速版本地上传上限（腾讯文档：≤100MB）
+_MAX_FLASH_AUDIO_BYTES = 100 * 1024 * 1024
+
+
+def _format_flash_sign_string(sorted_params: list[tuple[str, Any]]) -> str:
+    """与官方 flash_recognizer._format_sign_string 一致。"""
+    signstr = f"POST{_FLASH_HOST}{_FLASH_PATH_PREFIX}"
+    for key, value in sorted_params:
+        if key == "appid":
+            signstr += str(value)
+            break
+    signstr += "?"
+    for key, value in sorted_params:
+        if key == "appid":
+            continue
+        signstr += f"{key}={value}&"
+    return signstr[:-1]
+
+
+def _build_flash_url_and_headers(
+    secret_key: str, params: dict[str, Any]
+) -> tuple[str, dict[str, str]]:
+    sorted_params = sorted(params.items(), key=lambda item: item[0])
+    signstr = _format_flash_sign_string(sorted_params)
+    signature = base64.b64encode(
+        hmac.new(
+            secret_key.encode("utf-8"),
+            signstr.encode("utf-8"),
+            hashlib.sha1,
+        ).digest()
+    ).decode("utf-8")
+    url = "https://" + signstr[4:]
+    headers = {
+        "Host": _FLASH_HOST,
+        "Authorization": signature,
+    }
+    return url, headers
+
+
+def _build_flash_query_params(
+    *,
+    app_id: str,
+    secret_id: str,
+    engine_type: str,
+    voice_format: str,
+) -> dict[str, Any]:
+    return {
+        "appid": app_id,
+        "secretid": secret_id,
+        "timestamp": str(int(time.time())),
+        "engine_type": engine_type,
+        "voice_format": voice_format,
+        "speaker_diarization": 0,
+        "customization_id": "",
+        "filter_dirty": 0,
+        "filter_modal": 0,
+        "filter_punc": 0,
+        "convert_num_mode": 1,
+        "word_info": 0,
+        "first_channel_only": 1,
+        "reinforce_hotword": 0,
+        "sentence_max_length": 0,
+    }
+

 class TencentASRProvider:
    def __init__(
        self,
        secret_id: str,
        secret_key: str,
+        app_id: str,
        *,
        engine_type: str = "16k_zh_large",
+        request_timeout_seconds: float = 60.0,
    ):
        self._secret_id = secret_id
        self._secret_key = secret_key
+        self._app_id = (app_id or "").strip()
        self._engine_type = engine_type
-        self._client = None
-
-    def _get_client(self):
-        if self._client is not None:
-            return self._client
-        try:
-            from tencentcloud.asr.v20190614 import asr_client
-            from tencentcloud.common import credential
-            from tencentcloud.common.profile.client_profile import ClientProfile
-            from tencentcloud.common.profile.http_profile import HttpProfile
-
-            cred = credential.Credential(self._secret_id, self._secret_key)
-            http_profile = HttpProfile()
-            http_profile.endpoint = "asr.tencentcloudapi.com"
-            client_profile = ClientProfile()
-            client_profile.httpProfile = http_profile
-            self._client = asr_client.AsrClient(cred, "", client_profile)
-            return self._client
-        except Exception as e:
-            logger.error("Tencent ASR client init failed: {}", e)
-            return None
+        self._request_timeout_seconds = request_timeout_seconds

    def ensure_ready(self) -> bool:
-        return bool(self._secret_id and self._secret_key and self._get_client())
+        return bool(self._secret_id and self._secret_key and self._app_id)

    async def transcribe(self, audio: bytes, format: str = "m4a") -> str:
-        with business_span("asr.transcribe", provider="tencent"):
+        with business_span("asr.transcribe", provider="tencent_flash"):
            return await self._transcribe_inner(audio, format)

    async def _transcribe_inner(self, audio: bytes, format: str) -> str:
-        client = self._get_client()
-        if not client:
+        if not self.ensure_ready():
            raise ASRTranscriptionError(
-                "Tencent ASR client not initialized (check credentials)"
+                "Tencent ASR flash not configured (need TENCENT_APP_ID, SECRET_ID, SECRET_KEY)"
            )
+        if len(audio) > _MAX_FLASH_AUDIO_BYTES:
+            raise ASRTranscriptionError(
+                f"Audio exceeds {_MAX_FLASH_AUDIO_BYTES // (1024 * 1024)}MB flash upload limit"
+            )
+
+        voice_format = (format or "m4a").lower()
+        params = _build_flash_query_params(
+            app_id=self._app_id,
+            secret_id=self._secret_id,
+            engine_type=self._engine_type,
+            voice_format=voice_format,
+        )
+        url, headers = _build_flash_url_and_headers(self._secret_key, params)
+        headers["Content-Type"] = "application/octet-stream"
+        headers["Content-Length"] = str(len(audio))
+
        try:
-            from tencentcloud.asr.v20190614 import models
-
-            audio_base64 = base64.b64encode(audio).decode("utf-8")
-            req = models.SentenceRecognitionRequest()
-            req.EngSerViceType = self._engine_type
-            req.SourceType = 1
-            # 小写；与文档一致。iOS 常见为 m4a(AAC) 容器，与 16k 引擎匹配
-            req.VoiceFormat = (format or "m4a").lower()
-            req.Data = audio_base64
-            req.DataLen = len(audio)
-
-            # 腾讯 SDK 为同步阻塞调用；放到线程池里避免卡住事件循环。
-            resp = await asyncio.to_thread(client.SentenceRecognition, req)
-            text = (resp.Result or "").strip()
-            if text:
-                return text
-            err = getattr(resp, "Error", None) or getattr(resp, "Message", None)
-            logger.warning(
-                "Tencent ASR empty Result, audio_len={} format={} err={}",
-                len(audio),
-                req.VoiceFormat,
-                err,
-            )
-            raise ASRTranscriptionError(
-                "Tencent ASR empty Result (check sample rate / format / audio)"
-            )
+            async with httpx.AsyncClient() as client:
+                resp = await client.post(
+                    url,
+                    headers=headers,
+                    content=audio,
+                    timeout=self._request_timeout_seconds,
+                )
+            if resp.status_code >= 400:
+                raise ASRTranscriptionError(
+                    f"Tencent ASR flash HTTP {resp.status_code}: {resp.text[:200]}"
+                )
+            payload = resp.json()
        except ASRTranscriptionError:
            raise
+        except httpx.HTTPError as e:
+            logger.error("Tencent ASR flash HTTP failed: {}", e, exc_info=True)
+            raise ASRTranscriptionError(f"Tencent ASR flash HTTP failed: {e!s}") from e
+        except json.JSONDecodeError as e:
+            logger.error("Tencent ASR flash invalid JSON: {}", e, exc_info=True)
+            raise ASRTranscriptionError("Tencent ASR flash returned invalid JSON") from e
        except Exception as e:
-            logger.error("Tencent ASR transcribe failed: {}", e, exc_info=True)
-            raise ASRTranscriptionError(f"Tencent ASR transcribe failed: {e!s}") from e
+            logger.error("Tencent ASR flash transcribe failed: {}", e, exc_info=True)
+            raise ASRTranscriptionError(f"Tencent ASR flash transcribe failed: {e!s}") from e
+
+        return self._parse_flash_response(payload, audio_len=len(audio), voice_format=voice_format)
+
+    def _parse_flash_response(
+        self, payload: dict[str, Any], *, audio_len: int, voice_format: str
+    ) -> str:
+        code = payload.get("code")
+        if code != 0:
+            message = payload.get("message") or "unknown error"
+            request_id = payload.get("request_id", "")
+            logger.warning(
+                "Tencent ASR flash error code={} message={} request_id={} audio_len={} format={}",
+                code,
+                message,
+                request_id,
+                audio_len,
+                voice_format,
+            )
+            if code == 4004:
+                raise ASRTranscriptionError(
+                    "Tencent ASR flash resource pack exhausted (purchase 录音文件识别极速版)"
+                )
+            if code == 4003:
+                raise ASRTranscriptionError(
+                    "Tencent ASR flash service not enabled in console"
+                )
+            raise ASRTranscriptionError(
+                f"Tencent ASR flash failed (code={code}): {message}"
+            )
+
+        flash_result = payload.get("flash_result") or []
+        texts: list[str] = []
+        for channel in flash_result:
+            if not isinstance(channel, dict):
+                continue
+            text = (channel.get("text") or "").strip()
+            if text:
+                texts.append(text)
+        combined = "".join(texts)
+        if combined:
+            logger.debug(
+                "Tencent ASR flash ok request_id={} audio_len={} audio_duration_ms={} "
+                "voice_format={} chars={}",
+                payload.get("request_id"),
+                audio_len,
+                payload.get("audio_duration"),
+                voice_format,
+                len(combined),
+            )
+            return combined
+
+        logger.warning(
+            "Tencent ASR flash empty flash_result, audio_len={} format={} request_id={}",
+            audio_len,
+            voice_format,
+            payload.get("request_id"),
+        )
+        raise ASRTranscriptionError(
+            "Tencent ASR flash empty result (check sample rate / format / audio)"
+        )