feat(api): use Tencent ASR flash with 16k_zh_large and dev transcript logs
Replace CreateRecTask polling with recording-file flash API, add TENCENT_APP_ID, remove server-side pydub slicing, and log ASR recognition text at INFO in development. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -1,7 +1,13 @@
|
||||
"""Tencent Cloud ASR adapter — implements ASRProvider port."""
|
||||
"""Tencent Cloud ASR adapter — 录音文件识别极速版 (Flash), implements ASRProvider port."""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
from app.core.business_telemetry import business_span
|
||||
from app.core.logging import get_logger
|
||||
@@ -9,82 +15,198 @@ from app.ports.asr import ASRTranscriptionError
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_FLASH_HOST = "asr.cloud.tencent.com"
|
||||
_FLASH_PATH_PREFIX = "/asr/flash/v1/"
|
||||
# 极速版本地上传上限(腾讯文档:≤100MB)
|
||||
_MAX_FLASH_AUDIO_BYTES = 100 * 1024 * 1024
|
||||
|
||||
|
||||
def _format_flash_sign_string(sorted_params: list[tuple[str, Any]]) -> str:
|
||||
"""与官方 flash_recognizer._format_sign_string 一致。"""
|
||||
signstr = f"POST{_FLASH_HOST}{_FLASH_PATH_PREFIX}"
|
||||
for key, value in sorted_params:
|
||||
if key == "appid":
|
||||
signstr += str(value)
|
||||
break
|
||||
signstr += "?"
|
||||
for key, value in sorted_params:
|
||||
if key == "appid":
|
||||
continue
|
||||
signstr += f"{key}={value}&"
|
||||
return signstr[:-1]
|
||||
|
||||
|
||||
def _build_flash_url_and_headers(
|
||||
secret_key: str, params: dict[str, Any]
|
||||
) -> tuple[str, dict[str, str]]:
|
||||
sorted_params = sorted(params.items(), key=lambda item: item[0])
|
||||
signstr = _format_flash_sign_string(sorted_params)
|
||||
signature = base64.b64encode(
|
||||
hmac.new(
|
||||
secret_key.encode("utf-8"),
|
||||
signstr.encode("utf-8"),
|
||||
hashlib.sha1,
|
||||
).digest()
|
||||
).decode("utf-8")
|
||||
url = "https://" + signstr[4:]
|
||||
headers = {
|
||||
"Host": _FLASH_HOST,
|
||||
"Authorization": signature,
|
||||
}
|
||||
return url, headers
|
||||
|
||||
|
||||
def _build_flash_query_params(
|
||||
*,
|
||||
app_id: str,
|
||||
secret_id: str,
|
||||
engine_type: str,
|
||||
voice_format: str,
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"appid": app_id,
|
||||
"secretid": secret_id,
|
||||
"timestamp": str(int(time.time())),
|
||||
"engine_type": engine_type,
|
||||
"voice_format": voice_format,
|
||||
"speaker_diarization": 0,
|
||||
"customization_id": "",
|
||||
"filter_dirty": 0,
|
||||
"filter_modal": 0,
|
||||
"filter_punc": 0,
|
||||
"convert_num_mode": 1,
|
||||
"word_info": 0,
|
||||
"first_channel_only": 1,
|
||||
"reinforce_hotword": 0,
|
||||
"sentence_max_length": 0,
|
||||
}
|
||||
|
||||
|
||||
class TencentASRProvider:
|
||||
def __init__(
|
||||
self,
|
||||
secret_id: str,
|
||||
secret_key: str,
|
||||
app_id: str,
|
||||
*,
|
||||
engine_type: str = "16k_zh_large",
|
||||
request_timeout_seconds: float = 60.0,
|
||||
):
|
||||
self._secret_id = secret_id
|
||||
self._secret_key = secret_key
|
||||
self._app_id = (app_id or "").strip()
|
||||
self._engine_type = engine_type
|
||||
self._client = None
|
||||
|
||||
def _get_client(self):
|
||||
if self._client is not None:
|
||||
return self._client
|
||||
try:
|
||||
from tencentcloud.asr.v20190614 import asr_client
|
||||
from tencentcloud.common import credential
|
||||
from tencentcloud.common.profile.client_profile import ClientProfile
|
||||
from tencentcloud.common.profile.http_profile import HttpProfile
|
||||
|
||||
cred = credential.Credential(self._secret_id, self._secret_key)
|
||||
http_profile = HttpProfile()
|
||||
http_profile.endpoint = "asr.tencentcloudapi.com"
|
||||
client_profile = ClientProfile()
|
||||
client_profile.httpProfile = http_profile
|
||||
self._client = asr_client.AsrClient(cred, "", client_profile)
|
||||
return self._client
|
||||
except Exception as e:
|
||||
logger.error("Tencent ASR client init failed: {}", e)
|
||||
return None
|
||||
self._request_timeout_seconds = request_timeout_seconds
|
||||
|
||||
def ensure_ready(self) -> bool:
|
||||
return bool(self._secret_id and self._secret_key and self._get_client())
|
||||
return bool(self._secret_id and self._secret_key and self._app_id)
|
||||
|
||||
async def transcribe(self, audio: bytes, format: str = "m4a") -> str:
|
||||
with business_span("asr.transcribe", provider="tencent"):
|
||||
with business_span("asr.transcribe", provider="tencent_flash"):
|
||||
return await self._transcribe_inner(audio, format)
|
||||
|
||||
async def _transcribe_inner(self, audio: bytes, format: str) -> str:
|
||||
client = self._get_client()
|
||||
if not client:
|
||||
if not self.ensure_ready():
|
||||
raise ASRTranscriptionError(
|
||||
"Tencent ASR client not initialized (check credentials)"
|
||||
"Tencent ASR flash not configured (need TENCENT_APP_ID, SECRET_ID, SECRET_KEY)"
|
||||
)
|
||||
if len(audio) > _MAX_FLASH_AUDIO_BYTES:
|
||||
raise ASRTranscriptionError(
|
||||
f"Audio exceeds {_MAX_FLASH_AUDIO_BYTES // (1024 * 1024)}MB flash upload limit"
|
||||
)
|
||||
|
||||
voice_format = (format or "m4a").lower()
|
||||
params = _build_flash_query_params(
|
||||
app_id=self._app_id,
|
||||
secret_id=self._secret_id,
|
||||
engine_type=self._engine_type,
|
||||
voice_format=voice_format,
|
||||
)
|
||||
url, headers = _build_flash_url_and_headers(self._secret_key, params)
|
||||
headers["Content-Type"] = "application/octet-stream"
|
||||
headers["Content-Length"] = str(len(audio))
|
||||
|
||||
try:
|
||||
from tencentcloud.asr.v20190614 import models
|
||||
|
||||
audio_base64 = base64.b64encode(audio).decode("utf-8")
|
||||
req = models.SentenceRecognitionRequest()
|
||||
req.EngSerViceType = self._engine_type
|
||||
req.SourceType = 1
|
||||
# 小写;与文档一致。iOS 常见为 m4a(AAC) 容器,与 16k 引擎匹配
|
||||
req.VoiceFormat = (format or "m4a").lower()
|
||||
req.Data = audio_base64
|
||||
req.DataLen = len(audio)
|
||||
|
||||
# 腾讯 SDK 为同步阻塞调用;放到线程池里避免卡住事件循环。
|
||||
resp = await asyncio.to_thread(client.SentenceRecognition, req)
|
||||
text = (resp.Result or "").strip()
|
||||
if text:
|
||||
return text
|
||||
err = getattr(resp, "Error", None) or getattr(resp, "Message", None)
|
||||
logger.warning(
|
||||
"Tencent ASR empty Result, audio_len={} format={} err={}",
|
||||
len(audio),
|
||||
req.VoiceFormat,
|
||||
err,
|
||||
)
|
||||
raise ASRTranscriptionError(
|
||||
"Tencent ASR empty Result (check sample rate / format / audio)"
|
||||
)
|
||||
async with httpx.AsyncClient() as client:
|
||||
resp = await client.post(
|
||||
url,
|
||||
headers=headers,
|
||||
content=audio,
|
||||
timeout=self._request_timeout_seconds,
|
||||
)
|
||||
if resp.status_code >= 400:
|
||||
raise ASRTranscriptionError(
|
||||
f"Tencent ASR flash HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
)
|
||||
payload = resp.json()
|
||||
except ASRTranscriptionError:
|
||||
raise
|
||||
except httpx.HTTPError as e:
|
||||
logger.error("Tencent ASR flash HTTP failed: {}", e, exc_info=True)
|
||||
raise ASRTranscriptionError(f"Tencent ASR flash HTTP failed: {e!s}") from e
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error("Tencent ASR flash invalid JSON: {}", e, exc_info=True)
|
||||
raise ASRTranscriptionError("Tencent ASR flash returned invalid JSON") from e
|
||||
except Exception as e:
|
||||
logger.error("Tencent ASR transcribe failed: {}", e, exc_info=True)
|
||||
raise ASRTranscriptionError(f"Tencent ASR transcribe failed: {e!s}") from e
|
||||
logger.error("Tencent ASR flash transcribe failed: {}", e, exc_info=True)
|
||||
raise ASRTranscriptionError(f"Tencent ASR flash transcribe failed: {e!s}") from e
|
||||
|
||||
return self._parse_flash_response(payload, audio_len=len(audio), voice_format=voice_format)
|
||||
|
||||
def _parse_flash_response(
|
||||
self, payload: dict[str, Any], *, audio_len: int, voice_format: str
|
||||
) -> str:
|
||||
code = payload.get("code")
|
||||
if code != 0:
|
||||
message = payload.get("message") or "unknown error"
|
||||
request_id = payload.get("request_id", "")
|
||||
logger.warning(
|
||||
"Tencent ASR flash error code={} message={} request_id={} audio_len={} format={}",
|
||||
code,
|
||||
message,
|
||||
request_id,
|
||||
audio_len,
|
||||
voice_format,
|
||||
)
|
||||
if code == 4004:
|
||||
raise ASRTranscriptionError(
|
||||
"Tencent ASR flash resource pack exhausted (purchase 录音文件识别极速版)"
|
||||
)
|
||||
if code == 4003:
|
||||
raise ASRTranscriptionError(
|
||||
"Tencent ASR flash service not enabled in console"
|
||||
)
|
||||
raise ASRTranscriptionError(
|
||||
f"Tencent ASR flash failed (code={code}): {message}"
|
||||
)
|
||||
|
||||
flash_result = payload.get("flash_result") or []
|
||||
texts: list[str] = []
|
||||
for channel in flash_result:
|
||||
if not isinstance(channel, dict):
|
||||
continue
|
||||
text = (channel.get("text") or "").strip()
|
||||
if text:
|
||||
texts.append(text)
|
||||
combined = "".join(texts)
|
||||
if combined:
|
||||
logger.debug(
|
||||
"Tencent ASR flash ok request_id={} audio_len={} audio_duration_ms={} "
|
||||
"voice_format={} chars={}",
|
||||
payload.get("request_id"),
|
||||
audio_len,
|
||||
payload.get("audio_duration"),
|
||||
voice_format,
|
||||
len(combined),
|
||||
)
|
||||
return combined
|
||||
|
||||
logger.warning(
|
||||
"Tencent ASR flash empty flash_result, audio_len={} format={} request_id={}",
|
||||
audio_len,
|
||||
voice_format,
|
||||
payload.get("request_id"),
|
||||
)
|
||||
raise ASRTranscriptionError(
|
||||
"Tencent ASR flash empty result (check sample rate / format / audio)"
|
||||
)
|
||||
|
||||
@@ -87,6 +87,32 @@ def log_agent_summary(
|
||||
logger.info(message, *args)
|
||||
|
||||
|
||||
def asr_transcript_log_enabled() -> bool:
|
||||
"""development 环境或全局 DEBUG/TRACE 时以 INFO 输出 ASR 识别全文。"""
|
||||
env = (settings.app_environment or "").strip().lower()
|
||||
if env == "development":
|
||||
return True
|
||||
return agent_verbose_enabled()
|
||||
|
||||
|
||||
def log_asr_transcript_result(
|
||||
logger: Any,
|
||||
*,
|
||||
text: str,
|
||||
**context: Any,
|
||||
) -> None:
|
||||
"""在 ``asr_transcript_log_enabled()`` 时记录识别结果(过长文本会截断)。"""
|
||||
if not asr_transcript_log_enabled():
|
||||
return
|
||||
parts = [f"{k}={v}" for k, v in context.items() if v is not None and v != ""]
|
||||
ctx = " ".join(parts)
|
||||
body = truncate_for_log(text)
|
||||
if ctx:
|
||||
logger.info("ASR 识别结果 {} text={}", ctx, body)
|
||||
else:
|
||||
logger.info("ASR 识别结果 text={}", body)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def agent_span(
|
||||
logger: Any,
|
||||
|
||||
@@ -205,6 +205,7 @@ class AsrConfig(BaseModel):
|
||||
|
||||
provider: Literal["tencent"] = "tencent"
|
||||
engine_type: Literal["16k_zh_large"] = "16k_zh_large"
|
||||
request_timeout_seconds: float = 60.0
|
||||
|
||||
|
||||
class TtsConfig(BaseModel):
|
||||
|
||||
@@ -42,6 +42,7 @@ class Settings(BaseSettings):
|
||||
|
||||
tencent_secret_id: str = ""
|
||||
tencent_secret_key: str = ""
|
||||
tencent_app_id: str = ""
|
||||
|
||||
wechat_pay_api_v3_key: str = ""
|
||||
wechat_pay_private_key: str = ""
|
||||
|
||||
@@ -107,7 +107,9 @@ def get_asr_provider() -> ASRProvider:
|
||||
return TencentASRProvider(
|
||||
secret_id=settings.tencent_secret_id,
|
||||
secret_key=settings.tencent_secret_key,
|
||||
app_id=settings.tencent_app_id,
|
||||
engine_type=asr_defaults.engine_type,
|
||||
request_timeout_seconds=asr_defaults.request_timeout_seconds,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import io
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
@@ -19,7 +18,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.agents.chat import ChatOrchestrator
|
||||
from app.agents.chat.reply_limits import segments_from_llm_response
|
||||
from app.core.agent_logging import agent_summary_enabled
|
||||
from app.core.agent_logging import agent_summary_enabled, log_asr_transcript_result
|
||||
from app.core.business_telemetry import business_span
|
||||
from app.core.config import settings
|
||||
from app.core.cos_url_keys import (
|
||||
@@ -617,64 +616,6 @@ async def _delayed_listening_feedback(
|
||||
await _send_segment_transition_feedback(conversation_id, 0)
|
||||
|
||||
|
||||
# ── 长音频切片转写 ────────────────────────────────────────────
|
||||
|
||||
MAX_ASR_CHUNK_MS = 55_000
|
||||
|
||||
|
||||
def _split_audio_bytes(audio_bytes: bytes, fmt: str) -> list[bytes]:
|
||||
"""用 pydub 将长音频按 ≤55 s 切片,每片导出为 16 kHz mono WAV(腾讯 ASR 3 MB 限制内)。"""
|
||||
from pydub import AudioSegment as PydubSegment
|
||||
|
||||
audio = PydubSegment.from_file(io.BytesIO(audio_bytes), format=fmt)
|
||||
duration_ms = len(audio)
|
||||
|
||||
if duration_ms <= MAX_ASR_CHUNK_MS:
|
||||
return [audio_bytes]
|
||||
|
||||
mono_16k = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
|
||||
chunks: list[bytes] = []
|
||||
for start in range(0, duration_ms, MAX_ASR_CHUNK_MS):
|
||||
chunk = mono_16k[start : start + MAX_ASR_CHUNK_MS]
|
||||
buf = io.BytesIO()
|
||||
chunk.export(buf, format="wav")
|
||||
chunks.append(buf.getvalue())
|
||||
return chunks
|
||||
|
||||
|
||||
async def _transcribe_long_audio(audio_bytes: bytes, fmt: str = "m4a") -> str:
|
||||
"""超过 55 s 的音频自动切片后并行 ASR;短音频直接转写。"""
|
||||
asr = get_asr_provider()
|
||||
return await _transcribe_long_audio_inner(audio_bytes, fmt, asr)
|
||||
|
||||
|
||||
async def _transcribe_long_audio_inner(
|
||||
audio_bytes: bytes, fmt: str, asr: Any
|
||||
) -> str:
|
||||
try:
|
||||
chunks = await asyncio.to_thread(_split_audio_bytes, audio_bytes, fmt)
|
||||
except Exception as exc:
|
||||
logger.warning("pydub 切片失败 ({}), 回退到直接转写", exc)
|
||||
return await asr.transcribe(audio_bytes, format=fmt)
|
||||
|
||||
if len(chunks) <= 1:
|
||||
return await asr.transcribe(audio_bytes, format=fmt)
|
||||
|
||||
logger.info("长音频切片: {} 段", len(chunks))
|
||||
results = await asyncio.gather(
|
||||
*[asr.transcribe(c, format="wav") for c in chunks],
|
||||
return_exceptions=True,
|
||||
)
|
||||
texts: list[str] = []
|
||||
for i, r in enumerate(results):
|
||||
if isinstance(r, BaseException):
|
||||
logger.warning("切片 {} 转写异常: {}", i, r)
|
||||
continue
|
||||
if r and not _is_transcribe_failure(r):
|
||||
texts.append(r)
|
||||
return "".join(texts)
|
||||
|
||||
|
||||
# ── 分段语音异步处理 ────────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -761,7 +702,19 @@ async def process_audio_segment(
|
||||
segment_index,
|
||||
)
|
||||
try:
|
||||
transcript_text = await _transcribe_long_audio(audio_bytes, fmt="m4a")
|
||||
asr = get_asr_provider()
|
||||
transcript_text = await asr.transcribe(audio_bytes, format="m4a")
|
||||
if transcript_text:
|
||||
log_asr_transcript_result(
|
||||
logger,
|
||||
text=transcript_text,
|
||||
conversation_id=conversation_id,
|
||||
voice_session_id=voice_session_id,
|
||||
segment_index=segment_index,
|
||||
duration_s=audio_duration,
|
||||
audio_len=len(audio_bytes),
|
||||
source="audio_segment",
|
||||
)
|
||||
except ASRTranscriptionError as e:
|
||||
logger.warning(
|
||||
"ASR 转写失败 segment_index={} conversation_id={}: {}",
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
## 消息类型 (client → server)
|
||||
|
||||
- `TEXT`:文本消息。`data.text` 必填。可选 `data.tts_this_turn`(布尔):为 `true` 且服务端 `ENABLE_TTS` 开启且本轮回避 `skip_tts` 时,对该轮助手回复分段合成 TTS;默认为 `false`/缺省即不合成。**当开启本轮 TTS 时,每个助手分段服务端先推送 `tts_audio` 再推送该段 `agent_response`**,便于客户端先收音频再展示同段文字。
|
||||
- `AUDIO_SEGMENT`:语音分段。`data` 含 `audio_base64`、`segment_index`、`voice_session_id` / `client_segment_id`、`is_last`、`duration`。可选同上 `tts_this_turn`。
|
||||
- `AUDIO_MESSAGE`:整段音频(单次 ASR + 对话)。同上可选 `tts_this_turn`。
|
||||
- `AUDIO_SEGMENT`:语音分段(客户端约 15s 一段)。`data` 含 `audio_base64`、`segment_index`、`voice_session_id` / `client_segment_id`、`is_last`、`duration`。可选同上 `tts_this_turn`。服务端对每段调用录音文件识别极速版(`16k_zh_large`,HTTPS 同步返回)。
|
||||
- `AUDIO_MESSAGE`:整段音频(单次 ASR + 对话)。同上可选 `tts_this_turn`。单段建议 ≤100MB(极速版上限)。
|
||||
- `TRANSCRIBE_ONLY`:仅转写不回复
|
||||
- `TTS_CANCEL`:取消当前轮未完成的分段合成与下发
|
||||
- `TTS_REQUEST`:用户点击某一助手气泡「朗读」且该段尚无 TTS 时下发。`data` 含 `assistant_message_id`(落库 `conversation_messages.id`)、`segment_index`(与该条助手正文按 `[SPLIT]` 分段后的从 0 下标)、可选 `segment_text`(须与该分段正文一致,用于校验)。服务端若该段已有 URL 则只做预签名后推送 `tts_audio`(`data.manual=true`),**不重复合成**。
|
||||
|
||||
@@ -12,6 +12,7 @@ from starlette.websockets import WebSocketState
|
||||
|
||||
from app.agents.chat.background_voice import infer_background_voice
|
||||
from app.agents.chat.prompts_profile import format_user_profile_context
|
||||
from app.core.agent_logging import log_asr_transcript_result
|
||||
from app.core.config import settings
|
||||
from app.core.db import AsyncSessionLocal
|
||||
from app.core.dependencies import get_asr_provider
|
||||
@@ -596,15 +597,12 @@ async def websocket_endpoint(
|
||||
asr = get_asr_provider()
|
||||
audio_bytes = base64.b64decode(audio_base64)
|
||||
asr_text = await asr.transcribe(audio_bytes, "m4a")
|
||||
logger.debug(
|
||||
"ASR 转写完成: conversation_id={} chars={}",
|
||||
conversation_id,
|
||||
len(asr_text or ""),
|
||||
)
|
||||
logger.debug(
|
||||
"ASR 转写全文: conversation_id={} text={}",
|
||||
conversation_id,
|
||||
asr_text,
|
||||
log_asr_transcript_result(
|
||||
logger,
|
||||
text=asr_text or "",
|
||||
conversation_id=conversation_id,
|
||||
duration_s=audio_duration,
|
||||
source="audio_message",
|
||||
)
|
||||
|
||||
await manager.send_message(
|
||||
@@ -692,6 +690,12 @@ async def websocket_endpoint(
|
||||
asr = get_asr_provider()
|
||||
audio_bytes = base64.b64decode(audio_base64)
|
||||
asr_text = await asr.transcribe(audio_bytes, "m4a")
|
||||
log_asr_transcript_result(
|
||||
logger,
|
||||
text=asr_text or "",
|
||||
conversation_id=conversation_id,
|
||||
source="transcribe_only",
|
||||
)
|
||||
await manager.send_message(
|
||||
conversation_id,
|
||||
{
|
||||
|
||||
@@ -86,7 +86,7 @@ async def lifespan(app: FastAPI):
|
||||
asr_ready = True
|
||||
if asr_ready:
|
||||
logger.info(
|
||||
"ASR 服务已就绪(腾讯云一句话识别,引擎 {})",
|
||||
"ASR 服务已就绪(腾讯云录音文件识别极速版,引擎 {})",
|
||||
asr_defaults.engine_type,
|
||||
)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user