feat(api): use Tencent ASR flash with 16k_zh_large and dev transcript logs

Replace CreateRecTask polling with recording-file flash API, add TENCENT_APP_ID,
remove server-side pydub slicing, and log ASR recognition text at INFO in development.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Kevin
2026-05-25 11:28:22 +08:00
parent 22d282dc01
commit 07979bfb09
22 changed files with 354 additions and 185 deletions

View File

@@ -2,7 +2,6 @@
import asyncio
import base64
import io
import time
import uuid
from dataclasses import dataclass, field
@@ -19,7 +18,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat import ChatOrchestrator
from app.agents.chat.reply_limits import segments_from_llm_response
from app.core.agent_logging import agent_summary_enabled
from app.core.agent_logging import agent_summary_enabled, log_asr_transcript_result
from app.core.business_telemetry import business_span
from app.core.config import settings
from app.core.cos_url_keys import (
@@ -617,64 +616,6 @@ async def _delayed_listening_feedback(
await _send_segment_transition_feedback(conversation_id, 0)
# ── 长音频切片转写 ────────────────────────────────────────────
MAX_ASR_CHUNK_MS = 55_000
def _split_audio_bytes(audio_bytes: bytes, fmt: str) -> list[bytes]:
"""用 pydub 将长音频按 ≤55 s 切片,每片导出为 16 kHz mono WAV腾讯 ASR 3 MB 限制内)。"""
from pydub import AudioSegment as PydubSegment
audio = PydubSegment.from_file(io.BytesIO(audio_bytes), format=fmt)
duration_ms = len(audio)
if duration_ms <= MAX_ASR_CHUNK_MS:
return [audio_bytes]
mono_16k = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
chunks: list[bytes] = []
for start in range(0, duration_ms, MAX_ASR_CHUNK_MS):
chunk = mono_16k[start : start + MAX_ASR_CHUNK_MS]
buf = io.BytesIO()
chunk.export(buf, format="wav")
chunks.append(buf.getvalue())
return chunks
async def _transcribe_long_audio(audio_bytes: bytes, fmt: str = "m4a") -> str:
"""超过 55 s 的音频自动切片后并行 ASR短音频直接转写。"""
asr = get_asr_provider()
return await _transcribe_long_audio_inner(audio_bytes, fmt, asr)
async def _transcribe_long_audio_inner(
audio_bytes: bytes, fmt: str, asr: Any
) -> str:
try:
chunks = await asyncio.to_thread(_split_audio_bytes, audio_bytes, fmt)
except Exception as exc:
logger.warning("pydub 切片失败 ({}), 回退到直接转写", exc)
return await asr.transcribe(audio_bytes, format=fmt)
if len(chunks) <= 1:
return await asr.transcribe(audio_bytes, format=fmt)
logger.info("长音频切片: {}", len(chunks))
results = await asyncio.gather(
*[asr.transcribe(c, format="wav") for c in chunks],
return_exceptions=True,
)
texts: list[str] = []
for i, r in enumerate(results):
if isinstance(r, BaseException):
logger.warning("切片 {} 转写异常: {}", i, r)
continue
if r and not _is_transcribe_failure(r):
texts.append(r)
return "".join(texts)
# ── 分段语音异步处理 ────────────────────────────────────────────
@@ -761,7 +702,19 @@ async def process_audio_segment(
segment_index,
)
try:
transcript_text = await _transcribe_long_audio(audio_bytes, fmt="m4a")
asr = get_asr_provider()
transcript_text = await asr.transcribe(audio_bytes, format="m4a")
if transcript_text:
log_asr_transcript_result(
logger,
text=transcript_text,
conversation_id=conversation_id,
voice_session_id=voice_session_id,
segment_index=segment_index,
duration_s=audio_duration,
audio_len=len(audio_bytes),
source="audio_segment",
)
except ASRTranscriptionError as e:
logger.warning(
"ASR 转写失败 segment_index={} conversation_id={}: {}",