feat(api): use Tencent ASR flash with 16k_zh_large and dev transcript logs

Replace CreateRecTask polling with recording-file flash API, add TENCENT_APP_ID, remove server-side pydub slicing, and log ASR recognition text at INFO in development. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-25 11:28:22 +08:00
parent 22d282dc01
commit 07979bfb09
22 changed files with 354 additions and 185 deletions
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -2,7 +2,6 @@

 import asyncio
 import base64
-import io
 import time
 import uuid
 from dataclasses import dataclass, field
@@ -19,7 +18,7 @@ from sqlalchemy.ext.asyncio import AsyncSession

 from app.agents.chat import ChatOrchestrator
 from app.agents.chat.reply_limits import segments_from_llm_response
-from app.core.agent_logging import agent_summary_enabled
+from app.core.agent_logging import agent_summary_enabled, log_asr_transcript_result
 from app.core.business_telemetry import business_span
 from app.core.config import settings
 from app.core.cos_url_keys import (
@@ -617,64 +616,6 @@ async def _delayed_listening_feedback(
    await _send_segment_transition_feedback(conversation_id, 0)


-# ── 长音频切片转写 ────────────────────────────────────────────
-
-MAX_ASR_CHUNK_MS = 55_000
-
-
-def _split_audio_bytes(audio_bytes: bytes, fmt: str) -> list[bytes]:
-    """用 pydub 将长音频按 ≤55 s 切片，每片导出为 16 kHz mono WAV（腾讯 ASR 3 MB 限制内）。"""
-    from pydub import AudioSegment as PydubSegment
-
-    audio = PydubSegment.from_file(io.BytesIO(audio_bytes), format=fmt)
-    duration_ms = len(audio)
-
-    if duration_ms <= MAX_ASR_CHUNK_MS:
-        return [audio_bytes]
-
-    mono_16k = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
-    chunks: list[bytes] = []
-    for start in range(0, duration_ms, MAX_ASR_CHUNK_MS):
-        chunk = mono_16k[start : start + MAX_ASR_CHUNK_MS]
-        buf = io.BytesIO()
-        chunk.export(buf, format="wav")
-        chunks.append(buf.getvalue())
-    return chunks
-
-
-async def _transcribe_long_audio(audio_bytes: bytes, fmt: str = "m4a") -> str:
-    """超过 55 s 的音频自动切片后并行 ASR；短音频直接转写。"""
-    asr = get_asr_provider()
-    return await _transcribe_long_audio_inner(audio_bytes, fmt, asr)
-
-
-async def _transcribe_long_audio_inner(
-    audio_bytes: bytes, fmt: str, asr: Any
-) -> str:
-    try:
-        chunks = await asyncio.to_thread(_split_audio_bytes, audio_bytes, fmt)
-    except Exception as exc:
-        logger.warning("pydub 切片失败 ({}), 回退到直接转写", exc)
-        return await asr.transcribe(audio_bytes, format=fmt)
-
-    if len(chunks) <= 1:
-        return await asr.transcribe(audio_bytes, format=fmt)
-
-    logger.info("长音频切片: {} 段", len(chunks))
-    results = await asyncio.gather(
-        *[asr.transcribe(c, format="wav") for c in chunks],
-        return_exceptions=True,
-    )
-    texts: list[str] = []
-    for i, r in enumerate(results):
-        if isinstance(r, BaseException):
-            logger.warning("切片 {} 转写异常: {}", i, r)
-            continue
-        if r and not _is_transcribe_failure(r):
-            texts.append(r)
-    return "".join(texts)
-
-
 # ── 分段语音异步处理 ────────────────────────────────────────────


@@ -761,7 +702,19 @@ async def process_audio_segment(
                    segment_index,
                )
            try:
-                transcript_text = await _transcribe_long_audio(audio_bytes, fmt="m4a")
+                asr = get_asr_provider()
+                transcript_text = await asr.transcribe(audio_bytes, format="m4a")
+                if transcript_text:
+                    log_asr_transcript_result(
+                        logger,
+                        text=transcript_text,
+                        conversation_id=conversation_id,
+                        voice_session_id=voice_session_id,
+                        segment_index=segment_index,
+                        duration_s=audio_duration,
+                        audio_len=len(audio_bytes),
+                        source="audio_segment",
+                    )
            except ASRTranscriptionError as e:
                logger.warning(
                    "ASR 转写失败 segment_index={} conversation_id={}: {}",
--- a/api/app/features/conversation/ws/protocol.md
+++ b/api/app/features/conversation/ws/protocol.md
@@ -8,8 +8,8 @@
 ## 消息类型 (client → server)

 - `TEXT`：文本消息。`data.text` 必填。可选 `data.tts_this_turn`（布尔）：为 `true` 且服务端 `ENABLE_TTS` 开启且本轮回避 `skip_tts` 时，对该轮助手回复分段合成 TTS；默认为 `false`/缺省即不合成。**当开启本轮 TTS 时，每个助手分段服务端先推送 `tts_audio` 再推送该段 `agent_response`**，便于客户端先收音频再展示同段文字。
- `AUDIO_SEGMENT`：语音分段。`data` 含 `audio_base64`、`segment_index`、`voice_session_id` / `client_segment_id`、`is_last`、`duration`。可选同上 `tts_this_turn`。
- `AUDIO_MESSAGE`：整段音频（单次 ASR + 对话）。同上可选 `tts_this_turn`。
+- `AUDIO_SEGMENT`：语音分段（客户端约 15s 一段）。`data` 含 `audio_base64`、`segment_index`、`voice_session_id` / `client_segment_id`、`is_last`、`duration`。可选同上 `tts_this_turn`。服务端对每段调用录音文件识别极速版（`16k_zh_large`，HTTPS 同步返回）。
+- `AUDIO_MESSAGE`：整段音频（单次 ASR + 对话）。同上可选 `tts_this_turn`。单段建议 ≤100MB（极速版上限）。
 - `TRANSCRIBE_ONLY`：仅转写不回复
 - `TTS_CANCEL`：取消当前轮未完成的分段合成与下发
 - `TTS_REQUEST`：用户点击某一助手气泡「朗读」且该段尚无 TTS 时下发。`data` 含 `assistant_message_id`（落库 `conversation_messages.id`）、`segment_index`（与该条助手正文按 `[SPLIT]` 分段后的从 0 下标）、可选 `segment_text`（须与该分段正文一致，用于校验）。服务端若该段已有 URL 则只做预签名后推送 `tts_audio`（`data.manual=true`），**不重复合成**。
--- a/api/app/features/conversation/ws/router.py
+++ b/api/app/features/conversation/ws/router.py
@@ -12,6 +12,7 @@ from starlette.websockets import WebSocketState

 from app.agents.chat.background_voice import infer_background_voice
 from app.agents.chat.prompts_profile import format_user_profile_context
+from app.core.agent_logging import log_asr_transcript_result
 from app.core.config import settings
 from app.core.db import AsyncSessionLocal
 from app.core.dependencies import get_asr_provider
@@ -596,15 +597,12 @@ async def websocket_endpoint(
                                asr = get_asr_provider()
                                audio_bytes = base64.b64decode(audio_base64)
                                asr_text = await asr.transcribe(audio_bytes, "m4a")
-                                logger.debug(
-                                    "ASR 转写完成: conversation_id={} chars={}",
-                                    conversation_id,
-                                    len(asr_text or ""),
-                                )
-                                logger.debug(
-                                    "ASR 转写全文: conversation_id={} text={}",
-                                    conversation_id,
-                                    asr_text,
+                                log_asr_transcript_result(
+                                    logger,
+                                    text=asr_text or "",
+                                    conversation_id=conversation_id,
+                                    duration_s=audio_duration,
+                                    source="audio_message",
                                )

                                await manager.send_message(
@@ -692,6 +690,12 @@ async def websocket_endpoint(
                            asr = get_asr_provider()
                            audio_bytes = base64.b64decode(audio_base64)
                            asr_text = await asr.transcribe(audio_bytes, "m4a")
+                            log_asr_transcript_result(
+                                logger,
+                                text=asr_text or "",
+                                conversation_id=conversation_id,
+                                source="transcribe_only",
+                            )
                            await manager.send_message(
                                conversation_id,
                                {