From 07979bfb098073dac5bce6ab0ad9f3472db71157 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@brighteng.org>
Date: Mon, 25 May 2026 11:28:22 +0800
Subject: [PATCH] feat(api): use Tencent ASR flash with 16k_zh_large and dev
 transcript logs

Replace CreateRecTask polling with recording-file flash API, add TENCENT_APP_ID,
remove server-side pydub slicing, and log ASR recognition text at INFO in development.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 api/.env.example                             |   2 +
 api/.env.production                          |   5 +-
 api/.env.staging                             |   5 +-
 api/README.md                                |   2 +-
 api/app/adapters/asr/tencent_asr.py          | 234 ++++++++++++++-----
 api/app/core/agent_logging.py                |  26 +++
 api/app/core/app_config_models.py            |   1 +
 api/app/core/config.py                       |   1 +
 api/app/core/dependencies.py                 |   2 +
 api/app/features/conversation/ws/pipeline.py |  75 ++----
 api/app/features/conversation/ws/protocol.md |   4 +-
 api/app/features/conversation/ws/router.py   |  22 +-
 api/app/main.py                              |   2 +-
 api/config/default.toml                      |   2 +
 api/docs/部署指南.md                         |   3 +
 api/pyproject.toml                           |   1 -
 api/tests/test_asr_transcript_logging.py     |  58 +++++
 api/tests/test_default_toml_legacy_parity.py |   1 +
 api/tests/test_infra_regressions.py          |  77 +++---
 api/tests/test_settings_allowlist.py         |   3 +-
 api/uv.lock                                  |  11 -
 app-expo/src/features/voice/recorder.ts      |   2 +-
 22 files changed, 354 insertions(+), 185 deletions(-)
 create mode 100644 api/tests/test_asr_transcript_logging.py

diff --git a/api/.env.example b/api/.env.example
index 1c3056a..b3e0e53 100644
--- a/api/.env.example
+++ b/api/.env.example
@@ -29,6 +29,8 @@ ZHIPU_API_KEY=your_zhipu_api_key
 # ── 腾讯云凭证（SMS / ASR / TTS / COS 共用）──────────────────
 TENCENT_SECRET_ID=your_tencent_secret_id
 TENCENT_SECRET_KEY=your_tencent_secret_key
+# ASR 极速版必填：API 密钥管理页 AppId（与 SecretId 同页）
+TENCENT_APP_ID=your_tencent_app_id
 
 # ── WeChat Pay 密钥 ───────────────────────────────────────────
 WECHAT_PAY_API_V3_KEY=your_wechat_api_v3_key
diff --git a/api/.env.production b/api/.env.production
index 31dd3d7..8450f35 100644
--- a/api/.env.production
+++ b/api/.env.production
@@ -15,8 +15,9 @@ SECRET_KEY=cf47555c7ecbe5ddb7fd2113c59e08a8bcb110810c42f7c644e06a5acc898608
 DEEPSEEK_API_KEY=sk-09f17fb61c5a4299a3afc2a01de7af75
 ZHIPU_API_KEY=524eda18eb3848e881eefe4c7ef17ec2.xBmGUabYDEa44m3M
 
-TENCENT_SECRET_ID=AKIDa2ILCwUr56uVt31oU0JOHxPfGhvvkLiq
-TENCENT_SECRET_KEY=xiFbjlZ9XheS2NWYLvHRPAh2A5nGYcR2
+TENCENT_SECRET_ID=AKIDnX5gHgssuvoXBQYTCnJ5g6MPQXWTL8mD
+TENCENT_SECRET_KEY=MgsreEcqzzoQuFPVVnz11zpUEtPlwwjL
+TENCENT_APP_ID=1319381411
 
 WECHAT_PAY_API_V3_KEY=xjvGSJLGJAJfjgskfjslafjsajsdjals
 
diff --git a/api/.env.staging b/api/.env.staging
index 89aff48..8ea9f9d 100644
--- a/api/.env.staging
+++ b/api/.env.staging
@@ -19,8 +19,9 @@ SECRET_KEY=cf47555c7ecbe5ddb7fd2113c59e08a8bcb110810c42f7c644e06a5acc898608
 DEEPSEEK_API_KEY=sk-09f17fb61c5a4299a3afc2a01de7af75
 ZHIPU_API_KEY=524eda18eb3848e881eefe4c7ef17ec2.xBmGUabYDEa44m3M
 
-TENCENT_SECRET_ID=AKIDa2ILCwUr56uVt31oU0JOHxPfGhvvkLiq
-TENCENT_SECRET_KEY=xiFbjlZ9XheS2NWYLvHRPAh2A5nGYcR2
+TENCENT_SECRET_ID=AKIDnX5gHgssuvoXBQYTCnJ5g6MPQXWTL8mD
+TENCENT_SECRET_KEY=MgsreEcqzzoQuFPVVnz11zpUEtPlwwjL
+TENCENT_APP_ID=1319381411
 
 WECHAT_PAY_API_V3_KEY=xjvGSJLGJAJfjgskfjslafjsajsdjals
 
diff --git a/api/README.md b/api/README.md
index caa85a3..f20e4fa 100644
--- a/api/README.md
+++ b/api/README.md
@@ -533,7 +533,7 @@ ws://localhost:8000/ws/conversation/{conversation_id}?token={access_token}
 
 ### 3. 语音服务
 
-- **ASR (语音识别)**: 腾讯云一句话识别（引擎 `16k_zh_large`，含多方言）
+- **ASR (语音识别)**: 腾讯云录音文件识别极速版（Flash 同步接口，引擎 `16k_zh_large`）；需配置 `TENCENT_APP_ID`；客户端按 15s 分段上传
 - **TTS (语音合成)**: 使用 OpenAI TTS API 将文本转为语音
 
 ### 4. PDF 生成
diff --git a/api/app/adapters/asr/tencent_asr.py b/api/app/adapters/asr/tencent_asr.py
index 4e1790c..7224b49 100644
--- a/api/app/adapters/asr/tencent_asr.py
+++ b/api/app/adapters/asr/tencent_asr.py
@@ -1,7 +1,13 @@
-"""Tencent Cloud ASR adapter — implements ASRProvider port."""
+"""Tencent Cloud ASR adapter — 录音文件识别极速版 (Flash), implements ASRProvider port."""
 
-import asyncio
 import base64
+import hashlib
+import hmac
+import json
+import time
+from typing import Any
+
+import httpx
 
 from app.core.business_telemetry import business_span
 from app.core.logging import get_logger
@@ -9,82 +15,198 @@ from app.ports.asr import ASRTranscriptionError
 
 logger = get_logger(__name__)
 
+_FLASH_HOST = "asr.cloud.tencent.com"
+_FLASH_PATH_PREFIX = "/asr/flash/v1/"
+# 极速版本地上传上限（腾讯文档：≤100MB）
+_MAX_FLASH_AUDIO_BYTES = 100 * 1024 * 1024
+
+
+def _format_flash_sign_string(sorted_params: list[tuple[str, Any]]) -> str:
+    """与官方 flash_recognizer._format_sign_string 一致。"""
+    signstr = f"POST{_FLASH_HOST}{_FLASH_PATH_PREFIX}"
+    for key, value in sorted_params:
+        if key == "appid":
+            signstr += str(value)
+            break
+    signstr += "?"
+    for key, value in sorted_params:
+        if key == "appid":
+            continue
+        signstr += f"{key}={value}&"
+    return signstr[:-1]
+
+
+def _build_flash_url_and_headers(
+    secret_key: str, params: dict[str, Any]
+) -> tuple[str, dict[str, str]]:
+    sorted_params = sorted(params.items(), key=lambda item: item[0])
+    signstr = _format_flash_sign_string(sorted_params)
+    signature = base64.b64encode(
+        hmac.new(
+            secret_key.encode("utf-8"),
+            signstr.encode("utf-8"),
+            hashlib.sha1,
+        ).digest()
+    ).decode("utf-8")
+    url = "https://" + signstr[4:]
+    headers = {
+        "Host": _FLASH_HOST,
+        "Authorization": signature,
+    }
+    return url, headers
+
+
+def _build_flash_query_params(
+    *,
+    app_id: str,
+    secret_id: str,
+    engine_type: str,
+    voice_format: str,
+) -> dict[str, Any]:
+    return {
+        "appid": app_id,
+        "secretid": secret_id,
+        "timestamp": str(int(time.time())),
+        "engine_type": engine_type,
+        "voice_format": voice_format,
+        "speaker_diarization": 0,
+        "customization_id": "",
+        "filter_dirty": 0,
+        "filter_modal": 0,
+        "filter_punc": 0,
+        "convert_num_mode": 1,
+        "word_info": 0,
+        "first_channel_only": 1,
+        "reinforce_hotword": 0,
+        "sentence_max_length": 0,
+    }
+
 
 class TencentASRProvider:
     def __init__(
         self,
         secret_id: str,
         secret_key: str,
+        app_id: str,
         *,
         engine_type: str = "16k_zh_large",
+        request_timeout_seconds: float = 60.0,
     ):
         self._secret_id = secret_id
         self._secret_key = secret_key
+        self._app_id = (app_id or "").strip()
         self._engine_type = engine_type
-        self._client = None
-
-    def _get_client(self):
-        if self._client is not None:
-            return self._client
-        try:
-            from tencentcloud.asr.v20190614 import asr_client
-            from tencentcloud.common import credential
-            from tencentcloud.common.profile.client_profile import ClientProfile
-            from tencentcloud.common.profile.http_profile import HttpProfile
-
-            cred = credential.Credential(self._secret_id, self._secret_key)
-            http_profile = HttpProfile()
-            http_profile.endpoint = "asr.tencentcloudapi.com"
-            client_profile = ClientProfile()
-            client_profile.httpProfile = http_profile
-            self._client = asr_client.AsrClient(cred, "", client_profile)
-            return self._client
-        except Exception as e:
-            logger.error("Tencent ASR client init failed: {}", e)
-            return None
+        self._request_timeout_seconds = request_timeout_seconds
 
     def ensure_ready(self) -> bool:
-        return bool(self._secret_id and self._secret_key and self._get_client())
+        return bool(self._secret_id and self._secret_key and self._app_id)
 
     async def transcribe(self, audio: bytes, format: str = "m4a") -> str:
-        with business_span("asr.transcribe", provider="tencent"):
+        with business_span("asr.transcribe", provider="tencent_flash"):
             return await self._transcribe_inner(audio, format)
 
     async def _transcribe_inner(self, audio: bytes, format: str) -> str:
-        client = self._get_client()
-        if not client:
+        if not self.ensure_ready():
             raise ASRTranscriptionError(
-                "Tencent ASR client not initialized (check credentials)"
+                "Tencent ASR flash not configured (need TENCENT_APP_ID, SECRET_ID, SECRET_KEY)"
             )
+        if len(audio) > _MAX_FLASH_AUDIO_BYTES:
+            raise ASRTranscriptionError(
+                f"Audio exceeds {_MAX_FLASH_AUDIO_BYTES // (1024 * 1024)}MB flash upload limit"
+            )
+
+        voice_format = (format or "m4a").lower()
+        params = _build_flash_query_params(
+            app_id=self._app_id,
+            secret_id=self._secret_id,
+            engine_type=self._engine_type,
+            voice_format=voice_format,
+        )
+        url, headers = _build_flash_url_and_headers(self._secret_key, params)
+        headers["Content-Type"] = "application/octet-stream"
+        headers["Content-Length"] = str(len(audio))
+
         try:
-            from tencentcloud.asr.v20190614 import models
-
-            audio_base64 = base64.b64encode(audio).decode("utf-8")
-            req = models.SentenceRecognitionRequest()
-            req.EngSerViceType = self._engine_type
-            req.SourceType = 1
-            # 小写；与文档一致。iOS 常见为 m4a(AAC) 容器，与 16k 引擎匹配
-            req.VoiceFormat = (format or "m4a").lower()
-            req.Data = audio_base64
-            req.DataLen = len(audio)
-
-            # 腾讯 SDK 为同步阻塞调用；放到线程池里避免卡住事件循环。
-            resp = await asyncio.to_thread(client.SentenceRecognition, req)
-            text = (resp.Result or "").strip()
-            if text:
-                return text
-            err = getattr(resp, "Error", None) or getattr(resp, "Message", None)
-            logger.warning(
-                "Tencent ASR empty Result, audio_len={} format={} err={}",
-                len(audio),
-                req.VoiceFormat,
-                err,
-            )
-            raise ASRTranscriptionError(
-                "Tencent ASR empty Result (check sample rate / format / audio)"
-            )
+            async with httpx.AsyncClient() as client:
+                resp = await client.post(
+                    url,
+                    headers=headers,
+                    content=audio,
+                    timeout=self._request_timeout_seconds,
+                )
+            if resp.status_code >= 400:
+                raise ASRTranscriptionError(
+                    f"Tencent ASR flash HTTP {resp.status_code}: {resp.text[:200]}"
+                )
+            payload = resp.json()
         except ASRTranscriptionError:
             raise
+        except httpx.HTTPError as e:
+            logger.error("Tencent ASR flash HTTP failed: {}", e, exc_info=True)
+            raise ASRTranscriptionError(f"Tencent ASR flash HTTP failed: {e!s}") from e
+        except json.JSONDecodeError as e:
+            logger.error("Tencent ASR flash invalid JSON: {}", e, exc_info=True)
+            raise ASRTranscriptionError("Tencent ASR flash returned invalid JSON") from e
         except Exception as e:
-            logger.error("Tencent ASR transcribe failed: {}", e, exc_info=True)
-            raise ASRTranscriptionError(f"Tencent ASR transcribe failed: {e!s}") from e
+            logger.error("Tencent ASR flash transcribe failed: {}", e, exc_info=True)
+            raise ASRTranscriptionError(f"Tencent ASR flash transcribe failed: {e!s}") from e
+
+        return self._parse_flash_response(payload, audio_len=len(audio), voice_format=voice_format)
+
+    def _parse_flash_response(
+        self, payload: dict[str, Any], *, audio_len: int, voice_format: str
+    ) -> str:
+        code = payload.get("code")
+        if code != 0:
+            message = payload.get("message") or "unknown error"
+            request_id = payload.get("request_id", "")
+            logger.warning(
+                "Tencent ASR flash error code={} message={} request_id={} audio_len={} format={}",
+                code,
+                message,
+                request_id,
+                audio_len,
+                voice_format,
+            )
+            if code == 4004:
+                raise ASRTranscriptionError(
+                    "Tencent ASR flash resource pack exhausted (purchase 录音文件识别极速版)"
+                )
+            if code == 4003:
+                raise ASRTranscriptionError(
+                    "Tencent ASR flash service not enabled in console"
+                )
+            raise ASRTranscriptionError(
+                f"Tencent ASR flash failed (code={code}): {message}"
+            )
+
+        flash_result = payload.get("flash_result") or []
+        texts: list[str] = []
+        for channel in flash_result:
+            if not isinstance(channel, dict):
+                continue
+            text = (channel.get("text") or "").strip()
+            if text:
+                texts.append(text)
+        combined = "".join(texts)
+        if combined:
+            logger.debug(
+                "Tencent ASR flash ok request_id={} audio_len={} audio_duration_ms={} "
+                "voice_format={} chars={}",
+                payload.get("request_id"),
+                audio_len,
+                payload.get("audio_duration"),
+                voice_format,
+                len(combined),
+            )
+            return combined
+
+        logger.warning(
+            "Tencent ASR flash empty flash_result, audio_len={} format={} request_id={}",
+            audio_len,
+            voice_format,
+            payload.get("request_id"),
+        )
+        raise ASRTranscriptionError(
+            "Tencent ASR flash empty result (check sample rate / format / audio)"
+        )
diff --git a/api/app/core/agent_logging.py b/api/app/core/agent_logging.py
index 4786a59..0726236 100644
--- a/api/app/core/agent_logging.py
+++ b/api/app/core/agent_logging.py
@@ -87,6 +87,32 @@ def log_agent_summary(
         logger.info(message, *args)
 
 
+def asr_transcript_log_enabled() -> bool:
+    """development 环境或全局 DEBUG/TRACE 时以 INFO 输出 ASR 识别全文。"""
+    env = (settings.app_environment or "").strip().lower()
+    if env == "development":
+        return True
+    return agent_verbose_enabled()
+
+
+def log_asr_transcript_result(
+    logger: Any,
+    *,
+    text: str,
+    **context: Any,
+) -> None:
+    """在 ``asr_transcript_log_enabled()`` 时记录识别结果（过长文本会截断）。"""
+    if not asr_transcript_log_enabled():
+        return
+    parts = [f"{k}={v}" for k, v in context.items() if v is not None and v != ""]
+    ctx = " ".join(parts)
+    body = truncate_for_log(text)
+    if ctx:
+        logger.info("ASR 识别结果 {} text={}", ctx, body)
+    else:
+        logger.info("ASR 识别结果 text={}", body)
+
+
 @contextmanager
 def agent_span(
     logger: Any,
diff --git a/api/app/core/app_config_models.py b/api/app/core/app_config_models.py
index da63730..9da0374 100644
--- a/api/app/core/app_config_models.py
+++ b/api/app/core/app_config_models.py
@@ -205,6 +205,7 @@ class AsrConfig(BaseModel):
 
     provider: Literal["tencent"] = "tencent"
     engine_type: Literal["16k_zh_large"] = "16k_zh_large"
+    request_timeout_seconds: float = 60.0
 
 
 class TtsConfig(BaseModel):
diff --git a/api/app/core/config.py b/api/app/core/config.py
index bd66c5b..d7baf12 100644
--- a/api/app/core/config.py
+++ b/api/app/core/config.py
@@ -42,6 +42,7 @@ class Settings(BaseSettings):
 
     tencent_secret_id: str = ""
     tencent_secret_key: str = ""
+    tencent_app_id: str = ""
 
     wechat_pay_api_v3_key: str = ""
     wechat_pay_private_key: str = ""
diff --git a/api/app/core/dependencies.py b/api/app/core/dependencies.py
index eae3a57..3a06c0c 100644
--- a/api/app/core/dependencies.py
+++ b/api/app/core/dependencies.py
@@ -107,7 +107,9 @@ def get_asr_provider() -> ASRProvider:
     return TencentASRProvider(
         secret_id=settings.tencent_secret_id,
         secret_key=settings.tencent_secret_key,
+        app_id=settings.tencent_app_id,
         engine_type=asr_defaults.engine_type,
+        request_timeout_seconds=asr_defaults.request_timeout_seconds,
     )
 
 
diff --git a/api/app/features/conversation/ws/pipeline.py b/api/app/features/conversation/ws/pipeline.py
index 71c2776..b8a1cb4 100644
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -2,7 +2,6 @@
 
 import asyncio
 import base64
-import io
 import time
 import uuid
 from dataclasses import dataclass, field
@@ -19,7 +18,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.agents.chat import ChatOrchestrator
 from app.agents.chat.reply_limits import segments_from_llm_response
-from app.core.agent_logging import agent_summary_enabled
+from app.core.agent_logging import agent_summary_enabled, log_asr_transcript_result
 from app.core.business_telemetry import business_span
 from app.core.config import settings
 from app.core.cos_url_keys import (
@@ -617,64 +616,6 @@ async def _delayed_listening_feedback(
     await _send_segment_transition_feedback(conversation_id, 0)
 
 
-# ── 长音频切片转写 ────────────────────────────────────────────
-
-MAX_ASR_CHUNK_MS = 55_000
-
-
-def _split_audio_bytes(audio_bytes: bytes, fmt: str) -> list[bytes]:
-    """用 pydub 将长音频按 ≤55 s 切片，每片导出为 16 kHz mono WAV（腾讯 ASR 3 MB 限制内）。"""
-    from pydub import AudioSegment as PydubSegment
-
-    audio = PydubSegment.from_file(io.BytesIO(audio_bytes), format=fmt)
-    duration_ms = len(audio)
-
-    if duration_ms <= MAX_ASR_CHUNK_MS:
-        return [audio_bytes]
-
-    mono_16k = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
-    chunks: list[bytes] = []
-    for start in range(0, duration_ms, MAX_ASR_CHUNK_MS):
-        chunk = mono_16k[start : start + MAX_ASR_CHUNK_MS]
-        buf = io.BytesIO()
-        chunk.export(buf, format="wav")
-        chunks.append(buf.getvalue())
-    return chunks
-
-
-async def _transcribe_long_audio(audio_bytes: bytes, fmt: str = "m4a") -> str:
-    """超过 55 s 的音频自动切片后并行 ASR；短音频直接转写。"""
-    asr = get_asr_provider()
-    return await _transcribe_long_audio_inner(audio_bytes, fmt, asr)
-
-
-async def _transcribe_long_audio_inner(
-    audio_bytes: bytes, fmt: str, asr: Any
-) -> str:
-    try:
-        chunks = await asyncio.to_thread(_split_audio_bytes, audio_bytes, fmt)
-    except Exception as exc:
-        logger.warning("pydub 切片失败 ({}), 回退到直接转写", exc)
-        return await asr.transcribe(audio_bytes, format=fmt)
-
-    if len(chunks) <= 1:
-        return await asr.transcribe(audio_bytes, format=fmt)
-
-    logger.info("长音频切片: {} 段", len(chunks))
-    results = await asyncio.gather(
-        *[asr.transcribe(c, format="wav") for c in chunks],
-        return_exceptions=True,
-    )
-    texts: list[str] = []
-    for i, r in enumerate(results):
-        if isinstance(r, BaseException):
-            logger.warning("切片 {} 转写异常: {}", i, r)
-            continue
-        if r and not _is_transcribe_failure(r):
-            texts.append(r)
-    return "".join(texts)
-
-
 # ── 分段语音异步处理 ────────────────────────────────────────────
 
 
@@ -761,7 +702,19 @@ async def process_audio_segment(
                     segment_index,
                 )
             try:
-                transcript_text = await _transcribe_long_audio(audio_bytes, fmt="m4a")
+                asr = get_asr_provider()
+                transcript_text = await asr.transcribe(audio_bytes, format="m4a")
+                if transcript_text:
+                    log_asr_transcript_result(
+                        logger,
+                        text=transcript_text,
+                        conversation_id=conversation_id,
+                        voice_session_id=voice_session_id,
+                        segment_index=segment_index,
+                        duration_s=audio_duration,
+                        audio_len=len(audio_bytes),
+                        source="audio_segment",
+                    )
             except ASRTranscriptionError as e:
                 logger.warning(
                     "ASR 转写失败 segment_index={} conversation_id={}: {}",
diff --git a/api/app/features/conversation/ws/protocol.md b/api/app/features/conversation/ws/protocol.md
index 1b03b0c..1f44931 100644
--- a/api/app/features/conversation/ws/protocol.md
+++ b/api/app/features/conversation/ws/protocol.md
@@ -8,8 +8,8 @@
 ## 消息类型 (client → server)
 
 - `TEXT`：文本消息。`data.text` 必填。可选 `data.tts_this_turn`（布尔）：为 `true` 且服务端 `ENABLE_TTS` 开启且本轮回避 `skip_tts` 时，对该轮助手回复分段合成 TTS；默认为 `false`/缺省即不合成。**当开启本轮 TTS 时，每个助手分段服务端先推送 `tts_audio` 再推送该段 `agent_response`**，便于客户端先收音频再展示同段文字。
-- `AUDIO_SEGMENT`：语音分段。`data` 含 `audio_base64`、`segment_index`、`voice_session_id` / `client_segment_id`、`is_last`、`duration`。可选同上 `tts_this_turn`。
-- `AUDIO_MESSAGE`：整段音频（单次 ASR + 对话）。同上可选 `tts_this_turn`。
+- `AUDIO_SEGMENT`：语音分段（客户端约 15s 一段）。`data` 含 `audio_base64`、`segment_index`、`voice_session_id` / `client_segment_id`、`is_last`、`duration`。可选同上 `tts_this_turn`。服务端对每段调用录音文件识别极速版（`16k_zh_large`，HTTPS 同步返回）。
+- `AUDIO_MESSAGE`：整段音频（单次 ASR + 对话）。同上可选 `tts_this_turn`。单段建议 ≤100MB（极速版上限）。
 - `TRANSCRIBE_ONLY`：仅转写不回复
 - `TTS_CANCEL`：取消当前轮未完成的分段合成与下发
 - `TTS_REQUEST`：用户点击某一助手气泡「朗读」且该段尚无 TTS 时下发。`data` 含 `assistant_message_id`（落库 `conversation_messages.id`）、`segment_index`（与该条助手正文按 `[SPLIT]` 分段后的从 0 下标）、可选 `segment_text`（须与该分段正文一致，用于校验）。服务端若该段已有 URL 则只做预签名后推送 `tts_audio`（`data.manual=true`），**不重复合成**。
diff --git a/api/app/features/conversation/ws/router.py b/api/app/features/conversation/ws/router.py
index 44f78fe..e180ed0 100644
--- a/api/app/features/conversation/ws/router.py
+++ b/api/app/features/conversation/ws/router.py
@@ -12,6 +12,7 @@ from starlette.websockets import WebSocketState
 
 from app.agents.chat.background_voice import infer_background_voice
 from app.agents.chat.prompts_profile import format_user_profile_context
+from app.core.agent_logging import log_asr_transcript_result
 from app.core.config import settings
 from app.core.db import AsyncSessionLocal
 from app.core.dependencies import get_asr_provider
@@ -596,15 +597,12 @@ async def websocket_endpoint(
                                 asr = get_asr_provider()
                                 audio_bytes = base64.b64decode(audio_base64)
                                 asr_text = await asr.transcribe(audio_bytes, "m4a")
-                                logger.debug(
-                                    "ASR 转写完成: conversation_id={} chars={}",
-                                    conversation_id,
-                                    len(asr_text or ""),
-                                )
-                                logger.debug(
-                                    "ASR 转写全文: conversation_id={} text={}",
-                                    conversation_id,
-                                    asr_text,
+                                log_asr_transcript_result(
+                                    logger,
+                                    text=asr_text or "",
+                                    conversation_id=conversation_id,
+                                    duration_s=audio_duration,
+                                    source="audio_message",
                                 )
 
                                 await manager.send_message(
@@ -692,6 +690,12 @@ async def websocket_endpoint(
                             asr = get_asr_provider()
                             audio_bytes = base64.b64decode(audio_base64)
                             asr_text = await asr.transcribe(audio_bytes, "m4a")
+                            log_asr_transcript_result(
+                                logger,
+                                text=asr_text or "",
+                                conversation_id=conversation_id,
+                                source="transcribe_only",
+                            )
                             await manager.send_message(
                                 conversation_id,
                                 {
diff --git a/api/app/main.py b/api/app/main.py
index bf4e959..2034783 100644
--- a/api/app/main.py
+++ b/api/app/main.py
@@ -86,7 +86,7 @@ async def lifespan(app: FastAPI):
             asr_ready = True
         if asr_ready:
             logger.info(
-                "ASR 服务已就绪（腾讯云一句话识别，引擎 {}）",
+                "ASR 服务已就绪（腾讯云录音文件识别极速版，引擎 {}）",
                 asr_defaults.engine_type,
             )
         else:
diff --git a/api/config/default.toml b/api/config/default.toml
index 66b824a..7462265 100644
--- a/api/config/default.toml
+++ b/api/config/default.toml
@@ -177,7 +177,9 @@ embedding_model = "embedding-3"
 
 [asr]
 provider = "tencent"
+# 录音文件识别极速版（Flash HTTPS 同步）；引擎 16k_zh_large；AppID 见 .env TENCENT_APP_ID
 engine_type = "16k_zh_large"
+request_timeout_seconds = 60.0
 
 [tts]
 provider = "tencent"
diff --git a/api/docs/部署指南.md b/api/docs/部署指南.md
index 5a66d79..7f7ebe9 100644
--- a/api/docs/部署指南.md
+++ b/api/docs/部署指南.md
@@ -88,6 +88,8 @@ SECRET_KEY=your_strong_random_secret_here
 # 腾讯云 API 密钥（SMS / ASR / TTS / COS 共用）
 TENCENT_SECRET_ID=your_secret_id_here
 TENCENT_SECRET_KEY=your_secret_key_here
+# ASR 录音文件识别极速版必填（API 密钥管理页 AppId）
+TENCENT_APP_ID=your_app_id_here
 ```
 
 编辑 `api/config/production.toml`（SMS 业务 ID 等非密钥项）：
@@ -109,6 +111,7 @@ refresh_token_expire_days = 7
 | `SECRET_KEY` | `.env` | JWT 签名密钥 | `openssl rand -hex 32` 输出 |
 | `TENCENT_SECRET_ID` | `.env` | 腾讯云 API 密钥 ID | `AKIDxxxxxxxxxxxxx` |
 | `TENCENT_SECRET_KEY` | `.env` | 腾讯云 API 密钥 Key | `xxxxxxxxxxxxxxxx` |
+| `TENCENT_APP_ID` | `.env` | 腾讯云 AppId（ASR 极速版） | `1259220000` |
 | `tencent_sms_sdk_app_id` | `config/production.toml` | 短信应用 ID | `1400xxxxxx` |
 | `tencent_sms_sign_name` | `config/production.toml` | 短信签名（不含【】） | `人生回响` |
 | `tencent_sms_template_id` | `config/production.toml` | 短信模板 ID（所有场景共用） | `123456` |
diff --git a/api/pyproject.toml b/api/pyproject.toml
index cc03ff1..bfc34b6 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -31,7 +31,6 @@ dependencies = [
     "psycopg[binary]>=3.2.0",
     "pydantic>=2.12.5",
     "pydantic-settings>=2.13.1",
-    "pydub>=0.25.1",
     "pyjwt>=2.12.0",
     "python-alipay-sdk>=3.4.0",
     "redis>=6.4.0",
diff --git a/api/tests/test_asr_transcript_logging.py b/api/tests/test_asr_transcript_logging.py
new file mode 100644
index 0000000..762d3bb
--- /dev/null
+++ b/api/tests/test_asr_transcript_logging.py
@@ -0,0 +1,58 @@
+"""ASR transcript logging helpers."""
+
+from app.core.agent_logging import asr_transcript_log_enabled, log_asr_transcript_result
+
+
+def test_asr_transcript_log_enabled_in_development(
+    monkeypatch,
+) -> None:
+    from app.core.config import settings
+
+    monkeypatch.setattr(settings, "app_environment", "development", raising=False)
+    monkeypatch.setattr(settings, "log_level", "INFO", raising=False)
+    assert asr_transcript_log_enabled() is True
+
+
+def test_asr_transcript_log_disabled_in_production_info(
+    monkeypatch,
+) -> None:
+    from app.core.config import settings
+
+    monkeypatch.setattr(settings, "app_environment", "production", raising=False)
+    monkeypatch.setattr(settings, "log_level", "INFO", raising=False)
+    assert asr_transcript_log_enabled() is False
+
+
+def test_log_asr_transcript_result_emits_info(
+    monkeypatch,
+    caplog,
+) -> None:
+    import logging
+
+    from app.core.config import settings
+
+    caplog.set_level(logging.INFO)
+    monkeypatch.setattr(settings, "app_environment", "development", raising=False)
+
+    class _Logger:
+        def info(self, msg, *args):
+            caplog.records.append(
+                logging.LogRecord(
+                    name="test",
+                    level=logging.INFO,
+                    pathname="",
+                    lineno=0,
+                    msg=msg.format(*args) if args else msg,
+                    args=(),
+                    exc_info=None,
+                )
+            )
+
+    log_asr_transcript_result(
+        _Logger(),
+        text="你好世界",
+        conversation_id="c1",
+        segment_index=0,
+    )
+    messages = [r.getMessage() for r in caplog.records]
+    assert any("ASR 识别结果" in m and "你好世界" in m for m in messages)
diff --git a/api/tests/test_default_toml_legacy_parity.py b/api/tests/test_default_toml_legacy_parity.py
index 8445895..679e2bf 100644
--- a/api/tests/test_default_toml_legacy_parity.py
+++ b/api/tests/test_default_toml_legacy_parity.py
@@ -18,5 +18,6 @@ def test_default_toml_matches_legacy_settings_defaults() -> None:
 
     assert cfg.asr.provider == "tencent"
     assert cfg.asr.engine_type == "16k_zh_large"
+    assert cfg.asr.request_timeout_seconds == 60.0
 
     assert cfg.misc.tencent_sms_template_param_count == 2
diff --git a/api/tests/test_infra_regressions.py b/api/tests/test_infra_regressions.py
index 3a38b8d..476a429 100644
--- a/api/tests/test_infra_regressions.py
+++ b/api/tests/test_infra_regressions.py
@@ -1,7 +1,4 @@
-import asyncio
-import sys
-from types import ModuleType, SimpleNamespace
-
+import httpx
 import pytest
 
 from app.adapters.asr.tencent_asr import TencentASRProvider
@@ -58,46 +55,52 @@ def test_post_commit_reuses_singleton_redis_client(
 
 
 @pytest.mark.asyncio
-async def test_tencent_asr_transcribe_uses_to_thread(
+async def test_tencent_asr_flash_transcribe(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    to_thread_calls: list[tuple[object, tuple[object, ...]]] = []
+    captured: dict[str, object] = {}
 
-    class FakeRequest:
-        EngSerViceType: str | None = None
-        SourceType: int | None = None
-        VoiceFormat: str | None = None
-        Data: str | None = None
-        DataLen: int | None = None
+    class FakeAsyncClient:
+        async def __aenter__(self):
+            return self
 
-    class FakeClient:
-        def SentenceRecognition(self, req: FakeRequest) -> SimpleNamespace:
-            return SimpleNamespace(Result=" 你好，世界 ")
+        async def __aexit__(self, *args):
+            return None
 
-    async def fake_to_thread(fn, *args):
-        to_thread_calls.append((fn, args))
-        return fn(*args)
+        async def post(self, url, *, headers=None, content=None, timeout=None):
+            captured["url"] = url
+            captured["headers"] = headers
+            captured["content"] = content
+            captured["timeout"] = timeout
+            return httpx.Response(
+                200,
+                json={
+                    "code": 0,
+                    "request_id": "req-1",
+                    "flash_result": [{"channel_id": 0, "text": " 你好，世界 "}],
+                },
+            )
 
-    models_module = ModuleType("tencentcloud.asr.v20190614.models")
-    models_module.SentenceRecognitionRequest = FakeRequest
-    package_module = ModuleType("tencentcloud.asr.v20190614")
-    package_module.models = models_module
-
-    monkeypatch.setitem(sys.modules, "tencentcloud.asr.v20190614", package_module)
-    monkeypatch.setattr(asyncio, "to_thread", fake_to_thread)
-
-    provider = TencentASRProvider("sid", "skey", engine_type="16k_zh_large")
-    client = FakeClient()
-    monkeypatch.setattr(provider, "_get_client", lambda: client)
+    monkeypatch.setattr(httpx, "AsyncClient", FakeAsyncClient)
 
+    provider = TencentASRProvider(
+        "sid",
+        "skey",
+        "1259220000",
+        engine_type="16k_zh_large",
+    )
     text = await provider.transcribe(b"fake-audio", format="m4a")
 
     assert text == "你好，世界"
-    assert len(to_thread_calls) == 1
-    fn, args = to_thread_calls[0]
-    assert getattr(fn, "__self__", None) is client
-    assert getattr(fn, "__name__", "") == "SentenceRecognition"
-    request = args[0]
-    assert request.EngSerViceType == "16k_zh_large"
-    assert request.VoiceFormat == "m4a"
-    assert request.DataLen == len(b"fake-audio")
+    assert captured["content"] == b"fake-audio"
+    assert captured["timeout"] == 60.0
+    url = str(captured["url"])
+    assert "engine_type=16k_zh_large" in url
+    assert "voice_format=m4a" in url
+    assert "/asr/flash/v1/1259220000?" in url
+    assert "secretid=sid" in url
+    headers = captured["headers"]
+    assert headers is not None
+    assert headers["Authorization"]
+    assert headers["Content-Type"] == "application/octet-stream"
+    assert headers["Content-Length"] == str(len(b"fake-audio"))
diff --git a/api/tests/test_settings_allowlist.py b/api/tests/test_settings_allowlist.py
index e72ae8f..13df18d 100644
--- a/api/tests/test_settings_allowlist.py
+++ b/api/tests/test_settings_allowlist.py
@@ -2,7 +2,7 @@
 
 from app.core.config import Settings
 
-ALLOWLIST_MAX_FIELDS = 22
+ALLOWLIST_MAX_FIELDS = 23
 
 EXPECTED_PREFIXES = (
     "database_",
@@ -13,6 +13,7 @@ EXPECTED_PREFIXES = (
     "deepseek_",
     "zhipu_",
     "tencent_secret_",
+    "tencent_app_",
     "wechat_pay_",
     "alipay_",
     "liblib_",
diff --git a/api/uv.lock b/api/uv.lock
index db076b2..58f076f 100644
--- a/api/uv.lock
+++ b/api/uv.lock
@@ -102,7 +102,6 @@ dependencies = [
     { name = "psycopg", extra = ["binary"] },
     { name = "pydantic" },
     { name = "pydantic-settings" },
-    { name = "pydub" },
     { name = "pyjwt" },
     { name = "python-alipay-sdk" },
     { name = "redis" },
@@ -152,7 +151,6 @@ requires-dist = [
     { name = "psycopg", extras = ["binary"], specifier = ">=3.2.0" },
     { name = "pydantic", specifier = ">=2.12.5" },
     { name = "pydantic-settings", specifier = ">=2.13.1" },
-    { name = "pydub", specifier = ">=0.25.1" },
     { name = "pyjwt", specifier = ">=2.12.0" },
     { name = "python-alipay-sdk", specifier = ">=3.4.0" },
     { name = "redis", specifier = ">=6.4.0" },
@@ -2043,15 +2041,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/00/4b/ccc026168948fec4f7555b9164c724cf4125eac006e176541483d2c959be/pydantic_settings-2.13.1-py3-none-any.whl", hash = "sha256:d56fd801823dbeae7f0975e1f8c8e25c258eb75d278ea7abb5d9cebb01b56237", size = 58929, upload-time = "2026-02-19T13:45:06.034Z" },
 ]
 
-[[package]]
-name = "pydub"
-version = "0.25.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/fe/9a/e6bca0eed82db26562c73b5076539a4a08d3cffd19c3cc5913a3e61145fd/pydub-0.25.1.tar.gz", hash = "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f", size = 38326, upload-time = "2021-03-10T02:09:54.659Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl", hash = "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6", size = 32327, upload-time = "2021-03-10T02:09:53.503Z" },
-]
-
 [[package]]
 name = "pydyf"
 version = "0.12.1"
diff --git a/app-expo/src/features/voice/recorder.ts b/app-expo/src/features/voice/recorder.ts
index 34ba7a4..f26e63c 100644
--- a/app-expo/src/features/voice/recorder.ts
+++ b/app-expo/src/features/voice/recorder.ts
@@ -18,7 +18,7 @@ type StatusListener = (status: RecorderStatus) => void;
 type RecordingCompleteListener = (uri: string, durationMs: number) => void;
 
 /**
- * Tencent SentenceRecognition uses `EngSerViceType=16k_zh_large` and
+ * Tencent ASR flash (`16k_zh_large`); client sends ~15s m4a segments per upload.
  * `VoiceFormat=m4a`, so record speech in that shape directly instead of
  * relying on Expo's default 44.1 kHz stereo preset.
  */