From 07979bfb098073dac5bce6ab0ad9f3472db71157 Mon Sep 17 00:00:00 2001 From: Kevin Date: Mon, 25 May 2026 11:28:22 +0800 Subject: [PATCH] feat(api): use Tencent ASR flash with 16k_zh_large and dev transcript logs Replace CreateRecTask polling with recording-file flash API, add TENCENT_APP_ID, remove server-side pydub slicing, and log ASR recognition text at INFO in development. Co-authored-by: Cursor --- api/.env.example | 2 + api/.env.production | 5 +- api/.env.staging | 5 +- api/README.md | 2 +- api/app/adapters/asr/tencent_asr.py | 234 ++++++++++++++----- api/app/core/agent_logging.py | 26 +++ api/app/core/app_config_models.py | 1 + api/app/core/config.py | 1 + api/app/core/dependencies.py | 2 + api/app/features/conversation/ws/pipeline.py | 75 ++---- api/app/features/conversation/ws/protocol.md | 4 +- api/app/features/conversation/ws/router.py | 22 +- api/app/main.py | 2 +- api/config/default.toml | 2 + api/docs/部署指南.md | 3 + api/pyproject.toml | 1 - api/tests/test_asr_transcript_logging.py | 58 +++++ api/tests/test_default_toml_legacy_parity.py | 1 + api/tests/test_infra_regressions.py | 77 +++--- api/tests/test_settings_allowlist.py | 3 +- api/uv.lock | 11 - app-expo/src/features/voice/recorder.ts | 2 +- 22 files changed, 354 insertions(+), 185 deletions(-) create mode 100644 api/tests/test_asr_transcript_logging.py diff --git a/api/.env.example b/api/.env.example index 1c3056a..b3e0e53 100644 --- a/api/.env.example +++ b/api/.env.example @@ -29,6 +29,8 @@ ZHIPU_API_KEY=your_zhipu_api_key # ── 腾讯云凭证(SMS / ASR / TTS / COS 共用)────────────────── TENCENT_SECRET_ID=your_tencent_secret_id TENCENT_SECRET_KEY=your_tencent_secret_key +# ASR 极速版必填:API 密钥管理页 AppId(与 SecretId 同页) +TENCENT_APP_ID=your_tencent_app_id # ── WeChat Pay 密钥 ─────────────────────────────────────────── WECHAT_PAY_API_V3_KEY=your_wechat_api_v3_key diff --git a/api/.env.production b/api/.env.production index 31dd3d7..8450f35 100644 --- a/api/.env.production +++ b/api/.env.production @@ -15,8 +15,9 @@ SECRET_KEY=cf47555c7ecbe5ddb7fd2113c59e08a8bcb110810c42f7c644e06a5acc898608 DEEPSEEK_API_KEY=sk-09f17fb61c5a4299a3afc2a01de7af75 ZHIPU_API_KEY=524eda18eb3848e881eefe4c7ef17ec2.xBmGUabYDEa44m3M -TENCENT_SECRET_ID=AKIDa2ILCwUr56uVt31oU0JOHxPfGhvvkLiq -TENCENT_SECRET_KEY=xiFbjlZ9XheS2NWYLvHRPAh2A5nGYcR2 +TENCENT_SECRET_ID=AKIDnX5gHgssuvoXBQYTCnJ5g6MPQXWTL8mD +TENCENT_SECRET_KEY=MgsreEcqzzoQuFPVVnz11zpUEtPlwwjL +TENCENT_APP_ID=1319381411 WECHAT_PAY_API_V3_KEY=xjvGSJLGJAJfjgskfjslafjsajsdjals diff --git a/api/.env.staging b/api/.env.staging index 89aff48..8ea9f9d 100644 --- a/api/.env.staging +++ b/api/.env.staging @@ -19,8 +19,9 @@ SECRET_KEY=cf47555c7ecbe5ddb7fd2113c59e08a8bcb110810c42f7c644e06a5acc898608 DEEPSEEK_API_KEY=sk-09f17fb61c5a4299a3afc2a01de7af75 ZHIPU_API_KEY=524eda18eb3848e881eefe4c7ef17ec2.xBmGUabYDEa44m3M -TENCENT_SECRET_ID=AKIDa2ILCwUr56uVt31oU0JOHxPfGhvvkLiq -TENCENT_SECRET_KEY=xiFbjlZ9XheS2NWYLvHRPAh2A5nGYcR2 +TENCENT_SECRET_ID=AKIDnX5gHgssuvoXBQYTCnJ5g6MPQXWTL8mD +TENCENT_SECRET_KEY=MgsreEcqzzoQuFPVVnz11zpUEtPlwwjL +TENCENT_APP_ID=1319381411 WECHAT_PAY_API_V3_KEY=xjvGSJLGJAJfjgskfjslafjsajsdjals diff --git a/api/README.md b/api/README.md index caa85a3..f20e4fa 100644 --- a/api/README.md +++ b/api/README.md @@ -533,7 +533,7 @@ ws://localhost:8000/ws/conversation/{conversation_id}?token={access_token} ### 3. 语音服务 -- **ASR (语音识别)**: 腾讯云一句话识别(引擎 `16k_zh_large`,含多方言) +- **ASR (语音识别)**: 腾讯云录音文件识别极速版(Flash 同步接口,引擎 `16k_zh_large`);需配置 `TENCENT_APP_ID`;客户端按 15s 分段上传 - **TTS (语音合成)**: 使用 OpenAI TTS API 将文本转为语音 ### 4. PDF 生成 diff --git a/api/app/adapters/asr/tencent_asr.py b/api/app/adapters/asr/tencent_asr.py index 4e1790c..7224b49 100644 --- a/api/app/adapters/asr/tencent_asr.py +++ b/api/app/adapters/asr/tencent_asr.py @@ -1,7 +1,13 @@ -"""Tencent Cloud ASR adapter — implements ASRProvider port.""" +"""Tencent Cloud ASR adapter — 录音文件识别极速版 (Flash), implements ASRProvider port.""" -import asyncio import base64 +import hashlib +import hmac +import json +import time +from typing import Any + +import httpx from app.core.business_telemetry import business_span from app.core.logging import get_logger @@ -9,82 +15,198 @@ from app.ports.asr import ASRTranscriptionError logger = get_logger(__name__) +_FLASH_HOST = "asr.cloud.tencent.com" +_FLASH_PATH_PREFIX = "/asr/flash/v1/" +# 极速版本地上传上限(腾讯文档:≤100MB) +_MAX_FLASH_AUDIO_BYTES = 100 * 1024 * 1024 + + +def _format_flash_sign_string(sorted_params: list[tuple[str, Any]]) -> str: + """与官方 flash_recognizer._format_sign_string 一致。""" + signstr = f"POST{_FLASH_HOST}{_FLASH_PATH_PREFIX}" + for key, value in sorted_params: + if key == "appid": + signstr += str(value) + break + signstr += "?" + for key, value in sorted_params: + if key == "appid": + continue + signstr += f"{key}={value}&" + return signstr[:-1] + + +def _build_flash_url_and_headers( + secret_key: str, params: dict[str, Any] +) -> tuple[str, dict[str, str]]: + sorted_params = sorted(params.items(), key=lambda item: item[0]) + signstr = _format_flash_sign_string(sorted_params) + signature = base64.b64encode( + hmac.new( + secret_key.encode("utf-8"), + signstr.encode("utf-8"), + hashlib.sha1, + ).digest() + ).decode("utf-8") + url = "https://" + signstr[4:] + headers = { + "Host": _FLASH_HOST, + "Authorization": signature, + } + return url, headers + + +def _build_flash_query_params( + *, + app_id: str, + secret_id: str, + engine_type: str, + voice_format: str, +) -> dict[str, Any]: + return { + "appid": app_id, + "secretid": secret_id, + "timestamp": str(int(time.time())), + "engine_type": engine_type, + "voice_format": voice_format, + "speaker_diarization": 0, + "customization_id": "", + "filter_dirty": 0, + "filter_modal": 0, + "filter_punc": 0, + "convert_num_mode": 1, + "word_info": 0, + "first_channel_only": 1, + "reinforce_hotword": 0, + "sentence_max_length": 0, + } + class TencentASRProvider: def __init__( self, secret_id: str, secret_key: str, + app_id: str, *, engine_type: str = "16k_zh_large", + request_timeout_seconds: float = 60.0, ): self._secret_id = secret_id self._secret_key = secret_key + self._app_id = (app_id or "").strip() self._engine_type = engine_type - self._client = None - - def _get_client(self): - if self._client is not None: - return self._client - try: - from tencentcloud.asr.v20190614 import asr_client - from tencentcloud.common import credential - from tencentcloud.common.profile.client_profile import ClientProfile - from tencentcloud.common.profile.http_profile import HttpProfile - - cred = credential.Credential(self._secret_id, self._secret_key) - http_profile = HttpProfile() - http_profile.endpoint = "asr.tencentcloudapi.com" - client_profile = ClientProfile() - client_profile.httpProfile = http_profile - self._client = asr_client.AsrClient(cred, "", client_profile) - return self._client - except Exception as e: - logger.error("Tencent ASR client init failed: {}", e) - return None + self._request_timeout_seconds = request_timeout_seconds def ensure_ready(self) -> bool: - return bool(self._secret_id and self._secret_key and self._get_client()) + return bool(self._secret_id and self._secret_key and self._app_id) async def transcribe(self, audio: bytes, format: str = "m4a") -> str: - with business_span("asr.transcribe", provider="tencent"): + with business_span("asr.transcribe", provider="tencent_flash"): return await self._transcribe_inner(audio, format) async def _transcribe_inner(self, audio: bytes, format: str) -> str: - client = self._get_client() - if not client: + if not self.ensure_ready(): raise ASRTranscriptionError( - "Tencent ASR client not initialized (check credentials)" + "Tencent ASR flash not configured (need TENCENT_APP_ID, SECRET_ID, SECRET_KEY)" ) + if len(audio) > _MAX_FLASH_AUDIO_BYTES: + raise ASRTranscriptionError( + f"Audio exceeds {_MAX_FLASH_AUDIO_BYTES // (1024 * 1024)}MB flash upload limit" + ) + + voice_format = (format or "m4a").lower() + params = _build_flash_query_params( + app_id=self._app_id, + secret_id=self._secret_id, + engine_type=self._engine_type, + voice_format=voice_format, + ) + url, headers = _build_flash_url_and_headers(self._secret_key, params) + headers["Content-Type"] = "application/octet-stream" + headers["Content-Length"] = str(len(audio)) + try: - from tencentcloud.asr.v20190614 import models - - audio_base64 = base64.b64encode(audio).decode("utf-8") - req = models.SentenceRecognitionRequest() - req.EngSerViceType = self._engine_type - req.SourceType = 1 - # 小写;与文档一致。iOS 常见为 m4a(AAC) 容器,与 16k 引擎匹配 - req.VoiceFormat = (format or "m4a").lower() - req.Data = audio_base64 - req.DataLen = len(audio) - - # 腾讯 SDK 为同步阻塞调用;放到线程池里避免卡住事件循环。 - resp = await asyncio.to_thread(client.SentenceRecognition, req) - text = (resp.Result or "").strip() - if text: - return text - err = getattr(resp, "Error", None) or getattr(resp, "Message", None) - logger.warning( - "Tencent ASR empty Result, audio_len={} format={} err={}", - len(audio), - req.VoiceFormat, - err, - ) - raise ASRTranscriptionError( - "Tencent ASR empty Result (check sample rate / format / audio)" - ) + async with httpx.AsyncClient() as client: + resp = await client.post( + url, + headers=headers, + content=audio, + timeout=self._request_timeout_seconds, + ) + if resp.status_code >= 400: + raise ASRTranscriptionError( + f"Tencent ASR flash HTTP {resp.status_code}: {resp.text[:200]}" + ) + payload = resp.json() except ASRTranscriptionError: raise + except httpx.HTTPError as e: + logger.error("Tencent ASR flash HTTP failed: {}", e, exc_info=True) + raise ASRTranscriptionError(f"Tencent ASR flash HTTP failed: {e!s}") from e + except json.JSONDecodeError as e: + logger.error("Tencent ASR flash invalid JSON: {}", e, exc_info=True) + raise ASRTranscriptionError("Tencent ASR flash returned invalid JSON") from e except Exception as e: - logger.error("Tencent ASR transcribe failed: {}", e, exc_info=True) - raise ASRTranscriptionError(f"Tencent ASR transcribe failed: {e!s}") from e + logger.error("Tencent ASR flash transcribe failed: {}", e, exc_info=True) + raise ASRTranscriptionError(f"Tencent ASR flash transcribe failed: {e!s}") from e + + return self._parse_flash_response(payload, audio_len=len(audio), voice_format=voice_format) + + def _parse_flash_response( + self, payload: dict[str, Any], *, audio_len: int, voice_format: str + ) -> str: + code = payload.get("code") + if code != 0: + message = payload.get("message") or "unknown error" + request_id = payload.get("request_id", "") + logger.warning( + "Tencent ASR flash error code={} message={} request_id={} audio_len={} format={}", + code, + message, + request_id, + audio_len, + voice_format, + ) + if code == 4004: + raise ASRTranscriptionError( + "Tencent ASR flash resource pack exhausted (purchase 录音文件识别极速版)" + ) + if code == 4003: + raise ASRTranscriptionError( + "Tencent ASR flash service not enabled in console" + ) + raise ASRTranscriptionError( + f"Tencent ASR flash failed (code={code}): {message}" + ) + + flash_result = payload.get("flash_result") or [] + texts: list[str] = [] + for channel in flash_result: + if not isinstance(channel, dict): + continue + text = (channel.get("text") or "").strip() + if text: + texts.append(text) + combined = "".join(texts) + if combined: + logger.debug( + "Tencent ASR flash ok request_id={} audio_len={} audio_duration_ms={} " + "voice_format={} chars={}", + payload.get("request_id"), + audio_len, + payload.get("audio_duration"), + voice_format, + len(combined), + ) + return combined + + logger.warning( + "Tencent ASR flash empty flash_result, audio_len={} format={} request_id={}", + audio_len, + voice_format, + payload.get("request_id"), + ) + raise ASRTranscriptionError( + "Tencent ASR flash empty result (check sample rate / format / audio)" + ) diff --git a/api/app/core/agent_logging.py b/api/app/core/agent_logging.py index 4786a59..0726236 100644 --- a/api/app/core/agent_logging.py +++ b/api/app/core/agent_logging.py @@ -87,6 +87,32 @@ def log_agent_summary( logger.info(message, *args) +def asr_transcript_log_enabled() -> bool: + """development 环境或全局 DEBUG/TRACE 时以 INFO 输出 ASR 识别全文。""" + env = (settings.app_environment or "").strip().lower() + if env == "development": + return True + return agent_verbose_enabled() + + +def log_asr_transcript_result( + logger: Any, + *, + text: str, + **context: Any, +) -> None: + """在 ``asr_transcript_log_enabled()`` 时记录识别结果(过长文本会截断)。""" + if not asr_transcript_log_enabled(): + return + parts = [f"{k}={v}" for k, v in context.items() if v is not None and v != ""] + ctx = " ".join(parts) + body = truncate_for_log(text) + if ctx: + logger.info("ASR 识别结果 {} text={}", ctx, body) + else: + logger.info("ASR 识别结果 text={}", body) + + @contextmanager def agent_span( logger: Any, diff --git a/api/app/core/app_config_models.py b/api/app/core/app_config_models.py index da63730..9da0374 100644 --- a/api/app/core/app_config_models.py +++ b/api/app/core/app_config_models.py @@ -205,6 +205,7 @@ class AsrConfig(BaseModel): provider: Literal["tencent"] = "tencent" engine_type: Literal["16k_zh_large"] = "16k_zh_large" + request_timeout_seconds: float = 60.0 class TtsConfig(BaseModel): diff --git a/api/app/core/config.py b/api/app/core/config.py index bd66c5b..d7baf12 100644 --- a/api/app/core/config.py +++ b/api/app/core/config.py @@ -42,6 +42,7 @@ class Settings(BaseSettings): tencent_secret_id: str = "" tencent_secret_key: str = "" + tencent_app_id: str = "" wechat_pay_api_v3_key: str = "" wechat_pay_private_key: str = "" diff --git a/api/app/core/dependencies.py b/api/app/core/dependencies.py index eae3a57..3a06c0c 100644 --- a/api/app/core/dependencies.py +++ b/api/app/core/dependencies.py @@ -107,7 +107,9 @@ def get_asr_provider() -> ASRProvider: return TencentASRProvider( secret_id=settings.tencent_secret_id, secret_key=settings.tencent_secret_key, + app_id=settings.tencent_app_id, engine_type=asr_defaults.engine_type, + request_timeout_seconds=asr_defaults.request_timeout_seconds, ) diff --git a/api/app/features/conversation/ws/pipeline.py b/api/app/features/conversation/ws/pipeline.py index 71c2776..b8a1cb4 100644 --- a/api/app/features/conversation/ws/pipeline.py +++ b/api/app/features/conversation/ws/pipeline.py @@ -2,7 +2,6 @@ import asyncio import base64 -import io import time import uuid from dataclasses import dataclass, field @@ -19,7 +18,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.agents.chat import ChatOrchestrator from app.agents.chat.reply_limits import segments_from_llm_response -from app.core.agent_logging import agent_summary_enabled +from app.core.agent_logging import agent_summary_enabled, log_asr_transcript_result from app.core.business_telemetry import business_span from app.core.config import settings from app.core.cos_url_keys import ( @@ -617,64 +616,6 @@ async def _delayed_listening_feedback( await _send_segment_transition_feedback(conversation_id, 0) -# ── 长音频切片转写 ──────────────────────────────────────────── - -MAX_ASR_CHUNK_MS = 55_000 - - -def _split_audio_bytes(audio_bytes: bytes, fmt: str) -> list[bytes]: - """用 pydub 将长音频按 ≤55 s 切片,每片导出为 16 kHz mono WAV(腾讯 ASR 3 MB 限制内)。""" - from pydub import AudioSegment as PydubSegment - - audio = PydubSegment.from_file(io.BytesIO(audio_bytes), format=fmt) - duration_ms = len(audio) - - if duration_ms <= MAX_ASR_CHUNK_MS: - return [audio_bytes] - - mono_16k = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2) - chunks: list[bytes] = [] - for start in range(0, duration_ms, MAX_ASR_CHUNK_MS): - chunk = mono_16k[start : start + MAX_ASR_CHUNK_MS] - buf = io.BytesIO() - chunk.export(buf, format="wav") - chunks.append(buf.getvalue()) - return chunks - - -async def _transcribe_long_audio(audio_bytes: bytes, fmt: str = "m4a") -> str: - """超过 55 s 的音频自动切片后并行 ASR;短音频直接转写。""" - asr = get_asr_provider() - return await _transcribe_long_audio_inner(audio_bytes, fmt, asr) - - -async def _transcribe_long_audio_inner( - audio_bytes: bytes, fmt: str, asr: Any -) -> str: - try: - chunks = await asyncio.to_thread(_split_audio_bytes, audio_bytes, fmt) - except Exception as exc: - logger.warning("pydub 切片失败 ({}), 回退到直接转写", exc) - return await asr.transcribe(audio_bytes, format=fmt) - - if len(chunks) <= 1: - return await asr.transcribe(audio_bytes, format=fmt) - - logger.info("长音频切片: {} 段", len(chunks)) - results = await asyncio.gather( - *[asr.transcribe(c, format="wav") for c in chunks], - return_exceptions=True, - ) - texts: list[str] = [] - for i, r in enumerate(results): - if isinstance(r, BaseException): - logger.warning("切片 {} 转写异常: {}", i, r) - continue - if r and not _is_transcribe_failure(r): - texts.append(r) - return "".join(texts) - - # ── 分段语音异步处理 ──────────────────────────────────────────── @@ -761,7 +702,19 @@ async def process_audio_segment( segment_index, ) try: - transcript_text = await _transcribe_long_audio(audio_bytes, fmt="m4a") + asr = get_asr_provider() + transcript_text = await asr.transcribe(audio_bytes, format="m4a") + if transcript_text: + log_asr_transcript_result( + logger, + text=transcript_text, + conversation_id=conversation_id, + voice_session_id=voice_session_id, + segment_index=segment_index, + duration_s=audio_duration, + audio_len=len(audio_bytes), + source="audio_segment", + ) except ASRTranscriptionError as e: logger.warning( "ASR 转写失败 segment_index={} conversation_id={}: {}", diff --git a/api/app/features/conversation/ws/protocol.md b/api/app/features/conversation/ws/protocol.md index 1b03b0c..1f44931 100644 --- a/api/app/features/conversation/ws/protocol.md +++ b/api/app/features/conversation/ws/protocol.md @@ -8,8 +8,8 @@ ## 消息类型 (client → server) - `TEXT`:文本消息。`data.text` 必填。可选 `data.tts_this_turn`(布尔):为 `true` 且服务端 `ENABLE_TTS` 开启且本轮回避 `skip_tts` 时,对该轮助手回复分段合成 TTS;默认为 `false`/缺省即不合成。**当开启本轮 TTS 时,每个助手分段服务端先推送 `tts_audio` 再推送该段 `agent_response`**,便于客户端先收音频再展示同段文字。 -- `AUDIO_SEGMENT`:语音分段。`data` 含 `audio_base64`、`segment_index`、`voice_session_id` / `client_segment_id`、`is_last`、`duration`。可选同上 `tts_this_turn`。 -- `AUDIO_MESSAGE`:整段音频(单次 ASR + 对话)。同上可选 `tts_this_turn`。 +- `AUDIO_SEGMENT`:语音分段(客户端约 15s 一段)。`data` 含 `audio_base64`、`segment_index`、`voice_session_id` / `client_segment_id`、`is_last`、`duration`。可选同上 `tts_this_turn`。服务端对每段调用录音文件识别极速版(`16k_zh_large`,HTTPS 同步返回)。 +- `AUDIO_MESSAGE`:整段音频(单次 ASR + 对话)。同上可选 `tts_this_turn`。单段建议 ≤100MB(极速版上限)。 - `TRANSCRIBE_ONLY`:仅转写不回复 - `TTS_CANCEL`:取消当前轮未完成的分段合成与下发 - `TTS_REQUEST`:用户点击某一助手气泡「朗读」且该段尚无 TTS 时下发。`data` 含 `assistant_message_id`(落库 `conversation_messages.id`)、`segment_index`(与该条助手正文按 `[SPLIT]` 分段后的从 0 下标)、可选 `segment_text`(须与该分段正文一致,用于校验)。服务端若该段已有 URL 则只做预签名后推送 `tts_audio`(`data.manual=true`),**不重复合成**。 diff --git a/api/app/features/conversation/ws/router.py b/api/app/features/conversation/ws/router.py index 44f78fe..e180ed0 100644 --- a/api/app/features/conversation/ws/router.py +++ b/api/app/features/conversation/ws/router.py @@ -12,6 +12,7 @@ from starlette.websockets import WebSocketState from app.agents.chat.background_voice import infer_background_voice from app.agents.chat.prompts_profile import format_user_profile_context +from app.core.agent_logging import log_asr_transcript_result from app.core.config import settings from app.core.db import AsyncSessionLocal from app.core.dependencies import get_asr_provider @@ -596,15 +597,12 @@ async def websocket_endpoint( asr = get_asr_provider() audio_bytes = base64.b64decode(audio_base64) asr_text = await asr.transcribe(audio_bytes, "m4a") - logger.debug( - "ASR 转写完成: conversation_id={} chars={}", - conversation_id, - len(asr_text or ""), - ) - logger.debug( - "ASR 转写全文: conversation_id={} text={}", - conversation_id, - asr_text, + log_asr_transcript_result( + logger, + text=asr_text or "", + conversation_id=conversation_id, + duration_s=audio_duration, + source="audio_message", ) await manager.send_message( @@ -692,6 +690,12 @@ async def websocket_endpoint( asr = get_asr_provider() audio_bytes = base64.b64decode(audio_base64) asr_text = await asr.transcribe(audio_bytes, "m4a") + log_asr_transcript_result( + logger, + text=asr_text or "", + conversation_id=conversation_id, + source="transcribe_only", + ) await manager.send_message( conversation_id, { diff --git a/api/app/main.py b/api/app/main.py index bf4e959..2034783 100644 --- a/api/app/main.py +++ b/api/app/main.py @@ -86,7 +86,7 @@ async def lifespan(app: FastAPI): asr_ready = True if asr_ready: logger.info( - "ASR 服务已就绪(腾讯云一句话识别,引擎 {})", + "ASR 服务已就绪(腾讯云录音文件识别极速版,引擎 {})", asr_defaults.engine_type, ) else: diff --git a/api/config/default.toml b/api/config/default.toml index 66b824a..7462265 100644 --- a/api/config/default.toml +++ b/api/config/default.toml @@ -177,7 +177,9 @@ embedding_model = "embedding-3" [asr] provider = "tencent" +# 录音文件识别极速版(Flash HTTPS 同步);引擎 16k_zh_large;AppID 见 .env TENCENT_APP_ID engine_type = "16k_zh_large" +request_timeout_seconds = 60.0 [tts] provider = "tencent" diff --git a/api/docs/部署指南.md b/api/docs/部署指南.md index 5a66d79..7f7ebe9 100644 --- a/api/docs/部署指南.md +++ b/api/docs/部署指南.md @@ -88,6 +88,8 @@ SECRET_KEY=your_strong_random_secret_here # 腾讯云 API 密钥(SMS / ASR / TTS / COS 共用) TENCENT_SECRET_ID=your_secret_id_here TENCENT_SECRET_KEY=your_secret_key_here +# ASR 录音文件识别极速版必填(API 密钥管理页 AppId) +TENCENT_APP_ID=your_app_id_here ``` 编辑 `api/config/production.toml`(SMS 业务 ID 等非密钥项): @@ -109,6 +111,7 @@ refresh_token_expire_days = 7 | `SECRET_KEY` | `.env` | JWT 签名密钥 | `openssl rand -hex 32` 输出 | | `TENCENT_SECRET_ID` | `.env` | 腾讯云 API 密钥 ID | `AKIDxxxxxxxxxxxxx` | | `TENCENT_SECRET_KEY` | `.env` | 腾讯云 API 密钥 Key | `xxxxxxxxxxxxxxxx` | +| `TENCENT_APP_ID` | `.env` | 腾讯云 AppId(ASR 极速版) | `1259220000` | | `tencent_sms_sdk_app_id` | `config/production.toml` | 短信应用 ID | `1400xxxxxx` | | `tencent_sms_sign_name` | `config/production.toml` | 短信签名(不含【】) | `人生回响` | | `tencent_sms_template_id` | `config/production.toml` | 短信模板 ID(所有场景共用) | `123456` | diff --git a/api/pyproject.toml b/api/pyproject.toml index cc03ff1..bfc34b6 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -31,7 +31,6 @@ dependencies = [ "psycopg[binary]>=3.2.0", "pydantic>=2.12.5", "pydantic-settings>=2.13.1", - "pydub>=0.25.1", "pyjwt>=2.12.0", "python-alipay-sdk>=3.4.0", "redis>=6.4.0", diff --git a/api/tests/test_asr_transcript_logging.py b/api/tests/test_asr_transcript_logging.py new file mode 100644 index 0000000..762d3bb --- /dev/null +++ b/api/tests/test_asr_transcript_logging.py @@ -0,0 +1,58 @@ +"""ASR transcript logging helpers.""" + +from app.core.agent_logging import asr_transcript_log_enabled, log_asr_transcript_result + + +def test_asr_transcript_log_enabled_in_development( + monkeypatch, +) -> None: + from app.core.config import settings + + monkeypatch.setattr(settings, "app_environment", "development", raising=False) + monkeypatch.setattr(settings, "log_level", "INFO", raising=False) + assert asr_transcript_log_enabled() is True + + +def test_asr_transcript_log_disabled_in_production_info( + monkeypatch, +) -> None: + from app.core.config import settings + + monkeypatch.setattr(settings, "app_environment", "production", raising=False) + monkeypatch.setattr(settings, "log_level", "INFO", raising=False) + assert asr_transcript_log_enabled() is False + + +def test_log_asr_transcript_result_emits_info( + monkeypatch, + caplog, +) -> None: + import logging + + from app.core.config import settings + + caplog.set_level(logging.INFO) + monkeypatch.setattr(settings, "app_environment", "development", raising=False) + + class _Logger: + def info(self, msg, *args): + caplog.records.append( + logging.LogRecord( + name="test", + level=logging.INFO, + pathname="", + lineno=0, + msg=msg.format(*args) if args else msg, + args=(), + exc_info=None, + ) + ) + + log_asr_transcript_result( + _Logger(), + text="你好世界", + conversation_id="c1", + segment_index=0, + ) + messages = [r.getMessage() for r in caplog.records] + assert any("ASR 识别结果" in m and "你好世界" in m for m in messages) diff --git a/api/tests/test_default_toml_legacy_parity.py b/api/tests/test_default_toml_legacy_parity.py index 8445895..679e2bf 100644 --- a/api/tests/test_default_toml_legacy_parity.py +++ b/api/tests/test_default_toml_legacy_parity.py @@ -18,5 +18,6 @@ def test_default_toml_matches_legacy_settings_defaults() -> None: assert cfg.asr.provider == "tencent" assert cfg.asr.engine_type == "16k_zh_large" + assert cfg.asr.request_timeout_seconds == 60.0 assert cfg.misc.tencent_sms_template_param_count == 2 diff --git a/api/tests/test_infra_regressions.py b/api/tests/test_infra_regressions.py index 3a38b8d..476a429 100644 --- a/api/tests/test_infra_regressions.py +++ b/api/tests/test_infra_regressions.py @@ -1,7 +1,4 @@ -import asyncio -import sys -from types import ModuleType, SimpleNamespace - +import httpx import pytest from app.adapters.asr.tencent_asr import TencentASRProvider @@ -58,46 +55,52 @@ def test_post_commit_reuses_singleton_redis_client( @pytest.mark.asyncio -async def test_tencent_asr_transcribe_uses_to_thread( +async def test_tencent_asr_flash_transcribe( monkeypatch: pytest.MonkeyPatch, ) -> None: - to_thread_calls: list[tuple[object, tuple[object, ...]]] = [] + captured: dict[str, object] = {} - class FakeRequest: - EngSerViceType: str | None = None - SourceType: int | None = None - VoiceFormat: str | None = None - Data: str | None = None - DataLen: int | None = None + class FakeAsyncClient: + async def __aenter__(self): + return self - class FakeClient: - def SentenceRecognition(self, req: FakeRequest) -> SimpleNamespace: - return SimpleNamespace(Result=" 你好,世界 ") + async def __aexit__(self, *args): + return None - async def fake_to_thread(fn, *args): - to_thread_calls.append((fn, args)) - return fn(*args) + async def post(self, url, *, headers=None, content=None, timeout=None): + captured["url"] = url + captured["headers"] = headers + captured["content"] = content + captured["timeout"] = timeout + return httpx.Response( + 200, + json={ + "code": 0, + "request_id": "req-1", + "flash_result": [{"channel_id": 0, "text": " 你好,世界 "}], + }, + ) - models_module = ModuleType("tencentcloud.asr.v20190614.models") - models_module.SentenceRecognitionRequest = FakeRequest - package_module = ModuleType("tencentcloud.asr.v20190614") - package_module.models = models_module - - monkeypatch.setitem(sys.modules, "tencentcloud.asr.v20190614", package_module) - monkeypatch.setattr(asyncio, "to_thread", fake_to_thread) - - provider = TencentASRProvider("sid", "skey", engine_type="16k_zh_large") - client = FakeClient() - monkeypatch.setattr(provider, "_get_client", lambda: client) + monkeypatch.setattr(httpx, "AsyncClient", FakeAsyncClient) + provider = TencentASRProvider( + "sid", + "skey", + "1259220000", + engine_type="16k_zh_large", + ) text = await provider.transcribe(b"fake-audio", format="m4a") assert text == "你好,世界" - assert len(to_thread_calls) == 1 - fn, args = to_thread_calls[0] - assert getattr(fn, "__self__", None) is client - assert getattr(fn, "__name__", "") == "SentenceRecognition" - request = args[0] - assert request.EngSerViceType == "16k_zh_large" - assert request.VoiceFormat == "m4a" - assert request.DataLen == len(b"fake-audio") + assert captured["content"] == b"fake-audio" + assert captured["timeout"] == 60.0 + url = str(captured["url"]) + assert "engine_type=16k_zh_large" in url + assert "voice_format=m4a" in url + assert "/asr/flash/v1/1259220000?" in url + assert "secretid=sid" in url + headers = captured["headers"] + assert headers is not None + assert headers["Authorization"] + assert headers["Content-Type"] == "application/octet-stream" + assert headers["Content-Length"] == str(len(b"fake-audio")) diff --git a/api/tests/test_settings_allowlist.py b/api/tests/test_settings_allowlist.py index e72ae8f..13df18d 100644 --- a/api/tests/test_settings_allowlist.py +++ b/api/tests/test_settings_allowlist.py @@ -2,7 +2,7 @@ from app.core.config import Settings -ALLOWLIST_MAX_FIELDS = 22 +ALLOWLIST_MAX_FIELDS = 23 EXPECTED_PREFIXES = ( "database_", @@ -13,6 +13,7 @@ EXPECTED_PREFIXES = ( "deepseek_", "zhipu_", "tencent_secret_", + "tencent_app_", "wechat_pay_", "alipay_", "liblib_", diff --git a/api/uv.lock b/api/uv.lock index db076b2..58f076f 100644 --- a/api/uv.lock +++ b/api/uv.lock @@ -102,7 +102,6 @@ dependencies = [ { name = "psycopg", extra = ["binary"] }, { name = "pydantic" }, { name = "pydantic-settings" }, - { name = "pydub" }, { name = "pyjwt" }, { name = "python-alipay-sdk" }, { name = "redis" }, @@ -152,7 +151,6 @@ requires-dist = [ { name = "psycopg", extras = ["binary"], specifier = ">=3.2.0" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "pydantic-settings", specifier = ">=2.13.1" }, - { name = "pydub", specifier = ">=0.25.1" }, { name = "pyjwt", specifier = ">=2.12.0" }, { name = "python-alipay-sdk", specifier = ">=3.4.0" }, { name = "redis", specifier = ">=6.4.0" }, @@ -2043,15 +2041,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/4b/ccc026168948fec4f7555b9164c724cf4125eac006e176541483d2c959be/pydantic_settings-2.13.1-py3-none-any.whl", hash = "sha256:d56fd801823dbeae7f0975e1f8c8e25c258eb75d278ea7abb5d9cebb01b56237", size = 58929, upload-time = "2026-02-19T13:45:06.034Z" }, ] -[[package]] -name = "pydub" -version = "0.25.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/fe/9a/e6bca0eed82db26562c73b5076539a4a08d3cffd19c3cc5913a3e61145fd/pydub-0.25.1.tar.gz", hash = "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f", size = 38326, upload-time = "2021-03-10T02:09:54.659Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl", hash = "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6", size = 32327, upload-time = "2021-03-10T02:09:53.503Z" }, -] - [[package]] name = "pydyf" version = "0.12.1" diff --git a/app-expo/src/features/voice/recorder.ts b/app-expo/src/features/voice/recorder.ts index 34ba7a4..f26e63c 100644 --- a/app-expo/src/features/voice/recorder.ts +++ b/app-expo/src/features/voice/recorder.ts @@ -18,7 +18,7 @@ type StatusListener = (status: RecorderStatus) => void; type RecordingCompleteListener = (uri: string, durationMs: number) => void; /** - * Tencent SentenceRecognition uses `EngSerViceType=16k_zh_large` and + * Tencent ASR flash (`16k_zh_large`); client sends ~15s m4a segments per upload. * `VoiceFormat=m4a`, so record speech in that shape directly instead of * relying on Expo's default 44.1 kHz stereo preset. */