Files
operating-room-monitor-server/app/services/baidu_speech.py
2026-04-28 10:41:48 +08:00

98 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
from threading import Lock
from typing import Any
from aip import AipSpeech
from app.config import Settings, settings as _default_settings
from app.services.audio_wav import pcm_s16le_to_wav_bytes
class BaiduSpeechNotConfiguredError(RuntimeError):
"""未配置 `BAIDU_APP_ID` / `BAIDU_API_KEY` / `BAIDU_SECRET_KEY` 时调用接口会抛出。"""
class BaiduSpeechService:
"""百度短语音识别asr与在线语音合成synthesis基于 `baidu-aip` 的 `AipSpeech`。"""
def __init__(self, app_settings: Settings | None = None) -> None:
self._s = app_settings or _default_settings
self._client: AipSpeech | None = None
self._lock = Lock()
@property
def configured(self) -> bool:
return self._s.baidu_speech_configured
def _client_or_raise(self) -> AipSpeech:
if not self.configured:
raise BaiduSpeechNotConfiguredError(
"百度语音未配置:请设置 BAIDU_APP_ID、BAIDU_API_KEY、BAIDU_SECRET_KEY。"
)
with self._lock:
if self._client is None:
client = AipSpeech(
self._s.baidu_speech_app_id,
self._s.baidu_speech_api_key,
self._s.baidu_speech_secret_key,
)
if self._s.baidu_speech_connection_timeout_ms is not None:
client.setConnectionTimeoutInMillis(
self._s.baidu_speech_connection_timeout_ms
)
if self._s.baidu_speech_socket_timeout_ms is not None:
client.setSocketTimeoutInMillis(self._s.baidu_speech_socket_timeout_ms)
self._client = client
return self._client
def asr(
self,
speech: bytes | None = None,
format: str = "pcm",
rate: int = 16000,
options: dict[str, Any] | None = None,
) -> dict[str, Any]:
"""短语音识别。返回百度 JSON含 `err_no`、`result` 等)。
固定使用普通话模型(`dev_pid` 来自配置),避免未传参时误用服务端默认导致偏英语等结果。
"""
merged: dict[str, Any] = dict(options or {})
merged["dev_pid"] = int(self._s.baidu_speech_asr_dev_pid)
return self._client_or_raise().asr(speech, format, rate, merged)
def asr_16k_mono_pcm_or_wav_fallback(
self,
pcm_s16le: bytes,
*,
rate: int = 16000,
options: dict[str, Any] | None = None,
) -> dict[str, Any]:
"""先按 raw PCM 识别;若返回 err_no=3301语音质量错误再用 WAV 封装重试一次。
部分环境下 PCM 与 WAV 路径对边界样本表现不一致,重试可提高成功率。
"""
r = self.asr(pcm_s16le, "pcm", rate, options)
if not isinstance(r, dict):
return r
if r.get("err_no") != 3301:
return r
if len(pcm_s16le) < 1000:
return r
try:
wav = pcm_s16le_to_wav_bytes(pcm_s16le, sample_rate=rate)
except Exception:
return r
r2 = self.asr(wav, "wav", rate, options)
return r2 if isinstance(r2, dict) else r
def synthesis(
self,
text: str,
lang: str = "zh",
ctp: int = 1,
options: dict[str, Any] | None = None,
) -> bytes | dict[str, Any]:
"""在线语音合成。成功为音频二进制;失败为错误信息 dict。"""
return self._client_or_raise().synthesis(text, lang, ctp, options)