app/services/baidu_speech.py

from __future__ import annotations

from threading import Lock
from typing import Any

from aip import AipSpeech

from app.config import Settings, settings as _default_settings


class BaiduSpeechNotConfiguredError(RuntimeError):
    """未配置 BAIDU_SPEECH_APP_ID / API_KEY / SECRET_KEY 时调用接口会抛出。"""


class BaiduSpeechService:
    """百度短语音识别（asr）与在线语音合成（synthesis），基于 `baidu-aip` 的 `AipSpeech`。"""

    def __init__(self, app_settings: Settings | None = None) -> None:
        self._s = app_settings or _default_settings
        self._client: AipSpeech | None = None
        self._lock = Lock()

    @property
    def configured(self) -> bool:
        return self._s.baidu_speech_configured

    def _client_or_raise(self) -> AipSpeech:
        if not self.configured:
            raise BaiduSpeechNotConfiguredError(
                "百度语音未配置：请设置 BAIDU_SPEECH_APP_ID、BAIDU_SPEECH_API_KEY、"
                "BAIDU_SPEECH_SECRET_KEY"
            )
        with self._lock:
            if self._client is None:
                client = AipSpeech(
                    self._s.baidu_speech_app_id,
                    self._s.baidu_speech_api_key,
                    self._s.baidu_speech_secret_key,
                )
                if self._s.baidu_speech_connection_timeout_ms is not None:
                    client.setConnectionTimeoutInMillis(
                        self._s.baidu_speech_connection_timeout_ms
                    )
                if self._s.baidu_speech_socket_timeout_ms is not None:
                    client.setSocketTimeoutInMillis(self._s.baidu_speech_socket_timeout_ms)
                self._client = client
            return self._client

    def asr(
        self,
        speech: bytes | None = None,
        format: str = "pcm",
        rate: int = 16000,
        options: dict[str, Any] | None = None,
    ) -> dict[str, Any]:
        """短语音识别。返回百度 JSON（含 `err_no`、`result` 等）。

        固定使用普通话模型（`dev_pid` 来自配置），避免未传参时误用服务端默认导致偏英语等结果。
        """
        merged: dict[str, Any] = dict(options or {})
        merged["dev_pid"] = int(self._s.baidu_speech_asr_dev_pid)
        return self._client_or_raise().asr(speech, format, rate, merged)

    def synthesis(
        self,
        text: str,
        lang: str = "zh",
        ctp: int = 1,
        options: dict[str, Any] | None = None,
    ) -> bytes | dict[str, Any]:
        """在线语音合成。成功为音频二进制；失败为错误信息 dict。"""
        return self._client_or_raise().synthesis(text, lang, ctp, options)
-												feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor

											
										
										
											2026-04-21 18:33:54 +08:00
+								from __future__ import annotations
 								from threading import Lock
 								from typing import Any
 								from aip import AipSpeech
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								from app.config import Settings, settings as _default_settings
-												feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor

											
										
										
											2026-04-21 18:33:54 +08:00
 								class BaiduSpeechNotConfiguredError(RuntimeError):
 								    """未配置 BAIDU_SPEECH_APP_ID / API_KEY / SECRET_KEY 时调用接口会抛出。"""
 								class BaiduSpeechService:
 								    """百度短语音识别（asr）与在线语音合成（synthesis），基于 `baidu-aip` 的 `AipSpeech`。"""
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								    def __init__(self, app_settings: Settings | None = None) -> None:
 								        self._s = app_settings or _default_settings
-												feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor

											
										
										
											2026-04-21 18:33:54 +08:00
+								        self._client: AipSpeech | None = None
 								        self._lock = Lock()
 								    @property
 								    def configured(self) -> bool:
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								        return self._s.baidu_speech_configured
-												feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor

											
										
										
											2026-04-21 18:33:54 +08:00
 								    def _client_or_raise(self) -> AipSpeech:
 								        if not self.configured:
 								            raise BaiduSpeechNotConfiguredError(
 								                "百度语音未配置：请设置 BAIDU_SPEECH_APP_ID、BAIDU_SPEECH_API_KEY、"
 								                "BAIDU_SPEECH_SECRET_KEY"
 								            )
 								        with self._lock:
 								            if self._client is None:
 								                client = AipSpeech(
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								                    self._s.baidu_speech_app_id,
 								                    self._s.baidu_speech_api_key,
 								                    self._s.baidu_speech_secret_key,
-												feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor

											
										
										
											2026-04-21 18:33:54 +08:00
+								                )
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								                if self._s.baidu_speech_connection_timeout_ms is not None:
-												feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor

											
										
										
											2026-04-21 18:33:54 +08:00
+								                    client.setConnectionTimeoutInMillis(
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								                        self._s.baidu_speech_connection_timeout_ms
-												feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor

											
										
										
											2026-04-21 18:33:54 +08:00
+								                    )
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								                if self._s.baidu_speech_socket_timeout_ms is not None:
 								                    client.setSocketTimeoutInMillis(self._s.baidu_speech_socket_timeout_ms)
-												feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor

											
										
										
											2026-04-21 18:33:54 +08:00
+								                self._client = client
 								            return self._client
 								    def asr(
 								        self,
 								        speech: bytes | None = None,
 								        format: str = "pcm",
 								        rate: int = 16000,
 								        options: dict[str, Any] | None = None,
 								    ) -> dict[str, Any]:
-												feat: 语音确认、联调与运维增强

- 语音：序数解析（第一个/第二个等）、解析失败计数与 API detail.retry_remaining；
  百度 ASR 固定 dev_pid 为普通话；SurgeryPipelineError 支持 extra 并入 HTTP detail。
- Demo：demo 路由与假 RTSP、客户端 index 与 README；BackendResolver 与配置调整。
- 可观测：消耗 TSV 日志、语音文件日志、终端 Markdown 辅助；相关测试与依赖更新。
- 注意：.env 仍被 gitignore，本地密钥不会进入本提交。

Made-with: Cursor

											
										
										
											2026-04-23 14:24:20 +08:00
+								        """短语音识别。返回百度 JSON（含 `err_no`、`result` 等）。
 								        固定使用普通话模型（`dev_pid` 来自配置），避免未传参时误用服务端默认导致偏英语等结果。
 								        """
 								        merged: dict[str, Any] = dict(options or {})
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								        merged["dev_pid"] = int(self._s.baidu_speech_asr_dev_pid)
-												feat: 语音确认、联调与运维增强

- 语音：序数解析（第一个/第二个等）、解析失败计数与 API detail.retry_remaining；
  百度 ASR 固定 dev_pid 为普通话；SurgeryPipelineError 支持 extra 并入 HTTP detail。
- Demo：demo 路由与假 RTSP、客户端 index 与 README；BackendResolver 与配置调整。
- 可观测：消耗 TSV 日志、语音文件日志、终端 Markdown 辅助；相关测试与依赖更新。
- 注意：.env 仍被 gitignore，本地密钥不会进入本提交。

Made-with: Cursor

											
										
										
											2026-04-23 14:24:20 +08:00
+								        return self._client_or_raise().asr(speech, format, rate, merged)
-												feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor

											
										
										
											2026-04-21 18:33:54 +08:00
 								    def synthesis(
 								        self,
 								        text: str,
 								        lang: str = "zh",
 								        ctp: int = 1,
 								        options: dict[str, Any] | None = None,
 								    ) -> bytes | dict[str, Any]:
 								        """在线语音合成。成功为音频二进制；失败为错误信息 dict。"""
 								        return self._client_or_raise().synthesis(text, lang, ctp, options)