feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks. - Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence. - Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config. - Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency. - Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT. - Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled. Made-with: Cursor
2026-04-21 18:33:54 +08:00
parent d1a3d029ec
commit 04866559db
56 changed files with 7196 additions and 43 deletions
--- a/app/services/voice_confirm.py
+++ b/app/services/voice_confirm.py
@@ -0,0 +1,265 @@
+from __future__ import annotations
+
+import asyncio
+import os
+import platform
+import re
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass
+
+from fastapi.concurrency import run_in_threadpool
+from loguru import logger
+
+from app.config import Settings
+from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService
+
+
+_CN_DIGITS = {
+    "零": 0,
+    "一": 1,
+    "二": 2,
+    "两": 2,
+    "三": 3,
+    "四": 4,
+    "五": 5,
+    "六": 6,
+    "七": 7,
+    "八": 8,
+    "九": 9,
+    "十": 10,
+}
+
+
+def parse_voice_choice(asr_text: str, options: list[str]) -> str | None:
+    """
+    从识别文本中解析医生选择的耗材名称。
+    支持：完全匹配、子串匹配、第 N 个（1/一/第一个）。
+    """
+    raw = (asr_text or "").strip()
+    if not raw:
+        return None
+    normalized = raw.replace(" ", "").lower()
+
+    for opt in options:
+        if opt and opt in raw:
+            return opt
+
+    m_num = re.search(r"(\d+)", raw)
+    if m_num:
+        idx = int(m_num.group(1)) - 1
+        if 0 <= idx < len(options):
+            return options[idx]
+
+    m_cn = re.search(r"第([一二两三四五六七八九十\d]+)个", raw)
+    if m_cn:
+        token = m_cn.group(1)
+        if token.isdigit():
+            idx = int(token) - 1
+        elif token in _CN_DIGITS:
+            idx = _CN_DIGITS[token] - 1
+        else:
+            idx = -1
+        if 0 <= idx < len(options):
+            return options[idx]
+
+    for i, opt in enumerate(options):
+        if not opt:
+            continue
+        aliases = [f"第{i + 1}个", f"第{i + 1}", f"{i + 1}号"]
+        if any(a in normalized for a in aliases):
+            return opt
+
+    negatives = ("不是", "没有", "否", "无", "错")
+    if any(n in raw for n in negatives):
+        return None
+
+    return None
+
+
+def is_rejection_phrase(asr_text: str) -> bool:
+    """医生明确否认全部候选时返回 True（须在 parse_voice_choice 之前调用）。"""
+    raw = (asr_text or "").strip()
+    if not raw:
+        return False
+    negatives = ("不是", "没有", "否", "无", "错")
+    return any(n in raw for n in negatives)
+
+
+def build_prompt_text(options: list[tuple[str, float]]) -> str:
+    parts = ["请确认刚才使用的耗材是下面哪一项，可以说序号或名称。"]
+    for i, (name, _conf) in enumerate(options, start=1):
+        parts.append(f"第{i}个，{name}。")
+    parts.append("若都不是请说不是。")
+    return "".join(parts)
+
+
+@dataclass
+class VoiceAttemptResult:
+    chosen_label: str | None
+    asr_text: str | None
+    error: str | None
+
+
+class VoiceConfirmationOrchestrator:
+    """服务端 TTS 播报 + ffmpeg 采集 + 百度 ASR + 文本解析。"""
+
+    def __init__(self, settings: Settings, baidu: BaiduSpeechService) -> None:
+        self._s = settings
+        self._baidu = baidu
+        self._lock = asyncio.Lock()
+
+    def _ffplay_path(self) -> str | None:
+        return shutil.which("ffplay")
+
+    def _ffmpeg_path(self) -> str | None:
+        return shutil.which("ffmpeg")
+
+    def _record_pcm_ffmpeg(self, seconds: float) -> tuple[bytes | None, str | None]:
+        ffmpeg = self._ffmpeg_path()
+        if not ffmpeg:
+            return None, "ffmpeg not found in PATH"
+        system = platform.system()
+        if system == "Darwin":
+            dev = self._s.voice_ffmpeg_input.strip() or ":0"
+            input_args = ["-f", "avfoundation", "-i", dev]
+        else:
+            dev = self._s.voice_ffmpeg_input.strip() or "default"
+            input_args = ["-f", "alsa", "-i", dev]
+
+        cmd = [
+            ffmpeg,
+            "-nostdin",
+            "-loglevel",
+            "error",
+            "-y",
+            *input_args,
+            "-t",
+            str(seconds),
+            "-ar",
+            "16000",
+            "-ac",
+            "1",
+            "-f",
+            "s16le",
+            "-acodec",
+            "pcm_s16le",
+            "pipe:1",
+        ]
+        try:
+            proc = subprocess.run(
+                cmd,
+                capture_output=True,
+                timeout=seconds + 5.0,
+                check=False,
+            )
+        except subprocess.TimeoutExpired:
+            return None, "ffmpeg record timeout"
+        if proc.returncode != 0:
+            err = (proc.stderr or b"").decode("utf-8", errors="replace")
+            return None, f"ffmpeg failed: {err or proc.returncode}"
+        return proc.stdout, None
+
+    def _play_mp3_file(self, path: str) -> str | None:
+        ffplay = self._ffplay_path()
+        if not ffplay:
+            return "ffplay not found in PATH"
+        try:
+            proc = subprocess.run(
+                [
+                    ffplay,
+                    "-nodisp",
+                    "-autoexit",
+                    "-loglevel",
+                    "quiet",
+                    path,
+                ],
+                capture_output=True,
+                timeout=120.0,
+                check=False,
+            )
+        except subprocess.TimeoutExpired:
+            return "ffplay timeout"
+        if proc.returncode != 0:
+            return f"ffplay exit {proc.returncode}"
+        return None
+
+    def _synthesize_to_temp_mp3(self, text: str) -> tuple[str | None, str | None]:
+        try:
+            audio = self._baidu.synthesis(
+                text,
+                "zh",
+                1,
+                {"spd": 5, "pit": 5, "vol": 9, "per": 0},
+            )
+        except BaiduSpeechNotConfiguredError as exc:
+            return None, str(exc)
+        if isinstance(audio, dict):
+            return None, f"TTS error: {audio!r}"
+        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
+        try:
+            tmp.write(audio)
+            tmp.flush()
+            path = tmp.name
+        finally:
+            tmp.close()
+        return path, None
+
+    async def run_confirmation(
+        self,
+        *,
+        surgery_id: str,
+        options: list[tuple[str, float]],
+    ) -> VoiceAttemptResult:
+        if not self._s.voice_confirmation_enabled:
+            return VoiceAttemptResult(None, None, "voice_confirmation_disabled")
+        if not options:
+            return VoiceAttemptResult(None, None, "no_options")
+        if not self._baidu.configured:
+            return VoiceAttemptResult(None, None, "baidu_speech_not_configured")
+
+        labels = [o[0] for o in options]
+        prompt = build_prompt_text(options)
+        logger.info("Voice confirm surgery={} prompt_len={}", surgery_id, len(prompt))
+
+        async with self._lock:
+            mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, prompt)
+            if err or not mp3_path:
+                return VoiceAttemptResult(None, None, err or "tts_failed")
+            try:
+                play_err = await run_in_threadpool(self._play_mp3_file, mp3_path)
+                if play_err:
+                    return VoiceAttemptResult(None, None, play_err)
+            finally:
+                try:
+                    os.unlink(mp3_path)
+                except OSError:
+                    pass
+
+            pcm, rec_err = await run_in_threadpool(
+                self._record_pcm_ffmpeg, float(self._s.voice_record_seconds)
+            )
+            if rec_err or not pcm:
+                return VoiceAttemptResult(None, None, rec_err or "empty_audio")
+
+        asr_payload = await run_in_threadpool(self._baidu.asr, pcm, "pcm", 16000, None)
+        if not isinstance(asr_payload, dict):
+            return VoiceAttemptResult(None, None, "asr_invalid_response")
+        if asr_payload.get("err_no") != 0:
+            return VoiceAttemptResult(
+                None,
+                None,
+                f"asr_err_{asr_payload.get('err_no')}: {asr_payload.get('err_msg')}",
+            )
+        results = asr_payload.get("result")
+        text: str | None = None
+        if isinstance(results, list) and results:
+            text = str(results[0])
+        elif isinstance(results, str):
+            text = results
+        if not text:
+            return VoiceAttemptResult(None, None, "asr_empty_text")
+
+        chosen = parse_voice_choice(text, labels)
+        return VoiceAttemptResult(chosen, text, None)