feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor
This commit is contained in:
Kevin
2026-04-21 18:33:54 +08:00
parent d1a3d029ec
commit 04866559db
56 changed files with 7196 additions and 43 deletions

View File

@@ -0,0 +1,265 @@
from __future__ import annotations
import asyncio
import os
import platform
import re
import shutil
import subprocess
import tempfile
from dataclasses import dataclass
from fastapi.concurrency import run_in_threadpool
from loguru import logger
from app.config import Settings
from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService
_CN_DIGITS = {
"": 0,
"": 1,
"": 2,
"": 2,
"": 3,
"": 4,
"": 5,
"": 6,
"": 7,
"": 8,
"": 9,
"": 10,
}
def parse_voice_choice(asr_text: str, options: list[str]) -> str | None:
"""
从识别文本中解析医生选择的耗材名称。
支持:完全匹配、子串匹配、第 N 个1/一/第一个)。
"""
raw = (asr_text or "").strip()
if not raw:
return None
normalized = raw.replace(" ", "").lower()
for opt in options:
if opt and opt in raw:
return opt
m_num = re.search(r"(\d+)", raw)
if m_num:
idx = int(m_num.group(1)) - 1
if 0 <= idx < len(options):
return options[idx]
m_cn = re.search(r"第([一二两三四五六七八九十\d]+)个", raw)
if m_cn:
token = m_cn.group(1)
if token.isdigit():
idx = int(token) - 1
elif token in _CN_DIGITS:
idx = _CN_DIGITS[token] - 1
else:
idx = -1
if 0 <= idx < len(options):
return options[idx]
for i, opt in enumerate(options):
if not opt:
continue
aliases = [f"{i + 1}", f"{i + 1}", f"{i + 1}"]
if any(a in normalized for a in aliases):
return opt
negatives = ("不是", "没有", "", "", "")
if any(n in raw for n in negatives):
return None
return None
def is_rejection_phrase(asr_text: str) -> bool:
"""医生明确否认全部候选时返回 True须在 parse_voice_choice 之前调用)。"""
raw = (asr_text or "").strip()
if not raw:
return False
negatives = ("不是", "没有", "", "", "")
return any(n in raw for n in negatives)
def build_prompt_text(options: list[tuple[str, float]]) -> str:
parts = ["请确认刚才使用的耗材是下面哪一项,可以说序号或名称。"]
for i, (name, _conf) in enumerate(options, start=1):
parts.append(f"{i}个,{name}")
parts.append("若都不是请说不是。")
return "".join(parts)
@dataclass
class VoiceAttemptResult:
chosen_label: str | None
asr_text: str | None
error: str | None
class VoiceConfirmationOrchestrator:
"""服务端 TTS 播报 + ffmpeg 采集 + 百度 ASR + 文本解析。"""
def __init__(self, settings: Settings, baidu: BaiduSpeechService) -> None:
self._s = settings
self._baidu = baidu
self._lock = asyncio.Lock()
def _ffplay_path(self) -> str | None:
return shutil.which("ffplay")
def _ffmpeg_path(self) -> str | None:
return shutil.which("ffmpeg")
def _record_pcm_ffmpeg(self, seconds: float) -> tuple[bytes | None, str | None]:
ffmpeg = self._ffmpeg_path()
if not ffmpeg:
return None, "ffmpeg not found in PATH"
system = platform.system()
if system == "Darwin":
dev = self._s.voice_ffmpeg_input.strip() or ":0"
input_args = ["-f", "avfoundation", "-i", dev]
else:
dev = self._s.voice_ffmpeg_input.strip() or "default"
input_args = ["-f", "alsa", "-i", dev]
cmd = [
ffmpeg,
"-nostdin",
"-loglevel",
"error",
"-y",
*input_args,
"-t",
str(seconds),
"-ar",
"16000",
"-ac",
"1",
"-f",
"s16le",
"-acodec",
"pcm_s16le",
"pipe:1",
]
try:
proc = subprocess.run(
cmd,
capture_output=True,
timeout=seconds + 5.0,
check=False,
)
except subprocess.TimeoutExpired:
return None, "ffmpeg record timeout"
if proc.returncode != 0:
err = (proc.stderr or b"").decode("utf-8", errors="replace")
return None, f"ffmpeg failed: {err or proc.returncode}"
return proc.stdout, None
def _play_mp3_file(self, path: str) -> str | None:
ffplay = self._ffplay_path()
if not ffplay:
return "ffplay not found in PATH"
try:
proc = subprocess.run(
[
ffplay,
"-nodisp",
"-autoexit",
"-loglevel",
"quiet",
path,
],
capture_output=True,
timeout=120.0,
check=False,
)
except subprocess.TimeoutExpired:
return "ffplay timeout"
if proc.returncode != 0:
return f"ffplay exit {proc.returncode}"
return None
def _synthesize_to_temp_mp3(self, text: str) -> tuple[str | None, str | None]:
try:
audio = self._baidu.synthesis(
text,
"zh",
1,
{"spd": 5, "pit": 5, "vol": 9, "per": 0},
)
except BaiduSpeechNotConfiguredError as exc:
return None, str(exc)
if isinstance(audio, dict):
return None, f"TTS error: {audio!r}"
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
try:
tmp.write(audio)
tmp.flush()
path = tmp.name
finally:
tmp.close()
return path, None
async def run_confirmation(
self,
*,
surgery_id: str,
options: list[tuple[str, float]],
) -> VoiceAttemptResult:
if not self._s.voice_confirmation_enabled:
return VoiceAttemptResult(None, None, "voice_confirmation_disabled")
if not options:
return VoiceAttemptResult(None, None, "no_options")
if not self._baidu.configured:
return VoiceAttemptResult(None, None, "baidu_speech_not_configured")
labels = [o[0] for o in options]
prompt = build_prompt_text(options)
logger.info("Voice confirm surgery={} prompt_len={}", surgery_id, len(prompt))
async with self._lock:
mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, prompt)
if err or not mp3_path:
return VoiceAttemptResult(None, None, err or "tts_failed")
try:
play_err = await run_in_threadpool(self._play_mp3_file, mp3_path)
if play_err:
return VoiceAttemptResult(None, None, play_err)
finally:
try:
os.unlink(mp3_path)
except OSError:
pass
pcm, rec_err = await run_in_threadpool(
self._record_pcm_ffmpeg, float(self._s.voice_record_seconds)
)
if rec_err or not pcm:
return VoiceAttemptResult(None, None, rec_err or "empty_audio")
asr_payload = await run_in_threadpool(self._baidu.asr, pcm, "pcm", 16000, None)
if not isinstance(asr_payload, dict):
return VoiceAttemptResult(None, None, "asr_invalid_response")
if asr_payload.get("err_no") != 0:
return VoiceAttemptResult(
None,
None,
f"asr_err_{asr_payload.get('err_no')}: {asr_payload.get('err_msg')}",
)
results = asr_payload.get("result")
text: str | None = None
if isinstance(results, list) and results:
text = str(results[0])
elif isinstance(results, str):
text = results
if not text:
return VoiceAttemptResult(None, None, "asr_empty_text")
chosen = parse_voice_choice(text, labels)
return VoiceAttemptResult(chosen, text, None)