Files
operating-room-monitor-server/app/services/voice_confirm.py
Kevin 132702aea9 refactor: 统一耗材视觉算法并扩展语音确认至全量候选清单
- 以 ConsumableVisionAlgorithmService 替代 consumable_classifier 与 tear_action;
  可选手部检测权重,未配置时全帧分类;时间窗众数与 Excel 白名单配置。
- 语音待确认:ASR 先匹配 pending topk,再匹配本台 candidate_consumables;
  记账 item_id 与 vision 一致使用 name_to_code。
- 更新 config、Compose、.env.example、依赖(pandas/openpyxl)与测试。

Made-with: Cursor
2026-04-22 16:31:12 +08:00

288 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import asyncio
import os
import platform
import re
import shutil
import subprocess
import tempfile
from dataclasses import dataclass
from fastapi.concurrency import run_in_threadpool
from loguru import logger
from app.config import Settings
from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService
_CN_DIGITS = {
"": 0,
"": 1,
"": 2,
"": 2,
"": 3,
"": 4,
"": 5,
"": 6,
"": 7,
"": 8,
"": 9,
"": 10,
}
def parse_voice_choice(asr_text: str, options: list[str]) -> str | None:
"""
从识别文本中解析医生选择的耗材名称。
支持:完全匹配、子串匹配、第 N 个1/一/第一个)。
"""
raw = (asr_text or "").strip()
if not raw:
return None
normalized = raw.replace(" ", "").lower()
for opt in options:
if opt and opt in raw:
return opt
m_num = re.search(r"(\d+)", raw)
if m_num:
idx = int(m_num.group(1)) - 1
if 0 <= idx < len(options):
return options[idx]
m_cn = re.search(r"第([一二两三四五六七八九十\d]+)个", raw)
if m_cn:
token = m_cn.group(1)
if token.isdigit():
idx = int(token) - 1
elif token in _CN_DIGITS:
idx = _CN_DIGITS[token] - 1
else:
idx = -1
if 0 <= idx < len(options):
return options[idx]
for i, opt in enumerate(options):
if not opt:
continue
aliases = [f"{i + 1}", f"{i + 1}", f"{i + 1}"]
if any(a in normalized for a in aliases):
return opt
negatives = ("不是", "没有", "", "", "")
if any(n in raw for n in negatives):
return None
return None
def match_voice_choice_against_candidates(
asr_text: str, candidates: list[str]
) -> str | None:
"""
在未匹配 pending 展示的 topk 话术时,按本台手术「候选耗材清单」做名称子串匹配。
长名优先,减少短名误命中(如「纱」同时匹配多种耗材时优先更长全称)。
"""
raw = (asr_text or "").strip()
if not raw:
return None
stripped = [c.strip() for c in candidates if c and str(c).strip()]
if not stripped:
return None
for c in sorted(stripped, key=len, reverse=True):
if c in raw:
return c
return None
def is_rejection_phrase(asr_text: str) -> bool:
"""医生明确否认全部候选时返回 True须在 parse_voice_choice 之前调用)。"""
raw = (asr_text or "").strip()
if not raw:
return False
negatives = ("不是", "没有", "", "", "")
return any(n in raw for n in negatives)
def build_prompt_text(options: list[tuple[str, float]]) -> str:
parts = [
"请确认刚才使用的耗材是下面哪一项,可以说序号或名称;"
"若是清单内其它耗材,也可以直接说该耗材名称。"
]
for i, (name, _conf) in enumerate(options, start=1):
parts.append(f"{i}个,{name}")
parts.append("若都不是请说不是。")
return "".join(parts)
@dataclass
class VoiceAttemptResult:
chosen_label: str | None
asr_text: str | None
error: str | None
class VoiceConfirmationOrchestrator:
"""服务端 TTS 播报 + ffmpeg 采集 + 百度 ASR + 文本解析。"""
def __init__(self, settings: Settings, baidu: BaiduSpeechService) -> None:
self._s = settings
self._baidu = baidu
self._lock = asyncio.Lock()
def _ffplay_path(self) -> str | None:
return shutil.which("ffplay")
def _ffmpeg_path(self) -> str | None:
return shutil.which("ffmpeg")
def _record_pcm_ffmpeg(self, seconds: float) -> tuple[bytes | None, str | None]:
ffmpeg = self._ffmpeg_path()
if not ffmpeg:
return None, "ffmpeg not found in PATH"
system = platform.system()
if system == "Darwin":
dev = self._s.voice_ffmpeg_input.strip() or ":0"
input_args = ["-f", "avfoundation", "-i", dev]
else:
dev = self._s.voice_ffmpeg_input.strip() or "default"
input_args = ["-f", "alsa", "-i", dev]
cmd = [
ffmpeg,
"-nostdin",
"-loglevel",
"error",
"-y",
*input_args,
"-t",
str(seconds),
"-ar",
"16000",
"-ac",
"1",
"-f",
"s16le",
"-acodec",
"pcm_s16le",
"pipe:1",
]
try:
proc = subprocess.run(
cmd,
capture_output=True,
timeout=seconds + 5.0,
check=False,
)
except subprocess.TimeoutExpired:
return None, "ffmpeg record timeout"
if proc.returncode != 0:
err = (proc.stderr or b"").decode("utf-8", errors="replace")
return None, f"ffmpeg failed: {err or proc.returncode}"
return proc.stdout, None
def _play_mp3_file(self, path: str) -> str | None:
ffplay = self._ffplay_path()
if not ffplay:
return "ffplay not found in PATH"
try:
proc = subprocess.run(
[
ffplay,
"-nodisp",
"-autoexit",
"-loglevel",
"quiet",
path,
],
capture_output=True,
timeout=120.0,
check=False,
)
except subprocess.TimeoutExpired:
return "ffplay timeout"
if proc.returncode != 0:
return f"ffplay exit {proc.returncode}"
return None
def _synthesize_to_temp_mp3(self, text: str) -> tuple[str | None, str | None]:
try:
audio = self._baidu.synthesis(
text,
"zh",
1,
{"spd": 5, "pit": 5, "vol": 9, "per": 0},
)
except BaiduSpeechNotConfiguredError as exc:
return None, str(exc)
if isinstance(audio, dict):
return None, f"TTS error: {audio!r}"
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
try:
tmp.write(audio)
tmp.flush()
path = tmp.name
finally:
tmp.close()
return path, None
async def run_confirmation(
self,
*,
surgery_id: str,
options: list[tuple[str, float]],
) -> VoiceAttemptResult:
if not self._s.voice_confirmation_enabled:
return VoiceAttemptResult(None, None, "voice_confirmation_disabled")
if not options:
return VoiceAttemptResult(None, None, "no_options")
if not self._baidu.configured:
return VoiceAttemptResult(None, None, "baidu_speech_not_configured")
labels = [o[0] for o in options]
prompt = build_prompt_text(options)
logger.info("Voice confirm surgery={} prompt_len={}", surgery_id, len(prompt))
async with self._lock:
mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, prompt)
if err or not mp3_path:
return VoiceAttemptResult(None, None, err or "tts_failed")
try:
play_err = await run_in_threadpool(self._play_mp3_file, mp3_path)
if play_err:
return VoiceAttemptResult(None, None, play_err)
finally:
try:
os.unlink(mp3_path)
except OSError:
pass
pcm, rec_err = await run_in_threadpool(
self._record_pcm_ffmpeg, float(self._s.voice_record_seconds)
)
if rec_err or not pcm:
return VoiceAttemptResult(None, None, rec_err or "empty_audio")
asr_payload = await run_in_threadpool(self._baidu.asr, pcm, "pcm", 16000, None)
if not isinstance(asr_payload, dict):
return VoiceAttemptResult(None, None, "asr_invalid_response")
if asr_payload.get("err_no") != 0:
return VoiceAttemptResult(
None,
None,
f"asr_err_{asr_payload.get('err_no')}: {asr_payload.get('err_msg')}",
)
results = asr_payload.get("result")
text: str | None = None
if isinstance(results, list) and results:
text = str(results[0])
elif isinstance(results, str):
text = results
if not text:
return VoiceAttemptResult(None, None, "asr_empty_text")
chosen = parse_voice_choice(text, labels)
return VoiceAttemptResult(chosen, text, None)