Files
operating-room-monitor-server/app/services/voice_confirm.py
Kevin 0c05463617 feat: 语音确认、联调与运维增强
- 语音:序数解析(第一个/第二个等)、解析失败计数与 API detail.retry_remaining;
  百度 ASR 固定 dev_pid 为普通话;SurgeryPipelineError 支持 extra 并入 HTTP detail。
- Demo:demo 路由与假 RTSP、客户端 index 与 README;BackendResolver 与配置调整。
- 可观测:消耗 TSV 日志、语音文件日志、终端 Markdown 辅助;相关测试与依赖更新。
- 注意:.env 仍被 gitignore,本地密钥不会进入本提交。

Made-with: Cursor
2026-04-23 14:24:20 +08:00

399 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import asyncio
import os
import platform
import re
import shutil
import subprocess
import tempfile
from dataclasses import dataclass
from fastapi.concurrency import run_in_threadpool
from loguru import logger
from app.config import Settings
from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService
_CN_DIGITS = {
"": 0,
"": 1,
"": 2,
"": 2,
"": 3,
"": 4,
"": 5,
"": 6,
"": 7,
"": 8,
"": 9,
"": 10,
}
def _parse_ordinal_index_1based(token: str) -> int | None:
"""将「1」「3」「一」「三」「十一」等解析为 1-based 序数,失败返回 None。"""
t = (token or "").strip()
if not t:
return None
if t.isdigit():
v = int(t)
return v if 1 <= v <= 99 else None
if t in _CN_DIGITS and t != "" and t != "":
return int(_CN_DIGITS[t])
if t == "":
return 10
if len(t) == 2 and t[0] == "" and t[1] in _CN_DIGITS and t[1] not in ("", ""):
return 10 + int(_CN_DIGITS[t[1]])
if len(t) == 2 and t[1] == "" and t[0] in _CN_DIGITS and t[0] != "":
return int(_CN_DIGITS[t[0]]) * 10
if len(t) == 3 and t[0] in _CN_DIGITS and t[1] == "" and t[2] in _CN_DIGITS:
return int(_CN_DIGITS[t[0]]) * 10 + int(_CN_DIGITS[t[2]])
return None
def _label_from_ordinal_1based(n1: int, options: list[str]) -> str | None:
if n1 < 1:
return None
idx = n1 - 1
if 0 <= idx < len(options):
return options[idx]
return None
def _choose_from_ordinal_text(raw: str, options: list[str]) -> str | None:
"""从「第一个」「第2个」「选3」「1号」等表述解析选项。返回 None 表示本函数未识别。"""
n_opt = len(options)
if n_opt < 1:
return None
# 1) 显式「第N个/项/款/…」,允许夹带后噪声,如「第一个对」
for m in re.finditer(
r"第([0-9]+|[一二两三四五六七八九十百]+)(?:个|项|款|的|种|名)?", raw
):
n1 = _parse_ordinal_index_1based(m.group(1))
if n1 is not None:
ch = _label_from_ordinal_1based(n1, options)
if ch is not None:
return ch
m_pick = re.search(
r"(?:^|[\s,;:])(?:选|要|就)\s*0*([1-9]\d?)(?:\s*号|个|项|款)?",
raw,
)
if m_pick:
n1 = int(m_pick.group(1))
ch = _label_from_ordinal_1based(n1, options)
if ch is not None:
return ch
norm_for_opt = raw.replace(" ", "").lower()
m_op = re.search(r"(?:option|选项)\s*[:]?\s*(\d+)", norm_for_opt, re.IGNORECASE)
if m_op:
n1 = int(m_op.group(1))
ch = _label_from_ordinal_1based(n1, options)
if ch is not None:
return ch
# 2) 行首/句末「一」「二」单字,仅当候选项数较少时
s = raw.replace(" ", "")
if n_opt <= 3:
m_one = re.match(r"^([一二两三四])$", s)
if m_one:
tok = m_one.group(1)
if tok in _CN_DIGITS and tok not in ("", ""):
n1 = int(_CN_DIGITS[tok])
ch = _label_from_ordinal_1based(n1, options)
if ch is not None:
return ch
m_tail = re.search(r"([0-9一二两三四五六七八九十]+)\s*号$", s)
if m_tail:
n1 = _parse_ordinal_index_1based(m_tail.group(1))
if n1 is not None:
ch = _label_from_ordinal_1based(n1, options)
if ch is not None:
return ch
return None
def parse_voice_choice(asr_text: str, options: list[str]) -> str | None:
"""
从识别文本中解析医生选择的耗材名称。
支持:完全匹配、子串匹配、第 N 个1/一/第一个)。
"""
raw = re.sub(
r"^[。,、;:!?\s]+|[。,、;:!?\s]+$",
"",
(asr_text or "").strip(),
)
if not raw:
return None
normalized = raw.replace(" ", "").lower()
for opt in options:
if opt and opt in raw:
return opt
chosen_ord = _choose_from_ordinal_text(raw, options)
if chosen_ord is not None:
return chosen_ord
m_num = re.search(r"(\d+)", raw)
if m_num:
idx = int(m_num.group(1)) - 1
if 0 <= idx < len(options):
return options[idx]
m_cn = re.search(r"第([一二两三四五六七八九十\d]+)个", raw)
if m_cn:
token = m_cn.group(1)
n1 = int(token) if token.isdigit() else _parse_ordinal_index_1based(token)
if n1 is not None:
ch = _label_from_ordinal_1based(n1, options)
if ch is not None:
return ch
for i, opt in enumerate(options):
if not opt:
continue
aliases = [f"{i + 1}", f"{i + 1}", f"{i + 1}"]
if any(a in normalized for a in aliases):
return opt
negatives = ("不是", "没有", "", "", "")
if any(n in raw for n in negatives):
return None
return None
def match_voice_choice_against_candidates(
asr_text: str, candidates: list[str]
) -> str | None:
"""
在未匹配 pending 展示的 topk 话术时,按本台手术「候选耗材清单」做名称子串匹配。
长名优先,减少短名误命中(如「纱」同时匹配多种耗材时优先更长全称)。
"""
raw = (asr_text or "").strip()
if not raw:
return None
stripped = [c.strip() for c in candidates if c and str(c).strip()]
if not stripped:
return None
for c in sorted(stripped, key=len, reverse=True):
if c in raw:
return c
return None
def is_rejection_phrase(asr_text: str) -> bool:
"""医生明确否认全部候选时返回 True须在 parse_voice_choice 之前调用)。"""
raw = (asr_text or "").strip()
if not raw:
return False
negatives = ("不是", "没有", "", "", "")
return any(n in raw for n in negatives)
def build_prompt_text(options: list[tuple[str, float]]) -> str:
parts = ["请确认刚才使用的耗材是下面哪一项。"]
for i, (name, _conf) in enumerate(options, start=1):
parts.append(f"{i}个,{name}")
return "".join(parts)
@dataclass
class VoiceAttemptResult:
chosen_label: str | None
asr_text: str | None
error: str | None
class VoiceConfirmationOrchestrator:
"""服务端 TTS 播报 + ffmpeg 采集 + 百度 ASR + 文本解析。"""
def __init__(self, settings: Settings, baidu: BaiduSpeechService) -> None:
self._s = settings
self._baidu = baidu
self._lock = asyncio.Lock()
def _ffplay_path(self) -> str | None:
return shutil.which("ffplay")
def _ffmpeg_path(self) -> str | None:
return shutil.which("ffmpeg")
def _record_pcm_ffmpeg(self, seconds: float) -> tuple[bytes | None, str | None]:
ffmpeg = self._ffmpeg_path()
if not ffmpeg:
return None, "ffmpeg not found in PATH"
system = platform.system()
if system == "Darwin":
dev = self._s.voice_ffmpeg_input.strip() or ":0"
input_args = ["-f", "avfoundation", "-i", dev]
else:
dev = self._s.voice_ffmpeg_input.strip() or "default"
input_args = ["-f", "alsa", "-i", dev]
cmd = [
ffmpeg,
"-nostdin",
"-loglevel",
"error",
"-y",
*input_args,
"-t",
str(seconds),
"-ar",
"16000",
"-ac",
"1",
"-f",
"s16le",
"-acodec",
"pcm_s16le",
"pipe:1",
]
try:
proc = subprocess.run(
cmd,
capture_output=True,
timeout=seconds + 5.0,
check=False,
)
except subprocess.TimeoutExpired:
return None, "ffmpeg record timeout"
if proc.returncode != 0:
err = (proc.stderr or b"").decode("utf-8", errors="replace")
return None, f"ffmpeg failed: {err or proc.returncode}"
return proc.stdout, None
def _play_mp3_file(self, path: str) -> str | None:
ffplay = self._ffplay_path()
if not ffplay:
return "ffplay not found in PATH"
try:
proc = subprocess.run(
[
ffplay,
"-nodisp",
"-autoexit",
"-loglevel",
"quiet",
path,
],
capture_output=True,
timeout=120.0,
check=False,
)
except subprocess.TimeoutExpired:
return "ffplay timeout"
if proc.returncode != 0:
return f"ffplay exit {proc.returncode}"
return None
def _synthesize_to_temp_mp3(self, text: str) -> tuple[str | None, str | None]:
try:
audio = self._baidu.synthesis(
text,
"zh",
1,
{"spd": 5, "pit": 5, "vol": 9, "per": 0},
)
except BaiduSpeechNotConfiguredError as exc:
return None, str(exc)
if isinstance(audio, dict):
return None, f"TTS error: {audio!r}"
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
try:
tmp.write(audio)
tmp.flush()
path = tmp.name
finally:
tmp.close()
return path, None
async def speak_prompt(self, text: str) -> None:
"""仅百度 TTS + ffplay 播报,不录音。供待确认入队时提示手术室。"""
if not (text or "").strip():
return
if not self._s.voice_tts_on_pending_enqueued:
return
if not self._s.voice_confirmation_enabled:
return
if not self._baidu.configured:
logger.debug("speak_prompt skipped: baidu_speech not configured")
return
async with self._lock:
mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, text)
if err or not mp3_path:
logger.warning("TTS synthesis failed: {}", err)
return
try:
play_err = await run_in_threadpool(self._play_mp3_file, mp3_path)
if play_err:
logger.warning("TTS play failed: {}", play_err)
finally:
try:
os.unlink(mp3_path)
except OSError:
pass
async def run_confirmation(
self,
*,
surgery_id: str,
options: list[tuple[str, float]],
) -> VoiceAttemptResult:
if not self._s.voice_confirmation_enabled:
return VoiceAttemptResult(None, None, "voice_confirmation_disabled")
if not options:
return VoiceAttemptResult(None, None, "no_options")
if not self._baidu.configured:
return VoiceAttemptResult(None, None, "baidu_speech_not_configured")
labels = [o[0] for o in options]
prompt = build_prompt_text(options)
logger.info("Voice confirm surgery={} prompt_len={}", surgery_id, len(prompt))
async with self._lock:
mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, prompt)
if err or not mp3_path:
return VoiceAttemptResult(None, None, err or "tts_failed")
try:
play_err = await run_in_threadpool(self._play_mp3_file, mp3_path)
if play_err:
return VoiceAttemptResult(None, None, play_err)
finally:
try:
os.unlink(mp3_path)
except OSError:
pass
pcm, rec_err = await run_in_threadpool(
self._record_pcm_ffmpeg, float(self._s.voice_record_seconds)
)
if rec_err or not pcm:
return VoiceAttemptResult(None, None, rec_err or "empty_audio")
asr_payload = await run_in_threadpool(self._baidu.asr, pcm, "pcm", 16000, None)
if not isinstance(asr_payload, dict):
return VoiceAttemptResult(None, None, "asr_invalid_response")
if asr_payload.get("err_no") != 0:
return VoiceAttemptResult(
None,
None,
f"asr_err_{asr_payload.get('err_no')}: {asr_payload.get('err_msg')}",
)
results = asr_payload.get("result")
text: str | None = None
if isinstance(results, list) and results:
text = str(results[0])
elif isinstance(results, str):
text = results
if not text:
return VoiceAttemptResult(None, None, "asr_empty_text")
chosen = parse_voice_choice(text, labels)
return VoiceAttemptResult(chosen, text, None)