from __future__ import annotations import asyncio import os import platform import re import shutil import subprocess import tempfile from dataclasses import dataclass from fastapi.concurrency import run_in_threadpool from loguru import logger from app.config import Settings from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService _CN_DIGITS = { "零": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10, } def _parse_ordinal_index_1based(token: str) -> int | None: """将「1」「3」「一」「三」「十一」等解析为 1-based 序数,失败返回 None。""" t = (token or "").strip() if not t: return None if t.isdigit(): v = int(t) return v if 1 <= v <= 99 else None if t in _CN_DIGITS and t != "零" and t != "十": return int(_CN_DIGITS[t]) if t == "十": return 10 if len(t) == 2 and t[0] == "十" and t[1] in _CN_DIGITS and t[1] not in ("零", "十"): return 10 + int(_CN_DIGITS[t[1]]) if len(t) == 2 and t[1] == "十" and t[0] in _CN_DIGITS and t[0] != "零": return int(_CN_DIGITS[t[0]]) * 10 if len(t) == 3 and t[0] in _CN_DIGITS and t[1] == "十" and t[2] in _CN_DIGITS: return int(_CN_DIGITS[t[0]]) * 10 + int(_CN_DIGITS[t[2]]) return None def _label_from_ordinal_1based(n1: int, options: list[str]) -> str | None: if n1 < 1: return None idx = n1 - 1 if 0 <= idx < len(options): return options[idx] return None def _choose_from_ordinal_text(raw: str, options: list[str]) -> str | None: """从「第一个」「第2个」「选3」「1号」等表述解析选项。返回 None 表示本函数未识别。""" n_opt = len(options) if n_opt < 1: return None # 1) 显式「第N个/项/款/…」,允许夹带后噪声,如「第一个对」 for m in re.finditer( r"第([0-9]+|[一二两三四五六七八九十百]+)(?:个|项|款|的|种|名)?", raw ): n1 = _parse_ordinal_index_1based(m.group(1)) if n1 is not None: ch = _label_from_ordinal_1based(n1, options) if ch is not None: return ch m_pick = re.search( r"(?:^|[\s,,;;::])(?:选|要|就)\s*0*([1-9]\d?)(?:\s*号|个|项|款)?", raw, ) if m_pick: n1 = int(m_pick.group(1)) ch = _label_from_ordinal_1based(n1, options) if ch is not None: return ch norm_for_opt = raw.replace(" ", "").lower() m_op = re.search(r"(?:option|选项)\s*[::]?\s*(\d+)", norm_for_opt, re.IGNORECASE) if m_op: n1 = int(m_op.group(1)) ch = _label_from_ordinal_1based(n1, options) if ch is not None: return ch # 2) 行首/句末「一」「二」单字,仅当候选项数较少时 s = raw.replace(" ", "") if n_opt <= 3: m_one = re.match(r"^([一二两三四])$", s) if m_one: tok = m_one.group(1) if tok in _CN_DIGITS and tok not in ("零", "十"): n1 = int(_CN_DIGITS[tok]) ch = _label_from_ordinal_1based(n1, options) if ch is not None: return ch m_tail = re.search(r"([0-9一二两三四五六七八九十]+)\s*号$", s) if m_tail: n1 = _parse_ordinal_index_1based(m_tail.group(1)) if n1 is not None: ch = _label_from_ordinal_1based(n1, options) if ch is not None: return ch return None def parse_voice_choice(asr_text: str, options: list[str]) -> str | None: """ 从识别文本中解析医生选择的耗材名称。 支持:完全匹配、子串匹配、第 N 个(1/一/第一个)。 """ raw = re.sub( r"^[。,、;:!?\s]+|[。,、;:!?\s]+$", "", (asr_text or "").strip(), ) if not raw: return None normalized = raw.replace(" ", "").lower() for opt in options: if opt and opt in raw: return opt chosen_ord = _choose_from_ordinal_text(raw, options) if chosen_ord is not None: return chosen_ord m_num = re.search(r"(\d+)", raw) if m_num: idx = int(m_num.group(1)) - 1 if 0 <= idx < len(options): return options[idx] m_cn = re.search(r"第([一二两三四五六七八九十\d]+)个", raw) if m_cn: token = m_cn.group(1) n1 = int(token) if token.isdigit() else _parse_ordinal_index_1based(token) if n1 is not None: ch = _label_from_ordinal_1based(n1, options) if ch is not None: return ch for i, opt in enumerate(options): if not opt: continue aliases = [f"第{i + 1}个", f"第{i + 1}", f"{i + 1}号"] if any(a in normalized for a in aliases): return opt negatives = ("不是", "没有", "否", "无", "错") if any(n in raw for n in negatives): return None return None def match_voice_choice_against_candidates( asr_text: str, candidates: list[str] ) -> str | None: """ 在未匹配 pending 展示的 topk 话术时,按本台手术「候选耗材清单」做名称子串匹配。 长名优先,减少短名误命中(如「纱」同时匹配多种耗材时优先更长全称)。 """ raw = (asr_text or "").strip() if not raw: return None stripped = [c.strip() for c in candidates if c and str(c).strip()] if not stripped: return None for c in sorted(stripped, key=len, reverse=True): if c in raw: return c return None def is_rejection_phrase(asr_text: str) -> bool: """医生明确否认全部候选时返回 True(须在 parse_voice_choice 之前调用)。""" raw = (asr_text or "").strip() if not raw: return False negatives = ("不是", "没有", "否", "无", "错") return any(n in raw for n in negatives) def build_prompt_text(options: list[tuple[str, float]]) -> str: parts = ["请确认刚才使用的耗材是下面哪一项。"] for i, (name, _conf) in enumerate(options, start=1): parts.append(f"第{i}个,{name}。") return "".join(parts) @dataclass class VoiceAttemptResult: chosen_label: str | None asr_text: str | None error: str | None class VoiceConfirmationOrchestrator: """服务端 TTS 播报 + ffmpeg 采集 + 百度 ASR + 文本解析。""" def __init__(self, settings: Settings, baidu: BaiduSpeechService) -> None: self._s = settings self._baidu = baidu self._lock = asyncio.Lock() def _ffplay_path(self) -> str | None: return shutil.which("ffplay") def _ffmpeg_path(self) -> str | None: return shutil.which("ffmpeg") def _record_pcm_ffmpeg(self, seconds: float) -> tuple[bytes | None, str | None]: ffmpeg = self._ffmpeg_path() if not ffmpeg: return None, "ffmpeg not found in PATH" system = platform.system() if system == "Darwin": dev = self._s.voice_ffmpeg_input.strip() or ":0" input_args = ["-f", "avfoundation", "-i", dev] else: dev = self._s.voice_ffmpeg_input.strip() or "default" input_args = ["-f", "alsa", "-i", dev] cmd = [ ffmpeg, "-nostdin", "-loglevel", "error", "-y", *input_args, "-t", str(seconds), "-ar", "16000", "-ac", "1", "-f", "s16le", "-acodec", "pcm_s16le", "pipe:1", ] try: proc = subprocess.run( cmd, capture_output=True, timeout=seconds + 5.0, check=False, ) except subprocess.TimeoutExpired: return None, "ffmpeg record timeout" if proc.returncode != 0: err = (proc.stderr or b"").decode("utf-8", errors="replace") return None, f"ffmpeg failed: {err or proc.returncode}" return proc.stdout, None def _play_mp3_file(self, path: str) -> str | None: ffplay = self._ffplay_path() if not ffplay: return "ffplay not found in PATH" try: proc = subprocess.run( [ ffplay, "-nodisp", "-autoexit", "-loglevel", "quiet", path, ], capture_output=True, timeout=120.0, check=False, ) except subprocess.TimeoutExpired: return "ffplay timeout" if proc.returncode != 0: return f"ffplay exit {proc.returncode}" return None def _synthesize_to_temp_mp3(self, text: str) -> tuple[str | None, str | None]: try: audio = self._baidu.synthesis( text, "zh", 1, {"spd": 5, "pit": 5, "vol": 9, "per": 0}, ) except BaiduSpeechNotConfiguredError as exc: return None, str(exc) if isinstance(audio, dict): return None, f"TTS error: {audio!r}" tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) try: tmp.write(audio) tmp.flush() path = tmp.name finally: tmp.close() return path, None async def speak_prompt(self, text: str) -> None: """仅百度 TTS + ffplay 播报,不录音。供待确认入队时提示手术室。""" if not (text or "").strip(): return if not self._s.voice_tts_on_pending_enqueued: return if not self._s.voice_confirmation_enabled: return if not self._baidu.configured: logger.debug("speak_prompt skipped: baidu_speech not configured") return async with self._lock: mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, text) if err or not mp3_path: logger.warning("TTS synthesis failed: {}", err) return try: play_err = await run_in_threadpool(self._play_mp3_file, mp3_path) if play_err: logger.warning("TTS play failed: {}", play_err) finally: try: os.unlink(mp3_path) except OSError: pass async def run_confirmation( self, *, surgery_id: str, options: list[tuple[str, float]], ) -> VoiceAttemptResult: if not self._s.voice_confirmation_enabled: return VoiceAttemptResult(None, None, "voice_confirmation_disabled") if not options: return VoiceAttemptResult(None, None, "no_options") if not self._baidu.configured: return VoiceAttemptResult(None, None, "baidu_speech_not_configured") labels = [o[0] for o in options] prompt = build_prompt_text(options) logger.info("Voice confirm surgery={} prompt_len={}", surgery_id, len(prompt)) async with self._lock: mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, prompt) if err or not mp3_path: return VoiceAttemptResult(None, None, err or "tts_failed") try: play_err = await run_in_threadpool(self._play_mp3_file, mp3_path) if play_err: return VoiceAttemptResult(None, None, play_err) finally: try: os.unlink(mp3_path) except OSError: pass pcm, rec_err = await run_in_threadpool( self._record_pcm_ffmpeg, float(self._s.voice_record_seconds) ) if rec_err or not pcm: return VoiceAttemptResult(None, None, rec_err or "empty_audio") asr_payload = await run_in_threadpool(self._baidu.asr, pcm, "pcm", 16000, None) if not isinstance(asr_payload, dict): return VoiceAttemptResult(None, None, "asr_invalid_response") if asr_payload.get("err_no") != 0: return VoiceAttemptResult( None, None, f"asr_err_{asr_payload.get('err_no')}: {asr_payload.get('err_msg')}", ) results = asr_payload.get("result") text: str | None = None if isinstance(results, list) and results: text = str(results[0]) elif isinstance(results, str): text = results if not text: return VoiceAttemptResult(None, None, "asr_empty_text") chosen = parse_voice_choice(text, labels) return VoiceAttemptResult(chosen, text, None)