operating-room-monitor-server/app/services/voice_confirm.py

from __future__ import annotations

import asyncio
import os
import platform
import re
import shutil
import subprocess
import tempfile
from dataclasses import dataclass

from fastapi.concurrency import run_in_threadpool
from loguru import logger

from app.config import Settings
from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService


_CN_DIGITS = {
    "零": 0,
    "一": 1,
    "二": 2,
    "两": 2,
    "三": 3,
    "四": 4,
    "五": 5,
    "六": 6,
    "七": 7,
    "八": 8,
    "九": 9,
    "十": 10,
}


def _parse_ordinal_index_1based(token: str) -> int | None:
    """将「1」「3」「一」「三」「十一」等解析为 1-based 序数，失败返回 None。"""
    t = (token or "").strip()
    if not t:
        return None
    if t.isdigit():
        v = int(t)
        return v if 1 <= v <= 99 else None
    if t in _CN_DIGITS and t != "零" and t != "十":
        return int(_CN_DIGITS[t])
    if t == "十":
        return 10
    if len(t) == 2 and t[0] == "十" and t[1] in _CN_DIGITS and t[1] not in ("零", "十"):
        return 10 + int(_CN_DIGITS[t[1]])
    if len(t) == 2 and t[1] == "十" and t[0] in _CN_DIGITS and t[0] != "零":
        return int(_CN_DIGITS[t[0]]) * 10
    if len(t) == 3 and t[0] in _CN_DIGITS and t[1] == "十" and t[2] in _CN_DIGITS:
        return int(_CN_DIGITS[t[0]]) * 10 + int(_CN_DIGITS[t[2]])
    return None


def _label_from_ordinal_1based(n1: int, options: list[str]) -> str | None:
    if n1 < 1:
        return None
    idx = n1 - 1
    if 0 <= idx < len(options):
        return options[idx]
    return None


def _choose_from_ordinal_text(raw: str, options: list[str]) -> str | None:
    """从「第一个」「第2个」「选3」「1号」等表述解析选项。返回 None 表示本函数未识别。"""
    n_opt = len(options)
    if n_opt < 1:
        return None

    # 1) 显式「第N个/项/款/…」，允许夹带后噪声，如「第一个对」
    for m in re.finditer(
        r"第([0-9]+|[一二两三四五六七八九十百]+)(?:个|项|款|的|种|名)?", raw
    ):
        n1 = _parse_ordinal_index_1based(m.group(1))
        if n1 is not None:
            ch = _label_from_ordinal_1based(n1, options)
            if ch is not None:
                return ch
    m_pick = re.search(
        r"(?:^|[\s,，;；:：])(?:选|要|就)\s*0*([1-9]\d?)(?:\s*号|个|项|款)?",
        raw,
    )
    if m_pick:
        n1 = int(m_pick.group(1))
        ch = _label_from_ordinal_1based(n1, options)
        if ch is not None:
            return ch
    norm_for_opt = raw.replace(" ", "").lower()
    m_op = re.search(r"(?:option|选项)\s*[:：]?\s*(\d+)", norm_for_opt, re.IGNORECASE)
    if m_op:
        n1 = int(m_op.group(1))
        ch = _label_from_ordinal_1based(n1, options)
        if ch is not None:
            return ch

    # 2) 行首/句末「一」「二」单字，仅当候选项数较少时
    s = raw.replace(" ", "")
    if n_opt <= 3:
        m_one = re.match(r"^([一二两三四])$", s)
        if m_one:
            tok = m_one.group(1)
            if tok in _CN_DIGITS and tok not in ("零", "十"):
                n1 = int(_CN_DIGITS[tok])
                ch = _label_from_ordinal_1based(n1, options)
                if ch is not None:
                    return ch
    m_tail = re.search(r"([0-9一二两三四五六七八九十]+)\s*号$", s)
    if m_tail:
        n1 = _parse_ordinal_index_1based(m_tail.group(1))
        if n1 is not None:
            ch = _label_from_ordinal_1based(n1, options)
            if ch is not None:
                return ch

    return None


def parse_voice_choice(asr_text: str, options: list[str]) -> str | None:
    """
    从识别文本中解析医生选择的耗材名称。
    支持：完全匹配、子串匹配、第 N 个（1/一/第一个）。
    """
    raw = re.sub(
        r"^[。，、；：！？\s]+|[。，、；：！？\s]+$",
        "",
        (asr_text or "").strip(),
    )
    if not raw:
        return None
    normalized = raw.replace(" ", "").lower()

    for opt in options:
        if opt and opt in raw:
            return opt

    chosen_ord = _choose_from_ordinal_text(raw, options)
    if chosen_ord is not None:
        return chosen_ord

    m_num = re.search(r"(\d+)", raw)
    if m_num:
        idx = int(m_num.group(1)) - 1
        if 0 <= idx < len(options):
            return options[idx]

    m_cn = re.search(r"第([一二两三四五六七八九十\d]+)个", raw)
    if m_cn:
        token = m_cn.group(1)
        n1 = int(token) if token.isdigit() else _parse_ordinal_index_1based(token)
        if n1 is not None:
            ch = _label_from_ordinal_1based(n1, options)
            if ch is not None:
                return ch

    for i, opt in enumerate(options):
        if not opt:
            continue
        aliases = [f"第{i + 1}个", f"第{i + 1}", f"{i + 1}号"]
        if any(a in normalized for a in aliases):
            return opt

    negatives = ("不是", "没有", "否", "无", "错")
    if any(n in raw for n in negatives):
        return None

    return None


def match_voice_choice_against_candidates(
    asr_text: str, candidates: list[str]
) -> str | None:
    """
    在未匹配 pending 展示的 topk 话术时，按本台手术「候选耗材清单」做名称子串匹配。
    长名优先，减少短名误命中（如「纱」同时匹配多种耗材时优先更长全称）。
    """
    raw = (asr_text or "").strip()
    if not raw:
        return None
    stripped = [c.strip() for c in candidates if c and str(c).strip()]
    if not stripped:
        return None
    for c in sorted(stripped, key=len, reverse=True):
        if c in raw:
            return c
    return None


def is_rejection_phrase(asr_text: str) -> bool:
    """医生明确否认全部候选时返回 True（须在 parse_voice_choice 之前调用）。"""
    raw = (asr_text or "").strip()
    if not raw:
        return False
    negatives = ("不是", "没有", "否", "无", "错")
    return any(n in raw for n in negatives)


def build_prompt_text(options: list[tuple[str, float]]) -> str:
    parts = ["请确认刚才使用的耗材是下面哪一项。"]
    for i, (name, _conf) in enumerate(options, start=1):
        parts.append(f"第{i}个，{name}。")
    return "".join(parts)


@dataclass
class VoiceAttemptResult:
    chosen_label: str | None
    asr_text: str | None
    error: str | None


class VoiceConfirmationOrchestrator:
    """服务端 TTS 播报 + ffmpeg 采集 + 百度 ASR + 文本解析。"""

    def __init__(self, settings: Settings, baidu: BaiduSpeechService) -> None:
        self._s = settings
        self._baidu = baidu
        self._lock = asyncio.Lock()

    def _ffplay_path(self) -> str | None:
        return shutil.which("ffplay")

    def _ffmpeg_path(self) -> str | None:
        return shutil.which("ffmpeg")

    def _record_pcm_ffmpeg(self, seconds: float) -> tuple[bytes | None, str | None]:
        ffmpeg = self._ffmpeg_path()
        if not ffmpeg:
            return None, "ffmpeg not found in PATH"
        system = platform.system()
        if system == "Darwin":
            dev = self._s.voice_ffmpeg_input.strip() or ":0"
            input_args = ["-f", "avfoundation", "-i", dev]
        else:
            dev = self._s.voice_ffmpeg_input.strip() or "default"
            input_args = ["-f", "alsa", "-i", dev]

        cmd = [
            ffmpeg,
            "-nostdin",
            "-loglevel",
            "error",
            "-y",
            *input_args,
            "-t",
            str(seconds),
            "-ar",
            "16000",
            "-ac",
            "1",
            "-f",
            "s16le",
            "-acodec",
            "pcm_s16le",
            "pipe:1",
        ]
        try:
            proc = subprocess.run(
                cmd,
                capture_output=True,
                timeout=seconds + 5.0,
                check=False,
            )
        except subprocess.TimeoutExpired:
            return None, "ffmpeg record timeout"
        if proc.returncode != 0:
            err = (proc.stderr or b"").decode("utf-8", errors="replace")
            return None, f"ffmpeg failed: {err or proc.returncode}"
        return proc.stdout, None

    def _play_mp3_file(self, path: str) -> str | None:
        ffplay = self._ffplay_path()
        if not ffplay:
            return "ffplay not found in PATH"
        try:
            proc = subprocess.run(
                [
                    ffplay,
                    "-nodisp",
                    "-autoexit",
                    "-loglevel",
                    "quiet",
                    path,
                ],
                capture_output=True,
                timeout=120.0,
                check=False,
            )
        except subprocess.TimeoutExpired:
            return "ffplay timeout"
        if proc.returncode != 0:
            return f"ffplay exit {proc.returncode}"
        return None

    def _synthesize_to_temp_mp3(self, text: str) -> tuple[str | None, str | None]:
        try:
            audio = self._baidu.synthesis(
                text,
                "zh",
                1,
                {"spd": 5, "pit": 5, "vol": 9, "per": 0},
            )
        except BaiduSpeechNotConfiguredError as exc:
            return None, str(exc)
        if isinstance(audio, dict):
            return None, f"TTS error: {audio!r}"
        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
        try:
            tmp.write(audio)
            tmp.flush()
            path = tmp.name
        finally:
            tmp.close()
        return path, None

    async def speak_prompt(self, text: str) -> None:
        """仅百度 TTS + ffplay 播报，不录音。供待确认入队时提示手术室。"""
        if not (text or "").strip():
            return
        if not self._s.voice_tts_on_pending_enqueued:
            return
        if not self._s.voice_confirmation_enabled:
            return
        if not self._baidu.configured:
            logger.debug("speak_prompt skipped: baidu_speech not configured")
            return
        async with self._lock:
            mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, text)
            if err or not mp3_path:
                logger.warning("TTS synthesis failed: {}", err)
                return
            try:
                play_err = await run_in_threadpool(self._play_mp3_file, mp3_path)
                if play_err:
                    logger.warning("TTS play failed: {}", play_err)
            finally:
                try:
                    os.unlink(mp3_path)
                except OSError:
                    pass

    async def run_confirmation(
        self,
        *,
        surgery_id: str,
        options: list[tuple[str, float]],
    ) -> VoiceAttemptResult:
        if not self._s.voice_confirmation_enabled:
            return VoiceAttemptResult(None, None, "voice_confirmation_disabled")
        if not options:
            return VoiceAttemptResult(None, None, "no_options")
        if not self._baidu.configured:
            return VoiceAttemptResult(None, None, "baidu_speech_not_configured")

        labels = [o[0] for o in options]
        prompt = build_prompt_text(options)
        logger.info("Voice confirm surgery={} prompt_len={}", surgery_id, len(prompt))

        async with self._lock:
            mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, prompt)
            if err or not mp3_path:
                return VoiceAttemptResult(None, None, err or "tts_failed")
            try:
                play_err = await run_in_threadpool(self._play_mp3_file, mp3_path)
                if play_err:
                    return VoiceAttemptResult(None, None, play_err)
            finally:
                try:
                    os.unlink(mp3_path)
                except OSError:
                    pass

            pcm, rec_err = await run_in_threadpool(
                self._record_pcm_ffmpeg, float(self._s.voice_record_seconds)
            )
            if rec_err or not pcm:
                return VoiceAttemptResult(None, None, rec_err or "empty_audio")

        asr_payload = await run_in_threadpool(self._baidu.asr, pcm, "pcm", 16000, None)
        if not isinstance(asr_payload, dict):
            return VoiceAttemptResult(None, None, "asr_invalid_response")
        if asr_payload.get("err_no") != 0:
            return VoiceAttemptResult(
                None,
                None,
                f"asr_err_{asr_payload.get('err_no')}: {asr_payload.get('err_msg')}",
            )
        results = asr_payload.get("result")
        text: str | None = None
        if isinstance(results, list) and results:
            text = str(results[0])
        elif isinstance(results, str):
            text = results
        if not text:
            return VoiceAttemptResult(None, None, "asr_empty_text")

        chosen = parse_voice_choice(text, labels)
        return VoiceAttemptResult(chosen, text, None)