feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测 Made-with: Cursor
2026-04-23 20:42:21 +08:00
parent 69980d8073
commit 3d7bd70355
55 changed files with 4544 additions and 2050 deletions
--- a/app/services/voice_confirm.py
+++ b/app/services/voice_confirm.py
@@ -1,19 +1,6 @@
 from __future__ import annotations

-import asyncio
-import os
-import platform
 import re
-import shutil
-import subprocess
-import tempfile
-from dataclasses import dataclass
-
-from fastapi.concurrency import run_in_threadpool
-from loguru import logger
-
-from app.config import Settings
-from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService


 _CN_DIGITS = {
@@ -200,199 +187,3 @@ def build_prompt_text(options: list[tuple[str, float]]) -> str:
    for i, (name, _conf) in enumerate(options, start=1):
        parts.append(f"第{i}个，{name}。")
    return "".join(parts)
-
-
-@dataclass
-class VoiceAttemptResult:
-    chosen_label: str | None
-    asr_text: str | None
-    error: str | None
-
-
-class VoiceConfirmationOrchestrator:
-    """服务端 TTS 播报 + ffmpeg 采集 + 百度 ASR + 文本解析。"""
-
-    def __init__(self, settings: Settings, baidu: BaiduSpeechService) -> None:
-        self._s = settings
-        self._baidu = baidu
-        self._lock = asyncio.Lock()
-
-    def _ffplay_path(self) -> str | None:
-        return shutil.which("ffplay")
-
-    def _ffmpeg_path(self) -> str | None:
-        return shutil.which("ffmpeg")
-
-    def _record_pcm_ffmpeg(self, seconds: float) -> tuple[bytes | None, str | None]:
-        ffmpeg = self._ffmpeg_path()
-        if not ffmpeg:
-            return None, "ffmpeg not found in PATH"
-        system = platform.system()
-        if system == "Darwin":
-            dev = self._s.voice_ffmpeg_input.strip() or ":0"
-            input_args = ["-f", "avfoundation", "-i", dev]
-        else:
-            dev = self._s.voice_ffmpeg_input.strip() or "default"
-            input_args = ["-f", "alsa", "-i", dev]
-
-        cmd = [
-            ffmpeg,
-            "-nostdin",
-            "-loglevel",
-            "error",
-            "-y",
-            *input_args,
-            "-t",
-            str(seconds),
-            "-ar",
-            "16000",
-            "-ac",
-            "1",
-            "-f",
-            "s16le",
-            "-acodec",
-            "pcm_s16le",
-            "pipe:1",
-        ]
-        try:
-            proc = subprocess.run(
-                cmd,
-                capture_output=True,
-                timeout=seconds + 5.0,
-                check=False,
-            )
-        except subprocess.TimeoutExpired:
-            return None, "ffmpeg record timeout"
-        if proc.returncode != 0:
-            err = (proc.stderr or b"").decode("utf-8", errors="replace")
-            return None, f"ffmpeg failed: {err or proc.returncode}"
-        return proc.stdout, None
-
-    def _play_mp3_file(self, path: str) -> str | None:
-        ffplay = self._ffplay_path()
-        if not ffplay:
-            return "ffplay not found in PATH"
-        try:
-            proc = subprocess.run(
-                [
-                    ffplay,
-                    "-nodisp",
-                    "-autoexit",
-                    "-loglevel",
-                    "quiet",
-                    path,
-                ],
-                capture_output=True,
-                timeout=120.0,
-                check=False,
-            )
-        except subprocess.TimeoutExpired:
-            return "ffplay timeout"
-        if proc.returncode != 0:
-            return f"ffplay exit {proc.returncode}"
-        return None
-
-    def _synthesize_to_temp_mp3(self, text: str) -> tuple[str | None, str | None]:
-        try:
-            audio = self._baidu.synthesis(
-                text,
-                "zh",
-                1,
-                {"spd": 5, "pit": 5, "vol": 9, "per": 0},
-            )
-        except BaiduSpeechNotConfiguredError as exc:
-            return None, str(exc)
-        if isinstance(audio, dict):
-            return None, f"TTS error: {audio!r}"
-        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
-        try:
-            tmp.write(audio)
-            tmp.flush()
-            path = tmp.name
-        finally:
-            tmp.close()
-        return path, None
-
-    async def speak_prompt(self, text: str) -> None:
-        """仅百度 TTS + ffplay 播报，不录音。供待确认入队时提示手术室。"""
-        if not (text or "").strip():
-            return
-        if not self._s.voice_tts_on_pending_enqueued:
-            return
-        if not self._s.voice_confirmation_enabled:
-            return
-        if not self._baidu.configured:
-            logger.debug("speak_prompt skipped: baidu_speech not configured")
-            return
-        async with self._lock:
-            mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, text)
-            if err or not mp3_path:
-                logger.warning("TTS synthesis failed: {}", err)
-                return
-            try:
-                play_err = await run_in_threadpool(self._play_mp3_file, mp3_path)
-                if play_err:
-                    logger.warning("TTS play failed: {}", play_err)
-            finally:
-                try:
-                    os.unlink(mp3_path)
-                except OSError:
-                    pass
-
-    async def run_confirmation(
-        self,
-        *,
-        surgery_id: str,
-        options: list[tuple[str, float]],
-    ) -> VoiceAttemptResult:
-        if not self._s.voice_confirmation_enabled:
-            return VoiceAttemptResult(None, None, "voice_confirmation_disabled")
-        if not options:
-            return VoiceAttemptResult(None, None, "no_options")
-        if not self._baidu.configured:
-            return VoiceAttemptResult(None, None, "baidu_speech_not_configured")
-
-        labels = [o[0] for o in options]
-        prompt = build_prompt_text(options)
-        logger.info("Voice confirm surgery={} prompt_len={}", surgery_id, len(prompt))
-
-        async with self._lock:
-            mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, prompt)
-            if err or not mp3_path:
-                return VoiceAttemptResult(None, None, err or "tts_failed")
-            try:
-                play_err = await run_in_threadpool(self._play_mp3_file, mp3_path)
-                if play_err:
-                    return VoiceAttemptResult(None, None, play_err)
-            finally:
-                try:
-                    os.unlink(mp3_path)
-                except OSError:
-                    pass
-
-            pcm, rec_err = await run_in_threadpool(
-                self._record_pcm_ffmpeg, float(self._s.voice_record_seconds)
-            )
-            if rec_err or not pcm:
-                return VoiceAttemptResult(None, None, rec_err or "empty_audio")
-
-        asr_payload = await run_in_threadpool(self._baidu.asr, pcm, "pcm", 16000, None)
-        if not isinstance(asr_payload, dict):
-            return VoiceAttemptResult(None, None, "asr_invalid_response")
-        if asr_payload.get("err_no") != 0:
-            return VoiceAttemptResult(
-                None,
-                None,
-                f"asr_err_{asr_payload.get('err_no')}: {asr_payload.get('err_msg')}",
-            )
-        results = asr_payload.get("result")
-        text: str | None = None
-        if isinstance(results, list) and results:
-            text = str(results[0])
-        elif isinstance(results, str):
-            text = results
-        if not text:
-            return VoiceAttemptResult(None, None, "asr_empty_text")
-
-        chosen = parse_voice_choice(text, labels)
-        return VoiceAttemptResult(chosen, text, None)