feat: surgery pipeline API, video inference, voice confirm, and tests

- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks. - Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence. - Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config. - Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency. - Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT. - Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled. Made-with: Cursor
2026-04-21 18:33:54 +08:00
parent d1a3d029ec
commit 04866559db
56 changed files with 7196 additions and 43 deletions
--- a/app/services/voice_resolution.py
+++ b/app/services/voice_resolution.py
@@ -0,0 +1,349 @@
+"""Resolve pending consumable confirmation from uploaded WAV: MinIO + Baidu ASR + parse."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+
+from fastapi.concurrency import run_in_threadpool
+from loguru import logger
+
+from app.config import Settings
+from app.database import AsyncSessionLocal
+from app.repositories.voice_audits import VoiceAuditRepository
+from app.services.audio_wav import WavDecodeError, wav_bytes_to_pcm16k_mono_s16le
+from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService
+from app.services.minio_audio_storage import MinioAudioStorageService, StoredAudio
+from app.services.video.session_manager import CameraSessionManager
+from app.services.voice_confirm import is_rejection_phrase, parse_voice_choice
+from app.surgery_errors import SurgeryPipelineError
+
+
+@dataclass(frozen=True)
+class VoiceResolveResult:
+    resolved_label: str | None
+    rejected: bool
+    asr_text: str | None
+    audio_object_key: str | None
+    message: str
+
+
+class VoiceConfirmationService:
+    """Upload audio to MinIO, run Baidu ASR, parse choice, resolve pending queue entry."""
+
+    def __init__(
+        self,
+        settings: Settings,
+        sessions: CameraSessionManager,
+        baidu: BaiduSpeechService,
+        minio: MinioAudioStorageService,
+        audits: VoiceAuditRepository,
+    ) -> None:
+        self._s = settings
+        self._sessions = sessions
+        self._baidu = baidu
+        self._minio = minio
+        self._audits = audits
+
+    async def resolve_from_wav(
+        self,
+        *,
+        surgery_id: str,
+        confirmation_id: str,
+        wav_bytes: bytes,
+        filename: str,
+        content_type: str | None,
+    ) -> VoiceResolveResult:
+        _ = filename  # reserved for logging / future MIME sniff
+
+        if len(wav_bytes) > self._s.voice_upload_max_bytes:
+            await self._persist_audit(
+                surgery_id=surgery_id,
+                confirmation_id=confirmation_id,
+                status="invalid_audio",
+                audio_object_key=None,
+                audio_content_type=content_type,
+                audio_size_bytes=len(wav_bytes),
+                audio_sha256=None,
+                asr_text=None,
+                resolved_label=None,
+                options_snapshot_json=None,
+                error_message="音频超过大小限制",
+            )
+            raise SurgeryPipelineError(
+                "VOICE_AUDIO_INVALID",
+                f"音频大小超过限制（最大 {self._s.voice_upload_max_bytes} 字节）。",
+            )
+
+        if not self._minio.configured:
+            raise SurgeryPipelineError(
+                "MINIO_NOT_CONFIGURED",
+                "服务端未配置 MinIO，无法保存语音追溯文件。",
+            )
+
+        if not self._baidu.configured:
+            raise SurgeryPipelineError(
+                "BAIDU_NOT_CONFIGURED",
+                "服务端未配置百度语音，无法进行语音识别。",
+            )
+
+        pending = self._sessions.get_pending_confirmation_by_id(
+            surgery_id, confirmation_id
+        )
+        if pending is None:
+            raise SurgeryPipelineError(
+                "CONFIRMATION_NOT_FOUND",
+                "未找到该待确认项或已处理。",
+            )
+
+        option_labels = [a.strip() for a, _ in pending.options if a.strip()]
+        options_snapshot = json.dumps(
+            [{"label": a, "confidence": b} for a, b in pending.options],
+            ensure_ascii=False,
+        )
+
+        stored: StoredAudio | None = None
+        try:
+            await run_in_threadpool(self._minio.ensure_bucket)
+            stored = await run_in_threadpool(
+                lambda: self._minio.upload_voice_wav(
+                    surgery_id=surgery_id,
+                    confirmation_id=confirmation_id,
+                    data=wav_bytes,
+                    content_type=content_type,
+                )
+            )
+        except Exception as exc:
+            logger.warning("MinIO upload failed: {}", exc)
+            await self._persist_audit(
+                surgery_id=surgery_id,
+                confirmation_id=confirmation_id,
+                status="upload_failed",
+                audio_object_key=None,
+                audio_content_type=content_type,
+                audio_size_bytes=len(wav_bytes),
+                audio_sha256=None,
+                asr_text=None,
+                resolved_label=None,
+                options_snapshot_json=options_snapshot,
+                error_message=str(exc),
+            )
+            self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
+            raise SurgeryPipelineError(
+                "MINIO_UPLOAD_FAILED",
+                f"语音文件上传失败：{exc}",
+            ) from exc
+
+        try:
+            pcm = await run_in_threadpool(wav_bytes_to_pcm16k_mono_s16le, wav_bytes)
+        except WavDecodeError as exc:
+            await self._persist_audit(
+                surgery_id=surgery_id,
+                confirmation_id=confirmation_id,
+                status="invalid_audio",
+                audio_object_key=stored.object_key,
+                audio_content_type=content_type,
+                audio_size_bytes=stored.size_bytes,
+                audio_sha256=stored.sha256_hex,
+                asr_text=None,
+                resolved_label=None,
+                options_snapshot_json=options_snapshot,
+                error_message=str(exc),
+            )
+            self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
+            raise SurgeryPipelineError(
+                "VOICE_AUDIO_INVALID",
+                f"无法解析 WAV 音频：{exc}",
+            ) from exc
+
+        try:
+            asr_payload = await run_in_threadpool(
+                self._baidu.asr, pcm, "pcm", 16000, None
+            )
+        except BaiduSpeechNotConfiguredError as exc:
+            raise SurgeryPipelineError(
+                "BAIDU_NOT_CONFIGURED",
+                str(exc),
+            ) from exc
+        except Exception as exc:
+            await self._persist_audit(
+                surgery_id=surgery_id,
+                confirmation_id=confirmation_id,
+                status="asr_failed",
+                audio_object_key=stored.object_key,
+                audio_content_type=content_type,
+                audio_size_bytes=stored.size_bytes,
+                audio_sha256=stored.sha256_hex,
+                asr_text=None,
+                resolved_label=None,
+                options_snapshot_json=options_snapshot,
+                error_message=str(exc),
+            )
+            self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
+            raise SurgeryPipelineError(
+                "VOICE_ASR_FAILED",
+                f"语音识别调用失败：{exc}",
+            ) from exc
+
+        if not isinstance(asr_payload, dict):
+            msg = "ASR 返回格式异常"
+            await self._persist_audit(
+                surgery_id=surgery_id,
+                confirmation_id=confirmation_id,
+                status="asr_failed",
+                audio_object_key=stored.object_key,
+                audio_content_type=content_type,
+                audio_size_bytes=stored.size_bytes,
+                audio_sha256=stored.sha256_hex,
+                asr_text=None,
+                resolved_label=None,
+                options_snapshot_json=options_snapshot,
+                error_message=msg,
+            )
+            self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
+            raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
+
+        if asr_payload.get("err_no") != 0:
+            msg = (
+                f"asr_err_{asr_payload.get('err_no')}: "
+                f"{asr_payload.get('err_msg')}"
+            )
+            await self._persist_audit(
+                surgery_id=surgery_id,
+                confirmation_id=confirmation_id,
+                status="asr_failed",
+                audio_object_key=stored.object_key,
+                audio_content_type=content_type,
+                audio_size_bytes=stored.size_bytes,
+                audio_sha256=stored.sha256_hex,
+                asr_text=None,
+                resolved_label=None,
+                options_snapshot_json=options_snapshot,
+                error_message=msg,
+            )
+            self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
+            raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
+
+        results = asr_payload.get("result")
+        text: str | None = None
+        if isinstance(results, list) and results:
+            text = str(results[0])
+        elif isinstance(results, str):
+            text = results
+        text = (text or "").strip()
+
+        if not text:
+            msg = "语音识别结果为空"
+            await self._persist_audit(
+                surgery_id=surgery_id,
+                confirmation_id=confirmation_id,
+                status="asr_failed",
+                audio_object_key=stored.object_key,
+                audio_content_type=content_type,
+                audio_size_bytes=stored.size_bytes,
+                audio_sha256=stored.sha256_hex,
+                asr_text=None,
+                resolved_label=None,
+                options_snapshot_json=options_snapshot,
+                error_message=msg,
+            )
+            self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
+            raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
+
+        self._sessions.record_voice_trace(surgery_id, asr_text=text, error=None)
+
+        rejected = is_rejection_phrase(text)
+        chosen: str | None = None
+        if not rejected:
+            chosen = parse_voice_choice(text, option_labels)
+
+        if not rejected and not chosen:
+            msg = "无法从语音中匹配候选项，请重试或说「不是」否认全部"
+            await self._persist_audit(
+                surgery_id=surgery_id,
+                confirmation_id=confirmation_id,
+                status="parse_failed",
+                audio_object_key=stored.object_key,
+                audio_content_type=content_type,
+                audio_size_bytes=stored.size_bytes,
+                audio_sha256=stored.sha256_hex,
+                asr_text=text,
+                resolved_label=None,
+                options_snapshot_json=options_snapshot,
+                error_message=msg,
+            )
+            self._sessions.record_voice_trace(surgery_id, asr_text=text, error=msg)
+            raise SurgeryPipelineError("VOICE_PARSE_FAILED", msg)
+
+        await self._sessions.resolve_pending_confirmation(
+            surgery_id,
+            confirmation_id,
+            chosen_label=chosen,
+            rejected=rejected,
+        )
+
+        final_status = "rejected" if rejected else "recognized"
+        await self._persist_audit(
+            surgery_id=surgery_id,
+            confirmation_id=confirmation_id,
+            status=final_status,
+            audio_object_key=stored.object_key,
+            audio_content_type=content_type,
+            audio_size_bytes=stored.size_bytes,
+            audio_sha256=stored.sha256_hex,
+            asr_text=text,
+            resolved_label=chosen if not rejected else None,
+            options_snapshot_json=options_snapshot,
+            error_message=None,
+        )
+
+        if rejected:
+            return VoiceResolveResult(
+                resolved_label=None,
+                rejected=True,
+                asr_text=text,
+                audio_object_key=stored.object_key,
+                message="已否认全部候选，未记消耗。",
+            )
+        return VoiceResolveResult(
+            resolved_label=chosen,
+            rejected=False,
+            asr_text=text,
+            audio_object_key=stored.object_key,
+            message="已确认并记一条消耗。",
+        )
+
+    async def _persist_audit(
+        self,
+        *,
+        surgery_id: str,
+        confirmation_id: str,
+        status: str,
+        audio_object_key: str | None,
+        audio_content_type: str | None,
+        audio_size_bytes: int | None,
+        audio_sha256: str | None,
+        asr_text: str | None,
+        resolved_label: str | None,
+        options_snapshot_json: str | None,
+        error_message: str | None,
+    ) -> None:
+        try:
+            async with AsyncSessionLocal() as session:
+                async with session.begin():
+                    await self._audits.save_audit(
+                        session,
+                        surgery_id=surgery_id,
+                        confirmation_id=confirmation_id,
+                        status=status,
+                        audio_object_key=audio_object_key,
+                        audio_content_type=audio_content_type,
+                        audio_size_bytes=audio_size_bytes,
+                        audio_sha256=audio_sha256,
+                        asr_text=asr_text,
+                        resolved_label=resolved_label,
+                        options_snapshot_json=options_snapshot_json,
+                        error_message=error_message,
+                    )
+        except Exception as exc:
+            logger.error("Persist voice audit failed: {}", exc)