Files
operating-room-monitor-server/app/services/voice_resolution.py
Kevin 04866559db feat: surgery pipeline API, video inference, voice confirm, and tests
- Add FastAPI routes for surgery start/end, results, pending confirmation (WAV upload), and health checks.
- Implement RTSP/Hikvision capture, consumable classification, session manager, MinIO/Baidu voice resolution, and DB persistence.
- Add documentation (client API, video backends, staging checklist) and sample camera/RTSP config.
- Add pytest suite (API contract, session manager, voice, repositories, pipeline persistence) and httpx dev dependency.
- Replace deprecated HTTP_422_UNPROCESSABLE_ENTITY with HTTP_422_UNPROCESSABLE_CONTENT.
- Fix SurgeryPipeline DB reads to use an explicit transaction with autobegin disabled.

Made-with: Cursor
2026-04-21 18:33:54 +08:00

350 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Resolve pending consumable confirmation from uploaded WAV: MinIO + Baidu ASR + parse."""
from __future__ import annotations
import json
from dataclasses import dataclass
from fastapi.concurrency import run_in_threadpool
from loguru import logger
from app.config import Settings
from app.database import AsyncSessionLocal
from app.repositories.voice_audits import VoiceAuditRepository
from app.services.audio_wav import WavDecodeError, wav_bytes_to_pcm16k_mono_s16le
from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService
from app.services.minio_audio_storage import MinioAudioStorageService, StoredAudio
from app.services.video.session_manager import CameraSessionManager
from app.services.voice_confirm import is_rejection_phrase, parse_voice_choice
from app.surgery_errors import SurgeryPipelineError
@dataclass(frozen=True)
class VoiceResolveResult:
resolved_label: str | None
rejected: bool
asr_text: str | None
audio_object_key: str | None
message: str
class VoiceConfirmationService:
"""Upload audio to MinIO, run Baidu ASR, parse choice, resolve pending queue entry."""
def __init__(
self,
settings: Settings,
sessions: CameraSessionManager,
baidu: BaiduSpeechService,
minio: MinioAudioStorageService,
audits: VoiceAuditRepository,
) -> None:
self._s = settings
self._sessions = sessions
self._baidu = baidu
self._minio = minio
self._audits = audits
async def resolve_from_wav(
self,
*,
surgery_id: str,
confirmation_id: str,
wav_bytes: bytes,
filename: str,
content_type: str | None,
) -> VoiceResolveResult:
_ = filename # reserved for logging / future MIME sniff
if len(wav_bytes) > self._s.voice_upload_max_bytes:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="invalid_audio",
audio_object_key=None,
audio_content_type=content_type,
audio_size_bytes=len(wav_bytes),
audio_sha256=None,
asr_text=None,
resolved_label=None,
options_snapshot_json=None,
error_message="音频超过大小限制",
)
raise SurgeryPipelineError(
"VOICE_AUDIO_INVALID",
f"音频大小超过限制(最大 {self._s.voice_upload_max_bytes} 字节)。",
)
if not self._minio.configured:
raise SurgeryPipelineError(
"MINIO_NOT_CONFIGURED",
"服务端未配置 MinIO无法保存语音追溯文件。",
)
if not self._baidu.configured:
raise SurgeryPipelineError(
"BAIDU_NOT_CONFIGURED",
"服务端未配置百度语音,无法进行语音识别。",
)
pending = self._sessions.get_pending_confirmation_by_id(
surgery_id, confirmation_id
)
if pending is None:
raise SurgeryPipelineError(
"CONFIRMATION_NOT_FOUND",
"未找到该待确认项或已处理。",
)
option_labels = [a.strip() for a, _ in pending.options if a.strip()]
options_snapshot = json.dumps(
[{"label": a, "confidence": b} for a, b in pending.options],
ensure_ascii=False,
)
stored: StoredAudio | None = None
try:
await run_in_threadpool(self._minio.ensure_bucket)
stored = await run_in_threadpool(
lambda: self._minio.upload_voice_wav(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
data=wav_bytes,
content_type=content_type,
)
)
except Exception as exc:
logger.warning("MinIO upload failed: {}", exc)
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="upload_failed",
audio_object_key=None,
audio_content_type=content_type,
audio_size_bytes=len(wav_bytes),
audio_sha256=None,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=str(exc),
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
raise SurgeryPipelineError(
"MINIO_UPLOAD_FAILED",
f"语音文件上传失败:{exc}",
) from exc
try:
pcm = await run_in_threadpool(wav_bytes_to_pcm16k_mono_s16le, wav_bytes)
except WavDecodeError as exc:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="invalid_audio",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=str(exc),
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
raise SurgeryPipelineError(
"VOICE_AUDIO_INVALID",
f"无法解析 WAV 音频:{exc}",
) from exc
try:
asr_payload = await run_in_threadpool(
self._baidu.asr, pcm, "pcm", 16000, None
)
except BaiduSpeechNotConfiguredError as exc:
raise SurgeryPipelineError(
"BAIDU_NOT_CONFIGURED",
str(exc),
) from exc
except Exception as exc:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=str(exc),
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
raise SurgeryPipelineError(
"VOICE_ASR_FAILED",
f"语音识别调用失败:{exc}",
) from exc
if not isinstance(asr_payload, dict):
msg = "ASR 返回格式异常"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
if asr_payload.get("err_no") != 0:
msg = (
f"asr_err_{asr_payload.get('err_no')}: "
f"{asr_payload.get('err_msg')}"
)
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
results = asr_payload.get("result")
text: str | None = None
if isinstance(results, list) and results:
text = str(results[0])
elif isinstance(results, str):
text = results
text = (text or "").strip()
if not text:
msg = "语音识别结果为空"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=None)
rejected = is_rejection_phrase(text)
chosen: str | None = None
if not rejected:
chosen = parse_voice_choice(text, option_labels)
if not rejected and not chosen:
msg = "无法从语音中匹配候选项,请重试或说「不是」否认全部"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="parse_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=text,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=msg)
raise SurgeryPipelineError("VOICE_PARSE_FAILED", msg)
await self._sessions.resolve_pending_confirmation(
surgery_id,
confirmation_id,
chosen_label=chosen,
rejected=rejected,
)
final_status = "rejected" if rejected else "recognized"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status=final_status,
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=text,
resolved_label=chosen if not rejected else None,
options_snapshot_json=options_snapshot,
error_message=None,
)
if rejected:
return VoiceResolveResult(
resolved_label=None,
rejected=True,
asr_text=text,
audio_object_key=stored.object_key,
message="已否认全部候选,未记消耗。",
)
return VoiceResolveResult(
resolved_label=chosen,
rejected=False,
asr_text=text,
audio_object_key=stored.object_key,
message="已确认并记一条消耗。",
)
async def _persist_audit(
self,
*,
surgery_id: str,
confirmation_id: str,
status: str,
audio_object_key: str | None,
audio_content_type: str | None,
audio_size_bytes: int | None,
audio_sha256: str | None,
asr_text: str | None,
resolved_label: str | None,
options_snapshot_json: str | None,
error_message: str | None,
) -> None:
try:
async with AsyncSessionLocal() as session:
async with session.begin():
await self._audits.save_audit(
session,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status=status,
audio_object_key=audio_object_key,
audio_content_type=audio_content_type,
audio_size_bytes=audio_size_bytes,
audio_sha256=audio_sha256,
asr_text=asr_text,
resolved_label=resolved_label,
options_snapshot_json=options_snapshot_json,
error_message=error_message,
)
except Exception as exc:
logger.error("Persist voice audit failed: {}", exc)