Files
operating-room-monitor-server/app/services/voice_resolution.py
Kevin 69980d8073 feat: align surgery API with schemas and extend client tooling
- Refactor app API and schemas; adjust surgery pipeline, repository, and session manager.

- Improve consumption TSV logging and consumable vision integration; trim voice resolution.

- Add Baidu Face 1:N search script, .env.example entries, and client API integration doc.

- Update demo client, staging checklist, surgery interface doc, and related tests; add sample face image.

Made-with: Cursor
2026-04-23 16:09:20 +08:00

696 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Resolve pending consumable confirmation from uploaded WAV: MinIO + Baidu ASR + parse."""
from __future__ import annotations
import json
from dataclasses import dataclass
from fastapi.concurrency import run_in_threadpool
from loguru import logger
from app.config import Settings
from app.services.voice_file_log import emit_voice_event
from app.database import AsyncSessionLocal
from app.repositories.voice_audits import VoiceAuditRepository
from app.services.audio_wav import WavDecodeError, wav_bytes_to_pcm16k_mono_s16le
from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService
from app.services.minio_audio_storage import MinioAudioStorageService, StoredAudio
from app.services.video.session_manager import CameraSessionManager
from app.services.voice_confirm import (
is_rejection_phrase,
match_voice_choice_against_candidates,
parse_voice_choice,
)
from app.surgery_errors import SurgeryPipelineError
@dataclass(frozen=True)
class VoiceResolveResult:
resolved_label: str | None
rejected: bool
asr_text: str | None
audio_object_key: str | None
message: str
class VoiceConfirmationService:
"""Upload audio to MinIO, run Baidu ASR, parse choice, resolve pending queue entry."""
def __init__(
self,
settings: Settings,
sessions: CameraSessionManager,
baidu: BaiduSpeechService,
minio: MinioAudioStorageService,
audits: VoiceAuditRepository,
) -> None:
self._s = settings
self._sessions = sessions
self._baidu = baidu
self._minio = minio
self._audits = audits
def _emit_voice_trace(
self,
*,
source: str,
status: str,
surgery_id: str,
confirmation_id: str,
asr_text: str | None = None,
resolved_label: str | None = None,
rejected: bool | str | None = None,
error_message: str | None = None,
audio_object_key: str | None = None,
) -> None:
emit_voice_event(
self._s,
surgery_id=surgery_id,
source=source,
status=status,
confirmation_id=confirmation_id,
asr_text=asr_text,
resolved_label=resolved_label,
rejected=rejected,
error_message=error_message,
audio_object_key=audio_object_key,
)
def synthesize_prompt_to_mp3(self, text: str) -> bytes:
"""百度在线语音合成,供浏览器直接播放,与 `voice_confirm._synthesize_to_temp_mp3` 同参。"""
t = (text or "").strip()
if not t:
raise SurgeryPipelineError("TTS_TEXT_EMPTY", "提示文本为空。")
try:
r = self._baidu.synthesis(
t, "zh", 1, {"spd": 5, "pit": 5, "vol": 9, "per": 0}
)
except BaiduSpeechNotConfiguredError as exc:
raise SurgeryPipelineError(
"BAIDU_NOT_CONFIGURED",
"服务端未配置百度语音,无法合成播报音频。",
) from exc
if isinstance(r, dict):
raise SurgeryPipelineError("TTS_ERROR", f"百度 TTS 失败: {r!r}")
return r
async def resolve_from_wav(
self,
*,
surgery_id: str,
confirmation_id: str,
wav_bytes: bytes,
filename: str,
content_type: str | None,
) -> VoiceResolveResult:
_ = filename # reserved for logging / future MIME sniff
if len(wav_bytes) > self._s.voice_upload_max_bytes:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="invalid_audio",
audio_object_key=None,
audio_content_type=content_type,
audio_size_bytes=len(wav_bytes),
audio_sha256=None,
asr_text=None,
resolved_label=None,
options_snapshot_json=None,
error_message="音频超过大小限制",
)
self._emit_voice_trace(
source="wav",
status="invalid_audio",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="音频超过大小限制",
)
raise SurgeryPipelineError(
"VOICE_AUDIO_INVALID",
f"音频大小超过限制(最大 {self._s.voice_upload_max_bytes} 字节)。",
)
if not self._minio.configured:
self._emit_voice_trace(
source="wav",
status="minio_not_configured",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="服务端未配置 MinIO无法保存语音追溯文件。",
)
raise SurgeryPipelineError(
"MINIO_NOT_CONFIGURED",
"服务端未配置 MinIO无法保存语音追溯文件。",
)
if not self._baidu.configured:
self._emit_voice_trace(
source="wav",
status="baidu_not_configured",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="服务端未配置百度语音,无法进行语音识别。",
)
raise SurgeryPipelineError(
"BAIDU_NOT_CONFIGURED",
"服务端未配置百度语音,无法进行语音识别。",
)
pending = self._sessions.get_pending_confirmation_by_id(
surgery_id, confirmation_id
)
if pending is None:
self._emit_voice_trace(
source="wav",
status="confirmation_not_found",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="未找到该待确认项或已处理。",
)
raise SurgeryPipelineError(
"CONFIRMATION_NOT_FOUND",
"未找到该待确认项或已处理。",
)
option_labels = [a.strip() for a, _ in pending.options if a.strip()]
options_snapshot = json.dumps(
[{"label": a, "confidence": b} for a, b in pending.options],
ensure_ascii=False,
)
stored: StoredAudio | None = None
try:
await run_in_threadpool(self._minio.ensure_bucket)
stored = await run_in_threadpool(
lambda: self._minio.upload_voice_wav(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
data=wav_bytes,
content_type=content_type,
)
)
except Exception as exc:
logger.warning("MinIO upload failed: {}", exc)
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="upload_failed",
audio_object_key=None,
audio_content_type=content_type,
audio_size_bytes=len(wav_bytes),
audio_sha256=None,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=str(exc),
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
self._emit_voice_trace(
source="wav",
status="upload_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=str(exc),
)
raise SurgeryPipelineError(
"MINIO_UPLOAD_FAILED",
f"语音文件上传失败:{exc}",
) from exc
try:
pcm = await run_in_threadpool(wav_bytes_to_pcm16k_mono_s16le, wav_bytes)
except WavDecodeError as exc:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="invalid_audio",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=str(exc),
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
self._emit_voice_trace(
source="wav",
status="invalid_audio",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=str(exc),
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError(
"VOICE_AUDIO_INVALID",
f"无法解析 WAV 音频:{exc}",
) from exc
try:
asr_payload = await run_in_threadpool(
self._baidu.asr, pcm, "pcm", 16000, None
)
except BaiduSpeechNotConfiguredError as exc:
self._emit_voice_trace(
source="wav",
status="baidu_not_configured",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=str(exc),
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError(
"BAIDU_NOT_CONFIGURED",
str(exc),
) from exc
except Exception as exc:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=str(exc),
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
self._emit_voice_trace(
source="wav",
status="asr_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=str(exc),
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError(
"VOICE_ASR_FAILED",
f"语音识别调用失败:{exc}",
) from exc
if not isinstance(asr_payload, dict):
msg = "ASR 返回格式异常"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
self._emit_voice_trace(
source="wav",
status="asr_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=msg,
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
if asr_payload.get("err_no") != 0:
msg = (
f"asr_err_{asr_payload.get('err_no')}: "
f"{asr_payload.get('err_msg')}"
)
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
self._emit_voice_trace(
source="wav",
status="asr_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=msg,
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
results = asr_payload.get("result")
text: str | None = None
if isinstance(results, list) and results:
text = str(results[0])
elif isinstance(results, str):
text = results
text = (text or "").strip()
if not text:
msg = "语音识别结果为空"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
self._emit_voice_trace(
source="wav",
status="asr_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=msg,
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=None)
rejected = is_rejection_phrase(text)
chosen: str | None = None
if not rejected:
chosen = parse_voice_choice(text, option_labels)
if chosen is None:
surgery_candidates = self._sessions.get_surgery_candidate_consumables(
surgery_id
)
chosen = match_voice_choice_against_candidates(
text, surgery_candidates
)
if not rejected and not chosen:
_, retry_remaining = await self._sessions.record_voice_parse_failure(
surgery_id, confirmation_id
)
base = (
"无法从语音中匹配候选项或本台手术候选清单中的耗材名称,"
"请重试或说「不是」否认全部。"
)
if retry_remaining > 0:
msg = (
f"{base} 本次未听清或未能解析,"
f"您还可重试 {retry_remaining} 次,"
"请说「第一个」「第二个」等序号或候选项全名。"
)
else:
msg = (
f"{base} 本轮重试机会已用完,"
"请再清晰地说序号/全名,或说「不是」否认全部。"
)
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="parse_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=text,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=msg)
self._emit_voice_trace(
source="wav",
status="parse_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
asr_text=text,
error_message=msg,
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError(
"VOICE_PARSE_FAILED",
msg,
extra={
"confirmation_id": confirmation_id,
"retry_remaining": retry_remaining,
},
)
await self._sessions.resolve_pending_confirmation(
surgery_id,
confirmation_id,
chosen_label=chosen,
rejected=rejected,
)
final_status = "rejected" if rejected else "recognized"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status=final_status,
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=text,
resolved_label=chosen if not rejected else None,
options_snapshot_json=options_snapshot,
error_message=None,
)
self._emit_voice_trace(
source="wav",
status=final_status,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
asr_text=text,
resolved_label=chosen if not rejected else None,
rejected=rejected,
audio_object_key=stored.object_key,
)
if rejected:
return VoiceResolveResult(
resolved_label=None,
rejected=True,
asr_text=text,
audio_object_key=stored.object_key,
message="已否认全部候选,未记消耗。",
)
return VoiceResolveResult(
resolved_label=chosen,
rejected=False,
asr_text=text,
audio_object_key=stored.object_key,
message="已确认并记一条消耗。",
)
async def resolve_from_recognized_text(
self,
*,
surgery_id: str,
confirmation_id: str,
recognized_text: str,
) -> VoiceResolveResult:
"""浏览器 Web Speech 等客户端本机识别后的文本,不经 MinIO/百度 ASR解析规则与 `resolve_from_wav` 一致。"""
pending = self._sessions.get_pending_confirmation_by_id(
surgery_id, confirmation_id
)
if pending is None:
self._emit_voice_trace(
source="text",
status="confirmation_not_found",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="未找到该待确认项或已处理。",
)
raise SurgeryPipelineError(
"CONFIRMATION_NOT_FOUND",
"未找到该待确认项或已处理。",
)
option_labels = [a.strip() for a, _ in pending.options if a.strip()]
options_snapshot = json.dumps(
[{"label": a, "confidence": b} for a, b in pending.options],
ensure_ascii=False,
)
text = (recognized_text or "").strip()
if not text:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="client_stt_empty",
audio_object_key=None,
audio_content_type=None,
audio_size_bytes=None,
audio_sha256=None,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message="客户端识别文本为空",
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error="empty text")
self._emit_voice_trace(
source="text",
status="client_stt_empty",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="客户端识别文本为空",
)
raise SurgeryPipelineError("VOICE_TEXT_EMPTY", "recognized_text 为空。")
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=None)
rejected = is_rejection_phrase(text)
chosen: str | None = None
if not rejected:
chosen = parse_voice_choice(text, option_labels)
if chosen is None:
surgery_candidates = self._sessions.get_surgery_candidate_consumables(
surgery_id
)
chosen = match_voice_choice_against_candidates(text, surgery_candidates)
if not rejected and not chosen:
_, retry_remaining = await self._sessions.record_voice_parse_failure(
surgery_id, confirmation_id
)
base = (
"无法从文本中匹配候选项或本台手术候选清单中的耗材名称,"
"请重试或说「不是」否认全部。"
)
if retry_remaining > 0:
msg = (
f"{base} 本次未能解析,"
f"您还可重试 {retry_remaining} 次,"
"请输入「第一个」「第二个」等或候选项全名。"
)
else:
msg = (
f"{base} 本轮重试机会已用完,"
"请再输入序号/全名,或说「不是」否认全部。"
)
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="client_stt_parse_failed",
audio_object_key=None,
audio_content_type=None,
audio_size_bytes=None,
audio_sha256=None,
asr_text=text,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=msg)
self._emit_voice_trace(
source="text",
status="client_stt_parse_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
asr_text=text,
error_message=msg,
)
raise SurgeryPipelineError(
"VOICE_PARSE_FAILED",
msg,
extra={
"confirmation_id": confirmation_id,
"retry_remaining": retry_remaining,
},
)
await self._sessions.resolve_pending_confirmation(
surgery_id,
confirmation_id,
chosen_label=chosen,
rejected=rejected,
)
final_status = "rejected" if rejected else "recognized"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status=final_status,
audio_object_key=None,
audio_content_type=None,
audio_size_bytes=None,
audio_sha256=None,
asr_text=text,
resolved_label=chosen if not rejected else None,
options_snapshot_json=options_snapshot,
error_message=None,
)
self._emit_voice_trace(
source="text",
status=final_status,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
asr_text=text,
resolved_label=chosen if not rejected else None,
rejected=rejected,
)
if rejected:
return VoiceResolveResult(
resolved_label=None,
rejected=True,
asr_text=text,
audio_object_key=None,
message="已否认全部候选,未记消耗。",
)
return VoiceResolveResult(
resolved_label=chosen,
rejected=False,
asr_text=text,
audio_object_key=None,
message="已确认并记一条消耗。",
)
async def _persist_audit(
self,
*,
surgery_id: str,
confirmation_id: str,
status: str,
audio_object_key: str | None,
audio_content_type: str | None,
audio_size_bytes: int | None,
audio_sha256: str | None,
asr_text: str | None,
resolved_label: str | None,
options_snapshot_json: str | None,
error_message: str | None,
) -> None:
try:
async with AsyncSessionLocal() as session:
async with session.begin():
await self._audits.save_audit(
session,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status=status,
audio_object_key=audio_object_key,
audio_content_type=audio_content_type,
audio_size_bytes=audio_size_bytes,
audio_sha256=audio_sha256,
asr_text=asr_text,
resolved_label=resolved_label,
options_snapshot_json=options_snapshot_json,
error_message=error_message,
)
except Exception as exc:
logger.error("Persist voice audit failed: {}", exc)