Files
operating-room-monitor-server/app/services/voice_resolution.py
Kevin 0c05463617 feat: 语音确认、联调与运维增强
- 语音:序数解析(第一个/第二个等)、解析失败计数与 API detail.retry_remaining;
  百度 ASR 固定 dev_pid 为普通话;SurgeryPipelineError 支持 extra 并入 HTTP detail。
- Demo:demo 路由与假 RTSP、客户端 index 与 README;BackendResolver 与配置调整。
- 可观测:消耗 TSV 日志、语音文件日志、终端 Markdown 辅助;相关测试与依赖更新。
- 注意:.env 仍被 gitignore,本地密钥不会进入本提交。

Made-with: Cursor
2026-04-23 14:24:20 +08:00

713 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Resolve pending consumable confirmation from uploaded WAV: MinIO + Baidu ASR + parse."""
from __future__ import annotations
import json
from dataclasses import dataclass
from fastapi.concurrency import run_in_threadpool
from loguru import logger
from app.config import Settings
from app.services.voice_file_log import emit_voice_event
from app.database import AsyncSessionLocal
from app.db.models import VoiceConfirmationAudit
from app.repositories.voice_audits import VoiceAuditRepository
from app.services.audio_wav import WavDecodeError, wav_bytes_to_pcm16k_mono_s16le
from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService
from app.services.minio_audio_storage import MinioAudioStorageService, StoredAudio
from app.services.video.session_manager import CameraSessionManager
from app.services.voice_confirm import (
is_rejection_phrase,
match_voice_choice_against_candidates,
parse_voice_choice,
)
from app.surgery_errors import SurgeryPipelineError
@dataclass(frozen=True)
class VoiceResolveResult:
resolved_label: str | None
rejected: bool
asr_text: str | None
audio_object_key: str | None
message: str
class VoiceConfirmationService:
"""Upload audio to MinIO, run Baidu ASR, parse choice, resolve pending queue entry."""
def __init__(
self,
settings: Settings,
sessions: CameraSessionManager,
baidu: BaiduSpeechService,
minio: MinioAudioStorageService,
audits: VoiceAuditRepository,
) -> None:
self._s = settings
self._sessions = sessions
self._baidu = baidu
self._minio = minio
self._audits = audits
def _emit_voice_trace(
self,
*,
source: str,
status: str,
surgery_id: str,
confirmation_id: str,
asr_text: str | None = None,
resolved_label: str | None = None,
rejected: bool | str | None = None,
error_message: str | None = None,
audio_object_key: str | None = None,
) -> None:
emit_voice_event(
self._s,
surgery_id=surgery_id,
source=source,
status=status,
confirmation_id=confirmation_id,
asr_text=asr_text,
resolved_label=resolved_label,
rejected=rejected,
error_message=error_message,
audio_object_key=audio_object_key,
)
def synthesize_prompt_to_mp3(self, text: str) -> bytes:
"""百度在线语音合成,供浏览器直接播放,与 `voice_confirm._synthesize_to_temp_mp3` 同参。"""
t = (text or "").strip()
if not t:
raise SurgeryPipelineError("TTS_TEXT_EMPTY", "提示文本为空。")
try:
r = self._baidu.synthesis(
t, "zh", 1, {"spd": 5, "pit": 5, "vol": 9, "per": 0}
)
except BaiduSpeechNotConfiguredError as exc:
raise SurgeryPipelineError(
"BAIDU_NOT_CONFIGURED",
"服务端未配置百度语音,无法合成播报音频。",
) from exc
if isinstance(r, dict):
raise SurgeryPipelineError("TTS_ERROR", f"百度 TTS 失败: {r!r}")
return r
async def resolve_from_wav(
self,
*,
surgery_id: str,
confirmation_id: str,
wav_bytes: bytes,
filename: str,
content_type: str | None,
) -> VoiceResolveResult:
_ = filename # reserved for logging / future MIME sniff
if len(wav_bytes) > self._s.voice_upload_max_bytes:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="invalid_audio",
audio_object_key=None,
audio_content_type=content_type,
audio_size_bytes=len(wav_bytes),
audio_sha256=None,
asr_text=None,
resolved_label=None,
options_snapshot_json=None,
error_message="音频超过大小限制",
)
self._emit_voice_trace(
source="wav",
status="invalid_audio",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="音频超过大小限制",
)
raise SurgeryPipelineError(
"VOICE_AUDIO_INVALID",
f"音频大小超过限制(最大 {self._s.voice_upload_max_bytes} 字节)。",
)
if not self._minio.configured:
self._emit_voice_trace(
source="wav",
status="minio_not_configured",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="服务端未配置 MinIO无法保存语音追溯文件。",
)
raise SurgeryPipelineError(
"MINIO_NOT_CONFIGURED",
"服务端未配置 MinIO无法保存语音追溯文件。",
)
if not self._baidu.configured:
self._emit_voice_trace(
source="wav",
status="baidu_not_configured",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="服务端未配置百度语音,无法进行语音识别。",
)
raise SurgeryPipelineError(
"BAIDU_NOT_CONFIGURED",
"服务端未配置百度语音,无法进行语音识别。",
)
pending = self._sessions.get_pending_confirmation_by_id(
surgery_id, confirmation_id
)
if pending is None:
self._emit_voice_trace(
source="wav",
status="confirmation_not_found",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="未找到该待确认项或已处理。",
)
raise SurgeryPipelineError(
"CONFIRMATION_NOT_FOUND",
"未找到该待确认项或已处理。",
)
option_labels = [a.strip() for a, _ in pending.options if a.strip()]
options_snapshot = json.dumps(
[{"label": a, "confidence": b} for a, b in pending.options],
ensure_ascii=False,
)
stored: StoredAudio | None = None
try:
await run_in_threadpool(self._minio.ensure_bucket)
stored = await run_in_threadpool(
lambda: self._minio.upload_voice_wav(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
data=wav_bytes,
content_type=content_type,
)
)
except Exception as exc:
logger.warning("MinIO upload failed: {}", exc)
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="upload_failed",
audio_object_key=None,
audio_content_type=content_type,
audio_size_bytes=len(wav_bytes),
audio_sha256=None,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=str(exc),
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
self._emit_voice_trace(
source="wav",
status="upload_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=str(exc),
)
raise SurgeryPipelineError(
"MINIO_UPLOAD_FAILED",
f"语音文件上传失败:{exc}",
) from exc
try:
pcm = await run_in_threadpool(wav_bytes_to_pcm16k_mono_s16le, wav_bytes)
except WavDecodeError as exc:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="invalid_audio",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=str(exc),
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
self._emit_voice_trace(
source="wav",
status="invalid_audio",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=str(exc),
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError(
"VOICE_AUDIO_INVALID",
f"无法解析 WAV 音频:{exc}",
) from exc
try:
asr_payload = await run_in_threadpool(
self._baidu.asr, pcm, "pcm", 16000, None
)
except BaiduSpeechNotConfiguredError as exc:
self._emit_voice_trace(
source="wav",
status="baidu_not_configured",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=str(exc),
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError(
"BAIDU_NOT_CONFIGURED",
str(exc),
) from exc
except Exception as exc:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=str(exc),
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=str(exc))
self._emit_voice_trace(
source="wav",
status="asr_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=str(exc),
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError(
"VOICE_ASR_FAILED",
f"语音识别调用失败:{exc}",
) from exc
if not isinstance(asr_payload, dict):
msg = "ASR 返回格式异常"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
self._emit_voice_trace(
source="wav",
status="asr_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=msg,
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
if asr_payload.get("err_no") != 0:
msg = (
f"asr_err_{asr_payload.get('err_no')}: "
f"{asr_payload.get('err_msg')}"
)
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
self._emit_voice_trace(
source="wav",
status="asr_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=msg,
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
results = asr_payload.get("result")
text: str | None = None
if isinstance(results, list) and results:
text = str(results[0])
elif isinstance(results, str):
text = results
text = (text or "").strip()
if not text:
msg = "语音识别结果为空"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="asr_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error=msg)
self._emit_voice_trace(
source="wav",
status="asr_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message=msg,
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError("VOICE_ASR_FAILED", msg)
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=None)
rejected = is_rejection_phrase(text)
chosen: str | None = None
if not rejected:
chosen = parse_voice_choice(text, option_labels)
if chosen is None:
surgery_candidates = self._sessions.get_surgery_candidate_consumables(
surgery_id
)
chosen = match_voice_choice_against_candidates(
text, surgery_candidates
)
if not rejected and not chosen:
_, retry_remaining = await self._sessions.record_voice_parse_failure(
surgery_id, confirmation_id
)
base = (
"无法从语音中匹配候选项或本台手术候选清单中的耗材名称,"
"请重试或说「不是」否认全部。"
)
if retry_remaining > 0:
msg = (
f"{base} 本次未听清或未能解析,"
f"您还可重试 {retry_remaining} 次,"
"请说「第一个」「第二个」等序号或候选项全名。"
)
else:
msg = (
f"{base} 本轮重试机会已用完,"
"请再清晰地说序号/全名,或说「不是」否认全部。"
)
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="parse_failed",
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=text,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=msg)
self._emit_voice_trace(
source="wav",
status="parse_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
asr_text=text,
error_message=msg,
audio_object_key=stored.object_key,
)
raise SurgeryPipelineError(
"VOICE_PARSE_FAILED",
msg,
extra={
"confirmation_id": confirmation_id,
"retry_remaining": retry_remaining,
},
)
await self._sessions.resolve_pending_confirmation(
surgery_id,
confirmation_id,
chosen_label=chosen,
rejected=rejected,
)
final_status = "rejected" if rejected else "recognized"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status=final_status,
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
asr_text=text,
resolved_label=chosen if not rejected else None,
options_snapshot_json=options_snapshot,
error_message=None,
)
self._emit_voice_trace(
source="wav",
status=final_status,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
asr_text=text,
resolved_label=chosen if not rejected else None,
rejected=rejected,
audio_object_key=stored.object_key,
)
if rejected:
return VoiceResolveResult(
resolved_label=None,
rejected=True,
asr_text=text,
audio_object_key=stored.object_key,
message="已否认全部候选,未记消耗。",
)
return VoiceResolveResult(
resolved_label=chosen,
rejected=False,
asr_text=text,
audio_object_key=stored.object_key,
message="已确认并记一条消耗。",
)
async def resolve_from_recognized_text(
self,
*,
surgery_id: str,
confirmation_id: str,
recognized_text: str,
) -> VoiceResolveResult:
"""浏览器 Web Speech 等客户端本机识别后的文本,不经 MinIO/百度 ASR解析规则与 `resolve_from_wav` 一致。"""
pending = self._sessions.get_pending_confirmation_by_id(
surgery_id, confirmation_id
)
if pending is None:
self._emit_voice_trace(
source="text",
status="confirmation_not_found",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="未找到该待确认项或已处理。",
)
raise SurgeryPipelineError(
"CONFIRMATION_NOT_FOUND",
"未找到该待确认项或已处理。",
)
option_labels = [a.strip() for a, _ in pending.options if a.strip()]
options_snapshot = json.dumps(
[{"label": a, "confidence": b} for a, b in pending.options],
ensure_ascii=False,
)
text = (recognized_text or "").strip()
if not text:
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="client_stt_empty",
audio_object_key=None,
audio_content_type=None,
audio_size_bytes=None,
audio_sha256=None,
asr_text=None,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message="客户端识别文本为空",
)
self._sessions.record_voice_trace(surgery_id, asr_text=None, error="empty text")
self._emit_voice_trace(
source="text",
status="client_stt_empty",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
error_message="客户端识别文本为空",
)
raise SurgeryPipelineError("VOICE_TEXT_EMPTY", "recognized_text 为空。")
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=None)
rejected = is_rejection_phrase(text)
chosen: str | None = None
if not rejected:
chosen = parse_voice_choice(text, option_labels)
if chosen is None:
surgery_candidates = self._sessions.get_surgery_candidate_consumables(
surgery_id
)
chosen = match_voice_choice_against_candidates(text, surgery_candidates)
if not rejected and not chosen:
_, retry_remaining = await self._sessions.record_voice_parse_failure(
surgery_id, confirmation_id
)
base = (
"无法从文本中匹配候选项或本台手术候选清单中的耗材名称,"
"请重试或说「不是」否认全部。"
)
if retry_remaining > 0:
msg = (
f"{base} 本次未能解析,"
f"您还可重试 {retry_remaining} 次,"
"请输入「第一个」「第二个」等或候选项全名。"
)
else:
msg = (
f"{base} 本轮重试机会已用完,"
"请再输入序号/全名,或说「不是」否认全部。"
)
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status="client_stt_parse_failed",
audio_object_key=None,
audio_content_type=None,
audio_size_bytes=None,
audio_sha256=None,
asr_text=text,
resolved_label=None,
options_snapshot_json=options_snapshot,
error_message=msg,
)
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=msg)
self._emit_voice_trace(
source="text",
status="client_stt_parse_failed",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
asr_text=text,
error_message=msg,
)
raise SurgeryPipelineError(
"VOICE_PARSE_FAILED",
msg,
extra={
"confirmation_id": confirmation_id,
"retry_remaining": retry_remaining,
},
)
await self._sessions.resolve_pending_confirmation(
surgery_id,
confirmation_id,
chosen_label=chosen,
rejected=rejected,
)
final_status = "rejected" if rejected else "recognized"
await self._persist_audit(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status=final_status,
audio_object_key=None,
audio_content_type=None,
audio_size_bytes=None,
audio_sha256=None,
asr_text=text,
resolved_label=chosen if not rejected else None,
options_snapshot_json=options_snapshot,
error_message=None,
)
self._emit_voice_trace(
source="text",
status=final_status,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
asr_text=text,
resolved_label=chosen if not rejected else None,
rejected=rejected,
)
if rejected:
return VoiceResolveResult(
resolved_label=None,
rejected=True,
asr_text=text,
audio_object_key=None,
message="已否认全部候选,未记消耗。",
)
return VoiceResolveResult(
resolved_label=chosen,
rejected=False,
asr_text=text,
audio_object_key=None,
message="已确认并记一条消耗。",
)
async def list_voice_audits_for_surgery(
self,
surgery_id: str,
*,
limit: int = 50,
offset: int = 0,
) -> tuple[list[VoiceConfirmationAudit], int]:
"""从 `voice_confirmation_audits` 表分页读取,供内部查询与报表。"""
async with AsyncSessionLocal() as session:
return await self._audits.list_by_surgery(
session,
surgery_id,
limit=limit,
offset=offset,
)
async def _persist_audit(
self,
*,
surgery_id: str,
confirmation_id: str,
status: str,
audio_object_key: str | None,
audio_content_type: str | None,
audio_size_bytes: int | None,
audio_sha256: str | None,
asr_text: str | None,
resolved_label: str | None,
options_snapshot_json: str | None,
error_message: str | None,
) -> None:
try:
async with AsyncSessionLocal() as session:
async with session.begin():
await self._audits.save_audit(
session,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
status=status,
audio_object_key=audio_object_key,
audio_content_type=audio_content_type,
audio_size_bytes=audio_size_bytes,
audio_sha256=audio_sha256,
asr_text=asr_text,
resolved_label=resolved_label,
options_snapshot_json=options_snapshot_json,
error_message=error_message,
)
except Exception as exc:
logger.error("Persist voice audit failed: {}", exc)