Files
operating-room-monitor-server/app/services/voice_resolution.py
Kevin 3d7bd70355 feat: 手术视频消耗、待确认与持久化改造
- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志:TSV/Markdown 含 top2/top3;item_id 优先产品编码;待确认记「待确认」行,语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行,确认后替换;拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy,修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor
2026-04-23 20:42:21 +08:00

604 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Resolve pending consumable confirmation from uploaded WAV: MinIO + Baidu ASR + parse.
本模块把语音识别流程线性化为一系列「阶段」,每个失败阶段都走 `VoiceAuditEmitter.fail`
统一写审计 + trace + 抛 `SurgeryPipelineError`。成功路径走 `emitter.success`。
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from fastapi.concurrency import run_in_threadpool
from loguru import logger
from sqlalchemy.ext.asyncio import async_sessionmaker
from app.config import Settings
from app.database import AsyncSessionLocal
from app.repositories.voice_audits import VoiceAuditRepository
from app.services.audio_wav import WavDecodeError, wav_bytes_to_pcm16k_mono_s16le
from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService
from app.services.minio_audio_storage import MinioAudioStorageService, StoredAudio
from app.services.pending_confirmation_port import PendingConfirmationStore
from app.services.voice_audit_emitter import VoiceAuditContext, VoiceAuditEmitter
from app.services.voice_confirm import (
is_rejection_phrase,
match_voice_choice_against_candidates,
parse_voice_choice,
)
from app.surgery_errors import SurgeryPipelineError
@dataclass(frozen=True)
class VoiceResolveResult:
resolved_label: str | None
rejected: bool
asr_text: str | None
audio_object_key: str | None
message: str
class VoiceConfirmationService:
"""Upload audio to MinIO, run Baidu ASR, parse choice, resolve pending queue entry."""
def __init__(
self,
settings: Settings,
sessions: PendingConfirmationStore,
baidu: BaiduSpeechService,
minio: MinioAudioStorageService,
audits: VoiceAuditRepository,
session_factory: async_sessionmaker | None = None,
audit_emitter: VoiceAuditEmitter | None = None,
) -> None:
self._s = settings
self._sessions = sessions
self._baidu = baidu
self._minio = minio
self._audits = audits
self._session_factory: async_sessionmaker = session_factory or AsyncSessionLocal
self._emitter = audit_emitter or VoiceAuditEmitter(
settings=settings,
audits=audits,
session_factory=self._session_factory,
)
# ------------------------------------------------------------------
# TTS保持对外接口不变
# ------------------------------------------------------------------
def synthesize_prompt_to_mp3(self, text: str) -> bytes:
t = (text or "").strip()
if not t:
raise SurgeryPipelineError("TTS_TEXT_EMPTY", "提示文本为空。")
try:
r = self._baidu.synthesis(
t, "zh", 1, {"spd": 5, "pit": 5, "vol": 9, "per": 0}
)
except BaiduSpeechNotConfiguredError as exc:
raise SurgeryPipelineError(
"BAIDU_NOT_CONFIGURED",
"服务端未配置百度语音,无法合成播报音频。",
) from exc
if isinstance(r, dict):
raise SurgeryPipelineError("TTS_ERROR", f"百度 TTS 失败: {r!r}")
return r
def _session_trace(self, surgery_id: str):
def _recorder(asr_text: str | None, error: str | None) -> None:
self._sessions.record_voice_trace(
surgery_id, asr_text=asr_text, error=error
)
return _recorder
# ------------------------------------------------------------------
# 主入口
# ------------------------------------------------------------------
async def resolve_from_wav(
self,
*,
surgery_id: str,
confirmation_id: str,
wav_bytes: bytes,
filename: str,
content_type: str | None,
) -> VoiceResolveResult:
_ = filename # reserved for future MIME sniff
# 1) validate_size
if len(wav_bytes) > self._s.voice_upload_max_bytes:
raise await self._emitter.fail(
source="wav",
status="invalid_audio",
code="VOICE_AUDIO_INVALID",
message=(
f"音频大小超过限制(最大 {self._s.voice_upload_max_bytes} 字节)。"
),
surgery_id=surgery_id,
confirmation_id=confirmation_id,
ctx=VoiceAuditContext(
audio_content_type=content_type,
audio_size_bytes=len(wav_bytes),
),
# 仅对大小越界的情况按原实现不记 session trace
record_session_trace=False,
)
# 2) ensure_providers_configured
if not self._minio.configured:
raise await self._emitter.fail(
source="wav",
status="minio_not_configured",
code="MINIO_NOT_CONFIGURED",
message="服务端未配置 MinIO无法保存语音追溯文件。",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
persist_audit=False,
record_session_trace=False,
)
if not self._baidu.configured:
raise await self._emitter.fail(
source="wav",
status="baidu_not_configured",
code="BAIDU_NOT_CONFIGURED",
message="服务端未配置百度语音,无法进行语音识别。",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
persist_audit=False,
record_session_trace=False,
)
# 3) fetch_pending
pending = self._sessions.get_pending_confirmation_by_id(
surgery_id, confirmation_id
)
if pending is None:
raise await self._emitter.fail(
source="wav",
status="confirmation_not_found",
code="CONFIRMATION_NOT_FOUND",
message="未找到该待确认项或已处理。",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
persist_audit=False,
record_session_trace=False,
)
option_labels = [a.strip() for a, _ in pending.options if a.strip()]
options_snapshot = json.dumps(
[{"label": a, "confidence": b} for a, b in pending.options],
ensure_ascii=False,
)
session_trace = self._session_trace(surgery_id)
# 4) upload_wav
stored = await self._upload_wav(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
wav_bytes=wav_bytes,
content_type=content_type,
options_snapshot=options_snapshot,
session_trace=session_trace,
)
audio_ctx = VoiceAuditContext(
audio_object_key=stored.object_key,
audio_content_type=content_type,
audio_size_bytes=stored.size_bytes,
audio_sha256=stored.sha256_hex,
)
# 5) decode_pcm
pcm = await self._decode_pcm(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
wav_bytes=wav_bytes,
ctx=audio_ctx,
options_snapshot=options_snapshot,
session_trace=session_trace,
)
# 6) call_asr
asr_payload = await self._call_asr(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
pcm=pcm,
ctx=audio_ctx,
options_snapshot=options_snapshot,
session_trace=session_trace,
)
# 7) extract_text
text = await self._extract_text_from_asr(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
asr_payload=asr_payload,
ctx=audio_ctx,
options_snapshot=options_snapshot,
session_trace=session_trace,
)
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=None)
# 8) parse_choice
rejected, chosen = await self._parse_choice_or_fail(
source="wav",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
text=text,
option_labels=option_labels,
options_snapshot=options_snapshot,
ctx=audio_ctx,
session_trace=session_trace,
)
# 9) persist_success含 session 内的 resolve_pending_confirmation
await self._sessions.resolve_pending_confirmation(
surgery_id,
confirmation_id,
chosen_label=chosen,
rejected=rejected,
)
final_status = "rejected" if rejected else "recognized"
await self._emitter.success(
source="wav",
status=final_status,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
ctx=audio_ctx,
asr_text=text,
resolved_label=chosen if not rejected else None,
rejected=rejected,
options_snapshot_json=options_snapshot,
)
if rejected:
return VoiceResolveResult(
resolved_label=None,
rejected=True,
asr_text=text,
audio_object_key=stored.object_key,
message="已否认全部候选,未记消耗。",
)
return VoiceResolveResult(
resolved_label=chosen,
rejected=False,
asr_text=text,
audio_object_key=stored.object_key,
message="已确认并记一条消耗。",
)
async def resolve_from_recognized_text(
self,
*,
surgery_id: str,
confirmation_id: str,
recognized_text: str,
) -> VoiceResolveResult:
"""浏览器本机识别文本,不经 MinIO/百度 ASR解析规则与 `resolve_from_wav` 一致。"""
pending = self._sessions.get_pending_confirmation_by_id(
surgery_id, confirmation_id
)
if pending is None:
raise await self._emitter.fail(
source="text",
status="confirmation_not_found",
code="CONFIRMATION_NOT_FOUND",
message="未找到该待确认项或已处理。",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
persist_audit=False,
record_session_trace=False,
)
option_labels = [a.strip() for a, _ in pending.options if a.strip()]
options_snapshot = json.dumps(
[{"label": a, "confidence": b} for a, b in pending.options],
ensure_ascii=False,
)
session_trace = self._session_trace(surgery_id)
text = (recognized_text or "").strip()
if not text:
raise await self._emitter.fail(
source="text",
status="client_stt_empty",
code="VOICE_TEXT_EMPTY",
message="客户端识别文本为空",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
options_snapshot_json=options_snapshot,
session_trace_recorder=lambda _a, _m: self._sessions.record_voice_trace(
surgery_id, asr_text=None, error="empty text"
),
)
self._sessions.record_voice_trace(surgery_id, asr_text=text, error=None)
rejected, chosen = await self._parse_choice_or_fail(
source="text",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
text=text,
option_labels=option_labels,
options_snapshot=options_snapshot,
ctx=VoiceAuditContext(),
session_trace=session_trace,
parse_status_on_failure="client_stt_parse_failed",
parse_message_prefix=(
"无法从文本中匹配候选项或本台手术候选清单中的耗材名称,"
"请重试或说「不是」否认全部。"
),
parse_retry_hint_still=(
"请输入「第一个」「第二个」等或候选项全名。"
),
parse_retry_hint_exhausted=(
"请再输入序号/全名,或说「不是」否认全部。"
),
)
await self._sessions.resolve_pending_confirmation(
surgery_id,
confirmation_id,
chosen_label=chosen,
rejected=rejected,
)
final_status = "rejected" if rejected else "recognized"
await self._emitter.success(
source="text",
status=final_status,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
asr_text=text,
resolved_label=chosen if not rejected else None,
rejected=rejected,
options_snapshot_json=options_snapshot,
)
if rejected:
return VoiceResolveResult(
resolved_label=None,
rejected=True,
asr_text=text,
audio_object_key=None,
message="已否认全部候选,未记消耗。",
)
return VoiceResolveResult(
resolved_label=chosen,
rejected=False,
asr_text=text,
audio_object_key=None,
message="已确认并记一条消耗。",
)
# ------------------------------------------------------------------
# 内部阶段
# ------------------------------------------------------------------
async def _upload_wav(
self,
*,
surgery_id: str,
confirmation_id: str,
wav_bytes: bytes,
content_type: str | None,
options_snapshot: str,
session_trace,
) -> StoredAudio:
try:
await run_in_threadpool(self._minio.ensure_bucket)
return await run_in_threadpool(
lambda: self._minio.upload_voice_wav(
surgery_id=surgery_id,
confirmation_id=confirmation_id,
data=wav_bytes,
content_type=content_type,
)
)
except Exception as exc:
logger.warning("MinIO upload failed: {}", exc)
raise await self._emitter.fail(
source="wav",
status="upload_failed",
code="MINIO_UPLOAD_FAILED",
message=f"语音文件上传失败:{exc}",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
ctx=VoiceAuditContext(
audio_content_type=content_type,
audio_size_bytes=len(wav_bytes),
),
options_snapshot_json=options_snapshot,
session_trace_recorder=session_trace,
) from exc
async def _decode_pcm(
self,
*,
surgery_id: str,
confirmation_id: str,
wav_bytes: bytes,
ctx: VoiceAuditContext,
options_snapshot: str,
session_trace,
) -> bytes:
try:
return await run_in_threadpool(wav_bytes_to_pcm16k_mono_s16le, wav_bytes)
except WavDecodeError as exc:
raise await self._emitter.fail(
source="wav",
status="invalid_audio",
code="VOICE_AUDIO_INVALID",
message=f"无法解析 WAV 音频:{exc}",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
ctx=ctx,
options_snapshot_json=options_snapshot,
session_trace_recorder=session_trace,
) from exc
async def _call_asr(
self,
*,
surgery_id: str,
confirmation_id: str,
pcm: bytes,
ctx: VoiceAuditContext,
options_snapshot: str,
session_trace,
) -> object:
try:
return await run_in_threadpool(self._baidu.asr, pcm, "pcm", 16000, None)
except BaiduSpeechNotConfiguredError as exc:
raise await self._emitter.fail(
source="wav",
status="baidu_not_configured",
code="BAIDU_NOT_CONFIGURED",
message=str(exc),
surgery_id=surgery_id,
confirmation_id=confirmation_id,
ctx=ctx,
persist_audit=False,
record_session_trace=False,
) from exc
except Exception as exc:
raise await self._emitter.fail(
source="wav",
status="asr_failed",
code="VOICE_ASR_FAILED",
message=f"语音识别调用失败:{exc}",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
ctx=ctx,
options_snapshot_json=options_snapshot,
session_trace_recorder=session_trace,
) from exc
async def _extract_text_from_asr(
self,
*,
surgery_id: str,
confirmation_id: str,
asr_payload: object,
ctx: VoiceAuditContext,
options_snapshot: str,
session_trace,
) -> str:
if not isinstance(asr_payload, dict):
raise await self._emitter.fail(
source="wav",
status="asr_failed",
code="VOICE_ASR_FAILED",
message="ASR 返回格式异常",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
ctx=ctx,
options_snapshot_json=options_snapshot,
session_trace_recorder=session_trace,
)
if asr_payload.get("err_no") != 0:
msg = (
f"asr_err_{asr_payload.get('err_no')}: "
f"{asr_payload.get('err_msg')}"
)
raise await self._emitter.fail(
source="wav",
status="asr_failed",
code="VOICE_ASR_FAILED",
message=msg,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
ctx=ctx,
options_snapshot_json=options_snapshot,
session_trace_recorder=session_trace,
)
results = asr_payload.get("result")
text: str | None = None
if isinstance(results, list) and results:
text = str(results[0])
elif isinstance(results, str):
text = results
text = (text or "").strip()
if not text:
raise await self._emitter.fail(
source="wav",
status="asr_failed",
code="VOICE_ASR_FAILED",
message="语音识别结果为空",
surgery_id=surgery_id,
confirmation_id=confirmation_id,
ctx=ctx,
options_snapshot_json=options_snapshot,
session_trace_recorder=session_trace,
)
return text
async def _parse_choice_or_fail(
self,
*,
source,
surgery_id: str,
confirmation_id: str,
text: str,
option_labels: list[str],
options_snapshot: str,
ctx: VoiceAuditContext,
session_trace,
parse_status_on_failure: str = "parse_failed",
parse_message_prefix: str = (
"无法从语音中匹配候选项或本台手术候选清单中的耗材名称,"
"请重试或说「不是」否认全部。"
),
parse_retry_hint_still: str = (
"请说「第一个」「第二个」等序号或候选项全名。"
),
parse_retry_hint_exhausted: str = (
"请再清晰地说序号/全名,或说「不是」否认全部。"
),
) -> tuple[bool, str | None]:
rejected = is_rejection_phrase(text)
chosen: str | None = None
if not rejected:
chosen = parse_voice_choice(text, option_labels)
if chosen is None:
candidates = self._sessions.get_surgery_candidate_consumables(surgery_id)
chosen = match_voice_choice_against_candidates(text, candidates)
if rejected or chosen:
return rejected, chosen
_, retry_remaining = await self._sessions.record_voice_parse_failure(
surgery_id, confirmation_id
)
if retry_remaining > 0:
if source == "wav":
suffix = (
f" 本次未听清或未能解析,您还可重试 {retry_remaining} 次,"
f"{parse_retry_hint_still}"
)
else:
suffix = (
f" 本次未能解析,您还可重试 {retry_remaining} 次,"
f"{parse_retry_hint_still}"
)
else:
suffix = f" 本轮重试机会已用完,{parse_retry_hint_exhausted}"
msg = parse_message_prefix + suffix
raise await self._emitter.fail(
source=source,
status=parse_status_on_failure,
code="VOICE_PARSE_FAILED",
message=msg,
surgery_id=surgery_id,
confirmation_id=confirmation_id,
ctx=ctx,
asr_text=text,
options_snapshot_json=options_snapshot,
session_trace_recorder=session_trace,
include_extra={
"confirmation_id": confirmation_id,
"retry_remaining": retry_remaining,
},
)