"""Resolve pending consumable confirmation from uploaded WAV: MinIO + Baidu ASR + parse. 本模块把语音识别流程线性化为一系列「阶段」,每个失败阶段都走 `VoiceAuditEmitter.fail` 统一写审计 + trace + 抛 `SurgeryPipelineError`。成功路径走 `emitter.success`。 """ from __future__ import annotations import json from dataclasses import dataclass from fastapi.concurrency import run_in_threadpool from loguru import logger from sqlalchemy.ext.asyncio import async_sessionmaker from app.baked import pipeline as bp from app.config import Settings from app.database import AsyncSessionLocal from app.repositories.voice_audits import VoiceAuditRepository from app.services.audio_wav import WavDecodeError, wav_bytes_to_pcm16k_mono_s16le from app.services.baidu_speech import BaiduSpeechNotConfiguredError, BaiduSpeechService from app.services.minio_audio_storage import MinioAudioStorageService, StoredAudio from app.services.pending_confirmation_port import PendingConfirmationStore from app.services.voice_audit_emitter import VoiceAuditContext, VoiceAuditEmitter from app.services.voice_confirm import ( is_rejection_phrase, match_voice_choice_against_candidates, parse_voice_choice, ) from app.surgery_errors import SurgeryPipelineError @dataclass(frozen=True) class VoiceResolveResult: resolved_label: str | None rejected: bool asr_text: str | None audio_object_key: str | None message: str class VoiceConfirmationService: """Upload audio to MinIO, run Baidu ASR, parse choice, resolve pending queue entry.""" def __init__( self, settings: Settings, sessions: PendingConfirmationStore, baidu: BaiduSpeechService, minio: MinioAudioStorageService, audits: VoiceAuditRepository, session_factory: async_sessionmaker | None = None, audit_emitter: VoiceAuditEmitter | None = None, ) -> None: self._s = settings self._sessions = sessions self._baidu = baidu self._minio = minio self._audits = audits self._session_factory: async_sessionmaker = session_factory or AsyncSessionLocal self._emitter = audit_emitter or VoiceAuditEmitter( settings=settings, audits=audits, session_factory=self._session_factory, ) # ------------------------------------------------------------------ # TTS:保持对外接口不变 # ------------------------------------------------------------------ def synthesize_prompt_to_mp3(self, text: str) -> bytes: t = (text or "").strip() if not t: raise SurgeryPipelineError("TTS_TEXT_EMPTY", "提示文本为空。") try: r = self._baidu.synthesis( t, "zh", 1, {"spd": 5, "pit": 5, "vol": 9, "per": 0} ) except BaiduSpeechNotConfiguredError as exc: raise SurgeryPipelineError( "BAIDU_NOT_CONFIGURED", "服务端未配置百度语音,无法合成播报音频。", ) from exc if isinstance(r, dict): raise SurgeryPipelineError("TTS_ERROR", f"百度 TTS 失败: {r!r}") return r def _session_trace(self, surgery_id: str): def _recorder(asr_text: str | None, error: str | None) -> None: self._sessions.record_voice_trace( surgery_id, asr_text=asr_text, error=error ) return _recorder # ------------------------------------------------------------------ # 主入口 # ------------------------------------------------------------------ async def resolve_from_wav( self, *, surgery_id: str, confirmation_id: str, wav_bytes: bytes, filename: str, content_type: str | None, ) -> VoiceResolveResult: _ = filename # reserved for future MIME sniff # 1) validate_size if len(wav_bytes) > bp.VOICE_UPLOAD_MAX_BYTES: raise await self._emitter.fail( source="wav", status="invalid_audio", code="VOICE_AUDIO_INVALID", message=( f"音频大小超过限制(最大 {bp.VOICE_UPLOAD_MAX_BYTES} 字节)。" ), surgery_id=surgery_id, confirmation_id=confirmation_id, ctx=VoiceAuditContext( audio_content_type=content_type, audio_size_bytes=len(wav_bytes), ), # 仅对大小越界的情况按原实现不记 session trace record_session_trace=False, ) # 2) ensure_providers_configured if not self._minio.configured: raise await self._emitter.fail( source="wav", status="minio_not_configured", code="MINIO_NOT_CONFIGURED", message="服务端未配置 MinIO,无法保存语音追溯文件。", surgery_id=surgery_id, confirmation_id=confirmation_id, persist_audit=False, record_session_trace=False, ) if not self._baidu.configured: raise await self._emitter.fail( source="wav", status="baidu_not_configured", code="BAIDU_NOT_CONFIGURED", message="服务端未配置百度语音,无法进行语音识别。", surgery_id=surgery_id, confirmation_id=confirmation_id, persist_audit=False, record_session_trace=False, ) # 3) fetch_pending pending = self._sessions.get_pending_confirmation_by_id( surgery_id, confirmation_id ) if pending is None: raise await self._emitter.fail( source="wav", status="confirmation_not_found", code="CONFIRMATION_NOT_FOUND", message="未找到该待确认项或已处理。", surgery_id=surgery_id, confirmation_id=confirmation_id, persist_audit=False, record_session_trace=False, ) option_labels = [a.strip() for a, _ in pending.options if a.strip()] options_snapshot = json.dumps( [{"label": a, "confidence": b} for a, b in pending.options], ensure_ascii=False, ) session_trace = self._session_trace(surgery_id) # 4) upload_wav stored = await self._upload_wav( surgery_id=surgery_id, confirmation_id=confirmation_id, wav_bytes=wav_bytes, content_type=content_type, options_snapshot=options_snapshot, session_trace=session_trace, ) audio_ctx = VoiceAuditContext( audio_object_key=stored.object_key, audio_content_type=content_type, audio_size_bytes=stored.size_bytes, audio_sha256=stored.sha256_hex, ) # 5) decode_pcm pcm = await self._decode_pcm( surgery_id=surgery_id, confirmation_id=confirmation_id, wav_bytes=wav_bytes, ctx=audio_ctx, options_snapshot=options_snapshot, session_trace=session_trace, ) # 6) call_asr asr_payload = await self._call_asr( surgery_id=surgery_id, confirmation_id=confirmation_id, pcm=pcm, ctx=audio_ctx, options_snapshot=options_snapshot, session_trace=session_trace, ) # 7) extract_text text = await self._extract_text_from_asr( surgery_id=surgery_id, confirmation_id=confirmation_id, asr_payload=asr_payload, ctx=audio_ctx, options_snapshot=options_snapshot, session_trace=session_trace, ) self._sessions.record_voice_trace(surgery_id, asr_text=text, error=None) # 8) parse_choice rejected, chosen = await self._parse_choice_or_fail( source="wav", surgery_id=surgery_id, confirmation_id=confirmation_id, text=text, option_labels=option_labels, options_snapshot=options_snapshot, ctx=audio_ctx, session_trace=session_trace, ) # 9) persist_success(含 session 内的 resolve_pending_confirmation) await self._sessions.resolve_pending_confirmation( surgery_id, confirmation_id, chosen_label=chosen, rejected=rejected, ) final_status = "rejected" if rejected else "recognized" await self._emitter.success( source="wav", status=final_status, surgery_id=surgery_id, confirmation_id=confirmation_id, ctx=audio_ctx, asr_text=text, resolved_label=chosen if not rejected else None, rejected=rejected, options_snapshot_json=options_snapshot, ) if rejected: return VoiceResolveResult( resolved_label=None, rejected=True, asr_text=text, audio_object_key=stored.object_key, message="已否认全部候选,未记消耗。", ) return VoiceResolveResult( resolved_label=chosen, rejected=False, asr_text=text, audio_object_key=stored.object_key, message="已确认并记一条消耗。", ) async def resolve_from_recognized_text( self, *, surgery_id: str, confirmation_id: str, recognized_text: str, ) -> VoiceResolveResult: """浏览器本机识别文本,不经 MinIO/百度 ASR,解析规则与 `resolve_from_wav` 一致。""" pending = self._sessions.get_pending_confirmation_by_id( surgery_id, confirmation_id ) if pending is None: raise await self._emitter.fail( source="text", status="confirmation_not_found", code="CONFIRMATION_NOT_FOUND", message="未找到该待确认项或已处理。", surgery_id=surgery_id, confirmation_id=confirmation_id, persist_audit=False, record_session_trace=False, ) option_labels = [a.strip() for a, _ in pending.options if a.strip()] options_snapshot = json.dumps( [{"label": a, "confidence": b} for a, b in pending.options], ensure_ascii=False, ) session_trace = self._session_trace(surgery_id) text = (recognized_text or "").strip() if not text: raise await self._emitter.fail( source="text", status="client_stt_empty", code="VOICE_TEXT_EMPTY", message="客户端识别文本为空", surgery_id=surgery_id, confirmation_id=confirmation_id, options_snapshot_json=options_snapshot, session_trace_recorder=lambda _a, _m: self._sessions.record_voice_trace( surgery_id, asr_text=None, error="empty text" ), ) self._sessions.record_voice_trace(surgery_id, asr_text=text, error=None) rejected, chosen = await self._parse_choice_or_fail( source="text", surgery_id=surgery_id, confirmation_id=confirmation_id, text=text, option_labels=option_labels, options_snapshot=options_snapshot, ctx=VoiceAuditContext(), session_trace=session_trace, parse_status_on_failure="client_stt_parse_failed", parse_message_prefix=( "无法从文本中匹配候选项或本台手术候选清单中的耗材名称," "请重试或说「不是」否认全部。" ), parse_retry_hint_still=( "请输入「第一个」「第二个」等或候选项全名。" ), parse_retry_hint_exhausted=( "请再输入序号/全名,或说「不是」否认全部。" ), ) await self._sessions.resolve_pending_confirmation( surgery_id, confirmation_id, chosen_label=chosen, rejected=rejected, ) final_status = "rejected" if rejected else "recognized" await self._emitter.success( source="text", status=final_status, surgery_id=surgery_id, confirmation_id=confirmation_id, asr_text=text, resolved_label=chosen if not rejected else None, rejected=rejected, options_snapshot_json=options_snapshot, ) if rejected: return VoiceResolveResult( resolved_label=None, rejected=True, asr_text=text, audio_object_key=None, message="已否认全部候选,未记消耗。", ) return VoiceResolveResult( resolved_label=chosen, rejected=False, asr_text=text, audio_object_key=None, message="已确认并记一条消耗。", ) # ------------------------------------------------------------------ # 内部阶段 # ------------------------------------------------------------------ async def _upload_wav( self, *, surgery_id: str, confirmation_id: str, wav_bytes: bytes, content_type: str | None, options_snapshot: str, session_trace, ) -> StoredAudio: try: await run_in_threadpool(self._minio.ensure_bucket) return await run_in_threadpool( lambda: self._minio.upload_voice_wav( surgery_id=surgery_id, confirmation_id=confirmation_id, data=wav_bytes, content_type=content_type, ) ) except Exception as exc: logger.warning("MinIO upload failed: {}", exc) raise await self._emitter.fail( source="wav", status="upload_failed", code="MINIO_UPLOAD_FAILED", message=f"语音文件上传失败:{exc}", surgery_id=surgery_id, confirmation_id=confirmation_id, ctx=VoiceAuditContext( audio_content_type=content_type, audio_size_bytes=len(wav_bytes), ), options_snapshot_json=options_snapshot, session_trace_recorder=session_trace, ) from exc async def _decode_pcm( self, *, surgery_id: str, confirmation_id: str, wav_bytes: bytes, ctx: VoiceAuditContext, options_snapshot: str, session_trace, ) -> bytes: try: return await run_in_threadpool(wav_bytes_to_pcm16k_mono_s16le, wav_bytes) except WavDecodeError as exc: raise await self._emitter.fail( source="wav", status="invalid_audio", code="VOICE_AUDIO_INVALID", message=f"无法解析 WAV 音频:{exc}", surgery_id=surgery_id, confirmation_id=confirmation_id, ctx=ctx, options_snapshot_json=options_snapshot, session_trace_recorder=session_trace, ) from exc async def _call_asr( self, *, surgery_id: str, confirmation_id: str, pcm: bytes, ctx: VoiceAuditContext, options_snapshot: str, session_trace, ) -> object: try: return await run_in_threadpool(self._baidu.asr, pcm, "pcm", 16000, None) except BaiduSpeechNotConfiguredError as exc: raise await self._emitter.fail( source="wav", status="baidu_not_configured", code="BAIDU_NOT_CONFIGURED", message=str(exc), surgery_id=surgery_id, confirmation_id=confirmation_id, ctx=ctx, persist_audit=False, record_session_trace=False, ) from exc except Exception as exc: raise await self._emitter.fail( source="wav", status="asr_failed", code="VOICE_ASR_FAILED", message=f"语音识别调用失败:{exc}", surgery_id=surgery_id, confirmation_id=confirmation_id, ctx=ctx, options_snapshot_json=options_snapshot, session_trace_recorder=session_trace, ) from exc async def _extract_text_from_asr( self, *, surgery_id: str, confirmation_id: str, asr_payload: object, ctx: VoiceAuditContext, options_snapshot: str, session_trace, ) -> str: if not isinstance(asr_payload, dict): raise await self._emitter.fail( source="wav", status="asr_failed", code="VOICE_ASR_FAILED", message="ASR 返回格式异常", surgery_id=surgery_id, confirmation_id=confirmation_id, ctx=ctx, options_snapshot_json=options_snapshot, session_trace_recorder=session_trace, ) if asr_payload.get("err_no") != 0: msg = ( f"asr_err_{asr_payload.get('err_no')}: " f"{asr_payload.get('err_msg')}" ) raise await self._emitter.fail( source="wav", status="asr_failed", code="VOICE_ASR_FAILED", message=msg, surgery_id=surgery_id, confirmation_id=confirmation_id, ctx=ctx, options_snapshot_json=options_snapshot, session_trace_recorder=session_trace, ) results = asr_payload.get("result") text: str | None = None if isinstance(results, list) and results: text = str(results[0]) elif isinstance(results, str): text = results text = (text or "").strip() if not text: raise await self._emitter.fail( source="wav", status="asr_failed", code="VOICE_ASR_FAILED", message="语音识别结果为空", surgery_id=surgery_id, confirmation_id=confirmation_id, ctx=ctx, options_snapshot_json=options_snapshot, session_trace_recorder=session_trace, ) return text async def _parse_choice_or_fail( self, *, source, surgery_id: str, confirmation_id: str, text: str, option_labels: list[str], options_snapshot: str, ctx: VoiceAuditContext, session_trace, parse_status_on_failure: str = "parse_failed", parse_message_prefix: str = ( "无法从语音中匹配候选项或本台手术候选清单中的耗材名称," "请重试或说「不是」否认全部。" ), parse_retry_hint_still: str = ( "请说「第一个」「第二个」等序号或候选项全名。" ), parse_retry_hint_exhausted: str = ( "请再清晰地说序号/全名,或说「不是」否认全部。" ), ) -> tuple[bool, str | None]: rejected = is_rejection_phrase(text) chosen: str | None = None if not rejected: chosen = parse_voice_choice(text, option_labels) if chosen is None: candidates = self._sessions.get_surgery_candidate_consumables(surgery_id) chosen = match_voice_choice_against_candidates(text, candidates) if rejected or chosen: return rejected, chosen _, retry_remaining = await self._sessions.record_voice_parse_failure( surgery_id, confirmation_id ) if retry_remaining > 0: if source == "wav": suffix = ( f" 本次未听清或未能解析,您还可重试 {retry_remaining} 次," f"{parse_retry_hint_still}" ) else: suffix = ( f" 本次未能解析,您还可重试 {retry_remaining} 次," f"{parse_retry_hint_still}" ) else: suffix = f" 本轮重试机会已用完,{parse_retry_hint_exhausted}" msg = parse_message_prefix + suffix raise await self._emitter.fail( source=source, status=parse_status_on_failure, code="VOICE_PARSE_FAILED", message=msg, surgery_id=surgery_id, confirmation_id=confirmation_id, ctx=ctx, asr_text=text, options_snapshot_json=options_snapshot, session_trace_recorder=session_trace, include_extra={ "confirmation_id": confirmation_id, "retry_remaining": retry_remaining, }, )