"""Local faster-whisper ASR adapter — implements ASRProvider port.""" from __future__ import annotations import asyncio import os import re import tempfile from typing import Any, Iterable from app.core.logging import get_logger logger = get_logger(__name__) _SUBTITLE_WATERMARK_RE = re.compile( r"(字幕|听译|压制|字幕组).{0,20}(by|BY|By)|字幕\s*by", re.UNICODE, ) def _looks_like_subtitle_hallucination(text: str) -> bool: """静音时第二遍易吐出视频字幕水印;仅丢弃此类短句。""" t = (text or "").strip() if len(t) > 48: return False if _SUBTITLE_WATERMARK_RE.search(t): return True if len(t) <= 12 and "字幕" in t and not re.search(r"[??!!。,、]", t): return True return False def _join_segment_text(segments: Iterable[Any]) -> tuple[str, int]: segs = list(segments) return "".join(str(getattr(seg, "text", "") or "") for seg in segs).strip(), len( segs ) _DEFAULT_CACHE_DIR = os.path.normpath( os.path.join( os.path.dirname(os.path.abspath(__file__)), "..", "..", "..", "models", "whisper", ) ) class WhisperASRProvider: def __init__( self, model_size: str = "small", device: str = "auto", compute_type: str = "auto", cache_dir: str = "", ): self._model_size = model_size self._device = device self._compute_type = compute_type self._cache_dir = cache_dir self._model = None def _load_model(self) -> bool: if self._model is not None: return True try: from faster_whisper import WhisperModel device = self._device compute_type = self._compute_type if device == "auto": try: import torch # type: ignore[import-untyped] device = "cuda" if torch.cuda.is_available() else "cpu" except ImportError: device = "cpu" if compute_type == "auto": compute_type = "float16" if device == "cuda" else "int8" download_root = self._cache_dir or _DEFAULT_CACHE_DIR local_files_only = bool(self._cache_dir) os.makedirs(download_root, exist_ok=True) self._model = WhisperModel( self._model_size, device=device, compute_type=compute_type, download_root=download_root, local_files_only=local_files_only, ) return True except Exception as e: logger.error("Failed to load Whisper model: {}", e) return False def ensure_ready(self) -> bool: return self._load_model() async def transcribe(self, audio: bytes, format: str = "m4a") -> str: # 与 v1.1.0 相同的单次 transcribe;推理放线程池,避免阻塞 asyncio(tag 上为同步调用)。 self._load_model() if not self._model: return "" model = self._model def _sync_transcribe() -> str: tmp_path = None try: with tempfile.NamedTemporaryFile( suffix=f".{format}", delete=False ) as tmp: tmp.write(audio) tmp_path = tmp.name segments, _info = model.transcribe( tmp_path, language="zh", beam_size=5, vad_filter=True, vad_parameters={ "min_silence_duration_ms": 500, "threshold": 0.35, "min_speech_duration_ms": 200, }, ) text, pass1_seg_count = _join_segment_text(segments) used_second_pass = False pass2_seg_count = 0 pass3_seg_count = 0 if not text: logger.info( "Whisper VAD pass 无文本,关闭 VAD 再试一次(短录音易被 VAD 判为静音)" ) segments2, _info2 = model.transcribe( tmp_path, language="zh", beam_size=5, vad_filter=False, condition_on_previous_text=False, # 略抬高:减少边界片段被标成 no_speech 而整段为空 no_speech_threshold=0.85, ) raw2, pass2_seg_count = _join_segment_text(segments2) used_second_pass = True if raw2 and _looks_like_subtitle_hallucination(raw2): logger.info( "Whisper 丢弃疑似字幕水印幻听: {!r}", raw2[:120], ) text = "" else: text = raw2 if not text and used_second_pass: try: from faster_whisper import decode_audio audio_np = decode_audio(tmp_path, sampling_rate=16000) segments3, _info3 = model.transcribe( audio_np, language="zh", beam_size=5, vad_filter=False, condition_on_previous_text=False, no_speech_threshold=0.85, ) raw3, pass3_seg_count = _join_segment_text(segments3) if raw3 and _looks_like_subtitle_hallucination(raw3): logger.info( "Whisper decode_audio 回退仍是疑似字幕水印幻听: {!r}", raw3[:120], ) elif raw3: text = raw3 except Exception as ex: logger.warning("Whisper decode_audio 回退失败: {}", ex) return text except Exception as e: logger.error("Whisper transcribe failed: {}", e) return "" finally: if tmp_path and os.path.exists(tmp_path): try: os.remove(tmp_path) except OSError: pass return await asyncio.to_thread(_sync_transcribe)