life-echo/api/app/adapters/asr/whisper_local.py

"""Local faster-whisper ASR adapter — implements ASRProvider port."""

from __future__ import annotations

import asyncio
import os
import re
import tempfile
from typing import Any, Iterable

from app.core.business_telemetry import business_span
from app.core.logging import get_logger
from app.ports.asr import ASRTranscriptionError

logger = get_logger(__name__)

_SUBTITLE_WATERMARK_RE = re.compile(
    r"(字幕|听译|压制|字幕组).{0,20}(by|BY|By)|字幕\s*by",
    re.UNICODE,
)


def _looks_like_subtitle_hallucination(text: str) -> bool:
    """静音时第二遍易吐出视频字幕水印；仅丢弃此类短句。"""
    t = (text or "").strip()
    if len(t) > 48:
        return False
    if _SUBTITLE_WATERMARK_RE.search(t):
        return True
    if len(t) <= 12 and "字幕" in t and not re.search(r"[？?！!。，、]", t):
        return True
    return False


def _join_segment_text(segments: Iterable[Any]) -> tuple[str, int]:
    segs = list(segments)
    return "".join(str(getattr(seg, "text", "") or "") for seg in segs).strip(), len(
        segs
    )


_DEFAULT_CACHE_DIR = os.path.normpath(
    os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        "..",
        "..",
        "..",
        "models",
        "whisper",
    )
)


class WhisperASRProvider:
    def __init__(
        self,
        model_size: str = "small",
        device: str = "auto",
        compute_type: str = "auto",
        cache_dir: str = "",
    ):
        self._model_size = model_size
        self._device = device
        self._compute_type = compute_type
        self._cache_dir = cache_dir
        self._model = None

    def _load_model(self) -> bool:
        if self._model is not None:
            return True
        try:
            from faster_whisper import WhisperModel

            device = self._device
            compute_type = self._compute_type
            if device == "auto":
                try:
                    import torch  # type: ignore[import-untyped]

                    device = "cuda" if torch.cuda.is_available() else "cpu"
                except ImportError:
                    device = "cpu"
            if compute_type == "auto":
                compute_type = "float16" if device == "cuda" else "int8"

            download_root = self._cache_dir or _DEFAULT_CACHE_DIR
            local_files_only = bool(self._cache_dir)
            os.makedirs(download_root, exist_ok=True)

            self._model = WhisperModel(
                self._model_size,
                device=device,
                compute_type=compute_type,
                download_root=download_root,
                local_files_only=local_files_only,
            )
            return True
        except Exception as e:
            logger.error("Failed to load Whisper model: {}", e)
            return False

    def ensure_ready(self) -> bool:
        return self._load_model()

    async def transcribe(self, audio: bytes, format: str = "m4a") -> str:
        with business_span("asr.transcribe", provider="whisper"):
            return await self._transcribe_inner(audio, format)

    async def _transcribe_inner(self, audio: bytes, format: str) -> str:
        # 与 v1.1.0 相同的单次 transcribe；推理放线程池，避免阻塞 asyncio（tag 上为同步调用）。
        self._load_model()
        if not self._model:
            raise ASRTranscriptionError("Whisper model not loaded")

        model = self._model

        def _sync_transcribe() -> str:
            tmp_path = None
            try:
                with tempfile.NamedTemporaryFile(
                    suffix=f".{format}", delete=False
                ) as tmp:
                    tmp.write(audio)
                    tmp_path = tmp.name

                segments, _info = model.transcribe(
                    tmp_path,
                    language="zh",
                    beam_size=5,
                    vad_filter=True,
                    vad_parameters={
                        "min_silence_duration_ms": 500,
                        "threshold": 0.35,
                        "min_speech_duration_ms": 200,
                    },
                )
                text, pass1_seg_count = _join_segment_text(segments)
                used_second_pass = False
                pass2_seg_count = 0
                pass3_seg_count = 0

                if not text:
                    logger.info(
                        "Whisper VAD pass 无文本，关闭 VAD 再试一次（短录音易被 VAD 判为静音）"
                    )
                    segments2, _info2 = model.transcribe(
                        tmp_path,
                        language="zh",
                        beam_size=5,
                        vad_filter=False,
                        condition_on_previous_text=False,
                        # 略抬高：减少边界片段被标成 no_speech 而整段为空
                        no_speech_threshold=0.85,
                    )
                    raw2, pass2_seg_count = _join_segment_text(segments2)
                    used_second_pass = True
                    if raw2 and _looks_like_subtitle_hallucination(raw2):
                        logger.info(
                            "Whisper 丢弃疑似字幕水印幻听: {!r}",
                            raw2[:120],
                        )
                        text = ""
                    else:
                        text = raw2

                if not text and used_second_pass:
                    try:
                        from faster_whisper import decode_audio

                        audio_np = decode_audio(tmp_path, sampling_rate=16000)
                        segments3, _info3 = model.transcribe(
                            audio_np,
                            language="zh",
                            beam_size=5,
                            vad_filter=False,
                            condition_on_previous_text=False,
                            no_speech_threshold=0.85,
                        )
                        raw3, pass3_seg_count = _join_segment_text(segments3)
                        if raw3 and _looks_like_subtitle_hallucination(raw3):
                            logger.info(
                                "Whisper decode_audio 回退仍是疑似字幕水印幻听: {!r}",
                                raw3[:120],
                            )
                        elif raw3:
                            text = raw3
                    except Exception as ex:
                        logger.warning("Whisper decode_audio 回退失败: {}", ex)

                return text
            except ASRTranscriptionError:
                raise
            except Exception as e:
                logger.error("Whisper transcribe failed: {}", e)
                raise ASRTranscriptionError(f"Whisper transcribe failed: {e!s}") from e
            finally:
                if tmp_path and os.path.exists(tmp_path):
                    try:
                        os.remove(tmp_path)
                    except OSError:
                        pass

        return await asyncio.to_thread(_sync_transcribe)