api/app/adapters/asr/whisper_local.py

"""Local faster-whisper ASR adapter — implements ASRProvider port."""

from __future__ import annotations

import asyncio
import os
import re
import tempfile
from typing import Any, Iterable

from app.core.logging import get_logger

logger = get_logger(__name__)

_SUBTITLE_WATERMARK_RE = re.compile(
    r"(字幕|听译|压制|字幕组).{0,20}(by|BY|By)|字幕\s*by",
    re.UNICODE,
)


def _looks_like_subtitle_hallucination(text: str) -> bool:
    """静音时第二遍易吐出视频字幕水印；仅丢弃此类短句。"""
    t = (text or "").strip()
    if len(t) > 48:
        return False
    if _SUBTITLE_WATERMARK_RE.search(t):
        return True
    if len(t) <= 12 and "字幕" in t and not re.search(r"[？?！!。，、]", t):
        return True
    return False


def _join_segment_text(segments: Iterable[Any]) -> tuple[str, int]:
    segs = list(segments)
    return "".join(str(getattr(seg, "text", "") or "") for seg in segs).strip(), len(
        segs
    )


_DEFAULT_CACHE_DIR = os.path.normpath(
    os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        "..",
        "..",
        "..",
        "models",
        "whisper",
    )
)


class WhisperASRProvider:
    def __init__(
        self,
        model_size: str = "small",
        device: str = "auto",
        compute_type: str = "auto",
        cache_dir: str = "",
    ):
        self._model_size = model_size
        self._device = device
        self._compute_type = compute_type
        self._cache_dir = cache_dir
        self._model = None

    def _load_model(self) -> bool:
        if self._model is not None:
            return True
        try:
            from faster_whisper import WhisperModel

            device = self._device
            compute_type = self._compute_type
            if device == "auto":
                try:
                    import torch  # type: ignore[import-untyped]

                    device = "cuda" if torch.cuda.is_available() else "cpu"
                except ImportError:
                    device = "cpu"
            if compute_type == "auto":
                compute_type = "float16" if device == "cuda" else "int8"

            download_root = self._cache_dir or _DEFAULT_CACHE_DIR
            local_files_only = bool(self._cache_dir)
            os.makedirs(download_root, exist_ok=True)

            self._model = WhisperModel(
                self._model_size,
                device=device,
                compute_type=compute_type,
                download_root=download_root,
                local_files_only=local_files_only,
            )
            return True
        except Exception as e:
            logger.error("Failed to load Whisper model: {}", e)
            return False

    def ensure_ready(self) -> bool:
        return self._load_model()

    async def transcribe(self, audio: bytes, format: str = "m4a") -> str:
        # 与 v1.1.0 相同的单次 transcribe；推理放线程池，避免阻塞 asyncio（tag 上为同步调用）。
        self._load_model()
        if not self._model:
            return ""

        model = self._model

        def _sync_transcribe() -> str:
            tmp_path = None
            try:
                with tempfile.NamedTemporaryFile(
                    suffix=f".{format}", delete=False
                ) as tmp:
                    tmp.write(audio)
                    tmp_path = tmp.name

                segments, _info = model.transcribe(
                    tmp_path,
                    language="zh",
                    beam_size=5,
                    vad_filter=True,
                    vad_parameters={
                        "min_silence_duration_ms": 500,
                        "threshold": 0.35,
                        "min_speech_duration_ms": 200,
                    },
                )
                text, pass1_seg_count = _join_segment_text(segments)
                used_second_pass = False
                pass2_seg_count = 0
                pass3_seg_count = 0

                if not text:
                    logger.info(
                        "Whisper VAD pass 无文本，关闭 VAD 再试一次（短录音易被 VAD 判为静音）"
                    )
                    segments2, _info2 = model.transcribe(
                        tmp_path,
                        language="zh",
                        beam_size=5,
                        vad_filter=False,
                        condition_on_previous_text=False,
                        # 略抬高：减少边界片段被标成 no_speech 而整段为空
                        no_speech_threshold=0.85,
                    )
                    raw2, pass2_seg_count = _join_segment_text(segments2)
                    used_second_pass = True
                    if raw2 and _looks_like_subtitle_hallucination(raw2):
                        logger.info(
                            "Whisper 丢弃疑似字幕水印幻听: {!r}",
                            raw2[:120],
                        )
                        text = ""
                    else:
                        text = raw2

                if not text and used_second_pass:
                    try:
                        from faster_whisper import decode_audio

                        audio_np = decode_audio(tmp_path, sampling_rate=16000)
                        segments3, _info3 = model.transcribe(
                            audio_np,
                            language="zh",
                            beam_size=5,
                            vad_filter=False,
                            condition_on_previous_text=False,
                            no_speech_threshold=0.85,
                        )
                        raw3, pass3_seg_count = _join_segment_text(segments3)
                        if raw3 and _looks_like_subtitle_hallucination(raw3):
                            logger.info(
                                "Whisper decode_audio 回退仍是疑似字幕水印幻听: {!r}",
                                raw3[:120],
                            )
                        elif raw3:
                            text = raw3
                    except Exception as ex:
                        logger.warning("Whisper decode_audio 回退失败: {}", ex)

                return text
            except Exception as e:
                logger.error("Whisper transcribe failed: {}", e)
                return ""
            finally:
                if tmp_path and os.path.exists(tmp_path):
                    try:
                        os.remove(tmp_path)
                    except OSError:
                        pass

        return await asyncio.to_thread(_sync_transcribe)
-												Merge branch 'refactor/backend-architecture' into development

											
										
										
											2026-03-18 17:18:23 +08:00
+								"""Local faster-whisper ASR adapter — implements ASRProvider port."""
-												feat(memory,conversation): 记忆富化/证据包、时间线幂等字段与对话分段全链路

数据库
- 新增迁移 0003：timeline_events.memory_source_id 外键 → memory_sources，便于按 ingest 源做时间线幂等

后端 - 记忆
- 新增 ingest 后 LLM 富化（摘要/事实/时间线），可配置开关与最大字符数
- 新增证据包组装：合并 chunk、摘要、事实、时间线、故事等检索结果；支持空 query 时是否仍带 rolling 等开关
- repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展；文档 memory-retrieval.md 更新

后端 - 对话 WS
- 增加 PING/PONG；分段 ASR 日志与空音频处理；转写失败与「无助手回复」错误提示更明确
- 助手多段回复持久化使用统一分隔符，与分段逻辑一致

后端 - Agent
- reply_limits：按 [SPLIT] 与段落拆段，并保证非空 fallback，供 WS 与 TTS 多段下发

后端 - 回忆录任务
- transcript ingest 记录 source_id；任务成功结?

											
										
										
											2026-03-27 16:01:28 +08:00
+								from __future__ import annotations
 								import asyncio
-												Merge branch 'refactor/backend-architecture' into development

											
										
										
											2026-03-18 17:18:23 +08:00
+								import os
-												feat(memory,conversation): 记忆富化/证据包、时间线幂等字段与对话分段全链路

数据库
- 新增迁移 0003：timeline_events.memory_source_id 外键 → memory_sources，便于按 ingest 源做时间线幂等

后端 - 记忆
- 新增 ingest 后 LLM 富化（摘要/事实/时间线），可配置开关与最大字符数
- 新增证据包组装：合并 chunk、摘要、事实、时间线、故事等检索结果；支持空 query 时是否仍带 rolling 等开关
- repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展；文档 memory-retrieval.md 更新

后端 - 对话 WS
- 增加 PING/PONG；分段 ASR 日志与空音频处理；转写失败与「无助手回复」错误提示更明确
- 助手多段回复持久化使用统一分隔符，与分段逻辑一致

后端 - Agent
- reply_limits：按 [SPLIT] 与段落拆段，并保证非空 fallback，供 WS 与 TTS 多段下发

后端 - 回忆录任务
- transcript ingest 记录 source_id；任务成功结?

											
										
										
											2026-03-27 16:01:28 +08:00
+								import re
-												Merge branch 'refactor/backend-architecture' into development

											
										
										
											2026-03-18 17:18:23 +08:00
+								import tempfile
-												feat(memory,conversation): 记忆富化/证据包、时间线幂等字段与对话分段全链路

数据库
- 新增迁移 0003：timeline_events.memory_source_id 外键 → memory_sources，便于按 ingest 源做时间线幂等

后端 - 记忆
- 新增 ingest 后 LLM 富化（摘要/事实/时间线），可配置开关与最大字符数
- 新增证据包组装：合并 chunk、摘要、事实、时间线、故事等检索结果；支持空 query 时是否仍带 rolling 等开关
- repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展；文档 memory-retrieval.md 更新

后端 - 对话 WS
- 增加 PING/PONG；分段 ASR 日志与空音频处理；转写失败与「无助手回复」错误提示更明确
- 助手多段回复持久化使用统一分隔符，与分段逻辑一致

后端 - Agent
- reply_limits：按 [SPLIT] 与段落拆段，并保证非空 fallback，供 WS 与 TTS 多段下发

后端 - 回忆录任务
- transcript ingest 记录 source_id；任务成功结?

											
										
										
											2026-03-27 16:01:28 +08:00
+								from typing import Any, Iterable
 								from app.core.logging import get_logger
-												Merge branch 'refactor/backend-architecture' into development

											
										
										
											2026-03-18 17:18:23 +08:00
 								logger = get_logger(__name__)
-												feat(memory,conversation): 记忆富化/证据包、时间线幂等字段与对话分段全链路

数据库
- 新增迁移 0003：timeline_events.memory_source_id 外键 → memory_sources，便于按 ingest 源做时间线幂等

后端 - 记忆
- 新增 ingest 后 LLM 富化（摘要/事实/时间线），可配置开关与最大字符数
- 新增证据包组装：合并 chunk、摘要、事实、时间线、故事等检索结果；支持空 query 时是否仍带 rolling 等开关
- repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展；文档 memory-retrieval.md 更新

后端 - 对话 WS
- 增加 PING/PONG；分段 ASR 日志与空音频处理；转写失败与「无助手回复」错误提示更明确
- 助手多段回复持久化使用统一分隔符，与分段逻辑一致

后端 - Agent
- reply_limits：按 [SPLIT] 与段落拆段，并保证非空 fallback，供 WS 与 TTS 多段下发

后端 - 回忆录任务
- transcript ingest 记录 source_id；任务成功结?

											
										
										
											2026-03-27 16:01:28 +08:00
+								_SUBTITLE_WATERMARK_RE = re.compile(
 								    r"(字幕|听译|压制|字幕组).{0,20}(by|BY|By)|字幕\s*by",
 								    re.UNICODE,
 								)
 								def _looks_like_subtitle_hallucination(text: str) -> bool:
 								    """静音时第二遍易吐出视频字幕水印；仅丢弃此类短句。"""
 								    t = (text or "").strip()
 								    if len(t) > 48:
 								        return False
 								    if _SUBTITLE_WATERMARK_RE.search(t):
 								        return True
 								    if len(t) <= 12 and "字幕" in t and not re.search(r"[？?！!。，、]", t):
 								        return True
 								    return False
 								def _join_segment_text(segments: Iterable[Any]) -> tuple[str, int]:
 								    segs = list(segments)
 								    return "".join(str(getattr(seg, "text", "") or "") for seg in segs).strip(), len(
 								        segs
 								    )
-												Merge branch 'refactor/backend-architecture' into development

											
										
										
											2026-03-18 17:18:23 +08:00
+								_DEFAULT_CACHE_DIR = os.path.normpath(
-												chore/ 删除无用文件

											
										
										
											2026-03-19 14:36:14 +08:00
+								    os.path.join(
 								        os.path.dirname(os.path.abspath(__file__)),
 								        "..",
 								        "..",
 								        "..",
 								        "models",
 								        "whisper",
 								    )
-												Merge branch 'refactor/backend-architecture' into development

											
										
										
											2026-03-18 17:18:23 +08:00
+								)
 								class WhisperASRProvider:
 								    def __init__(
 								        self,
 								        model_size: str = "small",
 								        device: str = "auto",
 								        compute_type: str = "auto",
 								        cache_dir: str = "",
 								    ):
 								        self._model_size = model_size
 								        self._device = device
 								        self._compute_type = compute_type
 								        self._cache_dir = cache_dir
 								        self._model = None
 								    def _load_model(self) -> bool:
 								        if self._model is not None:
 								            return True
 								        try:
 								            from faster_whisper import WhisperModel
 								            device = self._device
 								            compute_type = self._compute_type
 								            if device == "auto":
 								                try:
 								                    import torch  # type: ignore[import-untyped]
-												chore/ 删除无用文件

											
										
										
											2026-03-19 14:36:14 +08:00
-												Merge branch 'refactor/backend-architecture' into development

											
										
										
											2026-03-18 17:18:23 +08:00
+								                    device = "cuda" if torch.cuda.is_available() else "cpu"
 								                except ImportError:
 								                    device = "cpu"
 								            if compute_type == "auto":
 								                compute_type = "float16" if device == "cuda" else "int8"
 								            download_root = self._cache_dir or _DEFAULT_CACHE_DIR
 								            local_files_only = bool(self._cache_dir)
 								            os.makedirs(download_root, exist_ok=True)
 								            self._model = WhisperModel(
 								                self._model_size,
 								                device=device,
 								                compute_type=compute_type,
 								                download_root=download_root,
 								                local_files_only=local_files_only,
 								            )
 								            return True
 								        except Exception as e:
-												feat(api+app): 对话阶段化、回忆录流水线与客户端会话体验
- DB: segments 用户输入文本（Alembic 0002）
- Chat: 阶段检测/阶段提示/回复限制，编排与访谈/画像 prompts 调整
- Memoir: 忠实度检查 agent，叙事与分类等链路更新
- Core: agent 日志、Alembic 启动、LangChain/日志/配置等
- Story: time_hints；Memory 检索与相关测试
- Expo: 助手头像、会话页与消息拆分、实时会话与文案/i18n
- Docs/scripts/tests: 迁移脚本、LLM JSON/记忆检索文档、新增单测

											
										
										
											2026-03-26 12:13:36 +08:00
+								            logger.error("Failed to load Whisper model: {}", e)
-												Merge branch 'refactor/backend-architecture' into development

											
										
										
											2026-03-18 17:18:23 +08:00
+								            return False
 								    def ensure_ready(self) -> bool:
 								        return self._load_model()
 								    async def transcribe(self, audio: bytes, format: str = "m4a") -> str:
-												feat(memory,conversation): 记忆富化/证据包、时间线幂等字段与对话分段全链路

数据库
- 新增迁移 0003：timeline_events.memory_source_id 外键 → memory_sources，便于按 ingest 源做时间线幂等

后端 - 记忆
- 新增 ingest 后 LLM 富化（摘要/事实/时间线），可配置开关与最大字符数
- 新增证据包组装：合并 chunk、摘要、事实、时间线、故事等检索结果；支持空 query 时是否仍带 rolling 等开关
- repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展；文档 memory-retrieval.md 更新

后端 - 对话 WS
- 增加 PING/PONG；分段 ASR 日志与空音频处理；转写失败与「无助手回复」错误提示更明确
- 助手多段回复持久化使用统一分隔符，与分段逻辑一致

后端 - Agent
- reply_limits：按 [SPLIT] 与段落拆段，并保证非空 fallback，供 WS 与 TTS 多段下发

后端 - 回忆录任务
- transcript ingest 记录 source_id；任务成功结?

											
										
										
											2026-03-27 16:01:28 +08:00
+								        # 与 v1.1.0 相同的单次 transcribe；推理放线程池，避免阻塞 asyncio（tag 上为同步调用）。
-												Merge branch 'refactor/backend-architecture' into development

											
										
										
											2026-03-18 17:18:23 +08:00
+								        self._load_model()
 								        if not self._model:
 								            return ""
-												feat(memory,conversation): 记忆富化/证据包、时间线幂等字段与对话分段全链路

数据库
- 新增迁移 0003：timeline_events.memory_source_id 外键 → memory_sources，便于按 ingest 源做时间线幂等

后端 - 记忆
- 新增 ingest 后 LLM 富化（摘要/事实/时间线），可配置开关与最大字符数
- 新增证据包组装：合并 chunk、摘要、事实、时间线、故事等检索结果；支持空 query 时是否仍带 rolling 等开关
- repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展；文档 memory-retrieval.md 更新

后端 - 对话 WS
- 增加 PING/PONG；分段 ASR 日志与空音频处理；转写失败与「无助手回复」错误提示更明确
- 助手多段回复持久化使用统一分隔符，与分段逻辑一致

后端 - Agent
- reply_limits：按 [SPLIT] 与段落拆段，并保证非空 fallback，供 WS 与 TTS 多段下发

后端 - 回忆录任务
- transcript ingest 记录 source_id；任务成功结?

											
										
										
											2026-03-27 16:01:28 +08:00
+								        model = self._model
 								        def _sync_transcribe() -> str:
 								            tmp_path = None
 								            try:
 								                with tempfile.NamedTemporaryFile(
 								                    suffix=f".{format}", delete=False
 								                ) as tmp:
 								                    tmp.write(audio)
 								                    tmp_path = tmp.name
 								                segments, _info = model.transcribe(
 								                    tmp_path,
 								                    language="zh",
 								                    beam_size=5,
 								                    vad_filter=True,
 								                    vad_parameters={
 								                        "min_silence_duration_ms": 500,
 								                        "threshold": 0.35,
 								                        "min_speech_duration_ms": 200,
 								                    },
 								                )
 								                text, pass1_seg_count = _join_segment_text(segments)
 								                used_second_pass = False
 								                pass2_seg_count = 0
 								                pass3_seg_count = 0
 								                if not text:
 								                    logger.info(
 								                        "Whisper VAD pass 无文本，关闭 VAD 再试一次（短录音易被 VAD 判为静音）"
 								                    )
 								                    segments2, _info2 = model.transcribe(
 								                        tmp_path,
 								                        language="zh",
 								                        beam_size=5,
 								                        vad_filter=False,
 								                        condition_on_previous_text=False,
 								                        # 略抬高：减少边界片段被标成 no_speech 而整段为空
 								                        no_speech_threshold=0.85,
 								                    )
 								                    raw2, pass2_seg_count = _join_segment_text(segments2)
 								                    used_second_pass = True
 								                    if raw2 and _looks_like_subtitle_hallucination(raw2):
 								                        logger.info(
 								                            "Whisper 丢弃疑似字幕水印幻听: {!r}",
 								                            raw2[:120],
 								                        )
 								                        text = ""
 								                    else:
 								                        text = raw2
 								                if not text and used_second_pass:
 								                    try:
 								                        from faster_whisper import decode_audio
 								                        audio_np = decode_audio(tmp_path, sampling_rate=16000)
 								                        segments3, _info3 = model.transcribe(
 								                            audio_np,
 								                            language="zh",
 								                            beam_size=5,
 								                            vad_filter=False,
 								                            condition_on_previous_text=False,
 								                            no_speech_threshold=0.85,
 								                        )
 								                        raw3, pass3_seg_count = _join_segment_text(segments3)
 								                        if raw3 and _looks_like_subtitle_hallucination(raw3):
 								                            logger.info(
 								                                "Whisper decode_audio 回退仍是疑似字幕水印幻听: {!r}",
 								                                raw3[:120],
 								                            )
 								                        elif raw3:
 								                            text = raw3
 								                    except Exception as ex:
 								                        logger.warning("Whisper decode_audio 回退失败: {}", ex)
 								                return text
 								            except Exception as e:
 								                logger.error("Whisper transcribe failed: {}", e)
 								                return ""
 								            finally:
 								                if tmp_path and os.path.exists(tmp_path):
 								                    try:
 								                        os.remove(tmp_path)
 								                    except OSError:
 								                        pass
 								        return await asyncio.to_thread(_sync_transcribe)