Files
life-echo/api/app/adapters/asr/whisper_local.py
Kevin e4bf0710c7 feat(memory,conversation): 记忆富化/证据包、时间线幂等字段与对话分段全链路
数据库
- 新增迁移 0003:timeline_events.memory_source_id 外键 → memory_sources,便于按 ingest 源做时间线幂等

后端 - 记忆
- 新增 ingest 后 LLM 富化(摘要/事实/时间线),可配置开关与最大字符数
- 新增证据包组装:合并 chunk、摘要、事实、时间线、故事等检索结果;支持空 query 时是否仍带 rolling 等开关
- repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展;文档 memory-retrieval.md 更新

后端 - 对话 WS
- 增加 PING/PONG;分段 ASR 日志与空音频处理;转写失败与「无助手回复」错误提示更明确
- 助手多段回复持久化使用统一分隔符,与分段逻辑一致

后端 - Agent
- reply_limits:按 [SPLIT] 与段落拆段,并保证非空 fallback,供 WS 与 TTS 多段下发

后端 - 回忆录任务
- transcript ingest 记录 source_id;任务成功结?
2026-03-27 16:24:43 +08:00

196 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Local faster-whisper ASR adapter — implements ASRProvider port."""
from __future__ import annotations
import asyncio
import os
import re
import tempfile
from typing import Any, Iterable
from app.core.logging import get_logger
logger = get_logger(__name__)
_SUBTITLE_WATERMARK_RE = re.compile(
r"(字幕|听译|压制|字幕组).{0,20}(by|BY|By)|字幕\s*by",
re.UNICODE,
)
def _looks_like_subtitle_hallucination(text: str) -> bool:
"""静音时第二遍易吐出视频字幕水印;仅丢弃此类短句。"""
t = (text or "").strip()
if len(t) > 48:
return False
if _SUBTITLE_WATERMARK_RE.search(t):
return True
if len(t) <= 12 and "字幕" in t and not re.search(r"[?!。,、]", t):
return True
return False
def _join_segment_text(segments: Iterable[Any]) -> tuple[str, int]:
segs = list(segments)
return "".join(str(getattr(seg, "text", "") or "") for seg in segs).strip(), len(
segs
)
_DEFAULT_CACHE_DIR = os.path.normpath(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"..",
"..",
"..",
"models",
"whisper",
)
)
class WhisperASRProvider:
def __init__(
self,
model_size: str = "small",
device: str = "auto",
compute_type: str = "auto",
cache_dir: str = "",
):
self._model_size = model_size
self._device = device
self._compute_type = compute_type
self._cache_dir = cache_dir
self._model = None
def _load_model(self) -> bool:
if self._model is not None:
return True
try:
from faster_whisper import WhisperModel
device = self._device
compute_type = self._compute_type
if device == "auto":
try:
import torch # type: ignore[import-untyped]
device = "cuda" if torch.cuda.is_available() else "cpu"
except ImportError:
device = "cpu"
if compute_type == "auto":
compute_type = "float16" if device == "cuda" else "int8"
download_root = self._cache_dir or _DEFAULT_CACHE_DIR
local_files_only = bool(self._cache_dir)
os.makedirs(download_root, exist_ok=True)
self._model = WhisperModel(
self._model_size,
device=device,
compute_type=compute_type,
download_root=download_root,
local_files_only=local_files_only,
)
return True
except Exception as e:
logger.error("Failed to load Whisper model: {}", e)
return False
def ensure_ready(self) -> bool:
return self._load_model()
async def transcribe(self, audio: bytes, format: str = "m4a") -> str:
# 与 v1.1.0 相同的单次 transcribe推理放线程池避免阻塞 asynciotag 上为同步调用)。
self._load_model()
if not self._model:
return ""
model = self._model
def _sync_transcribe() -> str:
tmp_path = None
try:
with tempfile.NamedTemporaryFile(
suffix=f".{format}", delete=False
) as tmp:
tmp.write(audio)
tmp_path = tmp.name
segments, _info = model.transcribe(
tmp_path,
language="zh",
beam_size=5,
vad_filter=True,
vad_parameters={
"min_silence_duration_ms": 500,
"threshold": 0.35,
"min_speech_duration_ms": 200,
},
)
text, pass1_seg_count = _join_segment_text(segments)
used_second_pass = False
pass2_seg_count = 0
pass3_seg_count = 0
if not text:
logger.info(
"Whisper VAD pass 无文本,关闭 VAD 再试一次(短录音易被 VAD 判为静音)"
)
segments2, _info2 = model.transcribe(
tmp_path,
language="zh",
beam_size=5,
vad_filter=False,
condition_on_previous_text=False,
# 略抬高:减少边界片段被标成 no_speech 而整段为空
no_speech_threshold=0.85,
)
raw2, pass2_seg_count = _join_segment_text(segments2)
used_second_pass = True
if raw2 and _looks_like_subtitle_hallucination(raw2):
logger.info(
"Whisper 丢弃疑似字幕水印幻听: {!r}",
raw2[:120],
)
text = ""
else:
text = raw2
if not text and used_second_pass:
try:
from faster_whisper import decode_audio
audio_np = decode_audio(tmp_path, sampling_rate=16000)
segments3, _info3 = model.transcribe(
audio_np,
language="zh",
beam_size=5,
vad_filter=False,
condition_on_previous_text=False,
no_speech_threshold=0.85,
)
raw3, pass3_seg_count = _join_segment_text(segments3)
if raw3 and _looks_like_subtitle_hallucination(raw3):
logger.info(
"Whisper decode_audio 回退仍是疑似字幕水印幻听: {!r}",
raw3[:120],
)
elif raw3:
text = raw3
except Exception as ex:
logger.warning("Whisper decode_audio 回退失败: {}", ex)
return text
except Exception as e:
logger.error("Whisper transcribe failed: {}", e)
return ""
finally:
if tmp_path and os.path.exists(tmp_path):
try:
os.remove(tmp_path)
except OSError:
pass
return await asyncio.to_thread(_sync_transcribe)