api/services/asr_service.py

"""
ASR 服务：语音转文字
使用本地 faster-whisper 模型进行语音识别
"""
import base64
import logging
import os
import tempfile
from typing import Optional

logger = logging.getLogger(__name__)

# 模型配置
# 可选模型: tiny, base, small, medium, large-v2, large-v3
# tiny/base 适合 CPU，small/medium 需要更多资源，large 需要 GPU
ASR_MODEL_SIZE = os.getenv("ASR_MODEL_SIZE", "small")
ASR_DEVICE = os.getenv("ASR_DEVICE", "auto")  # auto, cpu, cuda
ASR_COMPUTE_TYPE = os.getenv("ASR_COMPUTE_TYPE", "auto")  # auto, int8, float16, float32


class ASRService:
    """
    ASR 服务（语音转文字）
    使用 faster-whisper 本地模型
    """
    
    def __init__(self):
        self.model = None
        self._model_loaded = False
        self._load_error = None
    
    def _load_model(self) -> bool:
        """加载模型（首次调用时执行，后续直接返回）。返回是否加载成功。"""
        if self._model_loaded:
            return self.model is not None
        
        try:
            from faster_whisper import WhisperModel
            
            logger.info(f"正在加载 Whisper 模型: {ASR_MODEL_SIZE}, device={ASR_DEVICE}, compute_type={ASR_COMPUTE_TYPE}")
            
            # 确定设备和计算类型
            device = ASR_DEVICE
            compute_type = ASR_COMPUTE_TYPE
            
            if device == "auto":
                # 自动检测：优先使用 CUDA，否则使用 CPU
                try:
                    import torch
                    device = "cuda" if torch.cuda.is_available() else "cpu"
                except ImportError:
                    device = "cpu"
            
            if compute_type == "auto":
                # 根据设备自动选择计算类型
                if device == "cuda":
                    compute_type = "float16"  # GPU 使用 float16
                else:
                    compute_type = "int8"  # CPU 使用 int8 量化，速度更快
            
            self.model = WhisperModel(
                ASR_MODEL_SIZE,
                device=device,
                compute_type=compute_type
            )
            
            self._model_loaded = True
            logger.info(f"Whisper 模型加载成功: {ASR_MODEL_SIZE} on {device} ({compute_type})")
            return True
            
        except ImportError as e:
            self._load_error = "faster-whisper 未安装，请运行: pip install faster-whisper"
            logger.error(self._load_error)
            return False
        except Exception as e:
            self._load_error = f"加载 Whisper 模型失败: {str(e)}"
            logger.error(self._load_error, exc_info=True)
            return False
    
    def ensure_ready(self) -> bool:
        """
        确保 ASR 模型已就绪（用于启动时预加载与检查）。
        可在应用初始化时调用；为同步阻塞调用，建议在后台线程执行。
        返回是否就绪。
        """
        return self._load_model()
    
    def is_ready(self) -> bool:
        """检查 ASR 模型是否已加载并可用。"""
        return self._model_loaded and self.model is not None
    
    async def transcribe(self, audio_base64: str) -> Optional[str]:
        """
        转写音频为文字
        
        Args:
            audio_base64: Base64 编码的音频数据
        
        Returns:
            转写文本，失败时返回错误信息
        """
        # 懒加载模型
        self._load_model()
        
        if not self.model:
            error_msg = self._load_error or "ASR 模型未加载"
            logger.warning(error_msg)
            return f"转写失败: {error_msg}"
        
        tmp_file_path = None
        try:
            # 解码 Base64 音频
            audio_bytes = base64.b64decode(audio_base64)
            
            # 保存临时文件
            with tempfile.NamedTemporaryFile(suffix=".m4a", delete=False) as tmp_file:
                tmp_file.write(audio_bytes)
                tmp_file_path = tmp_file.name
            
            # 使用 faster-whisper 转写
            # language="zh" 指定中文，可以提高识别速度
            # beam_size=5 是默认值，可以调整
            segments, info = self.model.transcribe(
                tmp_file_path,
                language="zh",
                beam_size=5,
                vad_filter=True,  # 启用 VAD 过滤静音部分
                vad_parameters=dict(
                    min_silence_duration_ms=500,  # 最小静音时长
                )
            )
            
            # 合并所有转写片段
            transcript_text = "".join(segment.text for segment in segments)
            
            logger.info(f"ASR 转写完成: 语言={info.language}, 概率={info.language_probability:.2f}, 文本长度={len(transcript_text)}")
            
            return transcript_text.strip() if transcript_text else ""
            
        except Exception as e:
            logger.error(f"ASR 转写失败: {e}", exc_info=True)
            return f"转写失败: {str(e)}"
        finally:
            # 清理临时文件
            if tmp_file_path and os.path.exists(tmp_file_path):
                try:
                    os.remove(tmp_file_path)
                except Exception:
                    pass


# 全局实例
asr_service = ASRService()
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
+								"""
 								ASR 服务：语音转文字
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								使用本地 faster-whisper 模型进行语音识别
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
+								"""
 								import base64
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								import logging
-												refactor: 更新服务模块

- 更新ASR服务配置
- 更新服务模块导出

											
										
										
											2026-01-18 15:57:54 +08:00
+								import os
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								import tempfile
 								from typing import Optional
-												refactor: 更新服务模块

- 更新ASR服务配置
- 更新服务模块导出

											
										
										
											2026-01-18 15:57:54 +08:00
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								logger = logging.getLogger(__name__)
 								# 模型配置
 								# 可选模型: tiny, base, small, medium, large-v2, large-v3
 								# tiny/base 适合 CPU，small/medium 需要更多资源，large 需要 GPU
 								ASR_MODEL_SIZE = os.getenv("ASR_MODEL_SIZE", "small")
 								ASR_DEVICE = os.getenv("ASR_DEVICE", "auto")  # auto, cpu, cuda
 								ASR_COMPUTE_TYPE = os.getenv("ASR_COMPUTE_TYPE", "auto")  # auto, int8, float16, float32
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
 								class ASRService:
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								    """
 								    ASR 服务（语音转文字）
 								    使用 faster-whisper 本地模型
 								    """
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
 								    def __init__(self):
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								        self.model = None
 								        self._model_loaded = False
 								        self._load_error = None
 								    def _load_model(self) -> bool:
 								        """加载模型（首次调用时执行，后续直接返回）。返回是否加载成功。"""
 								        if self._model_loaded:
 								            return self.model is not None
 								        try:
 								            from faster_whisper import WhisperModel
 								            logger.info(f"正在加载 Whisper 模型: {ASR_MODEL_SIZE}, device={ASR_DEVICE}, compute_type={ASR_COMPUTE_TYPE}")
 								            # 确定设备和计算类型
 								            device = ASR_DEVICE
 								            compute_type = ASR_COMPUTE_TYPE
 								            if device == "auto":
 								                # 自动检测：优先使用 CUDA，否则使用 CPU
 								                try:
 								                    import torch
 								                    device = "cuda" if torch.cuda.is_available() else "cpu"
 								                except ImportError:
 								                    device = "cpu"
 								            if compute_type == "auto":
 								                # 根据设备自动选择计算类型
 								                if device == "cuda":
 								                    compute_type = "float16"  # GPU 使用 float16
 								                else:
 								                    compute_type = "int8"  # CPU 使用 int8 量化，速度更快
 								            self.model = WhisperModel(
 								                ASR_MODEL_SIZE,
 								                device=device,
 								                compute_type=compute_type
 								            )
 								            self._model_loaded = True
 								            logger.info(f"Whisper 模型加载成功: {ASR_MODEL_SIZE} on {device} ({compute_type})")
 								            return True
 								        except ImportError as e:
 								            self._load_error = "faster-whisper 未安装，请运行: pip install faster-whisper"
 								            logger.error(self._load_error)
 								            return False
 								        except Exception as e:
 								            self._load_error = f"加载 Whisper 模型失败: {str(e)}"
 								            logger.error(self._load_error, exc_info=True)
 								            return False
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								    def ensure_ready(self) -> bool:
 								        """
 								        确保 ASR 模型已就绪（用于启动时预加载与检查）。
 								        可在应用初始化时调用；为同步阻塞调用，建议在后台线程执行。
 								        返回是否就绪。
 								        """
 								        return self._load_model()
 								    def is_ready(self) -> bool:
 								        """检查 ASR 模型是否已加载并可用。"""
 								        return self._model_loaded and self.model is not None
 								    async def transcribe(self, audio_base64: str) -> Optional[str]:
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
+								        """
 								        转写音频为文字
 								        Args:
 								            audio_base64: Base64 编码的音频数据
 								        Returns:
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								            转写文本，失败时返回错误信息
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
+								        """
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								        # 懒加载模型
 								        self._load_model()
 								        if not self.model:
 								            error_msg = self._load_error or "ASR 模型未加载"
 								            logger.warning(error_msg)
 								            return f"转写失败: {error_msg}"
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								        tmp_file_path = None
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
+								        try:
 								            # 解码 Base64 音频
 								            audio_bytes = base64.b64decode(audio_base64)
 								            # 保存临时文件
 								            with tempfile.NamedTemporaryFile(suffix=".m4a", delete=False) as tmp_file:
 								                tmp_file.write(audio_bytes)
 								                tmp_file_path = tmp_file.name
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								            # 使用 faster-whisper 转写
 								            # language="zh" 指定中文，可以提高识别速度
 								            # beam_size=5 是默认值，可以调整
 								            segments, info = self.model.transcribe(
 								                tmp_file_path,
 								                language="zh",
 								                beam_size=5,
 								                vad_filter=True,  # 启用 VAD 过滤静音部分
 								                vad_parameters=dict(
 								                    min_silence_duration_ms=500,  # 最小静音时长
 								                )
 								            )
 								            # 合并所有转写片段
 								            transcript_text = "".join(segment.text for segment in segments)
 								            logger.info(f"ASR 转写完成: 语言={info.language}, 概率={info.language_probability:.2f}, 文本长度={len(transcript_text)}")
 								            return transcript_text.strip() if transcript_text else ""
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
+								        except Exception as e:
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								            logger.error(f"ASR 转写失败: {e}", exc_info=True)
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
+								            return f"转写失败: {str(e)}"
-												feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息
- 优化asr_service.py语音识别服务
- 更新main.py和requirements.txt
- 更新.env.production配置

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-02-03 11:29:38 +08:00
+								        finally:
 								            # 清理临时文件
 								            if tmp_file_path and os.path.exists(tmp_file_path):
 								                try:
 								                    os.remove(tmp_file_path)
 								                except Exception:
 								                    pass
-												添加API服务模块

											
										
										
											2026-01-07 11:56:46 +08:00
 								# 全局实例
 								asr_service = ASRService()