feat: 扩展后端WebSocket和语音识别功能

- 扩展websocket.py支持语音消息 - 优化asr_service.py语音识别服务 - 更新main.py和requirements.txt - 更新.env.production配置 Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-03 11:29:38 +08:00
parent 748f252c2f
commit 76fd7da3c9
5 changed files with 209 additions and 33 deletions
--- a/api/routers/websocket.py
+++ b/api/routers/websocket.py
@@ -19,6 +19,7 @@ from database.models import Conversation, Segment
 from database.models import User as UserModel
 from services.auth_service import verify_token
 from services.memoir_state_service import get_or_create_state
+from services.asr_service import asr_service
 from fastapi import HTTPException, status

 logger = logging.getLogger(__name__)
@@ -28,8 +29,9 @@ class MessageType(str, Enum):
    """WebSocket 消息类型"""
    CONNECT = "connect"
    AUDIO_CHUNK = "audio_chunk"
+    AUDIO_MESSAGE = "audio_message"  # 完整音频消息（类似微信语音）
    TEXT = "text"  # 文本消息
-    TRANSCRIPT = "transcript"
+    TRANSCRIPT = "transcript"  # 语音转文字结果
    AGENT_RESPONSE = "agent_response"
    TTS_AUDIO = "tts_audio"
    END_CONVERSATION = "end_conversation"
@@ -190,6 +192,70 @@ async def websocket_endpoint(
                                manager=manager
                            )
                    
+                    elif msg_type == MessageType.AUDIO_MESSAGE:
+                        # 处理完整音频消息（类似微信语音）
+                        data = message.get("data", {})
+                        audio_base64 = data.get("audio_base64", "")
+                        audio_duration = data.get("duration", 0)
+                        
+                        if audio_base64:
+                            logger.info(f"收到音频消息，时长: {audio_duration}s")
+                            
+                            try:
+                                # 1. ASR 转写
+                                transcript_text = await asr_service.transcribe(audio_base64)
+                                logger.info(f"ASR 转写结果: {transcript_text}")
+                                
+                                # 2. 发送转写结果给客户端
+                                await manager.send_message(conversation_id, {
+                                    "type": MessageType.TRANSCRIPT,
+                                    "conversation_id": conversation_id,
+                                    "data": {
+                                        "text": transcript_text,
+                                        "audio_duration": audio_duration
+                                    },
+                                    "timestamp": datetime.now(timezone.utc).isoformat()
+                                })
+                                
+                                # 3. 保存段落到数据库（包含转写文本和音频信息）
+                                segment = Segment(
+                                    id=str(uuid.uuid4()),
+                                    conversation_id=conversation_id,
+                                    transcript_text=transcript_text,
+                                    audio_url=f"audio:{audio_duration}s",  # 简化存储，标记为音频消息
+                                    processed=False
+                                )
+                                db.add(segment)
+                                await db.commit()
+                                await db.refresh(segment)
+                                await manager.background_runner.queue_message(conversation.user_id, segment.id)
+                                
+                                # 4. Agent 生成回应（基于转写文本）
+                                if transcript_text and not transcript_text.startswith("转写失败"):
+                                    await process_user_message(
+                                        conversation_id=conversation_id,
+                                        user_message=transcript_text,
+                                        conversation=conversation,
+                                        segment=segment,
+                                        db=db,
+                                        manager=manager
+                                    )
+                                else:
+                                    # 转写失败，发送错误消息
+                                    await manager.send_message(conversation_id, {
+                                        "type": MessageType.ERROR,
+                                        "data": {"message": "语音转写失败，请重试或使用文字输入"},
+                                        "timestamp": datetime.now(timezone.utc).isoformat()
+                                    })
+                                    
+                            except Exception as e:
+                                logger.error(f"处理音频消息失败: {e}", exc_info=True)
+                                await manager.send_message(conversation_id, {
+                                    "type": MessageType.ERROR,
+                                    "data": {"message": f"处理音频消息失败: {str(e)}"},
+                                    "timestamp": datetime.now(timezone.utc).isoformat()
+                                })
+                    
                    elif msg_type == MessageType.END_CONVERSATION:
                        # 结束对话
                        conversation.status = "ended"