diff --git a/.github/workflows/app-expo-deploy.yml b/.github/workflows/app-expo-deploy.yml index 6c05400..521a081 100644 --- a/.github/workflows/app-expo-deploy.yml +++ b/.github/workflows/app-expo-deploy.yml @@ -92,6 +92,15 @@ jobs: npm run lint npm run test:ci + - name: Set API environment + working-directory: app-expo + run: | + case "${{ steps.env.outputs.env }}" in + prod) node scripts/use-env.js production ;; + stage) node scripts/use-env.js staging ;; + *) node scripts/use-env.js development ;; + esac + - name: Export web build working-directory: app-expo run: npx expo export -p web diff --git a/api/.env.example b/api/.env.example index e473cf5..78b8235 100644 --- a/api/.env.example +++ b/api/.env.example @@ -65,6 +65,16 @@ TENCENT_SECRET_ID=your_tencent_asr_secret_id TENCENT_SECRET_KEY=your_tencent_asr_secret_key # TENCENT_ASR_APP_ID= +# ============================================================================= +# TTS (openai | tencent) +# ============================================================================= +TTS_PROVIDER=tencent +# 仅 TTS_PROVIDER=openai 时需要 +# OPENAI_API_KEY=your_openai_api_key +# 仅 TTS_PROVIDER=tencent 时生效,与 ASR 共用 TENCENT_SECRET_ID / TENCENT_SECRET_KEY +# 音色 ID 见 https://cloud.tencent.com/document/product/1073/92668 +TTS_VOICE_TYPE=603004 + # ============================================================================= # WeChat Pay # ============================================================================= diff --git a/api/app/adapters/tts/tencent_tts.py b/api/app/adapters/tts/tencent_tts.py new file mode 100644 index 0000000..13fbc9b --- /dev/null +++ b/api/app/adapters/tts/tencent_tts.py @@ -0,0 +1,79 @@ +"""Tencent Cloud TTS adapter — implements TTSProvider port. + +API: https://cloud.tencent.com/document/product/1073/37995 +""" + +import asyncio +import base64 +import uuid + +from app.core.logging import get_logger + +logger = get_logger(__name__) + + +class TencentTTSProvider: + def __init__( + self, + secret_id: str, + secret_key: str, + voice_type: int = 603004, + codec: str = "mp3", + sample_rate: int = 16000, + ): + self._secret_id = secret_id + self._secret_key = secret_key + self._voice_type = voice_type + self._codec = codec + self._sample_rate = sample_rate + self._client = None + + def _get_client(self): + if self._client is not None: + return self._client + try: + from tencentcloud.common import credential + from tencentcloud.common.profile.client_profile import ClientProfile + from tencentcloud.common.profile.http_profile import HttpProfile + from tencentcloud.tts.v20190823 import tts_client + + cred = credential.Credential(self._secret_id, self._secret_key) + http_profile = HttpProfile() + http_profile.endpoint = "tts.tencentcloudapi.com" + client_profile = ClientProfile() + client_profile.httpProfile = http_profile + self._client = tts_client.TtsClient(cred, "", client_profile) + return self._client + except Exception as e: + logger.error("Tencent TTS client init failed: %s", e) + return None + + def _synthesize_sync(self, text: str) -> bytes: + """Sync synthesis (run in executor).""" + client = self._get_client() + if not client: + return b"" + from tencentcloud.tts.v20190823 import models + + req = models.TextToVoiceRequest() + req.Text = text[:500] # 中文约150字,英文约500字母,保守截断 + req.SessionId = f"tts-{uuid.uuid4().hex}" + req.VoiceType = self._voice_type + req.Codec = self._codec + req.SampleRate = self._sample_rate + req.PrimaryLanguage = 1 # 1=中文 + + resp = client.TextToVoice(req) + if resp.Audio: + return base64.b64decode(resp.Audio) + return b"" + + async def synthesize(self, text: str, voice: str = "alloy") -> bytes: + """Convert text to speech. Returns mp3 bytes.""" + if not text or not self._secret_id or not self._secret_key: + return b"" + try: + return await asyncio.to_thread(self._synthesize_sync, text) + except Exception as e: + logger.error("Tencent TTS synthesize failed: %s", e) + return b"" diff --git a/api/app/core/config.py b/api/app/core/config.py index 53d64ae..9be2704 100644 --- a/api/app/core/config.py +++ b/api/app/core/config.py @@ -59,8 +59,10 @@ class Settings(BaseSettings): tencent_secret_key: str = "" tencent_asr_app_id: str = "" - # ── OpenAI (TTS) ───────────────────────────────────────── + # ── TTS (openai | tencent) ─────────────────────────────── + tts_provider: str = "tencent" openai_api_key: str = "" + tts_voice_type: int = 603004 # Tencent 音色 ID,见 https://cloud.tencent.com/document/product/1073/92668 # ── WeChat Pay ─────────────────────────────────────────── wechat_pay_app_id: str = "" diff --git a/api/app/core/dependencies.py b/api/app/core/dependencies.py index f1ac020..7b9025c 100644 --- a/api/app/core/dependencies.py +++ b/api/app/core/dependencies.py @@ -60,6 +60,15 @@ def get_llm_provider() -> LLMProvider: @lru_cache def get_tts_provider() -> TTSProvider: + if settings.tts_provider == "tencent": + from app.adapters.tts.tencent_tts import TencentTTSProvider + + return TencentTTSProvider( + secret_id=settings.tencent_secret_id, + secret_key=settings.tencent_secret_key, + voice_type=settings.tts_voice_type, + codec="mp3", + ) from app.adapters.tts.openai_tts import OpenAITTSProvider return OpenAITTSProvider(api_key=settings.openai_api_key) diff --git a/api/app/features/conversation/ws/pipeline.py b/api/app/features/conversation/ws/pipeline.py index 69701d8..6e5daee 100644 --- a/api/app/features/conversation/ws/pipeline.py +++ b/api/app/features/conversation/ws/pipeline.py @@ -26,11 +26,41 @@ from app.features.conversation.ws.profile_collector import ( get_missing_profile_fields, ) from app.features.user.models import User -from app.core.dependencies import get_asr_provider +from app.core.dependencies import get_asr_provider, get_tts_provider from app.features.memoir.state_service import get_or_create_state logger = get_logger(__name__) + +async def _send_tts_audio(conversation_id: str, text: str) -> None: + """Synthesize text to speech and send TTS_AUDIO if successful.""" + try: + tts = get_tts_provider() + audio_bytes = await tts.synthesize(text) + if not audio_bytes: + logger.warning( + "TTS skipped: synthesize returned empty. Check TTS config in .env" + ) + return + await manager.send_message(conversation_id, { + "type": MessageType.TTS_AUDIO, + "conversation_id": conversation_id, + "data": { + "audio_base64": base64.b64encode(audio_bytes).decode("utf-8"), + "format": "mp3", + }, + "timestamp": datetime.now(timezone.utc).isoformat(), + }) + except Exception as e: + err_str = str(e) + if "PkgExhausted" in err_str: + logger.warning( + "TTS skipped: 腾讯云语音合成资源包已用尽,请在控制台购买或开通后付费: %s", + err_str[:100], + ) + else: + logger.error("TTS synthesize failed: %s", e) + # ── Agent 实例(从 ConnectionManager 移出) ───────────────────── conversation_agent = ConversationAgent() memory_agent = MemoryAgent() @@ -447,6 +477,7 @@ async def process_user_message( "data": {"text": response_text, "index": i, "total": len(responses)}, "timestamp": datetime.now(timezone.utc).isoformat(), }) + await _send_tts_audio(conversation_id, response_text) if i < len(responses) - 1: await asyncio.sleep(0.5) return @@ -498,6 +529,7 @@ async def process_user_message( "data": {"text": response_text, "index": i, "total": len(responses)}, "timestamp": datetime.now(timezone.utc).isoformat(), }) + await _send_tts_audio(conversation_id, response_text) if i < len(responses) - 1: await asyncio.sleep(0.5) diff --git a/app-expo/.gitignore b/app-expo/.gitignore index f8c6c2e..7769eac 100644 --- a/app-expo/.gitignore +++ b/app-expo/.gitignore @@ -32,6 +32,8 @@ yarn-error.* # local env files .env*.local +# generated .env (from use-env script) +.env # typescript *.tsbuildinfo diff --git a/app-expo/package.json b/app-expo/package.json index 597d6db..d1c183c 100644 --- a/app-expo/package.json +++ b/app-expo/package.json @@ -3,11 +3,15 @@ "main": "expo-router/entry", "version": "1.0.0", "scripts": { + "use-env": "node scripts/use-env.js", + "prestart": "npm run use-env -- development", "start": "expo start", + "start:staging": "npm run use-env -- staging && expo start", + "start:prod": "npm run use-env -- production && expo start", "reset-project": "node ./scripts/reset-project.js", - "android": "expo run:android", - "ios": "expo run:ios", - "web": "expo start --web", + "android": "npm run use-env -- development && expo run:android", + "ios": "npm run use-env -- development && expo run:ios", + "web": "npm run use-env -- development && expo start --web", "lint": "expo lint", "test": "jest --watch", "test:changed": "jest --onlyChanged --coverage=false", diff --git a/app-expo/src/app/(main)/conversation/[id].tsx b/app-expo/src/app/(main)/conversation/[id].tsx index 5b3ece2..82b7ac3 100644 --- a/app-expo/src/app/(main)/conversation/[id].tsx +++ b/app-expo/src/app/(main)/conversation/[id].tsx @@ -1,6 +1,14 @@ import { Image } from 'expo-image'; import { useLocalSearchParams } from 'expo-router'; -import { Mic, Pause, Play, PlusCircle, Type, X } from 'lucide-react-native'; +import { + Mic, + Pause, + Play, + PlusCircle, + Type, + Volume2, + X, +} from 'lucide-react-native'; import React, { useCallback, useEffect, useRef, useState } from 'react'; import { Alert, @@ -23,6 +31,7 @@ import { useThemeColors } from '@/hooks/use-theme-colors'; import { useMessages, useRealtimeSession } from '@/features/conversation/hooks'; import type { MessageItem } from '@/features/conversation/types'; import { audioFocus } from '@/core/audio/audio-focus'; +import { usePlayer } from '@/features/voice/hooks/use-player'; import { useRecorder } from '@/features/voice/hooks/use-recorder'; import { useAudioPlayer, useAudioPlayerStatus } from 'expo-audio'; @@ -543,10 +552,12 @@ export default function ConversationScreen() { const { t } = useTranslation('conversation'); const { t: tApp } = useTranslation('app'); const { data: messages } = useMessages(id); + const { enqueueTtsAudio, status: playerStatus } = usePlayer(); const { connectionState, streamingMessage, sendText, sendVoiceMessage } = useRealtimeSession({ - conversationId: id, + conversationId: id ?? '', enabled: !!id, + onTtsAudio: enqueueTtsAudio, }); const handleRecordingComplete = useCallback( @@ -606,6 +617,14 @@ export default function ConversationScreen() { title={ {tApp('name')} + {playerStatus === 'playing' && ( + + )} void; } const MIN_RECORDING_DURATION_SEC = 1; @@ -136,6 +137,7 @@ interface RealtimeSessionState { export function useRealtimeSession({ conversationId, enabled = true, + onTtsAudio, }: UseRealtimeSessionOptions): RealtimeSessionState { const queryClient = useQueryClient(); const sessionRef = useRef(null); @@ -168,6 +170,7 @@ export function useRealtimeSession({ conversationId, queryClient, onStreamingText: handleStreamingText, + onTtsAudio, onError: handleError, onStateChange: setConnectionState, }); @@ -181,7 +184,7 @@ export function useRealtimeSession({ setConnectionState('disconnected'); setStreamingMessage(null); }; - }, [conversationId, enabled, queryClient, handleStreamingText, handleError]); + }, [conversationId, enabled, queryClient, handleStreamingText, handleError, onTtsAudio]); const sendText = useCallback( (text: string) => { diff --git a/app-expo/src/features/conversation/realtime-session.ts b/app-expo/src/features/conversation/realtime-session.ts index 7869da9..933db80 100644 --- a/app-expo/src/features/conversation/realtime-session.ts +++ b/app-expo/src/features/conversation/realtime-session.ts @@ -18,6 +18,7 @@ interface RealtimeSessionOptions { conversationId: string; queryClient: QueryClient; onStreamingText?: StreamingTextCallback; + onTtsAudio?: (audioBase64: string) => void; onError?: ErrorCallback; onStateChange?: WsStateListener; } @@ -38,6 +39,7 @@ export class RealtimeSession { private conversationId: string; private queryClient: QueryClient; private onStreamingText?: StreamingTextCallback; + private onTtsAudio?: (audioBase64: string) => void; private onError?: ErrorCallback; private unsubEvent: (() => void) | null = null; private unsubState: (() => void) | null = null; @@ -49,6 +51,7 @@ export class RealtimeSession { this.conversationId = options.conversationId; this.queryClient = options.queryClient; this.onStreamingText = options.onStreamingText; + this.onTtsAudio = options.onTtsAudio; this.onError = options.onError; this.unsubEvent = this.client.onEvent(this.handleEvent); @@ -117,6 +120,11 @@ export class RealtimeSession { return; } + if (event.kind === 'tts_audio_received') { + this.onTtsAudio?.(event.audioBase64); + return; + } + handleWsEvent(this.queryClient, event); if (event.kind === 'session_error') {