From 3d01085442bf6ffec53d45d36a5b5c8f6de3f83b Mon Sep 17 00:00:00 2001 From: Kevin Date: Tue, 12 May 2026 10:42:44 +0800 Subject: [PATCH] =?UTF-8?q?fix(conversation):=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E5=AE=9E=E6=97=B6=E4=BC=9A=E8=AF=9D=20TTS/=E5=9B=9E=E5=A4=8D?= =?UTF-8?q?=E8=A2=AB=E7=A6=BB=E5=B1=8F=20WS=20=E6=8A=A2=E5=8D=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 列表预热仅预取消息缓存,避免后台 WebSocket 覆盖服务端连接 - RealtimeSession UI 回调按 owner 独占,防止 offscreen 覆盖聊天页 - 列表页聚焦时再 prewarm,会话页 TTS 入队优先 base64 - 管线下发 TTS 同时带 audio_base64 与 audio_url;协议说明同步 - 移除 TTS 排查用前后端调试日志,保留错误/告警 - 补充 WS / RealtimeSession / entry-warmup / 播放器相关单测 Co-authored-by: Cursor --- api/app/adapters/tts/tencent_tts.py | 33 --- api/app/features/conversation/ws/pipeline.py | 118 +------- api/app/features/conversation/ws/protocol.md | 2 +- api/app/features/conversation/ws/router.py | 42 --- app-expo/src/app/(main)/conversation/[id].tsx | 53 +++- app-expo/src/app/(tabs)/index.tsx | 5 +- app-expo/src/core/audio/audio-focus.ts | 12 +- app-expo/src/core/settings/app-settings.ts | 12 + app-expo/src/core/ws/types.ts | 3 +- .../conversation-ws-background-pool.ts | 12 +- .../src/features/conversation/entry-warmup.ts | 20 +- app-expo/src/features/conversation/hooks.ts | 23 +- .../features/conversation/realtime-session.ts | 207 ++++++++++++-- .../src/features/voice/hooks/use-player.ts | 15 +- app-expo/tests/core/ws/client.test.ts | 39 +++ .../conversation/entry-warmup.test.ts | 15 ++ .../realtime-session-sync-order.test.ts | 253 ++++++++++++++++++ .../tests/features/voice/use-player.test.tsx | 40 +++ 18 files changed, 643 insertions(+), 261 deletions(-) create mode 100644 app-expo/tests/features/conversation/realtime-session-sync-order.test.ts diff --git a/api/app/adapters/tts/tencent_tts.py b/api/app/adapters/tts/tencent_tts.py index 501cdd1..2377fa3 100644 --- a/api/app/adapters/tts/tencent_tts.py +++ b/api/app/adapters/tts/tencent_tts.py @@ -137,18 +137,6 @@ class TencentTTSProvider: # 显式声明使用新模型;大模型音色(501xxx)若不带该字段会被旧模型拒绝并静默返回空音频。 req.ModelType = MODEL_TYPE_LLM - # 长期保留 INFO:TTS 实际请求腾讯云 SDK 时的关键参数 - logger.info( - "tencent_tts._synthesize_sync request voice_type={} primary_language={} " - "model_type={} sample_rate={} codec={} text_len={}", - voice_type, - primary_language, - MODEL_TYPE_LLM, - req.SampleRate, - self._codec, - len(text or ""), - ) - resp = client.TextToVoice(req) request_id = getattr(resp, "RequestId", None) if resp is not None else None audio_b64 = getattr(resp, "Audio", "") if resp is not None else "" @@ -163,15 +151,6 @@ class TencentTTSProvider: ) return b"" audio_bytes = base64.b64decode(audio_b64) - # 长期保留 INFO:腾讯云 SDK 返回的 request_id + 音频字节数(用户排查必需) - logger.info( - "tencent_tts._synthesize_sync response request_id={} audio_bytes_len={} " - "voice_type={} primary_language={}", - request_id, - len(audio_bytes), - voice_type, - primary_language, - ) return audio_bytes except TencentCloudSDKException as e: logger.error( @@ -225,18 +204,6 @@ class TencentTTSProvider: else: voice_type = VOICE_MAP.get(v, default_voice) chunks = _chunk_text(text, max_chars=max_chars) - # 长期保留 INFO:adapter 入口的 language / voice_type / chunk_count(排查必需) - logger.info( - "tencent_tts.synthesize entry language={} voice_arg={} resolved_voice_type={} " - "primary_language={} max_chars={} text_len={} chunk_count={}", - language, - voice, - voice_type, - primary_language, - max_chars, - len(text or ""), - len(chunks), - ) if not chunks: return b"" diff --git a/api/app/features/conversation/ws/pipeline.py b/api/app/features/conversation/ws/pipeline.py index 6270422..9f5090e 100644 --- a/api/app/features/conversation/ws/pipeline.py +++ b/api/app/features/conversation/ws/pipeline.py @@ -99,41 +99,10 @@ async def _send_tts_audio( ) -> str | None: """Synthesize TTS, upload to COS, append Redis, send TTS_AUDIO. Returns public URL or None.""" current_epoch = _tts_epoch_value(conversation_id) - # 长期保留 INFO:TTS 决策与执行链路必须在 INFO 级别全程可见 - logger.info( - "pipeline._send_tts_audio entry conversation_id={} chunk_index={} chunk_total={} " - "text_len={} language={} manual={} tts_epoch_start={} current_epoch={} " - "enable_tts={} provider={}", - conversation_id, - chunk_index, - chunk_total, - len(text or ""), - language, - manual, - tts_epoch_start, - current_epoch, - settings.enable_tts, - settings.tts_provider, - ) # enable_tts:仅禁用「助手回复自动生成 TTS」(want_tts 路径);用户点喇叭(manual=True)仍可合成。 if not manual and not settings.enable_tts: - logger.info( - "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False " - "url_set=False audio_bytes_len=0 reason=enable_tts_false", - conversation_id, - chunk_index, - ) return None if current_epoch != tts_epoch_start: - logger.info( - "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False " - "url_set=False audio_bytes_len=0 reason=epoch_mismatch_pre_synth " - "tts_epoch_start={} current_epoch={}", - conversation_id, - chunk_index, - tts_epoch_start, - current_epoch, - ) return None try: tts = get_tts_provider() @@ -148,50 +117,19 @@ async def _send_tts_audio( (text or "")[:30], settings.tts_provider, ) - logger.info( - "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False " - "url_set=False audio_bytes_len=0 reason=synthesize_empty", - conversation_id, - chunk_index, - ) return None if _tts_epoch_value(conversation_id) != tts_epoch_start: - logger.info( - "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False " - "url_set=False audio_bytes_len={} reason=epoch_mismatch_post_synth", - conversation_id, - chunk_index, - len(audio_bytes), - ) return None ext = _tts_object_ext(settings.tts_codec) content_type = _tts_codec_to_content_type(settings.tts_codec) storage = get_object_storage() key = f"conversations/{conversation_id}/tts/{uuid.uuid4().hex}.{ext}" - upload_started = time.perf_counter() - logger.debug( - "pipeline._send_tts_audio uploading key={} audio_bytes_len={} content_type={}", - key, - len(audio_bytes), - content_type, - ) public_url = storage.upload(key, audio_bytes, content_type) - upload_ms = (time.perf_counter() - upload_started) * 1000 # 与 `tts_delivery.apply_presigned_tts_urls_to_messages` / 回忆录图片 presign 一致:下发可播 URL playback_url = storage.get_url(key, expires=TTS_PRESIGNED_EXPIRES_SEC) - logger.debug( - "pipeline._send_tts_audio uploaded key={} audio_bytes_len={} upload_ms={:.2f} " - "public_url_set={} playback_url_set={}", - key, - len(audio_bytes), - upload_ms, - bool(public_url), - bool(playback_url), - ) - audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") payload_data: Dict[str, Any] = { - "audio_base64": audio_b64, "format": settings.tts_codec, + "audio_base64": base64.b64encode(audio_bytes).decode("utf-8"), "audio_url": playback_url, "index": chunk_index, "total": chunk_total, @@ -200,16 +138,6 @@ async def _send_tts_audio( payload_data["assistant_message_id"] = assistant_message_id if manual: payload_data["manual"] = True - logger.debug( - "pipeline._send_tts_audio sending TTS_AUDIO conversation_id={} chunk_index={} " - "chunk_total={} payload_fields={} audio_b64_len={} manual={}", - conversation_id, - chunk_index, - chunk_total, - sorted(payload_data.keys()), - len(audio_b64), - manual, - ) await manager.send_message( conversation_id, { @@ -219,16 +147,6 @@ async def _send_tts_audio( "timestamp": datetime.now(timezone.utc).isoformat(), }, ) - logger.info( - "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=True " - "url_set={} audio_bytes_len={} upload_ms={:.2f} manual={}", - conversation_id, - chunk_index, - bool(public_url), - len(audio_bytes), - upload_ms, - manual, - ) return public_url except Exception as e: err_str = str(e) @@ -239,13 +157,6 @@ async def _send_tts_audio( ) else: logger.error("TTS synthesize failed: {}", e) - logger.info( - "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False " - "url_set=False audio_bytes_len=0 reason=exception err={}", - conversation_id, - chunk_index, - type(e).__name__, - ) return None @@ -1035,18 +946,6 @@ async def process_user_message( segment.id, len(user_message or ""), ) - # 长期保留:TTS 决策入口(pipeline 层);INFO 级别可见所有控制位 - logger.info( - "pipeline.process_user_message entry conversation_id={} segment_id={} " - "tts_this_turn={} force_skip_tts={} enable_tts={} provider={} user_language={}", - conversation_id, - segment.id, - tts_this_turn, - force_skip_tts, - settings.enable_tts, - settings.tts_provider, - user_language, - ) is_from_voice = bool(segment.audio_url) voice_session_id = _voice_session_id_from_audio_url(segment.audio_url) audio_dur = getattr(segment, "audio_duration_seconds", None) @@ -1074,21 +973,6 @@ async def process_user_message( skip_tts = bool(turn.skip_tts) want_voice = bool(tts_this_turn) if tts_this_turn is not None else False want_tts = want_voice and settings.enable_tts and not skip_tts - # 长期保留 INFO:TTS 决策最终结论;不再被 agent_summary_enabled 门控 - logger.info( - "pipeline.process_user_message tts_decision conversation_id={} segment_id={} " - "tts_this_turn={} force_skip_tts={} enable_tts={} skip_tts_from_turn={} " - "want_voice={} want_tts={} response_segments={}", - conversation_id, - segment.id, - tts_this_turn, - force_skip_tts, - settings.enable_tts, - skip_tts, - want_voice, - want_tts, - len(turn.messages), - ) if agent_summary_enabled(): logger.info( "pipeline.process_user_message duration_ms={:.2f} " diff --git a/api/app/features/conversation/ws/protocol.md b/api/app/features/conversation/ws/protocol.md index c198c8a..1b03b0c 100644 --- a/api/app/features/conversation/ws/protocol.md +++ b/api/app/features/conversation/ws/protocol.md @@ -20,7 +20,7 @@ - `TRANSCRIPT`: ASR 转写结果 - `AGENT_RESPONSE`: AI 回复文本分段 -- `TTS_AUDIO`: 语音合成结果(可与 `COS` 签名 URL、`base64` 并存)。按需朗读成功时 `data.manual` 可为 `true`,提示客户端应播放(即使用户未开「本轮 Speak」)。 +- `TTS_AUDIO`: 语音合成结果。标准链路下发 `data.audio_base64` 作为本轮即时播放通道,并同时下发 `data.audio_url`(COS 预签名可播放 URL)供客户端绑定气泡、退出重进后复播。按需朗读成功时 `data.manual` 可为 `true`,提示客户端应播放(即使用户未开「本轮 Speak」)。 - `MEMOIR_UPDATE`: 回忆录更新通知 - `ERROR`: 错误信息 diff --git a/api/app/features/conversation/ws/router.py b/api/app/features/conversation/ws/router.py index fbb32e3..cb57568 100644 --- a/api/app/features/conversation/ws/router.py +++ b/api/app/features/conversation/ws/router.py @@ -289,13 +289,6 @@ async def websocket_endpoint( data = message.get("data") or {} text_message = data.get("text", "") tts_this_turn = bool(data.get("tts_this_turn")) - # 长期保留:TTS 决策入口可见性(INFO 级别即可定位 FE 是否带 tts_this_turn) - logger.info( - "ws.user_message tts_this_turn={} conversation_id={} text_len={}", - tts_this_turn, - conversation_id, - len(text_message or ""), - ) if text_message: can_send, quota_msg = await check_ws_quota( @@ -397,16 +390,6 @@ async def websocket_endpoint( is_last = bool(data.get("is_last", False)) audio_duration = int(data.get("duration", 0) or 0) tts_this_turn_segment = bool(data.get("tts_this_turn")) - # 长期保留:分段语音轮的 TTS 决策入口可见性 - logger.info( - "ws.audio_segment tts_this_turn={} is_last={} " - "conversation_id={} voice_session_id={} segment_index_raw={}", - tts_this_turn_segment, - is_last, - conversation_id, - voice_session_id, - segment_index_raw, - ) if not audio_base64: await manager.send_message( @@ -524,13 +507,6 @@ async def websocket_endpoint( audio_base64 = data.get("audio_base64", "") audio_duration = data.get("duration", 0) tts_this_turn = bool(data.get("tts_this_turn")) - # 长期保留:单次整段音频路径的 TTS 决策入口可见性 - logger.info( - "ws.audio_message tts_this_turn={} conversation_id={} duration_s={}", - tts_this_turn, - conversation_id, - audio_duration, - ) if audio_base64: can_send, quota_msg = await check_ws_quota( @@ -719,15 +695,6 @@ async def websocket_endpoint( st_val = None else: st_val = str(st).strip() or None - logger.info( - "ws.TTS_REQUEST received conversation_id={} user_id={} " - "assistant_message_id={} segment_index={} segment_text_len={}", - conversation_id, - user_id, - str(aid).strip(), - seg_idx, - len(st_val or ""), - ) ok, err_msg = await handle_tts_request_on_demand( conversation_id=conversation_id, user_id=user_id, @@ -736,15 +703,6 @@ async def websocket_endpoint( segment_text=st_val, db=db, ) - logger.info( - "ws.TTS_REQUEST handled conversation_id={} assistant_message_id={} " - "segment_index={} ok={} err_msg={}", - conversation_id, - str(aid).strip(), - seg_idx, - ok, - err_msg, - ) if not ok: await manager.send_message( conversation_id, diff --git a/app-expo/src/app/(main)/conversation/[id].tsx b/app-expo/src/app/(main)/conversation/[id].tsx index b460c9b..45a871a 100644 --- a/app-expo/src/app/(main)/conversation/[id].tsx +++ b/app-expo/src/app/(main)/conversation/[id].tsx @@ -46,6 +46,10 @@ import { Icon } from '@/components/ui/icon'; import { Text } from '@/components/ui/text'; import { ScreenHeader } from '@/components/screen-header'; import { resolveApiMediaUrl } from '@/core/api/media-url'; +import { + getTtsSpeakDefault, + setTtsSpeakDefault, +} from '@/core/settings/app-settings'; import { useAppSettings } from '@/hooks/use-app-settings'; import { useThemeColors } from '@/hooks/use-theme-colors'; import { useTypography } from '@/core/typography-context'; @@ -1238,15 +1242,14 @@ export default function ConversationScreen() { const handleTtsSegment = useCallback( (p: TtsSegmentPayload) => { - // 闸门用于丢弃「用户已打断后」迟到的自动 TTS;按需朗读 (manual) 是当前明确操作,必须放行。 - const allowByGate = - p.manual === true || ttsGate.current.shouldAcceptIncomingTts(); - if (!allowByGate) return; const convId = id ?? ''; const cosUrl = p.audioUrl?.trim(); + const isManualPlayback = !!p.manual; + const shouldAutoPlay = !!p.autoPlay; /** - * 播放走 WS,但「再读」依赖 MessageItem.ttsAudioUrls。乐观提交的消息没有 URL, - * 服务端 attach 要等整轮结束;收到 COS URL 时写入缓存,按钮才能用。 + * COS URL 先写入缓存:**与Speak无关**,否则录音/打断后闸门关闭时会跳过 merge, + * 按需朗读下发的 `tts_audio` 无法绑定到气泡,喇叭表现为「不可用」。 + * 闸门仅约束是否入队播放(迟到自动朗读),不禁用 URL 附着。 */ if (cosUrl && convId) { queryClient.setQueryData( @@ -1307,8 +1310,13 @@ export default function ConversationScreen() { ); } + const gateAllowsPlayback = + isManualPlayback || ttsGate.current.shouldAcceptIncomingTts(); const shouldEnqueue = - p.manual === true || lastUserMessageRequestedTtsRef.current; + isManualPlayback || + shouldAutoPlay || + lastUserMessageRequestedTtsRef.current; + if (!gateAllowsPlayback) return; if (!shouldEnqueue) return; const listKey = @@ -1325,8 +1333,8 @@ export default function ConversationScreen() { ...shared, uri: `data:audio/mp3;base64,${p.audioBase64}`, }); - } else if (p.audioUrl) { - void enqueue({ ...shared, uri: p.audioUrl }); + } else if (cosUrl) { + void enqueue({ ...shared, uri: cosUrl }); } }, [enqueue, id, queryClient], @@ -1399,8 +1407,9 @@ export default function ConversationScreen() { const [input, setInput] = useState(''); const [inputResetKey, setInputResetKey] = useState(0); - /** 本条发出的用户消息是否请求助手朗读(先 TTS 再出字) */ + /** 本条发出的用户消息是否请求助手朗读(先 TTS 再出字);默认值从存储恢复 */ const [ttsThisTurn, setTtsThisTurn] = useState(false); + const [ttsSpeakPrefReady, setTtsSpeakPrefReady] = useState(false); const [inputMode, setInputMode] = useState('text'); const [isKeyboardVisible, setIsKeyboardVisible] = useState(false); const inputModeRef = useRef('text'); @@ -1489,6 +1498,24 @@ export default function ConversationScreen() { inputModeRef.current = inputMode; }, [inputMode]); + useEffect(() => { + let cancelled = false; + void (async () => { + try { + const v = await getTtsSpeakDefault(); + if (!cancelled) { + setTtsThisTurn(v); + setTtsSpeakPrefReady(true); + } + } catch { + if (!cancelled) setTtsSpeakPrefReady(true); + } + })(); + return () => { + cancelled = true; + }; + }, []); + useEffect(() => { const onKeyboardWillShow = () => { if (inputModeRef.current !== 'text') return; @@ -1687,7 +1714,11 @@ export default function ConversationScreen() { { + setTtsThisTurn(v); + void setTtsSpeakDefault(v); + }} trackColor={{ false: CHAT_COLORS.outline, true: CHAT_COLORS.secondaryContainer, diff --git a/app-expo/src/app/(tabs)/index.tsx b/app-expo/src/app/(tabs)/index.tsx index ff5235b..99b4906 100644 --- a/app-expo/src/app/(tabs)/index.tsx +++ b/app-expo/src/app/(tabs)/index.tsx @@ -2,6 +2,7 @@ import { Image } from 'expo-image'; import { router } from 'expo-router'; import React, { useEffect, useRef, useState } from 'react'; import { useQueryClient } from '@tanstack/react-query'; +import { useIsFocused } from '@react-navigation/native'; import { Alert, AppState, @@ -302,6 +303,7 @@ function findTodayConversationToResume( export default function ConversationsScreen() { const { t } = useTranslation('conversation'); const queryClient = useQueryClient(); + const isFocused = useIsFocused(); const { data: conversations = [], isLoading } = useConversations(); const createConversation = useCreateConversation(); @@ -448,12 +450,13 @@ export default function ConversationsScreen() { * 单槽连接池:换会话会自动 dispose 旧槽,所以这里只挑一条最像即将被点的。 */ useEffect(() => { + if (!isFocused) return; if (isLoading) return; const candidate = todayConversation ?? conversations.find(conversationHasAnyMessage); if (!candidate) return; prewarmConversationSession(queryClient, candidate.id); - }, [isLoading, conversations, todayConversation, queryClient]); + }, [isFocused, isLoading, conversations, todayConversation, queryClient]); return ( diff --git a/app-expo/src/core/audio/audio-focus.ts b/app-expo/src/core/audio/audio-focus.ts index 8241782..5c40395 100644 --- a/app-expo/src/core/audio/audio-focus.ts +++ b/app-expo/src/core/audio/audio-focus.ts @@ -20,7 +20,10 @@ function notify() { */ export const audioFocus = { async acquireForRecording(): Promise { - if (currentOwner === 'recorder') return true; + if (currentOwner === 'recorder') { + await setIsAudioActiveAsync(true); + return true; + } if (currentOwner === 'player') { await this.releaseIfOwnedBy('player'); @@ -30,6 +33,7 @@ export const audioFocus = { playsInSilentMode: true, allowsRecording: true, }); + await setIsAudioActiveAsync(true); currentOwner = 'recorder'; notify(); @@ -37,7 +41,10 @@ export const audioFocus = { }, async acquireForPlayback(): Promise { - if (currentOwner === 'player') return true; + if (currentOwner === 'player') { + await setIsAudioActiveAsync(true); + return true; + } if (currentOwner === 'recorder') { return false; @@ -47,6 +54,7 @@ export const audioFocus = { playsInSilentMode: true, allowsRecording: false, }); + await setIsAudioActiveAsync(true); currentOwner = 'player'; notify(); diff --git a/app-expo/src/core/settings/app-settings.ts b/app-expo/src/core/settings/app-settings.ts index 17ddec1..12854bd 100644 --- a/app-expo/src/core/settings/app-settings.ts +++ b/app-expo/src/core/settings/app-settings.ts @@ -14,6 +14,7 @@ const KEY_LANGUAGE = 'app_settings_language'; const KEY_LARGE_TEXT = 'app_settings_large_text'; const KEY_DARK_MODE = 'app_settings_dark_mode'; const KEY_THEME_NAME = 'app_settings_theme_name'; +const KEY_TTS_SPEAK_DEFAULT = 'app_settings_tts_speak_default'; const webFallback: Record = {}; @@ -83,5 +84,16 @@ export async function setThemeName(value: ThemeName): Promise { await setStored(KEY_THEME_NAME, value); } +/** 会话页「Speak / 本轮朗读」开关是否默认开启(跨会话记忆) */ +export async function getTtsSpeakDefault(): Promise { + const v = await getStored(KEY_TTS_SPEAK_DEFAULT); + if (v == null || v === '') return false; + return v === 'true'; +} + +export async function setTtsSpeakDefault(value: boolean): Promise { + await setStored(KEY_TTS_SPEAK_DEFAULT, value ? 'true' : 'false'); +} + export { supportedLanguages, THEME_NAMES }; export type { AppLanguage, ThemeName }; diff --git a/app-expo/src/core/ws/types.ts b/app-expo/src/core/ws/types.ts index 5a5590a..abd34a2 100644 --- a/app-expo/src/core/ws/types.ts +++ b/app-expo/src/core/ws/types.ts @@ -63,7 +63,8 @@ export interface AgentResponseEvent { export interface TtsAudioReceivedEvent { kind: 'tts_audio_received'; conversationId: string; - audioBase64: string; + /** 兼容旧 WS payload;标准链路使用 audioUrl。 */ + audioBase64?: string; audioUrl?: string; index?: number; total?: number; diff --git a/app-expo/src/features/conversation/conversation-ws-background-pool.ts b/app-expo/src/features/conversation/conversation-ws-background-pool.ts index 858b680..928dcec 100644 --- a/app-expo/src/features/conversation/conversation-ws-background-pool.ts +++ b/app-expo/src/features/conversation/conversation-ws-background-pool.ts @@ -1,7 +1,10 @@ import type { QueryClient } from '@tanstack/react-query'; import { AppState, type AppStateStatus } from 'react-native'; -import { RealtimeSession } from './realtime-session'; +import { + RealtimeSession, + type RealtimeSessionUiOwner, +} from './realtime-session'; type Slot = { conversationId: string; session: RealtimeSession }; @@ -34,8 +37,11 @@ const offScreenUi = { }; /** 离屏:保持 WebSocket,去掉 UI 回调,避免列表页播 TTS 或对已卸载组件 setState */ -export function releaseConversationWsUi(session: RealtimeSession): void { - session.attachUiCallbacks({ +export function releaseConversationWsUi( + session: RealtimeSession, + owner: RealtimeSessionUiOwner, +): void { + session.releaseUiCallbacks(owner, { onStreamingText: offScreenUi.onStreamingText, onTtsSegment: offScreenUi.onTtsSegment, onError: offScreenUi.onError, diff --git a/app-expo/src/features/conversation/entry-warmup.ts b/app-expo/src/features/conversation/entry-warmup.ts index 579b38f..d88f123 100644 --- a/app-expo/src/features/conversation/entry-warmup.ts +++ b/app-expo/src/features/conversation/entry-warmup.ts @@ -1,6 +1,5 @@ import type { QueryClient } from '@tanstack/react-query'; -import { acquireBackgroundConversationWs } from './conversation-ws-background-pool'; import { conversationMessagesRepository } from './conversation-messages-repository'; import { conversationKeys } from './query-keys'; import { registerPreparedRealtimeSession } from './prepared-session-registry'; @@ -52,17 +51,13 @@ export async function prefetchConversationMessages( }); } -const offscreenUiCallbacks = { - onStreamingText: () => {}, - onTtsSegment: () => {}, - onError: () => {}, - onStateChange: () => {}, -}; - const inflightPrewarms = new Set(); /** - * 列表页/卡片按下时的预热:保持后台 WS 连接,并触发消息缓存填充。 + * 列表页/卡片按下时的预热:只填充消息缓存,不建立后台 WS。 + * + * 后端当前以 conversation_id 记录 active WebSocket;离屏 WS 会覆盖聊天页连接, + * 导致本轮 TTS/agent_response 发到 offscreen session,页面停在「回复中」。 * 与 `warmupConversationOpening` 不同:不等待开场白、不阻塞调用方,仅适用于"已有消息"的会话。 */ export function prewarmConversationSession( @@ -70,13 +65,6 @@ export function prewarmConversationSession( conversationId: string, ): void { if (!conversationId) return; - const session = acquireBackgroundConversationWs( - conversationId, - queryClient, - null, - ); - // 预热阶段没有挂载的 UI,先用空回调占位;聊天页 mount 时会重新 attach。 - session.attachUiCallbacks(offscreenUiCallbacks); if (inflightPrewarms.has(conversationId)) return; const cached = queryClient.getQueryData( conversationKeys.messages(conversationId), diff --git a/app-expo/src/features/conversation/hooks.ts b/app-expo/src/features/conversation/hooks.ts index 1457807..9fbb416 100644 --- a/app-expo/src/features/conversation/hooks.ts +++ b/app-expo/src/features/conversation/hooks.ts @@ -18,6 +18,7 @@ import { conversationKeys } from './query-keys'; import { takePreparedRealtimeSession } from './prepared-session-registry'; import { type ErrorCallback, + type RealtimeSessionUiOwner, type StreamingTextCallback, type TtsSegmentPayload, type RealtimeSession, @@ -219,6 +220,9 @@ export function useRealtimeSession({ }: UseRealtimeSessionOptions): RealtimeSessionState { const queryClient = useQueryClient(); const sessionRef = useRef(null); + const uiOwnerRef = useRef( + Symbol('conversation-screen-ui'), + ); const uiRef = useRef({ handleStreamingText: (() => {}) as StreamingTextCallback, handleError: (() => {}) as ErrorCallback, @@ -300,20 +304,23 @@ export function useRealtimeSession({ prepared, ); - session.attachUiCallbacks({ - onStreamingText: (text, isComplete) => { - uiRef.current.handleStreamingText(text, isComplete); + session.attachUiCallbacks( + { + onStreamingText: (text, isComplete) => { + uiRef.current.handleStreamingText(text, isComplete); + }, + onTtsSegment: (payload) => uiRef.current.onTtsSegment?.(payload), + onError: (message, code) => uiRef.current.handleError(message, code), + onStateChange: setConnectionState, }, - onTtsSegment: (payload) => uiRef.current.onTtsSegment?.(payload), - onError: (message, code) => uiRef.current.handleError(message, code), - onStateChange: setConnectionState, - }); + uiOwnerRef.current, + ); sessionRef.current = session; setConnectionState(session.getConnectionState()); return () => { - releaseConversationWsUi(session); + releaseConversationWsUi(session, uiOwnerRef.current); sessionRef.current = null; setConnectionState('disconnected'); setStreamingMessage(null); diff --git a/app-expo/src/features/conversation/realtime-session.ts b/app-expo/src/features/conversation/realtime-session.ts index 3e74663..7a5ae08 100644 --- a/app-expo/src/features/conversation/realtime-session.ts +++ b/app-expo/src/features/conversation/realtime-session.ts @@ -21,6 +21,7 @@ function looksLikeUuidAssistantMessageId(id: string): boolean { export type StreamingTextCallback = (text: string, isComplete: boolean) => void; export type ErrorCallback = (message: string, code?: string) => void; +export type RealtimeSessionUiOwner = symbol; /** WebSocket `tts_audio`:服务端可能只带 base64、只带 COS URL,或两者都有 */ export type TtsSegmentPayload = { @@ -32,6 +33,8 @@ export type TtsSegmentPayload = { assistantMessageId?: string; /** 用户点喇叭按需下发时为 true,应加入播放队列(即使未开「本轮朗读」) */ manual?: boolean; + /** 本段属于用户显式打开 Speak 的自动朗读轮次。 */ + autoPlay?: boolean; }; interface RealtimeSessionOptions { @@ -63,6 +66,7 @@ export class RealtimeSession { private onTtsSegment?: (payload: TtsSegmentPayload) => void; private onError?: ErrorCallback; private uiStateListener?: WsStateListener; + private uiOwner: RealtimeSessionUiOwner | null = null; private unsubEvent: (() => void) | null = null; private unsubState: (() => void) | null = null; @@ -75,6 +79,18 @@ export class RealtimeSession { private assistantTurnTtsSync = false; private pendingTtsByKey = new Map(); + /** `agent_response` 早于 `tts_audio` 时延后落库,超时后露字且无播放 */ + private readonly SYNC_REVEAL_TIMEOUT_MS = 3000; + private deferredSyncCommits = new Map< + string, + { + timeoutId: ReturnType; + commit: () => void; + index: number; + total: number; + } + >(); + private static bufferedTtsKey( assistantMessageId: string | undefined, index: number, @@ -99,13 +115,20 @@ export class RealtimeSession { } /** 列表预热接棒或刷新 UI 订阅时替换回调,不重建 WebSocket */ - attachUiCallbacks(options: { - onStreamingText?: StreamingTextCallback; - onTtsSegment?: (payload: TtsSegmentPayload) => void; - onError?: ErrorCallback; - onStateChange?: WsStateListener; - }): void { + attachUiCallbacks( + options: { + onStreamingText?: StreamingTextCallback; + onTtsSegment?: (payload: TtsSegmentPayload) => void; + onError?: ErrorCallback; + onStateChange?: WsStateListener; + }, + owner?: RealtimeSessionUiOwner, + ): void { if (this.destroyed) return; + if (this.uiOwner && owner !== this.uiOwner) { + return; + } + this.uiOwner = owner ?? null; if (options.onStreamingText !== undefined) { this.onStreamingText = options.onStreamingText; } @@ -124,6 +147,23 @@ export class RealtimeSession { } } + releaseUiCallbacks( + owner: RealtimeSessionUiOwner, + options: { + onStreamingText?: StreamingTextCallback; + onTtsSegment?: (payload: TtsSegmentPayload) => void; + onError?: ErrorCallback; + onStateChange?: WsStateListener; + }, + ): void { + if (this.destroyed) return; + if (this.uiOwner !== owner) { + return; + } + this.uiOwner = null; + this.attachUiCallbacks(options); + } + async connect(): Promise { await this.client.connect(); } @@ -144,6 +184,7 @@ export class RealtimeSession { /** Returns true if the message was sent over the socket. */ sendText(text: string, options?: { ttsThisTurn?: boolean }): boolean { + this.beginNewOutboundUserTurnCleanup(); const tts = !!options?.ttsThisTurn; this.assistantTurnTtsSync = tts; return this.client.sendText(text, { ttsThisTurn: tts }); @@ -160,6 +201,7 @@ export class RealtimeSession { ttsThisTurn?: boolean; }, ): boolean { + this.beginNewOutboundUserTurnCleanup(); const tts = !!options?.ttsThisTurn; this.assistantTurnTtsSync = tts; return this.client.send({ @@ -198,32 +240,99 @@ export class RealtimeSession { return this.client.sendTtsRequest(body); } + /** 新开一轮用户发到 WS 之前:清空上轮 sync 残留的缓冲,避免占位 / 错乱 */ + private beginNewOutboundUserTurnCleanup(): void { + if (this.deferredSyncCommits.size === 0 && this.pendingTtsByKey.size === 0) { + return; + } + for (const def of this.deferredSyncCommits.values()) { + clearTimeout(def.timeoutId); + try { + def.commit(); + } catch { + /* best-effort */ + } + } + this.deferredSyncCommits.clear(); + this.pendingTtsByKey.clear(); + } + // ─── Internal ─── private resetAssistantTtsSyncState(): void { + for (const def of this.deferredSyncCommits.values()) { + clearTimeout(def.timeoutId); + try { + def.commit(); + } catch { + /* 取消打断时尽量不丢正文 */ + } + } + this.deferredSyncCommits.clear(); this.assistantTurnTtsSync = false; this.pendingTtsByKey.clear(); } - private flushBufferedTtsIfSync( + /** sync 模式下取出缓冲的该段 TTS;调用方需先落缓存再转发,方便 UI 绑定 URL */ + private takeBufferedTtsIfSync( assistantMessageId: string | undefined, index: number, - ): void { - if (!this.assistantTurnTtsSync) return; + ): TtsSegmentPayload | null { + if (!this.assistantTurnTtsSync) return null; const key = RealtimeSession.bufferedTtsKey(assistantMessageId, index); const payload = this.pendingTtsByKey.get(key); - if (payload) { + if (!payload) return null; + this.pendingTtsByKey.delete(key); + return payload; + } + + /** + * 正文已先于音频到达:`commit` 延至收到 `tts_audio` 或超时(无音频路径则照常露字) + */ + private scheduleDeferredSyncCommit( + key: string, + index: number, + total: number, + commit: () => void, + ): void { + const timeoutId = setTimeout(() => { + const def = this.deferredSyncCommits.get(key); + if (!def || def.timeoutId !== timeoutId) return; + this.deferredSyncCommits.delete(key); this.pendingTtsByKey.delete(key); - this.onTtsSegment?.(payload); - } + def.commit(); + this.finishAssistantTurnIfLastSegment(def.index, def.total); + }, this.SYNC_REVEAL_TIMEOUT_MS); + this.deferredSyncCommits.set(key, { timeoutId, commit, index, total }); + } + + /** 迟到 `tts_audio` 与延后落库会合:先写缓存再入队播放,确保 URL 能绑定到气泡 */ + private tryResolveDeferredSyncWithIncomingTts( + key: string, + incoming: TtsSegmentPayload, + ): boolean { + const def = this.deferredSyncCommits.get(key); + if (!def) return false; + clearTimeout(def.timeoutId); + this.deferredSyncCommits.delete(key); + def.commit(); + this.onTtsSegment?.(incoming); + this.finishAssistantTurnIfLastSegment(def.index, def.total); + return true; } private finishAssistantTurnIfLastSegment(index: number, total: number): void { if (index >= total - 1) { - this.resetAssistantTtsSyncState(); + this.assistantTurnTtsSync = false; + this.pendingTtsByKey.clear(); } } + /** sync 多段回复不走 footer 流式展示,但仍要清掉「正在回复」占位气泡。 */ + private clearAssistantPendingUi(): void { + this.onStreamingText?.('', true); + } + private handleEvent: WsEventListener = (event: WsEvent) => { if (event.kind === 'agent_response') { this.handleAgentChunk(event); @@ -244,12 +353,21 @@ export class RealtimeSession { assistantMessageId: event.assistantMessageId, manual: event.manual, }; + if (this.assistantTurnTtsSync && !payload.manual) { const idx = event.index ?? 0; const key = RealtimeSession.bufferedTtsKey( event.assistantMessageId, idx, ); + payload.autoPlay = true; + const resolvedDeferred = this.tryResolveDeferredSyncWithIncomingTts( + key, + payload, + ); + if (resolvedDeferred) { + return; + } this.pendingTtsByKey.set(key, payload); } else { this.onTtsSegment?.(payload); @@ -292,11 +410,32 @@ export class RealtimeSession { ? assistantSegmentMessageId(event.assistantMessageId, index) : `${this.conversationId}_agent_${Date.now()}_${index}`; if (sync) { - this.flushBufferedTtsIfSync(event.assistantMessageId, index); + const bufferedTts = this.takeBufferedTtsIfSync( + event.assistantMessageId, + index, + ); + if (bufferedTts) { + this.commitOneAssistantMessage(event.text, id); + this.clearAssistantPendingUi(); + this.onTtsSegment?.(bufferedTts); + this.finishAssistantTurnIfLastSegment(index, total); + } else { + const key = RealtimeSession.bufferedTtsKey( + event.assistantMessageId, + index, + ); + const textCaptured = event.text; + const idCaptured = id; + this.scheduleDeferredSyncCommit(key, index, total, () => { + this.commitOneAssistantMessage(textCaptured, idCaptured); + this.clearAssistantPendingUi(); + }); + } + } else { + this.commitOneAssistantMessage(event.text, id); + this.onStreamingText?.(event.text, true); + this.finishAssistantTurnIfLastSegment(index, total); } - this.commitOneAssistantMessage(event.text, id); - this.onStreamingText?.(event.text, true); - this.finishAssistantTurnIfLastSegment(index, total); return; } @@ -317,18 +456,40 @@ export class RealtimeSession { const id = this.pendingAssistantMessageId ?? `${this.conversationId}_agent_${Date.now()}`; + let finishSyncTurnNow = false; if (sync) { - this.flushBufferedTtsIfSync(assistantId ?? undefined, 0); - this.commitStreamingBufferWithId(id); - const visible = - this.streamingBuffer.trim().length > 0 ? this.streamingBuffer : '…'; - this.onStreamingText?.(visible, true); + const bufferedTts = this.takeBufferedTtsIfSync( + assistantId ?? undefined, + 0, + ); + if (bufferedTts) { + this.commitStreamingBufferWithId(id); + const visible = + this.streamingBuffer.trim().length > 0 ? this.streamingBuffer : '…'; + this.onStreamingText?.(visible, true); + this.onTtsSegment?.(bufferedTts); + finishSyncTurnNow = true; + } else { + const snapshot = this.streamingBuffer; + const key = RealtimeSession.bufferedTtsKey(assistantId ?? undefined, 0); + const idCaptured = id; + this.scheduleDeferredSyncCommit(key, 0, 1, () => { + this.streamingBuffer = snapshot; + this.commitStreamingBufferWithId(idCaptured); + this.streamingBuffer = ''; + const visible = snapshot.trim().length > 0 ? snapshot : '…'; + this.onStreamingText?.(visible, true); + }); + } } else { this.commitStreamingBufferWithId(id); + finishSyncTurnNow = true; } this.streamingBuffer = ''; this.pendingAssistantMessageId = null; - this.finishAssistantTurnIfLastSegment(0, 1); + if (!sync || finishSyncTurnNow) { + this.finishAssistantTurnIfLastSegment(0, 1); + } } } diff --git a/app-expo/src/features/voice/hooks/use-player.ts b/app-expo/src/features/voice/hooks/use-player.ts index 8bb257f..1c2c7c7 100644 --- a/app-expo/src/features/voice/hooks/use-player.ts +++ b/app-expo/src/features/voice/hooks/use-player.ts @@ -94,6 +94,12 @@ export function usePlayer(): UsePlayerResult { const acquired = await audioFocus.acquireForPlayback(); if (!acquired) { + /** + * 录音占用时 acquire 失败且队列尚未 shift;若用户进入会话前焦点已在 + * `recorder`,可能不会再次触发 `onOwnerChange('recorder')`,旧的 + * `wasBlockedByRecorderRef` 不会被置位,录音结束后也不会重试 playNext。 + */ + wasBlockedByRecorderRef.current = true; setStatus('idle'); return; } @@ -172,14 +178,17 @@ export function usePlayer(): UsePlayerResult { if (owner === null && wasBlockedByRecorderRef.current) { wasBlockedByRecorderRef.current = false; - if (queueRef.current.length > 0 && status === 'idle') { - playNext(); + if ( + queueRef.current.length > 0 && + playbackActiveUriRef.current === null + ) { + void playNext(); } } }); return unsub; - }, [status, currentSource, playNext]); + }, [currentSource, playNext]); const enqueue = useCallback( async (item: PlaybackItem) => { diff --git a/app-expo/tests/core/ws/client.test.ts b/app-expo/tests/core/ws/client.test.ts index 220d2dc..d60e61a 100644 --- a/app-expo/tests/core/ws/client.test.ts +++ b/app-expo/tests/core/ws/client.test.ts @@ -118,6 +118,45 @@ describe('WsClient', () => { client.dispose(); }); + test('maps tts audio with base64 and url playback channels', async () => { + const client = new WsClient('conv-123'); + const events: WsEvent[] = []; + client.onEvent((e) => events.push(e)); + + await client.connect(); + await new Promise((r) => setTimeout(r, 10)); + + const ws = (client as unknown as { ws: MockWebSocket }).ws; + ws.simulateMessage({ + type: 'tts_audio', + conversation_id: 'conv-123', + data: { + audio_base64: 'ZmFrZS1tcDM=', + audio_url: 'https://example.com/tts.mp3', + index: 0, + total: 1, + assistant_message_id: 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa', + manual: true, + }, + timestamp: '2026-01-01T00:00:00Z', + }); + + expect(events).toEqual([ + { + kind: 'tts_audio_received', + conversationId: 'conv-123', + audioBase64: 'ZmFrZS1tcDM=', + audioUrl: 'https://example.com/tts.mp3', + index: 0, + total: 1, + assistantMessageId: 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa', + manual: true, + }, + ]); + + client.dispose(); + }); + test('sends text messages', async () => { const client = new WsClient('conv-123'); diff --git a/app-expo/tests/features/conversation/entry-warmup.test.ts b/app-expo/tests/features/conversation/entry-warmup.test.ts index ce9d2f4..b191588 100644 --- a/app-expo/tests/features/conversation/entry-warmup.test.ts +++ b/app-expo/tests/features/conversation/entry-warmup.test.ts @@ -1,6 +1,7 @@ import { QueryClient } from '@tanstack/react-query'; import { + prewarmConversationSession, prefetchConversationMessages, warmupConversationOpening, } from '@/features/conversation/entry-warmup'; @@ -90,6 +91,20 @@ describe('conversation entry warmup', () => { ).resolves.toBeUndefined(); }); + test('prewarms existing conversations without opening an offscreen websocket', async () => { + const existing = assistantMessage(); + mockLoadMessages.mockResolvedValueOnce([existing]); + + prewarmConversationSession(queryClient, 'conv-1'); + await new Promise((r) => setImmediate(r)); + + expect(mockLoadMessages).toHaveBeenCalledWith('conv-1'); + expect(mockSessions).toHaveLength(0); + expect( + queryClient.getQueryData(conversationKeys.messages('conv-1')), + ).toEqual([existing]); + }); + test('uses refreshed history and skips websocket when opening is already cached', async () => { const existing = assistantMessage(); mockLoadMessages.mockResolvedValueOnce([existing]); diff --git a/app-expo/tests/features/conversation/realtime-session-sync-order.test.ts b/app-expo/tests/features/conversation/realtime-session-sync-order.test.ts new file mode 100644 index 0000000..a3778b1 --- /dev/null +++ b/app-expo/tests/features/conversation/realtime-session-sync-order.test.ts @@ -0,0 +1,253 @@ +import { QueryClient } from '@tanstack/react-query'; + +import { RealtimeSession } from '@/features/conversation/realtime-session'; +import { conversationKeys } from '@/features/conversation/query-keys'; +import type { MessageItem } from '@/features/conversation/types'; + +jest.mock('@/core/auth/token-manager', () => ({ + tokenManager: { + getAccessToken: jest.fn().mockResolvedValue('test-token'), + }, +})); + +jest.mock('@/core/config', () => ({ + config: { + wsBaseUrl: 'ws://localhost:8000/', + ws: { + reconnectMaxRetries: 3, + reconnectBaseDelayMs: 10, + reconnectMaxDelayMs: 100, + heartbeatIntervalMs: 600000, + }, + }, +})); + +class MockWebSocket { + static OPEN = 1; + static CLOSED = 3; + static instances: MockWebSocket[] = []; + + readyState = MockWebSocket.OPEN; + onopen: (() => void) | null = null; + onmessage: ((event: { data: string }) => void) | null = null; + onclose: (() => void) | null = null; + onerror: (() => void) | null = null; + + constructor(public url: string) { + MockWebSocket.instances.push(this); + queueMicrotask(() => this.onopen?.()); + } + + send(): void {} + + close(): void { + this.readyState = MockWebSocket.CLOSED; + } + + simulateMessage(data: Record): void { + this.onmessage?.({ data: JSON.stringify(data) }); + } +} + +(global as Record).WebSocket = MockWebSocket; + +function msgs(qc: QueryClient, cid: string): MessageItem[] { + return qc.getQueryData(conversationKeys.messages(cid)) ?? []; +} + +describe('RealtimeSession sync TTS / agent ordering', () => { + let qc: QueryClient; + + beforeEach(() => { + jest.clearAllMocks(); + MockWebSocket.instances = []; + qc = new QueryClient(); + qc.setQueryData(conversationKeys.messages('conv-x'), []); + }); + + afterEach(async () => { + await new Promise((r) => setImmediate(r)); + }); + + it('defers assistant commit when agent_response arrives before tts_audio (single segment)', async () => { + const aid = 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa'; + const onTts = jest.fn(() => { + expect(msgs(qc, 'conv-x').some((m) => m.id === aid)).toBe(true); + }); + const onStream = jest.fn(); + const session = new RealtimeSession({ + conversationId: 'conv-x', + queryClient: qc, + onStreamingText: onStream, + onTtsSegment: onTts, + }); + + await session.connect(); + await new Promise((r) => setImmediate(r)); + + const ws = MockWebSocket.instances[0]!; + expect(session.sendText('hi', { ttsThisTurn: true })).toBe(true); + + ws.simulateMessage({ + type: 'agent_response', + conversation_id: 'conv-x', + data: { + text: 'Hello segment', + index: 0, + total: 1, + assistant_message_id: aid, + }, + timestamp: new Date().toISOString(), + }); + + const afterAgentOnly = msgs(qc, 'conv-x').filter( + (m) => m.senderType === 'assistant', + ); + expect(afterAgentOnly).toHaveLength(0); + + ws.simulateMessage({ + type: 'tts_audio', + conversation_id: 'conv-x', + data: { + audio_url: 'https://example.com/tts-a.mp3', + index: 0, + total: 1, + assistant_message_id: aid, + }, + timestamp: new Date().toISOString(), + }); + + expect(onTts).toHaveBeenCalledTimes(1); + const committed = msgs(qc, 'conv-x').filter( + (m) => m.senderType === 'assistant', + ); + expect(committed).toHaveLength(1); + expect(committed[0]!.content).toContain('Hello segment'); + + session.dispose(); + }); + + it('multi-segment sync clears pending UI without streaming footer text', async () => { + const aid = 'bb22bb22-bbbb-bbbb-bbbb-bbbbbbbbbbbb'; + const onTts = jest.fn(() => { + expect( + msgs(qc, 'conv-x').some((m) => m.id === `${aid}_seg_0`), + ).toBe(true); + }); + const onStream = jest.fn(); + const session = new RealtimeSession({ + conversationId: 'conv-x', + queryClient: qc, + onStreamingText: onStream, + onTtsSegment: onTts, + }); + + await session.connect(); + await new Promise((r) => setImmediate(r)); + + const ws = MockWebSocket.instances[0]!; + session.sendText('hi', { ttsThisTurn: true }); + + ws.simulateMessage({ + type: 'tts_audio', + conversation_id: 'conv-x', + data: { + audio_url: 'https://example.com/tts-b.mp3', + index: 0, + total: 2, + assistant_message_id: aid, + }, + timestamp: new Date().toISOString(), + }); + + ws.simulateMessage({ + type: 'agent_response', + conversation_id: 'conv-x', + data: { + text: 'Part A', + index: 0, + total: 2, + assistant_message_id: aid, + }, + timestamp: new Date().toISOString(), + }); + + expect(onStream).toHaveBeenCalledWith('', true); + expect(onStream).not.toHaveBeenCalledWith('Part A', true); + expect(onTts).toHaveBeenCalled(); + session.dispose(); + }); + + it('keeps active screen TTS callback when stale offscreen attach runs later', async () => { + const aid = 'cc33cc33-cccc-cccc-cccc-cccccccccccc'; + const screenOnTts = jest.fn(); + const offscreenOnTts = jest.fn(); + const session = new RealtimeSession({ + conversationId: 'conv-x', + queryClient: qc, + }); + const owner = Symbol('screen-owner'); + + session.attachUiCallbacks({ onTtsSegment: screenOnTts }, owner); + session.attachUiCallbacks({ onTtsSegment: offscreenOnTts }); + + await session.connect(); + await new Promise((r) => setImmediate(r)); + + const ws = MockWebSocket.instances[0]!; + ws.simulateMessage({ + type: 'tts_audio', + conversation_id: 'conv-x', + data: { + audio_base64: 'ZmFrZS1tcDM=', + audio_url: 'https://example.com/tts-c.mp3', + index: 0, + total: 1, + assistant_message_id: aid, + manual: true, + }, + timestamp: new Date().toISOString(), + }); + + expect(screenOnTts).toHaveBeenCalledTimes(1); + expect(offscreenOnTts).not.toHaveBeenCalled(); + session.dispose(); + }); + + it('keeps active screen TTS callback when a stale screen owner attaches later', async () => { + const aid = 'dd44dd44-dddd-dddd-dddd-dddddddddddd'; + const screenOnTts = jest.fn(); + const staleScreenOnTts = jest.fn(); + const session = new RealtimeSession({ + conversationId: 'conv-x', + queryClient: qc, + }); + const activeOwner = Symbol('active-screen-owner'); + const staleOwner = Symbol('stale-screen-owner'); + + session.attachUiCallbacks({ onTtsSegment: screenOnTts }, activeOwner); + session.attachUiCallbacks({ onTtsSegment: staleScreenOnTts }, staleOwner); + + await session.connect(); + await new Promise((r) => setImmediate(r)); + + const ws = MockWebSocket.instances[0]!; + ws.simulateMessage({ + type: 'tts_audio', + conversation_id: 'conv-x', + data: { + audio_base64: 'ZmFrZS1tcDM=', + audio_url: 'https://example.com/tts-d.mp3', + index: 0, + total: 1, + assistant_message_id: aid, + manual: true, + }, + timestamp: new Date().toISOString(), + }); + + expect(screenOnTts).toHaveBeenCalledTimes(1); + expect(staleScreenOnTts).not.toHaveBeenCalled(); + session.dispose(); + }); +}); diff --git a/app-expo/tests/features/voice/use-player.test.tsx b/app-expo/tests/features/voice/use-player.test.tsx index 15f90c1..fe5e47a 100644 --- a/app-expo/tests/features/voice/use-player.test.tsx +++ b/app-expo/tests/features/voice/use-player.test.tsx @@ -37,6 +37,7 @@ describe('usePlayer', () => { }); jest.mocked(audioFocus.acquireForPlayback).mockResolvedValue(true); jest.mocked(audioFocus.releaseIfOwnedBy).mockResolvedValue(undefined); + jest.mocked(audioFocus.onOwnerChange).mockImplementation(() => jest.fn()); }); test('keeps the native audio session active while app-level audio focus owns teardown', () => { @@ -127,4 +128,43 @@ describe('usePlayer', () => { expect(pause).not.toHaveBeenCalled(); expect(result.current.status).toBe('idle'); }); + + test('retries queued audio after acquire fails once then audio focus frees', async () => { + const acquire = jest.mocked(audioFocus.acquireForPlayback); + acquire.mockResolvedValueOnce(false).mockResolvedValue(true); + + let ownerListener: ((owner: null | string) => void) | undefined; + jest.mocked(audioFocus.onOwnerChange).mockImplementation((cb) => { + ownerListener = cb as (owner: null | string) => void; + return jest.fn(); + }); + + mockUseAudioPlayerStatus.mockReturnValue({ + isLoaded: true, + playing: false, + currentTime: 0, + duration: 10, + }); + const play = jest.fn(); + mockUseAudioPlayer.mockReturnValue({ pause: jest.fn(), play }); + + const { result } = renderHook(() => usePlayer()); + + await act(async () => { + await result.current.enqueue({ + uri: 'file:///queued.mp3', + kind: 'tts_auto', + }); + }); + + expect(acquire).toHaveBeenCalledTimes(1); + expect(result.current.status).toBe('idle'); + + await act(async () => { + ownerListener?.(null); + }); + + expect(acquire).toHaveBeenCalledTimes(2); + expect(play).toHaveBeenCalled(); + }); });