From 3d01085442bf6ffec53d45d36a5b5c8f6de3f83b Mon Sep 17 00:00:00 2001
From: Kevin <kevin@brighteng.org>
Date: Tue, 12 May 2026 10:42:44 +0800
Subject: [PATCH] =?UTF-8?q?fix(conversation):=20=E4=BF=AE=E5=A4=8D?=
 =?UTF-8?q?=E5=AE=9E=E6=97=B6=E4=BC=9A=E8=AF=9D=20TTS/=E5=9B=9E=E5=A4=8D?=
 =?UTF-8?q?=E8=A2=AB=E7=A6=BB=E5=B1=8F=20WS=20=E6=8A=A2=E5=8D=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 列表预热仅预取消息缓存，避免后台 WebSocket 覆盖服务端连接
- RealtimeSession UI 回调按 owner 独占，防止 offscreen 覆盖聊天页
- 列表页聚焦时再 prewarm，会话页 TTS 入队优先 base64
- 管线下发 TTS 同时带 audio_base64 与 audio_url；协议说明同步
- 移除 TTS 排查用前后端调试日志，保留错误/告警
- 补充 WS / RealtimeSession / entry-warmup / 播放器相关单测

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 api/app/adapters/tts/tencent_tts.py           |  33 ---
 api/app/features/conversation/ws/pipeline.py  | 118 +-------
 api/app/features/conversation/ws/protocol.md  |   2 +-
 api/app/features/conversation/ws/router.py    |  42 ---
 app-expo/src/app/(main)/conversation/[id].tsx |  53 +++-
 app-expo/src/app/(tabs)/index.tsx             |   5 +-
 app-expo/src/core/audio/audio-focus.ts        |  12 +-
 app-expo/src/core/settings/app-settings.ts    |  12 +
 app-expo/src/core/ws/types.ts                 |   3 +-
 .../conversation-ws-background-pool.ts        |  12 +-
 .../src/features/conversation/entry-warmup.ts |  20 +-
 app-expo/src/features/conversation/hooks.ts   |  23 +-
 .../features/conversation/realtime-session.ts | 207 ++++++++++++--
 .../src/features/voice/hooks/use-player.ts    |  15 +-
 app-expo/tests/core/ws/client.test.ts         |  39 +++
 .../conversation/entry-warmup.test.ts         |  15 ++
 .../realtime-session-sync-order.test.ts       | 253 ++++++++++++++++++
 .../tests/features/voice/use-player.test.tsx  |  40 +++
 18 files changed, 643 insertions(+), 261 deletions(-)
 create mode 100644 app-expo/tests/features/conversation/realtime-session-sync-order.test.ts

diff --git a/api/app/adapters/tts/tencent_tts.py b/api/app/adapters/tts/tencent_tts.py
index 501cdd1..2377fa3 100644
--- a/api/app/adapters/tts/tencent_tts.py
+++ b/api/app/adapters/tts/tencent_tts.py
@@ -137,18 +137,6 @@ class TencentTTSProvider:
             # 显式声明使用新模型；大模型音色（501xxx）若不带该字段会被旧模型拒绝并静默返回空音频。
             req.ModelType = MODEL_TYPE_LLM
 
-            # 长期保留 INFO：TTS 实际请求腾讯云 SDK 时的关键参数
-            logger.info(
-                "tencent_tts._synthesize_sync request voice_type={} primary_language={} "
-                "model_type={} sample_rate={} codec={} text_len={}",
-                voice_type,
-                primary_language,
-                MODEL_TYPE_LLM,
-                req.SampleRate,
-                self._codec,
-                len(text or ""),
-            )
-
             resp = client.TextToVoice(req)
             request_id = getattr(resp, "RequestId", None) if resp is not None else None
             audio_b64 = getattr(resp, "Audio", "") if resp is not None else ""
@@ -163,15 +151,6 @@ class TencentTTSProvider:
                 )
                 return b""
             audio_bytes = base64.b64decode(audio_b64)
-            # 长期保留 INFO：腾讯云 SDK 返回的 request_id + 音频字节数（用户排查必需）
-            logger.info(
-                "tencent_tts._synthesize_sync response request_id={} audio_bytes_len={} "
-                "voice_type={} primary_language={}",
-                request_id,
-                len(audio_bytes),
-                voice_type,
-                primary_language,
-            )
             return audio_bytes
         except TencentCloudSDKException as e:
             logger.error(
@@ -225,18 +204,6 @@ class TencentTTSProvider:
         else:
             voice_type = VOICE_MAP.get(v, default_voice)
         chunks = _chunk_text(text, max_chars=max_chars)
-        # 长期保留 INFO：adapter 入口的 language / voice_type / chunk_count（排查必需）
-        logger.info(
-            "tencent_tts.synthesize entry language={} voice_arg={} resolved_voice_type={} "
-            "primary_language={} max_chars={} text_len={} chunk_count={}",
-            language,
-            voice,
-            voice_type,
-            primary_language,
-            max_chars,
-            len(text or ""),
-            len(chunks),
-        )
         if not chunks:
             return b""
 
diff --git a/api/app/features/conversation/ws/pipeline.py b/api/app/features/conversation/ws/pipeline.py
index 6270422..9f5090e 100644
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -99,41 +99,10 @@ async def _send_tts_audio(
 ) -> str | None:
     """Synthesize TTS, upload to COS, append Redis, send TTS_AUDIO. Returns public URL or None."""
     current_epoch = _tts_epoch_value(conversation_id)
-    # 长期保留 INFO：TTS 决策与执行链路必须在 INFO 级别全程可见
-    logger.info(
-        "pipeline._send_tts_audio entry conversation_id={} chunk_index={} chunk_total={} "
-        "text_len={} language={} manual={} tts_epoch_start={} current_epoch={} "
-        "enable_tts={} provider={}",
-        conversation_id,
-        chunk_index,
-        chunk_total,
-        len(text or ""),
-        language,
-        manual,
-        tts_epoch_start,
-        current_epoch,
-        settings.enable_tts,
-        settings.tts_provider,
-    )
     # enable_tts：仅禁用「助手回复自动生成 TTS」（want_tts 路径）；用户点喇叭（manual=True）仍可合成。
     if not manual and not settings.enable_tts:
-        logger.info(
-            "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
-            "url_set=False audio_bytes_len=0 reason=enable_tts_false",
-            conversation_id,
-            chunk_index,
-        )
         return None
     if current_epoch != tts_epoch_start:
-        logger.info(
-            "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
-            "url_set=False audio_bytes_len=0 reason=epoch_mismatch_pre_synth "
-            "tts_epoch_start={} current_epoch={}",
-            conversation_id,
-            chunk_index,
-            tts_epoch_start,
-            current_epoch,
-        )
         return None
     try:
         tts = get_tts_provider()
@@ -148,50 +117,19 @@ async def _send_tts_audio(
                 (text or "")[:30],
                 settings.tts_provider,
             )
-            logger.info(
-                "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
-                "url_set=False audio_bytes_len=0 reason=synthesize_empty",
-                conversation_id,
-                chunk_index,
-            )
             return None
         if _tts_epoch_value(conversation_id) != tts_epoch_start:
-            logger.info(
-                "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
-                "url_set=False audio_bytes_len={} reason=epoch_mismatch_post_synth",
-                conversation_id,
-                chunk_index,
-                len(audio_bytes),
-            )
             return None
         ext = _tts_object_ext(settings.tts_codec)
         content_type = _tts_codec_to_content_type(settings.tts_codec)
         storage = get_object_storage()
         key = f"conversations/{conversation_id}/tts/{uuid.uuid4().hex}.{ext}"
-        upload_started = time.perf_counter()
-        logger.debug(
-            "pipeline._send_tts_audio uploading key={} audio_bytes_len={} content_type={}",
-            key,
-            len(audio_bytes),
-            content_type,
-        )
         public_url = storage.upload(key, audio_bytes, content_type)
-        upload_ms = (time.perf_counter() - upload_started) * 1000
         # 与 `tts_delivery.apply_presigned_tts_urls_to_messages` / 回忆录图片 presign 一致：下发可播 URL
         playback_url = storage.get_url(key, expires=TTS_PRESIGNED_EXPIRES_SEC)
-        logger.debug(
-            "pipeline._send_tts_audio uploaded key={} audio_bytes_len={} upload_ms={:.2f} "
-            "public_url_set={} playback_url_set={}",
-            key,
-            len(audio_bytes),
-            upload_ms,
-            bool(public_url),
-            bool(playback_url),
-        )
-        audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
         payload_data: Dict[str, Any] = {
-            "audio_base64": audio_b64,
             "format": settings.tts_codec,
+            "audio_base64": base64.b64encode(audio_bytes).decode("utf-8"),
             "audio_url": playback_url,
             "index": chunk_index,
             "total": chunk_total,
@@ -200,16 +138,6 @@ async def _send_tts_audio(
             payload_data["assistant_message_id"] = assistant_message_id
         if manual:
             payload_data["manual"] = True
-        logger.debug(
-            "pipeline._send_tts_audio sending TTS_AUDIO conversation_id={} chunk_index={} "
-            "chunk_total={} payload_fields={} audio_b64_len={} manual={}",
-            conversation_id,
-            chunk_index,
-            chunk_total,
-            sorted(payload_data.keys()),
-            len(audio_b64),
-            manual,
-        )
         await manager.send_message(
             conversation_id,
             {
@@ -219,16 +147,6 @@ async def _send_tts_audio(
                 "timestamp": datetime.now(timezone.utc).isoformat(),
             },
         )
-        logger.info(
-            "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=True "
-            "url_set={} audio_bytes_len={} upload_ms={:.2f} manual={}",
-            conversation_id,
-            chunk_index,
-            bool(public_url),
-            len(audio_bytes),
-            upload_ms,
-            manual,
-        )
         return public_url
     except Exception as e:
         err_str = str(e)
@@ -239,13 +157,6 @@ async def _send_tts_audio(
             )
         else:
             logger.error("TTS synthesize failed: {}", e)
-        logger.info(
-            "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
-            "url_set=False audio_bytes_len=0 reason=exception err={}",
-            conversation_id,
-            chunk_index,
-            type(e).__name__,
-        )
         return None
 
 
@@ -1035,18 +946,6 @@ async def process_user_message(
             segment.id,
             len(user_message or ""),
         )
-        # 长期保留：TTS 决策入口（pipeline 层）；INFO 级别可见所有控制位
-        logger.info(
-            "pipeline.process_user_message entry conversation_id={} segment_id={} "
-            "tts_this_turn={} force_skip_tts={} enable_tts={} provider={} user_language={}",
-            conversation_id,
-            segment.id,
-            tts_this_turn,
-            force_skip_tts,
-            settings.enable_tts,
-            settings.tts_provider,
-            user_language,
-        )
         is_from_voice = bool(segment.audio_url)
         voice_session_id = _voice_session_id_from_audio_url(segment.audio_url)
         audio_dur = getattr(segment, "audio_duration_seconds", None)
@@ -1074,21 +973,6 @@ async def process_user_message(
         skip_tts = bool(turn.skip_tts)
         want_voice = bool(tts_this_turn) if tts_this_turn is not None else False
         want_tts = want_voice and settings.enable_tts and not skip_tts
-        # 长期保留 INFO：TTS 决策最终结论；不再被 agent_summary_enabled 门控
-        logger.info(
-            "pipeline.process_user_message tts_decision conversation_id={} segment_id={} "
-            "tts_this_turn={} force_skip_tts={} enable_tts={} skip_tts_from_turn={} "
-            "want_voice={} want_tts={} response_segments={}",
-            conversation_id,
-            segment.id,
-            tts_this_turn,
-            force_skip_tts,
-            settings.enable_tts,
-            skip_tts,
-            want_voice,
-            want_tts,
-            len(turn.messages),
-        )
         if agent_summary_enabled():
             logger.info(
                 "pipeline.process_user_message duration_ms={:.2f} "
diff --git a/api/app/features/conversation/ws/protocol.md b/api/app/features/conversation/ws/protocol.md
index c198c8a..1b03b0c 100644
--- a/api/app/features/conversation/ws/protocol.md
+++ b/api/app/features/conversation/ws/protocol.md
@@ -20,7 +20,7 @@
 
 - `TRANSCRIPT`: ASR 转写结果
 - `AGENT_RESPONSE`: AI 回复文本分段
-- `TTS_AUDIO`: 语音合成结果（可与 `COS` 签名 URL、`base64` 并存）。按需朗读成功时 `data.manual` 可为 `true`，提示客户端应播放（即使用户未开「本轮 Speak」）。
+- `TTS_AUDIO`: 语音合成结果。标准链路下发 `data.audio_base64` 作为本轮即时播放通道，并同时下发 `data.audio_url`（COS 预签名可播放 URL）供客户端绑定气泡、退出重进后复播。按需朗读成功时 `data.manual` 可为 `true`，提示客户端应播放（即使用户未开「本轮 Speak」）。
 - `MEMOIR_UPDATE`: 回忆录更新通知
 - `ERROR`: 错误信息
 
diff --git a/api/app/features/conversation/ws/router.py b/api/app/features/conversation/ws/router.py
index fbb32e3..cb57568 100644
--- a/api/app/features/conversation/ws/router.py
+++ b/api/app/features/conversation/ws/router.py
@@ -289,13 +289,6 @@ async def websocket_endpoint(
                         data = message.get("data") or {}
                         text_message = data.get("text", "")
                         tts_this_turn = bool(data.get("tts_this_turn"))
-                        # 长期保留：TTS 决策入口可见性（INFO 级别即可定位 FE 是否带 tts_this_turn）
-                        logger.info(
-                            "ws.user_message tts_this_turn={} conversation_id={} text_len={}",
-                            tts_this_turn,
-                            conversation_id,
-                            len(text_message or ""),
-                        )
 
                         if text_message:
                             can_send, quota_msg = await check_ws_quota(
@@ -397,16 +390,6 @@ async def websocket_endpoint(
                         is_last = bool(data.get("is_last", False))
                         audio_duration = int(data.get("duration", 0) or 0)
                         tts_this_turn_segment = bool(data.get("tts_this_turn"))
-                        # 长期保留：分段语音轮的 TTS 决策入口可见性
-                        logger.info(
-                            "ws.audio_segment tts_this_turn={} is_last={} "
-                            "conversation_id={} voice_session_id={} segment_index_raw={}",
-                            tts_this_turn_segment,
-                            is_last,
-                            conversation_id,
-                            voice_session_id,
-                            segment_index_raw,
-                        )
 
                         if not audio_base64:
                             await manager.send_message(
@@ -524,13 +507,6 @@ async def websocket_endpoint(
                         audio_base64 = data.get("audio_base64", "")
                         audio_duration = data.get("duration", 0)
                         tts_this_turn = bool(data.get("tts_this_turn"))
-                        # 长期保留：单次整段音频路径的 TTS 决策入口可见性
-                        logger.info(
-                            "ws.audio_message tts_this_turn={} conversation_id={} duration_s={}",
-                            tts_this_turn,
-                            conversation_id,
-                            audio_duration,
-                        )
 
                         if audio_base64:
                             can_send, quota_msg = await check_ws_quota(
@@ -719,15 +695,6 @@ async def websocket_endpoint(
                             st_val = None
                         else:
                             st_val = str(st).strip() or None
-                        logger.info(
-                            "ws.TTS_REQUEST received conversation_id={} user_id={} "
-                            "assistant_message_id={} segment_index={} segment_text_len={}",
-                            conversation_id,
-                            user_id,
-                            str(aid).strip(),
-                            seg_idx,
-                            len(st_val or ""),
-                        )
                         ok, err_msg = await handle_tts_request_on_demand(
                             conversation_id=conversation_id,
                             user_id=user_id,
@@ -736,15 +703,6 @@ async def websocket_endpoint(
                             segment_text=st_val,
                             db=db,
                         )
-                        logger.info(
-                            "ws.TTS_REQUEST handled conversation_id={} assistant_message_id={} "
-                            "segment_index={} ok={} err_msg={}",
-                            conversation_id,
-                            str(aid).strip(),
-                            seg_idx,
-                            ok,
-                            err_msg,
-                        )
                         if not ok:
                             await manager.send_message(
                                 conversation_id,
diff --git a/app-expo/src/app/(main)/conversation/[id].tsx b/app-expo/src/app/(main)/conversation/[id].tsx
index b460c9b..45a871a 100644
--- a/app-expo/src/app/(main)/conversation/[id].tsx
+++ b/app-expo/src/app/(main)/conversation/[id].tsx
@@ -46,6 +46,10 @@ import { Icon } from '@/components/ui/icon';
 import { Text } from '@/components/ui/text';
 import { ScreenHeader } from '@/components/screen-header';
 import { resolveApiMediaUrl } from '@/core/api/media-url';
+import {
+  getTtsSpeakDefault,
+  setTtsSpeakDefault,
+} from '@/core/settings/app-settings';
 import { useAppSettings } from '@/hooks/use-app-settings';
 import { useThemeColors } from '@/hooks/use-theme-colors';
 import { useTypography } from '@/core/typography-context';
@@ -1238,15 +1242,14 @@ export default function ConversationScreen() {
 
   const handleTtsSegment = useCallback(
     (p: TtsSegmentPayload) => {
-      // 闸门用于丢弃「用户已打断后」迟到的自动 TTS；按需朗读 (manual) 是当前明确操作，必须放行。
-      const allowByGate =
-        p.manual === true || ttsGate.current.shouldAcceptIncomingTts();
-      if (!allowByGate) return;
       const convId = id ?? '';
       const cosUrl = p.audioUrl?.trim();
+      const isManualPlayback = !!p.manual;
+      const shouldAutoPlay = !!p.autoPlay;
       /**
-       * 播放走 WS，但「再读」依赖 MessageItem.ttsAudioUrls。乐观提交的消息没有 URL，
-       * 服务端 attach 要等整轮结束；收到 COS URL 时写入缓存，按钮才能用。
+       * COS URL 先写入缓存：**与Speak无关**，否则录音/打断后闸门关闭时会跳过 merge，
+       * 按需朗读下发的 `tts_audio` 无法绑定到气泡，喇叭表现为「不可用」。
+       * 闸门仅约束是否入队播放（迟到自动朗读），不禁用 URL 附着。
        */
       if (cosUrl && convId) {
         queryClient.setQueryData<MessageItem[]>(
@@ -1307,8 +1310,13 @@ export default function ConversationScreen() {
         );
       }
 
+      const gateAllowsPlayback =
+        isManualPlayback || ttsGate.current.shouldAcceptIncomingTts();
       const shouldEnqueue =
-        p.manual === true || lastUserMessageRequestedTtsRef.current;
+        isManualPlayback ||
+        shouldAutoPlay ||
+        lastUserMessageRequestedTtsRef.current;
+      if (!gateAllowsPlayback) return;
       if (!shouldEnqueue) return;
 
       const listKey =
@@ -1325,8 +1333,8 @@ export default function ConversationScreen() {
           ...shared,
           uri: `data:audio/mp3;base64,${p.audioBase64}`,
         });
-      } else if (p.audioUrl) {
-        void enqueue({ ...shared, uri: p.audioUrl });
+      } else if (cosUrl) {
+        void enqueue({ ...shared, uri: cosUrl });
       }
     },
     [enqueue, id, queryClient],
@@ -1399,8 +1407,9 @@ export default function ConversationScreen() {
 
   const [input, setInput] = useState('');
   const [inputResetKey, setInputResetKey] = useState(0);
-  /** 本条发出的用户消息是否请求助手朗读（先 TTS 再出字） */
+  /** 本条发出的用户消息是否请求助手朗读（先 TTS 再出字）；默认值从存储恢复 */
   const [ttsThisTurn, setTtsThisTurn] = useState(false);
+  const [ttsSpeakPrefReady, setTtsSpeakPrefReady] = useState(false);
   const [inputMode, setInputMode] = useState<InputMode>('text');
   const [isKeyboardVisible, setIsKeyboardVisible] = useState(false);
   const inputModeRef = useRef<InputMode>('text');
@@ -1489,6 +1498,24 @@ export default function ConversationScreen() {
     inputModeRef.current = inputMode;
   }, [inputMode]);
 
+  useEffect(() => {
+    let cancelled = false;
+    void (async () => {
+      try {
+        const v = await getTtsSpeakDefault();
+        if (!cancelled) {
+          setTtsThisTurn(v);
+          setTtsSpeakPrefReady(true);
+        }
+      } catch {
+        if (!cancelled) setTtsSpeakPrefReady(true);
+      }
+    })();
+    return () => {
+      cancelled = true;
+    };
+  }, []);
+
   useEffect(() => {
     const onKeyboardWillShow = () => {
       if (inputModeRef.current !== 'text') return;
@@ -1687,7 +1714,11 @@ export default function ConversationScreen() {
             <Switch
               accessibilityLabel={t('ttsThisTurnAccessibility')}
               value={ttsThisTurn}
-              onValueChange={setTtsThisTurn}
+              disabled={!ttsSpeakPrefReady}
+              onValueChange={(v) => {
+                setTtsThisTurn(v);
+                void setTtsSpeakDefault(v);
+              }}
               trackColor={{
                 false: CHAT_COLORS.outline,
                 true: CHAT_COLORS.secondaryContainer,
diff --git a/app-expo/src/app/(tabs)/index.tsx b/app-expo/src/app/(tabs)/index.tsx
index ff5235b..99b4906 100644
--- a/app-expo/src/app/(tabs)/index.tsx
+++ b/app-expo/src/app/(tabs)/index.tsx
@@ -2,6 +2,7 @@ import { Image } from 'expo-image';
 import { router } from 'expo-router';
 import React, { useEffect, useRef, useState } from 'react';
 import { useQueryClient } from '@tanstack/react-query';
+import { useIsFocused } from '@react-navigation/native';
 import {
   Alert,
   AppState,
@@ -302,6 +303,7 @@ function findTodayConversationToResume(
 export default function ConversationsScreen() {
   const { t } = useTranslation('conversation');
   const queryClient = useQueryClient();
+  const isFocused = useIsFocused();
 
   const { data: conversations = [], isLoading } = useConversations();
   const createConversation = useCreateConversation();
@@ -448,12 +450,13 @@ export default function ConversationsScreen() {
    * 单槽连接池：换会话会自动 dispose 旧槽，所以这里只挑一条最像即将被点的。
    */
   useEffect(() => {
+    if (!isFocused) return;
     if (isLoading) return;
     const candidate =
       todayConversation ?? conversations.find(conversationHasAnyMessage);
     if (!candidate) return;
     prewarmConversationSession(queryClient, candidate.id);
-  }, [isLoading, conversations, todayConversation, queryClient]);
+  }, [isFocused, isLoading, conversations, todayConversation, queryClient]);
 
   return (
     <View className="flex-1 bg-background">
diff --git a/app-expo/src/core/audio/audio-focus.ts b/app-expo/src/core/audio/audio-focus.ts
index 8241782..5c40395 100644
--- a/app-expo/src/core/audio/audio-focus.ts
+++ b/app-expo/src/core/audio/audio-focus.ts
@@ -20,7 +20,10 @@ function notify() {
  */
 export const audioFocus = {
   async acquireForRecording(): Promise<boolean> {
-    if (currentOwner === 'recorder') return true;
+    if (currentOwner === 'recorder') {
+      await setIsAudioActiveAsync(true);
+      return true;
+    }
 
     if (currentOwner === 'player') {
       await this.releaseIfOwnedBy('player');
@@ -30,6 +33,7 @@ export const audioFocus = {
       playsInSilentMode: true,
       allowsRecording: true,
     });
+    await setIsAudioActiveAsync(true);
 
     currentOwner = 'recorder';
     notify();
@@ -37,7 +41,10 @@ export const audioFocus = {
   },
 
   async acquireForPlayback(): Promise<boolean> {
-    if (currentOwner === 'player') return true;
+    if (currentOwner === 'player') {
+      await setIsAudioActiveAsync(true);
+      return true;
+    }
 
     if (currentOwner === 'recorder') {
       return false;
@@ -47,6 +54,7 @@ export const audioFocus = {
       playsInSilentMode: true,
       allowsRecording: false,
     });
+    await setIsAudioActiveAsync(true);
 
     currentOwner = 'player';
     notify();
diff --git a/app-expo/src/core/settings/app-settings.ts b/app-expo/src/core/settings/app-settings.ts
index 17ddec1..12854bd 100644
--- a/app-expo/src/core/settings/app-settings.ts
+++ b/app-expo/src/core/settings/app-settings.ts
@@ -14,6 +14,7 @@ const KEY_LANGUAGE = 'app_settings_language';
 const KEY_LARGE_TEXT = 'app_settings_large_text';
 const KEY_DARK_MODE = 'app_settings_dark_mode';
 const KEY_THEME_NAME = 'app_settings_theme_name';
+const KEY_TTS_SPEAK_DEFAULT = 'app_settings_tts_speak_default';
 
 const webFallback: Record<string, string> = {};
 
@@ -83,5 +84,16 @@ export async function setThemeName(value: ThemeName): Promise<void> {
   await setStored(KEY_THEME_NAME, value);
 }
 
+/** 会话页「Speak / 本轮朗读」开关是否默认开启（跨会话记忆） */
+export async function getTtsSpeakDefault(): Promise<boolean> {
+  const v = await getStored(KEY_TTS_SPEAK_DEFAULT);
+  if (v == null || v === '') return false;
+  return v === 'true';
+}
+
+export async function setTtsSpeakDefault(value: boolean): Promise<void> {
+  await setStored(KEY_TTS_SPEAK_DEFAULT, value ? 'true' : 'false');
+}
+
 export { supportedLanguages, THEME_NAMES };
 export type { AppLanguage, ThemeName };
diff --git a/app-expo/src/core/ws/types.ts b/app-expo/src/core/ws/types.ts
index 5a5590a..abd34a2 100644
--- a/app-expo/src/core/ws/types.ts
+++ b/app-expo/src/core/ws/types.ts
@@ -63,7 +63,8 @@ export interface AgentResponseEvent {
 export interface TtsAudioReceivedEvent {
   kind: 'tts_audio_received';
   conversationId: string;
-  audioBase64: string;
+  /** 兼容旧 WS payload；标准链路使用 audioUrl。 */
+  audioBase64?: string;
   audioUrl?: string;
   index?: number;
   total?: number;
diff --git a/app-expo/src/features/conversation/conversation-ws-background-pool.ts b/app-expo/src/features/conversation/conversation-ws-background-pool.ts
index 858b680..928dcec 100644
--- a/app-expo/src/features/conversation/conversation-ws-background-pool.ts
+++ b/app-expo/src/features/conversation/conversation-ws-background-pool.ts
@@ -1,7 +1,10 @@
 import type { QueryClient } from '@tanstack/react-query';
 import { AppState, type AppStateStatus } from 'react-native';
 
-import { RealtimeSession } from './realtime-session';
+import {
+  RealtimeSession,
+  type RealtimeSessionUiOwner,
+} from './realtime-session';
 
 type Slot = { conversationId: string; session: RealtimeSession };
 
@@ -34,8 +37,11 @@ const offScreenUi = {
 };
 
 /** 离屏：保持 WebSocket，去掉 UI 回调，避免列表页播 TTS 或对已卸载组件 setState */
-export function releaseConversationWsUi(session: RealtimeSession): void {
-  session.attachUiCallbacks({
+export function releaseConversationWsUi(
+  session: RealtimeSession,
+  owner: RealtimeSessionUiOwner,
+): void {
+  session.releaseUiCallbacks(owner, {
     onStreamingText: offScreenUi.onStreamingText,
     onTtsSegment: offScreenUi.onTtsSegment,
     onError: offScreenUi.onError,
diff --git a/app-expo/src/features/conversation/entry-warmup.ts b/app-expo/src/features/conversation/entry-warmup.ts
index 579b38f..d88f123 100644
--- a/app-expo/src/features/conversation/entry-warmup.ts
+++ b/app-expo/src/features/conversation/entry-warmup.ts
@@ -1,6 +1,5 @@
 import type { QueryClient } from '@tanstack/react-query';
 
-import { acquireBackgroundConversationWs } from './conversation-ws-background-pool';
 import { conversationMessagesRepository } from './conversation-messages-repository';
 import { conversationKeys } from './query-keys';
 import { registerPreparedRealtimeSession } from './prepared-session-registry';
@@ -52,17 +51,13 @@ export async function prefetchConversationMessages(
   });
 }
 
-const offscreenUiCallbacks = {
-  onStreamingText: () => {},
-  onTtsSegment: () => {},
-  onError: () => {},
-  onStateChange: () => {},
-};
-
 const inflightPrewarms = new Set<string>();
 
 /**
- * 列表页/卡片按下时的预热：保持后台 WS 连接，并触发消息缓存填充。
+ * 列表页/卡片按下时的预热：只填充消息缓存，不建立后台 WS。
+ *
+ * 后端当前以 conversation_id 记录 active WebSocket；离屏 WS 会覆盖聊天页连接，
+ * 导致本轮 TTS/agent_response 发到 offscreen session，页面停在「回复中」。
  * 与 `warmupConversationOpening` 不同：不等待开场白、不阻塞调用方，仅适用于"已有消息"的会话。
  */
 export function prewarmConversationSession(
@@ -70,13 +65,6 @@ export function prewarmConversationSession(
   conversationId: string,
 ): void {
   if (!conversationId) return;
-  const session = acquireBackgroundConversationWs(
-    conversationId,
-    queryClient,
-    null,
-  );
-  // 预热阶段没有挂载的 UI，先用空回调占位；聊天页 mount 时会重新 attach。
-  session.attachUiCallbacks(offscreenUiCallbacks);
   if (inflightPrewarms.has(conversationId)) return;
   const cached = queryClient.getQueryData<MessageItem[]>(
     conversationKeys.messages(conversationId),
diff --git a/app-expo/src/features/conversation/hooks.ts b/app-expo/src/features/conversation/hooks.ts
index 1457807..9fbb416 100644
--- a/app-expo/src/features/conversation/hooks.ts
+++ b/app-expo/src/features/conversation/hooks.ts
@@ -18,6 +18,7 @@ import { conversationKeys } from './query-keys';
 import { takePreparedRealtimeSession } from './prepared-session-registry';
 import {
   type ErrorCallback,
+  type RealtimeSessionUiOwner,
   type StreamingTextCallback,
   type TtsSegmentPayload,
   type RealtimeSession,
@@ -219,6 +220,9 @@ export function useRealtimeSession({
 }: UseRealtimeSessionOptions): RealtimeSessionState {
   const queryClient = useQueryClient();
   const sessionRef = useRef<RealtimeSession | null>(null);
+  const uiOwnerRef = useRef<RealtimeSessionUiOwner>(
+    Symbol('conversation-screen-ui'),
+  );
   const uiRef = useRef({
     handleStreamingText: (() => {}) as StreamingTextCallback,
     handleError: (() => {}) as ErrorCallback,
@@ -300,20 +304,23 @@ export function useRealtimeSession({
       prepared,
     );
 
-    session.attachUiCallbacks({
-      onStreamingText: (text, isComplete) => {
-        uiRef.current.handleStreamingText(text, isComplete);
+    session.attachUiCallbacks(
+      {
+        onStreamingText: (text, isComplete) => {
+          uiRef.current.handleStreamingText(text, isComplete);
+        },
+        onTtsSegment: (payload) => uiRef.current.onTtsSegment?.(payload),
+        onError: (message, code) => uiRef.current.handleError(message, code),
+        onStateChange: setConnectionState,
       },
-      onTtsSegment: (payload) => uiRef.current.onTtsSegment?.(payload),
-      onError: (message, code) => uiRef.current.handleError(message, code),
-      onStateChange: setConnectionState,
-    });
+      uiOwnerRef.current,
+    );
 
     sessionRef.current = session;
     setConnectionState(session.getConnectionState());
 
     return () => {
-      releaseConversationWsUi(session);
+      releaseConversationWsUi(session, uiOwnerRef.current);
       sessionRef.current = null;
       setConnectionState('disconnected');
       setStreamingMessage(null);
diff --git a/app-expo/src/features/conversation/realtime-session.ts b/app-expo/src/features/conversation/realtime-session.ts
index 3e74663..7a5ae08 100644
--- a/app-expo/src/features/conversation/realtime-session.ts
+++ b/app-expo/src/features/conversation/realtime-session.ts
@@ -21,6 +21,7 @@ function looksLikeUuidAssistantMessageId(id: string): boolean {
 
 export type StreamingTextCallback = (text: string, isComplete: boolean) => void;
 export type ErrorCallback = (message: string, code?: string) => void;
+export type RealtimeSessionUiOwner = symbol;
 
 /** WebSocket `tts_audio`：服务端可能只带 base64、只带 COS URL，或两者都有 */
 export type TtsSegmentPayload = {
@@ -32,6 +33,8 @@ export type TtsSegmentPayload = {
   assistantMessageId?: string;
   /** 用户点喇叭按需下发时为 true，应加入播放队列（即使未开「本轮朗读」） */
   manual?: boolean;
+  /** 本段属于用户显式打开 Speak 的自动朗读轮次。 */
+  autoPlay?: boolean;
 };
 
 interface RealtimeSessionOptions {
@@ -63,6 +66,7 @@ export class RealtimeSession {
   private onTtsSegment?: (payload: TtsSegmentPayload) => void;
   private onError?: ErrorCallback;
   private uiStateListener?: WsStateListener;
+  private uiOwner: RealtimeSessionUiOwner | null = null;
   private unsubEvent: (() => void) | null = null;
   private unsubState: (() => void) | null = null;
 
@@ -75,6 +79,18 @@ export class RealtimeSession {
   private assistantTurnTtsSync = false;
   private pendingTtsByKey = new Map<string, TtsSegmentPayload>();
 
+  /** `agent_response` 早于 `tts_audio` 时延后落库，超时后露字且无播放 */
+  private readonly SYNC_REVEAL_TIMEOUT_MS = 3000;
+  private deferredSyncCommits = new Map<
+    string,
+    {
+      timeoutId: ReturnType<typeof setTimeout>;
+      commit: () => void;
+      index: number;
+      total: number;
+    }
+  >();
+
   private static bufferedTtsKey(
     assistantMessageId: string | undefined,
     index: number,
@@ -99,13 +115,20 @@ export class RealtimeSession {
   }
 
   /** 列表预热接棒或刷新 UI 订阅时替换回调，不重建 WebSocket */
-  attachUiCallbacks(options: {
-    onStreamingText?: StreamingTextCallback;
-    onTtsSegment?: (payload: TtsSegmentPayload) => void;
-    onError?: ErrorCallback;
-    onStateChange?: WsStateListener;
-  }): void {
+  attachUiCallbacks(
+    options: {
+      onStreamingText?: StreamingTextCallback;
+      onTtsSegment?: (payload: TtsSegmentPayload) => void;
+      onError?: ErrorCallback;
+      onStateChange?: WsStateListener;
+    },
+    owner?: RealtimeSessionUiOwner,
+  ): void {
     if (this.destroyed) return;
+    if (this.uiOwner && owner !== this.uiOwner) {
+      return;
+    }
+    this.uiOwner = owner ?? null;
     if (options.onStreamingText !== undefined) {
       this.onStreamingText = options.onStreamingText;
     }
@@ -124,6 +147,23 @@ export class RealtimeSession {
     }
   }
 
+  releaseUiCallbacks(
+    owner: RealtimeSessionUiOwner,
+    options: {
+      onStreamingText?: StreamingTextCallback;
+      onTtsSegment?: (payload: TtsSegmentPayload) => void;
+      onError?: ErrorCallback;
+      onStateChange?: WsStateListener;
+    },
+  ): void {
+    if (this.destroyed) return;
+    if (this.uiOwner !== owner) {
+      return;
+    }
+    this.uiOwner = null;
+    this.attachUiCallbacks(options);
+  }
+
   async connect(): Promise<void> {
     await this.client.connect();
   }
@@ -144,6 +184,7 @@ export class RealtimeSession {
 
   /** Returns true if the message was sent over the socket. */
   sendText(text: string, options?: { ttsThisTurn?: boolean }): boolean {
+    this.beginNewOutboundUserTurnCleanup();
     const tts = !!options?.ttsThisTurn;
     this.assistantTurnTtsSync = tts;
     return this.client.sendText(text, { ttsThisTurn: tts });
@@ -160,6 +201,7 @@ export class RealtimeSession {
       ttsThisTurn?: boolean;
     },
   ): boolean {
+    this.beginNewOutboundUserTurnCleanup();
     const tts = !!options?.ttsThisTurn;
     this.assistantTurnTtsSync = tts;
     return this.client.send({
@@ -198,32 +240,99 @@ export class RealtimeSession {
     return this.client.sendTtsRequest(body);
   }
 
+  /** 新开一轮用户发到 WS 之前：清空上轮 sync 残留的缓冲，避免占位 / 错乱 */
+  private beginNewOutboundUserTurnCleanup(): void {
+    if (this.deferredSyncCommits.size === 0 && this.pendingTtsByKey.size === 0) {
+      return;
+    }
+    for (const def of this.deferredSyncCommits.values()) {
+      clearTimeout(def.timeoutId);
+      try {
+        def.commit();
+      } catch {
+        /* best-effort */
+      }
+    }
+    this.deferredSyncCommits.clear();
+    this.pendingTtsByKey.clear();
+  }
+
   // ─── Internal ───
 
   private resetAssistantTtsSyncState(): void {
+    for (const def of this.deferredSyncCommits.values()) {
+      clearTimeout(def.timeoutId);
+      try {
+        def.commit();
+      } catch {
+        /* 取消打断时尽量不丢正文 */
+      }
+    }
+    this.deferredSyncCommits.clear();
     this.assistantTurnTtsSync = false;
     this.pendingTtsByKey.clear();
   }
 
-  private flushBufferedTtsIfSync(
+  /** sync 模式下取出缓冲的该段 TTS；调用方需先落缓存再转发，方便 UI 绑定 URL */
+  private takeBufferedTtsIfSync(
     assistantMessageId: string | undefined,
     index: number,
-  ): void {
-    if (!this.assistantTurnTtsSync) return;
+  ): TtsSegmentPayload | null {
+    if (!this.assistantTurnTtsSync) return null;
     const key = RealtimeSession.bufferedTtsKey(assistantMessageId, index);
     const payload = this.pendingTtsByKey.get(key);
-    if (payload) {
+    if (!payload) return null;
+    this.pendingTtsByKey.delete(key);
+    return payload;
+  }
+
+  /**
+   * 正文已先于音频到达：`commit` 延至收到 `tts_audio` 或超时（无音频路径则照常露字）
+   */
+  private scheduleDeferredSyncCommit(
+    key: string,
+    index: number,
+    total: number,
+    commit: () => void,
+  ): void {
+    const timeoutId = setTimeout(() => {
+      const def = this.deferredSyncCommits.get(key);
+      if (!def || def.timeoutId !== timeoutId) return;
+      this.deferredSyncCommits.delete(key);
       this.pendingTtsByKey.delete(key);
-      this.onTtsSegment?.(payload);
-    }
+      def.commit();
+      this.finishAssistantTurnIfLastSegment(def.index, def.total);
+    }, this.SYNC_REVEAL_TIMEOUT_MS);
+    this.deferredSyncCommits.set(key, { timeoutId, commit, index, total });
+  }
+
+  /** 迟到 `tts_audio` 与延后落库会合：先写缓存再入队播放，确保 URL 能绑定到气泡 */
+  private tryResolveDeferredSyncWithIncomingTts(
+    key: string,
+    incoming: TtsSegmentPayload,
+  ): boolean {
+    const def = this.deferredSyncCommits.get(key);
+    if (!def) return false;
+    clearTimeout(def.timeoutId);
+    this.deferredSyncCommits.delete(key);
+    def.commit();
+    this.onTtsSegment?.(incoming);
+    this.finishAssistantTurnIfLastSegment(def.index, def.total);
+    return true;
   }
 
   private finishAssistantTurnIfLastSegment(index: number, total: number): void {
     if (index >= total - 1) {
-      this.resetAssistantTtsSyncState();
+      this.assistantTurnTtsSync = false;
+      this.pendingTtsByKey.clear();
     }
   }
 
+  /** sync 多段回复不走 footer 流式展示，但仍要清掉「正在回复」占位气泡。 */
+  private clearAssistantPendingUi(): void {
+    this.onStreamingText?.('', true);
+  }
+
   private handleEvent: WsEventListener = (event: WsEvent) => {
     if (event.kind === 'agent_response') {
       this.handleAgentChunk(event);
@@ -244,12 +353,21 @@ export class RealtimeSession {
         assistantMessageId: event.assistantMessageId,
         manual: event.manual,
       };
+
       if (this.assistantTurnTtsSync && !payload.manual) {
         const idx = event.index ?? 0;
         const key = RealtimeSession.bufferedTtsKey(
           event.assistantMessageId,
           idx,
         );
+        payload.autoPlay = true;
+        const resolvedDeferred = this.tryResolveDeferredSyncWithIncomingTts(
+          key,
+          payload,
+        );
+        if (resolvedDeferred) {
+          return;
+        }
         this.pendingTtsByKey.set(key, payload);
       } else {
         this.onTtsSegment?.(payload);
@@ -292,11 +410,32 @@ export class RealtimeSession {
           ? assistantSegmentMessageId(event.assistantMessageId, index)
           : `${this.conversationId}_agent_${Date.now()}_${index}`;
       if (sync) {
-        this.flushBufferedTtsIfSync(event.assistantMessageId, index);
+        const bufferedTts = this.takeBufferedTtsIfSync(
+          event.assistantMessageId,
+          index,
+        );
+        if (bufferedTts) {
+          this.commitOneAssistantMessage(event.text, id);
+          this.clearAssistantPendingUi();
+          this.onTtsSegment?.(bufferedTts);
+          this.finishAssistantTurnIfLastSegment(index, total);
+        } else {
+          const key = RealtimeSession.bufferedTtsKey(
+            event.assistantMessageId,
+            index,
+          );
+          const textCaptured = event.text;
+          const idCaptured = id;
+          this.scheduleDeferredSyncCommit(key, index, total, () => {
+            this.commitOneAssistantMessage(textCaptured, idCaptured);
+            this.clearAssistantPendingUi();
+          });
+        }
+      } else {
+        this.commitOneAssistantMessage(event.text, id);
+        this.onStreamingText?.(event.text, true);
+        this.finishAssistantTurnIfLastSegment(index, total);
       }
-      this.commitOneAssistantMessage(event.text, id);
-      this.onStreamingText?.(event.text, true);
-      this.finishAssistantTurnIfLastSegment(index, total);
       return;
     }
 
@@ -317,18 +456,40 @@ export class RealtimeSession {
       const id =
         this.pendingAssistantMessageId ??
         `${this.conversationId}_agent_${Date.now()}`;
+      let finishSyncTurnNow = false;
       if (sync) {
-        this.flushBufferedTtsIfSync(assistantId ?? undefined, 0);
-        this.commitStreamingBufferWithId(id);
-        const visible =
-          this.streamingBuffer.trim().length > 0 ? this.streamingBuffer : '…';
-        this.onStreamingText?.(visible, true);
+        const bufferedTts = this.takeBufferedTtsIfSync(
+          assistantId ?? undefined,
+          0,
+        );
+        if (bufferedTts) {
+          this.commitStreamingBufferWithId(id);
+          const visible =
+            this.streamingBuffer.trim().length > 0 ? this.streamingBuffer : '…';
+          this.onStreamingText?.(visible, true);
+          this.onTtsSegment?.(bufferedTts);
+          finishSyncTurnNow = true;
+        } else {
+          const snapshot = this.streamingBuffer;
+          const key = RealtimeSession.bufferedTtsKey(assistantId ?? undefined, 0);
+          const idCaptured = id;
+          this.scheduleDeferredSyncCommit(key, 0, 1, () => {
+            this.streamingBuffer = snapshot;
+            this.commitStreamingBufferWithId(idCaptured);
+            this.streamingBuffer = '';
+            const visible = snapshot.trim().length > 0 ? snapshot : '…';
+            this.onStreamingText?.(visible, true);
+          });
+        }
       } else {
         this.commitStreamingBufferWithId(id);
+        finishSyncTurnNow = true;
       }
       this.streamingBuffer = '';
       this.pendingAssistantMessageId = null;
-      this.finishAssistantTurnIfLastSegment(0, 1);
+      if (!sync || finishSyncTurnNow) {
+        this.finishAssistantTurnIfLastSegment(0, 1);
+      }
     }
   }
 
diff --git a/app-expo/src/features/voice/hooks/use-player.ts b/app-expo/src/features/voice/hooks/use-player.ts
index 8bb257f..1c2c7c7 100644
--- a/app-expo/src/features/voice/hooks/use-player.ts
+++ b/app-expo/src/features/voice/hooks/use-player.ts
@@ -94,6 +94,12 @@ export function usePlayer(): UsePlayerResult {
 
       const acquired = await audioFocus.acquireForPlayback();
       if (!acquired) {
+        /**
+         * 录音占用时 acquire 失败且队列尚未 shift；若用户进入会话前焦点已在
+         * `recorder`，可能不会再次触发 `onOwnerChange('recorder')`，旧的
+         * `wasBlockedByRecorderRef` 不会被置位，录音结束后也不会重试 playNext。
+         */
+        wasBlockedByRecorderRef.current = true;
         setStatus('idle');
         return;
       }
@@ -172,14 +178,17 @@ export function usePlayer(): UsePlayerResult {
 
       if (owner === null && wasBlockedByRecorderRef.current) {
         wasBlockedByRecorderRef.current = false;
-        if (queueRef.current.length > 0 && status === 'idle') {
-          playNext();
+        if (
+          queueRef.current.length > 0 &&
+          playbackActiveUriRef.current === null
+        ) {
+          void playNext();
         }
       }
     });
 
     return unsub;
-  }, [status, currentSource, playNext]);
+  }, [currentSource, playNext]);
 
   const enqueue = useCallback(
     async (item: PlaybackItem) => {
diff --git a/app-expo/tests/core/ws/client.test.ts b/app-expo/tests/core/ws/client.test.ts
index 220d2dc..d60e61a 100644
--- a/app-expo/tests/core/ws/client.test.ts
+++ b/app-expo/tests/core/ws/client.test.ts
@@ -118,6 +118,45 @@ describe('WsClient', () => {
     client.dispose();
   });
 
+  test('maps tts audio with base64 and url playback channels', async () => {
+    const client = new WsClient('conv-123');
+    const events: WsEvent[] = [];
+    client.onEvent((e) => events.push(e));
+
+    await client.connect();
+    await new Promise((r) => setTimeout(r, 10));
+
+    const ws = (client as unknown as { ws: MockWebSocket }).ws;
+    ws.simulateMessage({
+      type: 'tts_audio',
+      conversation_id: 'conv-123',
+      data: {
+        audio_base64: 'ZmFrZS1tcDM=',
+        audio_url: 'https://example.com/tts.mp3',
+        index: 0,
+        total: 1,
+        assistant_message_id: 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
+        manual: true,
+      },
+      timestamp: '2026-01-01T00:00:00Z',
+    });
+
+    expect(events).toEqual([
+      {
+        kind: 'tts_audio_received',
+        conversationId: 'conv-123',
+        audioBase64: 'ZmFrZS1tcDM=',
+        audioUrl: 'https://example.com/tts.mp3',
+        index: 0,
+        total: 1,
+        assistantMessageId: 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
+        manual: true,
+      },
+    ]);
+
+    client.dispose();
+  });
+
   test('sends text messages', async () => {
     const client = new WsClient('conv-123');
 
diff --git a/app-expo/tests/features/conversation/entry-warmup.test.ts b/app-expo/tests/features/conversation/entry-warmup.test.ts
index ce9d2f4..b191588 100644
--- a/app-expo/tests/features/conversation/entry-warmup.test.ts
+++ b/app-expo/tests/features/conversation/entry-warmup.test.ts
@@ -1,6 +1,7 @@
 import { QueryClient } from '@tanstack/react-query';
 
 import {
+  prewarmConversationSession,
   prefetchConversationMessages,
   warmupConversationOpening,
 } from '@/features/conversation/entry-warmup';
@@ -90,6 +91,20 @@ describe('conversation entry warmup', () => {
     ).resolves.toBeUndefined();
   });
 
+  test('prewarms existing conversations without opening an offscreen websocket', async () => {
+    const existing = assistantMessage();
+    mockLoadMessages.mockResolvedValueOnce([existing]);
+
+    prewarmConversationSession(queryClient, 'conv-1');
+    await new Promise((r) => setImmediate(r));
+
+    expect(mockLoadMessages).toHaveBeenCalledWith('conv-1');
+    expect(mockSessions).toHaveLength(0);
+    expect(
+      queryClient.getQueryData(conversationKeys.messages('conv-1')),
+    ).toEqual([existing]);
+  });
+
   test('uses refreshed history and skips websocket when opening is already cached', async () => {
     const existing = assistantMessage();
     mockLoadMessages.mockResolvedValueOnce([existing]);
diff --git a/app-expo/tests/features/conversation/realtime-session-sync-order.test.ts b/app-expo/tests/features/conversation/realtime-session-sync-order.test.ts
new file mode 100644
index 0000000..a3778b1
--- /dev/null
+++ b/app-expo/tests/features/conversation/realtime-session-sync-order.test.ts
@@ -0,0 +1,253 @@
+import { QueryClient } from '@tanstack/react-query';
+
+import { RealtimeSession } from '@/features/conversation/realtime-session';
+import { conversationKeys } from '@/features/conversation/query-keys';
+import type { MessageItem } from '@/features/conversation/types';
+
+jest.mock('@/core/auth/token-manager', () => ({
+  tokenManager: {
+    getAccessToken: jest.fn().mockResolvedValue('test-token'),
+  },
+}));
+
+jest.mock('@/core/config', () => ({
+  config: {
+    wsBaseUrl: 'ws://localhost:8000/',
+    ws: {
+      reconnectMaxRetries: 3,
+      reconnectBaseDelayMs: 10,
+      reconnectMaxDelayMs: 100,
+      heartbeatIntervalMs: 600000,
+    },
+  },
+}));
+
+class MockWebSocket {
+  static OPEN = 1;
+  static CLOSED = 3;
+  static instances: MockWebSocket[] = [];
+
+  readyState = MockWebSocket.OPEN;
+  onopen: (() => void) | null = null;
+  onmessage: ((event: { data: string }) => void) | null = null;
+  onclose: (() => void) | null = null;
+  onerror: (() => void) | null = null;
+
+  constructor(public url: string) {
+    MockWebSocket.instances.push(this);
+    queueMicrotask(() => this.onopen?.());
+  }
+
+  send(): void {}
+
+  close(): void {
+    this.readyState = MockWebSocket.CLOSED;
+  }
+
+  simulateMessage(data: Record<string, unknown>): void {
+    this.onmessage?.({ data: JSON.stringify(data) });
+  }
+}
+
+(global as Record<string, unknown>).WebSocket = MockWebSocket;
+
+function msgs(qc: QueryClient, cid: string): MessageItem[] {
+  return qc.getQueryData<MessageItem[]>(conversationKeys.messages(cid)) ?? [];
+}
+
+describe('RealtimeSession sync TTS / agent ordering', () => {
+  let qc: QueryClient;
+
+  beforeEach(() => {
+    jest.clearAllMocks();
+    MockWebSocket.instances = [];
+    qc = new QueryClient();
+    qc.setQueryData(conversationKeys.messages('conv-x'), []);
+  });
+
+  afterEach(async () => {
+    await new Promise((r) => setImmediate(r));
+  });
+
+  it('defers assistant commit when agent_response arrives before tts_audio (single segment)', async () => {
+    const aid = 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa';
+    const onTts = jest.fn(() => {
+      expect(msgs(qc, 'conv-x').some((m) => m.id === aid)).toBe(true);
+    });
+    const onStream = jest.fn();
+    const session = new RealtimeSession({
+      conversationId: 'conv-x',
+      queryClient: qc,
+      onStreamingText: onStream,
+      onTtsSegment: onTts,
+    });
+
+    await session.connect();
+    await new Promise((r) => setImmediate(r));
+
+    const ws = MockWebSocket.instances[0]!;
+    expect(session.sendText('hi', { ttsThisTurn: true })).toBe(true);
+
+    ws.simulateMessage({
+      type: 'agent_response',
+      conversation_id: 'conv-x',
+      data: {
+        text: 'Hello segment',
+        index: 0,
+        total: 1,
+        assistant_message_id: aid,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    const afterAgentOnly = msgs(qc, 'conv-x').filter(
+      (m) => m.senderType === 'assistant',
+    );
+    expect(afterAgentOnly).toHaveLength(0);
+
+    ws.simulateMessage({
+      type: 'tts_audio',
+      conversation_id: 'conv-x',
+      data: {
+        audio_url: 'https://example.com/tts-a.mp3',
+        index: 0,
+        total: 1,
+        assistant_message_id: aid,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    expect(onTts).toHaveBeenCalledTimes(1);
+    const committed = msgs(qc, 'conv-x').filter(
+      (m) => m.senderType === 'assistant',
+    );
+    expect(committed).toHaveLength(1);
+    expect(committed[0]!.content).toContain('Hello segment');
+
+    session.dispose();
+  });
+
+  it('multi-segment sync clears pending UI without streaming footer text', async () => {
+    const aid = 'bb22bb22-bbbb-bbbb-bbbb-bbbbbbbbbbbb';
+    const onTts = jest.fn(() => {
+      expect(
+        msgs(qc, 'conv-x').some((m) => m.id === `${aid}_seg_0`),
+      ).toBe(true);
+    });
+    const onStream = jest.fn();
+    const session = new RealtimeSession({
+      conversationId: 'conv-x',
+      queryClient: qc,
+      onStreamingText: onStream,
+      onTtsSegment: onTts,
+    });
+
+    await session.connect();
+    await new Promise((r) => setImmediate(r));
+
+    const ws = MockWebSocket.instances[0]!;
+    session.sendText('hi', { ttsThisTurn: true });
+
+    ws.simulateMessage({
+      type: 'tts_audio',
+      conversation_id: 'conv-x',
+      data: {
+        audio_url: 'https://example.com/tts-b.mp3',
+        index: 0,
+        total: 2,
+        assistant_message_id: aid,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    ws.simulateMessage({
+      type: 'agent_response',
+      conversation_id: 'conv-x',
+      data: {
+        text: 'Part A',
+        index: 0,
+        total: 2,
+        assistant_message_id: aid,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    expect(onStream).toHaveBeenCalledWith('', true);
+    expect(onStream).not.toHaveBeenCalledWith('Part A', true);
+    expect(onTts).toHaveBeenCalled();
+    session.dispose();
+  });
+
+  it('keeps active screen TTS callback when stale offscreen attach runs later', async () => {
+    const aid = 'cc33cc33-cccc-cccc-cccc-cccccccccccc';
+    const screenOnTts = jest.fn();
+    const offscreenOnTts = jest.fn();
+    const session = new RealtimeSession({
+      conversationId: 'conv-x',
+      queryClient: qc,
+    });
+    const owner = Symbol('screen-owner');
+
+    session.attachUiCallbacks({ onTtsSegment: screenOnTts }, owner);
+    session.attachUiCallbacks({ onTtsSegment: offscreenOnTts });
+
+    await session.connect();
+    await new Promise((r) => setImmediate(r));
+
+    const ws = MockWebSocket.instances[0]!;
+    ws.simulateMessage({
+      type: 'tts_audio',
+      conversation_id: 'conv-x',
+      data: {
+        audio_base64: 'ZmFrZS1tcDM=',
+        audio_url: 'https://example.com/tts-c.mp3',
+        index: 0,
+        total: 1,
+        assistant_message_id: aid,
+        manual: true,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    expect(screenOnTts).toHaveBeenCalledTimes(1);
+    expect(offscreenOnTts).not.toHaveBeenCalled();
+    session.dispose();
+  });
+
+  it('keeps active screen TTS callback when a stale screen owner attaches later', async () => {
+    const aid = 'dd44dd44-dddd-dddd-dddd-dddddddddddd';
+    const screenOnTts = jest.fn();
+    const staleScreenOnTts = jest.fn();
+    const session = new RealtimeSession({
+      conversationId: 'conv-x',
+      queryClient: qc,
+    });
+    const activeOwner = Symbol('active-screen-owner');
+    const staleOwner = Symbol('stale-screen-owner');
+
+    session.attachUiCallbacks({ onTtsSegment: screenOnTts }, activeOwner);
+    session.attachUiCallbacks({ onTtsSegment: staleScreenOnTts }, staleOwner);
+
+    await session.connect();
+    await new Promise((r) => setImmediate(r));
+
+    const ws = MockWebSocket.instances[0]!;
+    ws.simulateMessage({
+      type: 'tts_audio',
+      conversation_id: 'conv-x',
+      data: {
+        audio_base64: 'ZmFrZS1tcDM=',
+        audio_url: 'https://example.com/tts-d.mp3',
+        index: 0,
+        total: 1,
+        assistant_message_id: aid,
+        manual: true,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    expect(screenOnTts).toHaveBeenCalledTimes(1);
+    expect(staleScreenOnTts).not.toHaveBeenCalled();
+    session.dispose();
+  });
+});
diff --git a/app-expo/tests/features/voice/use-player.test.tsx b/app-expo/tests/features/voice/use-player.test.tsx
index 15f90c1..fe5e47a 100644
--- a/app-expo/tests/features/voice/use-player.test.tsx
+++ b/app-expo/tests/features/voice/use-player.test.tsx
@@ -37,6 +37,7 @@ describe('usePlayer', () => {
     });
     jest.mocked(audioFocus.acquireForPlayback).mockResolvedValue(true);
     jest.mocked(audioFocus.releaseIfOwnedBy).mockResolvedValue(undefined);
+    jest.mocked(audioFocus.onOwnerChange).mockImplementation(() => jest.fn());
   });
 
   test('keeps the native audio session active while app-level audio focus owns teardown', () => {
@@ -127,4 +128,43 @@ describe('usePlayer', () => {
     expect(pause).not.toHaveBeenCalled();
     expect(result.current.status).toBe('idle');
   });
+
+  test('retries queued audio after acquire fails once then audio focus frees', async () => {
+    const acquire = jest.mocked(audioFocus.acquireForPlayback);
+    acquire.mockResolvedValueOnce(false).mockResolvedValue(true);
+
+    let ownerListener: ((owner: null | string) => void) | undefined;
+    jest.mocked(audioFocus.onOwnerChange).mockImplementation((cb) => {
+      ownerListener = cb as (owner: null | string) => void;
+      return jest.fn();
+    });
+
+    mockUseAudioPlayerStatus.mockReturnValue({
+      isLoaded: true,
+      playing: false,
+      currentTime: 0,
+      duration: 10,
+    });
+    const play = jest.fn();
+    mockUseAudioPlayer.mockReturnValue({ pause: jest.fn(), play });
+
+    const { result } = renderHook(() => usePlayer());
+
+    await act(async () => {
+      await result.current.enqueue({
+        uri: 'file:///queued.mp3',
+        kind: 'tts_auto',
+      });
+    });
+
+    expect(acquire).toHaveBeenCalledTimes(1);
+    expect(result.current.status).toBe('idle');
+
+    await act(async () => {
+      ownerListener?.(null);
+    });
+
+    expect(acquire).toHaveBeenCalledTimes(2);
+    expect(play).toHaveBeenCalled();
+  });
 });