fix(conversation): 修复实时会话 TTS/回复被离屏 WS 抢占

- 列表预热仅预取消息缓存，避免后台 WebSocket 覆盖服务端连接 - RealtimeSession UI 回调按 owner 独占，防止 offscreen 覆盖聊天页 - 列表页聚焦时再 prewarm，会话页 TTS 入队优先 base64 - 管线下发 TTS 同时带 audio_base64 与 audio_url；协议说明同步 - 移除 TTS 排查用前后端调试日志，保留错误/告警 - 补充 WS / RealtimeSession / entry-warmup / 播放器相关单测 Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 10:42:44 +08:00
parent 93be60f74c
commit 3d01085442
18 changed files with 643 additions and 261 deletions
--- a/app-expo/src/app/(main)/conversation/[id].tsx
+++ b/app-expo/src/app/(main)/conversation/[id].tsx
@@ -46,6 +46,10 @@ import { Icon } from '@/components/ui/icon';
 import { Text } from '@/components/ui/text';
 import { ScreenHeader } from '@/components/screen-header';
 import { resolveApiMediaUrl } from '@/core/api/media-url';
+import {
+  getTtsSpeakDefault,
+  setTtsSpeakDefault,
+} from '@/core/settings/app-settings';
 import { useAppSettings } from '@/hooks/use-app-settings';
 import { useThemeColors } from '@/hooks/use-theme-colors';
 import { useTypography } from '@/core/typography-context';
@@ -1238,15 +1242,14 @@ export default function ConversationScreen() {

  const handleTtsSegment = useCallback(
    (p: TtsSegmentPayload) => {
-      // 闸门用于丢弃「用户已打断后」迟到的自动 TTS；按需朗读 (manual) 是当前明确操作，必须放行。
-      const allowByGate =
-        p.manual === true || ttsGate.current.shouldAcceptIncomingTts();
-      if (!allowByGate) return;
      const convId = id ?? '';
      const cosUrl = p.audioUrl?.trim();
+      const isManualPlayback = !!p.manual;
+      const shouldAutoPlay = !!p.autoPlay;
      /**
-       * 播放走 WS，但「再读」依赖 MessageItem.ttsAudioUrls。乐观提交的消息没有 URL，
-       * 服务端 attach 要等整轮结束；收到 COS URL 时写入缓存，按钮才能用。
+       * COS URL 先写入缓存：**与Speak无关**，否则录音/打断后闸门关闭时会跳过 merge，
+       * 按需朗读下发的 `tts_audio` 无法绑定到气泡，喇叭表现为「不可用」。
+       * 闸门仅约束是否入队播放（迟到自动朗读），不禁用 URL 附着。
       */
      if (cosUrl && convId) {
        queryClient.setQueryData<MessageItem[]>(
@@ -1307,8 +1310,13 @@ export default function ConversationScreen() {
        );
      }

+      const gateAllowsPlayback =
+        isManualPlayback || ttsGate.current.shouldAcceptIncomingTts();
      const shouldEnqueue =
-        p.manual === true || lastUserMessageRequestedTtsRef.current;
+        isManualPlayback ||
+        shouldAutoPlay ||
+        lastUserMessageRequestedTtsRef.current;
+      if (!gateAllowsPlayback) return;
      if (!shouldEnqueue) return;

      const listKey =
@@ -1325,8 +1333,8 @@ export default function ConversationScreen() {
          ...shared,
          uri: `data:audio/mp3;base64,${p.audioBase64}`,
        });
-      } else if (p.audioUrl) {
-        void enqueue({ ...shared, uri: p.audioUrl });
+      } else if (cosUrl) {
+        void enqueue({ ...shared, uri: cosUrl });
      }
    },
    [enqueue, id, queryClient],
@@ -1399,8 +1407,9 @@ export default function ConversationScreen() {

  const [input, setInput] = useState('');
  const [inputResetKey, setInputResetKey] = useState(0);
-  /** 本条发出的用户消息是否请求助手朗读（先 TTS 再出字） */
+  /** 本条发出的用户消息是否请求助手朗读（先 TTS 再出字）；默认值从存储恢复 */
  const [ttsThisTurn, setTtsThisTurn] = useState(false);
+  const [ttsSpeakPrefReady, setTtsSpeakPrefReady] = useState(false);
  const [inputMode, setInputMode] = useState<InputMode>('text');
  const [isKeyboardVisible, setIsKeyboardVisible] = useState(false);
  const inputModeRef = useRef<InputMode>('text');
@@ -1489,6 +1498,24 @@ export default function ConversationScreen() {
    inputModeRef.current = inputMode;
  }, [inputMode]);

+  useEffect(() => {
+    let cancelled = false;
+    void (async () => {
+      try {
+        const v = await getTtsSpeakDefault();
+        if (!cancelled) {
+          setTtsThisTurn(v);
+          setTtsSpeakPrefReady(true);
+        }
+      } catch {
+        if (!cancelled) setTtsSpeakPrefReady(true);
+      }
+    })();
+    return () => {
+      cancelled = true;
+    };
+  }, []);
+
  useEffect(() => {
    const onKeyboardWillShow = () => {
      if (inputModeRef.current !== 'text') return;
@@ -1687,7 +1714,11 @@ export default function ConversationScreen() {
            <Switch
              accessibilityLabel={t('ttsThisTurnAccessibility')}
              value={ttsThisTurn}
-              onValueChange={setTtsThisTurn}
+              disabled={!ttsSpeakPrefReady}
+              onValueChange={(v) => {
+                setTtsThisTurn(v);
+                void setTtsSpeakDefault(v);
+              }}
              trackColor={{
                false: CHAT_COLORS.outline,
                true: CHAT_COLORS.secondaryContainer,
--- a/app-expo/src/app/(tabs)/index.tsx
+++ b/app-expo/src/app/(tabs)/index.tsx
@@ -2,6 +2,7 @@ import { Image } from 'expo-image';
 import { router } from 'expo-router';
 import React, { useEffect, useRef, useState } from 'react';
 import { useQueryClient } from '@tanstack/react-query';
+import { useIsFocused } from '@react-navigation/native';
 import {
  Alert,
  AppState,
@@ -302,6 +303,7 @@ function findTodayConversationToResume(
 export default function ConversationsScreen() {
  const { t } = useTranslation('conversation');
  const queryClient = useQueryClient();
+  const isFocused = useIsFocused();

  const { data: conversations = [], isLoading } = useConversations();
  const createConversation = useCreateConversation();
@@ -448,12 +450,13 @@ export default function ConversationsScreen() {
   * 单槽连接池：换会话会自动 dispose 旧槽，所以这里只挑一条最像即将被点的。
   */
  useEffect(() => {
+    if (!isFocused) return;
    if (isLoading) return;
    const candidate =
      todayConversation ?? conversations.find(conversationHasAnyMessage);
    if (!candidate) return;
    prewarmConversationSession(queryClient, candidate.id);
-  }, [isLoading, conversations, todayConversation, queryClient]);
+  }, [isFocused, isLoading, conversations, todayConversation, queryClient]);

  return (
    <View className="flex-1 bg-background">
--- a/app-expo/src/core/audio/audio-focus.ts
+++ b/app-expo/src/core/audio/audio-focus.ts
@@ -20,7 +20,10 @@ function notify() {
 */
 export const audioFocus = {
  async acquireForRecording(): Promise<boolean> {
-    if (currentOwner === 'recorder') return true;
+    if (currentOwner === 'recorder') {
+      await setIsAudioActiveAsync(true);
+      return true;
+    }

    if (currentOwner === 'player') {
      await this.releaseIfOwnedBy('player');
@@ -30,6 +33,7 @@ export const audioFocus = {
      playsInSilentMode: true,
      allowsRecording: true,
    });
+    await setIsAudioActiveAsync(true);

    currentOwner = 'recorder';
    notify();
@@ -37,7 +41,10 @@ export const audioFocus = {
  },

  async acquireForPlayback(): Promise<boolean> {
-    if (currentOwner === 'player') return true;
+    if (currentOwner === 'player') {
+      await setIsAudioActiveAsync(true);
+      return true;
+    }

    if (currentOwner === 'recorder') {
      return false;
@@ -47,6 +54,7 @@ export const audioFocus = {
      playsInSilentMode: true,
      allowsRecording: false,
    });
+    await setIsAudioActiveAsync(true);

    currentOwner = 'player';
    notify();
--- a/app-expo/src/core/settings/app-settings.ts
+++ b/app-expo/src/core/settings/app-settings.ts
@@ -14,6 +14,7 @@ const KEY_LANGUAGE = 'app_settings_language';
 const KEY_LARGE_TEXT = 'app_settings_large_text';
 const KEY_DARK_MODE = 'app_settings_dark_mode';
 const KEY_THEME_NAME = 'app_settings_theme_name';
+const KEY_TTS_SPEAK_DEFAULT = 'app_settings_tts_speak_default';

 const webFallback: Record<string, string> = {};

@@ -83,5 +84,16 @@ export async function setThemeName(value: ThemeName): Promise<void> {
  await setStored(KEY_THEME_NAME, value);
 }

+/** 会话页「Speak / 本轮朗读」开关是否默认开启（跨会话记忆） */
+export async function getTtsSpeakDefault(): Promise<boolean> {
+  const v = await getStored(KEY_TTS_SPEAK_DEFAULT);
+  if (v == null || v === '') return false;
+  return v === 'true';
+}
+
+export async function setTtsSpeakDefault(value: boolean): Promise<void> {
+  await setStored(KEY_TTS_SPEAK_DEFAULT, value ? 'true' : 'false');
+}
+
 export { supportedLanguages, THEME_NAMES };
 export type { AppLanguage, ThemeName };
--- a/app-expo/src/core/ws/types.ts
+++ b/app-expo/src/core/ws/types.ts
@@ -63,7 +63,8 @@ export interface AgentResponseEvent {
 export interface TtsAudioReceivedEvent {
  kind: 'tts_audio_received';
  conversationId: string;
-  audioBase64: string;
+  /** 兼容旧 WS payload；标准链路使用 audioUrl。 */
+  audioBase64?: string;
  audioUrl?: string;
  index?: number;
  total?: number;
--- a/app-expo/src/features/conversation/conversation-ws-background-pool.ts
+++ b/app-expo/src/features/conversation/conversation-ws-background-pool.ts
@@ -1,7 +1,10 @@
 import type { QueryClient } from '@tanstack/react-query';
 import { AppState, type AppStateStatus } from 'react-native';

-import { RealtimeSession } from './realtime-session';
+import {
+  RealtimeSession,
+  type RealtimeSessionUiOwner,
+} from './realtime-session';

 type Slot = { conversationId: string; session: RealtimeSession };

@@ -34,8 +37,11 @@ const offScreenUi = {
 };

 /** 离屏：保持 WebSocket，去掉 UI 回调，避免列表页播 TTS 或对已卸载组件 setState */
-export function releaseConversationWsUi(session: RealtimeSession): void {
-  session.attachUiCallbacks({
+export function releaseConversationWsUi(
+  session: RealtimeSession,
+  owner: RealtimeSessionUiOwner,
+): void {
+  session.releaseUiCallbacks(owner, {
    onStreamingText: offScreenUi.onStreamingText,
    onTtsSegment: offScreenUi.onTtsSegment,
    onError: offScreenUi.onError,
--- a/app-expo/src/features/conversation/entry-warmup.ts
+++ b/app-expo/src/features/conversation/entry-warmup.ts
@@ -1,6 +1,5 @@
 import type { QueryClient } from '@tanstack/react-query';

-import { acquireBackgroundConversationWs } from './conversation-ws-background-pool';
 import { conversationMessagesRepository } from './conversation-messages-repository';
 import { conversationKeys } from './query-keys';
 import { registerPreparedRealtimeSession } from './prepared-session-registry';
@@ -52,17 +51,13 @@ export async function prefetchConversationMessages(
  });
 }

-const offscreenUiCallbacks = {
-  onStreamingText: () => {},
-  onTtsSegment: () => {},
-  onError: () => {},
-  onStateChange: () => {},
-};
-
 const inflightPrewarms = new Set<string>();

 /**
- * 列表页/卡片按下时的预热：保持后台 WS 连接，并触发消息缓存填充。
+ * 列表页/卡片按下时的预热：只填充消息缓存，不建立后台 WS。
+ *
+ * 后端当前以 conversation_id 记录 active WebSocket；离屏 WS 会覆盖聊天页连接，
+ * 导致本轮 TTS/agent_response 发到 offscreen session，页面停在「回复中」。
 * 与 `warmupConversationOpening` 不同：不等待开场白、不阻塞调用方，仅适用于"已有消息"的会话。
 */
 export function prewarmConversationSession(
@@ -70,13 +65,6 @@ export function prewarmConversationSession(
  conversationId: string,
 ): void {
  if (!conversationId) return;
-  const session = acquireBackgroundConversationWs(
-    conversationId,
-    queryClient,
-    null,
-  );
-  // 预热阶段没有挂载的 UI，先用空回调占位；聊天页 mount 时会重新 attach。
-  session.attachUiCallbacks(offscreenUiCallbacks);
  if (inflightPrewarms.has(conversationId)) return;
  const cached = queryClient.getQueryData<MessageItem[]>(
    conversationKeys.messages(conversationId),
--- a/app-expo/src/features/conversation/hooks.ts
+++ b/app-expo/src/features/conversation/hooks.ts
@@ -18,6 +18,7 @@ import { conversationKeys } from './query-keys';
 import { takePreparedRealtimeSession } from './prepared-session-registry';
 import {
  type ErrorCallback,
+  type RealtimeSessionUiOwner,
  type StreamingTextCallback,
  type TtsSegmentPayload,
  type RealtimeSession,
@@ -219,6 +220,9 @@ export function useRealtimeSession({
 }: UseRealtimeSessionOptions): RealtimeSessionState {
  const queryClient = useQueryClient();
  const sessionRef = useRef<RealtimeSession | null>(null);
+  const uiOwnerRef = useRef<RealtimeSessionUiOwner>(
+    Symbol('conversation-screen-ui'),
+  );
  const uiRef = useRef({
    handleStreamingText: (() => {}) as StreamingTextCallback,
    handleError: (() => {}) as ErrorCallback,
@@ -300,20 +304,23 @@ export function useRealtimeSession({
      prepared,
    );

-    session.attachUiCallbacks({
-      onStreamingText: (text, isComplete) => {
-        uiRef.current.handleStreamingText(text, isComplete);
+    session.attachUiCallbacks(
+      {
+        onStreamingText: (text, isComplete) => {
+          uiRef.current.handleStreamingText(text, isComplete);
+        },
+        onTtsSegment: (payload) => uiRef.current.onTtsSegment?.(payload),
+        onError: (message, code) => uiRef.current.handleError(message, code),
+        onStateChange: setConnectionState,
      },
-      onTtsSegment: (payload) => uiRef.current.onTtsSegment?.(payload),
-      onError: (message, code) => uiRef.current.handleError(message, code),
-      onStateChange: setConnectionState,
-    });
+      uiOwnerRef.current,
+    );

    sessionRef.current = session;
    setConnectionState(session.getConnectionState());

    return () => {
-      releaseConversationWsUi(session);
+      releaseConversationWsUi(session, uiOwnerRef.current);
      sessionRef.current = null;
      setConnectionState('disconnected');
      setStreamingMessage(null);
--- a/app-expo/src/features/conversation/realtime-session.ts
+++ b/app-expo/src/features/conversation/realtime-session.ts
@@ -21,6 +21,7 @@ function looksLikeUuidAssistantMessageId(id: string): boolean {

 export type StreamingTextCallback = (text: string, isComplete: boolean) => void;
 export type ErrorCallback = (message: string, code?: string) => void;
+export type RealtimeSessionUiOwner = symbol;

 /** WebSocket `tts_audio`：服务端可能只带 base64、只带 COS URL，或两者都有 */
 export type TtsSegmentPayload = {
@@ -32,6 +33,8 @@ export type TtsSegmentPayload = {
  assistantMessageId?: string;
  /** 用户点喇叭按需下发时为 true，应加入播放队列（即使未开「本轮朗读」） */
  manual?: boolean;
+  /** 本段属于用户显式打开 Speak 的自动朗读轮次。 */
+  autoPlay?: boolean;
 };

 interface RealtimeSessionOptions {
@@ -63,6 +66,7 @@ export class RealtimeSession {
  private onTtsSegment?: (payload: TtsSegmentPayload) => void;
  private onError?: ErrorCallback;
  private uiStateListener?: WsStateListener;
+  private uiOwner: RealtimeSessionUiOwner | null = null;
  private unsubEvent: (() => void) | null = null;
  private unsubState: (() => void) | null = null;

@@ -75,6 +79,18 @@ export class RealtimeSession {
  private assistantTurnTtsSync = false;
  private pendingTtsByKey = new Map<string, TtsSegmentPayload>();

+  /** `agent_response` 早于 `tts_audio` 时延后落库，超时后露字且无播放 */
+  private readonly SYNC_REVEAL_TIMEOUT_MS = 3000;
+  private deferredSyncCommits = new Map<
+    string,
+    {
+      timeoutId: ReturnType<typeof setTimeout>;
+      commit: () => void;
+      index: number;
+      total: number;
+    }
+  >();
+
  private static bufferedTtsKey(
    assistantMessageId: string | undefined,
    index: number,
@@ -99,13 +115,20 @@ export class RealtimeSession {
  }

  /** 列表预热接棒或刷新 UI 订阅时替换回调，不重建 WebSocket */
-  attachUiCallbacks(options: {
-    onStreamingText?: StreamingTextCallback;
-    onTtsSegment?: (payload: TtsSegmentPayload) => void;
-    onError?: ErrorCallback;
-    onStateChange?: WsStateListener;
-  }): void {
+  attachUiCallbacks(
+    options: {
+      onStreamingText?: StreamingTextCallback;
+      onTtsSegment?: (payload: TtsSegmentPayload) => void;
+      onError?: ErrorCallback;
+      onStateChange?: WsStateListener;
+    },
+    owner?: RealtimeSessionUiOwner,
+  ): void {
    if (this.destroyed) return;
+    if (this.uiOwner && owner !== this.uiOwner) {
+      return;
+    }
+    this.uiOwner = owner ?? null;
    if (options.onStreamingText !== undefined) {
      this.onStreamingText = options.onStreamingText;
    }
@@ -124,6 +147,23 @@ export class RealtimeSession {
    }
  }

+  releaseUiCallbacks(
+    owner: RealtimeSessionUiOwner,
+    options: {
+      onStreamingText?: StreamingTextCallback;
+      onTtsSegment?: (payload: TtsSegmentPayload) => void;
+      onError?: ErrorCallback;
+      onStateChange?: WsStateListener;
+    },
+  ): void {
+    if (this.destroyed) return;
+    if (this.uiOwner !== owner) {
+      return;
+    }
+    this.uiOwner = null;
+    this.attachUiCallbacks(options);
+  }
+
  async connect(): Promise<void> {
    await this.client.connect();
  }
@@ -144,6 +184,7 @@ export class RealtimeSession {

  /** Returns true if the message was sent over the socket. */
  sendText(text: string, options?: { ttsThisTurn?: boolean }): boolean {
+    this.beginNewOutboundUserTurnCleanup();
    const tts = !!options?.ttsThisTurn;
    this.assistantTurnTtsSync = tts;
    return this.client.sendText(text, { ttsThisTurn: tts });
@@ -160,6 +201,7 @@ export class RealtimeSession {
      ttsThisTurn?: boolean;
    },
  ): boolean {
+    this.beginNewOutboundUserTurnCleanup();
    const tts = !!options?.ttsThisTurn;
    this.assistantTurnTtsSync = tts;
    return this.client.send({
@@ -198,32 +240,99 @@ export class RealtimeSession {
    return this.client.sendTtsRequest(body);
  }

+  /** 新开一轮用户发到 WS 之前：清空上轮 sync 残留的缓冲，避免占位 / 错乱 */
+  private beginNewOutboundUserTurnCleanup(): void {
+    if (this.deferredSyncCommits.size === 0 && this.pendingTtsByKey.size === 0) {
+      return;
+    }
+    for (const def of this.deferredSyncCommits.values()) {
+      clearTimeout(def.timeoutId);
+      try {
+        def.commit();
+      } catch {
+        /* best-effort */
+      }
+    }
+    this.deferredSyncCommits.clear();
+    this.pendingTtsByKey.clear();
+  }
+
  // ─── Internal ───

  private resetAssistantTtsSyncState(): void {
+    for (const def of this.deferredSyncCommits.values()) {
+      clearTimeout(def.timeoutId);
+      try {
+        def.commit();
+      } catch {
+        /* 取消打断时尽量不丢正文 */
+      }
+    }
+    this.deferredSyncCommits.clear();
    this.assistantTurnTtsSync = false;
    this.pendingTtsByKey.clear();
  }

-  private flushBufferedTtsIfSync(
+  /** sync 模式下取出缓冲的该段 TTS；调用方需先落缓存再转发，方便 UI 绑定 URL */
+  private takeBufferedTtsIfSync(
    assistantMessageId: string | undefined,
    index: number,
-  ): void {
-    if (!this.assistantTurnTtsSync) return;
+  ): TtsSegmentPayload | null {
+    if (!this.assistantTurnTtsSync) return null;
    const key = RealtimeSession.bufferedTtsKey(assistantMessageId, index);
    const payload = this.pendingTtsByKey.get(key);
-    if (payload) {
+    if (!payload) return null;
+    this.pendingTtsByKey.delete(key);
+    return payload;
+  }
+
+  /**
+   * 正文已先于音频到达：`commit` 延至收到 `tts_audio` 或超时（无音频路径则照常露字）
+   */
+  private scheduleDeferredSyncCommit(
+    key: string,
+    index: number,
+    total: number,
+    commit: () => void,
+  ): void {
+    const timeoutId = setTimeout(() => {
+      const def = this.deferredSyncCommits.get(key);
+      if (!def || def.timeoutId !== timeoutId) return;
+      this.deferredSyncCommits.delete(key);
      this.pendingTtsByKey.delete(key);
-      this.onTtsSegment?.(payload);
-    }
+      def.commit();
+      this.finishAssistantTurnIfLastSegment(def.index, def.total);
+    }, this.SYNC_REVEAL_TIMEOUT_MS);
+    this.deferredSyncCommits.set(key, { timeoutId, commit, index, total });
+  }
+
+  /** 迟到 `tts_audio` 与延后落库会合：先写缓存再入队播放，确保 URL 能绑定到气泡 */
+  private tryResolveDeferredSyncWithIncomingTts(
+    key: string,
+    incoming: TtsSegmentPayload,
+  ): boolean {
+    const def = this.deferredSyncCommits.get(key);
+    if (!def) return false;
+    clearTimeout(def.timeoutId);
+    this.deferredSyncCommits.delete(key);
+    def.commit();
+    this.onTtsSegment?.(incoming);
+    this.finishAssistantTurnIfLastSegment(def.index, def.total);
+    return true;
  }

  private finishAssistantTurnIfLastSegment(index: number, total: number): void {
    if (index >= total - 1) {
-      this.resetAssistantTtsSyncState();
+      this.assistantTurnTtsSync = false;
+      this.pendingTtsByKey.clear();
    }
  }

+  /** sync 多段回复不走 footer 流式展示，但仍要清掉「正在回复」占位气泡。 */
+  private clearAssistantPendingUi(): void {
+    this.onStreamingText?.('', true);
+  }
+
  private handleEvent: WsEventListener = (event: WsEvent) => {
    if (event.kind === 'agent_response') {
      this.handleAgentChunk(event);
@@ -244,12 +353,21 @@ export class RealtimeSession {
        assistantMessageId: event.assistantMessageId,
        manual: event.manual,
      };
+
      if (this.assistantTurnTtsSync && !payload.manual) {
        const idx = event.index ?? 0;
        const key = RealtimeSession.bufferedTtsKey(
          event.assistantMessageId,
          idx,
        );
+        payload.autoPlay = true;
+        const resolvedDeferred = this.tryResolveDeferredSyncWithIncomingTts(
+          key,
+          payload,
+        );
+        if (resolvedDeferred) {
+          return;
+        }
        this.pendingTtsByKey.set(key, payload);
      } else {
        this.onTtsSegment?.(payload);
@@ -292,11 +410,32 @@ export class RealtimeSession {
          ? assistantSegmentMessageId(event.assistantMessageId, index)
          : `${this.conversationId}_agent_${Date.now()}_${index}`;
      if (sync) {
-        this.flushBufferedTtsIfSync(event.assistantMessageId, index);
+        const bufferedTts = this.takeBufferedTtsIfSync(
+          event.assistantMessageId,
+          index,
+        );
+        if (bufferedTts) {
+          this.commitOneAssistantMessage(event.text, id);
+          this.clearAssistantPendingUi();
+          this.onTtsSegment?.(bufferedTts);
+          this.finishAssistantTurnIfLastSegment(index, total);
+        } else {
+          const key = RealtimeSession.bufferedTtsKey(
+            event.assistantMessageId,
+            index,
+          );
+          const textCaptured = event.text;
+          const idCaptured = id;
+          this.scheduleDeferredSyncCommit(key, index, total, () => {
+            this.commitOneAssistantMessage(textCaptured, idCaptured);
+            this.clearAssistantPendingUi();
+          });
+        }
+      } else {
+        this.commitOneAssistantMessage(event.text, id);
+        this.onStreamingText?.(event.text, true);
+        this.finishAssistantTurnIfLastSegment(index, total);
      }
-      this.commitOneAssistantMessage(event.text, id);
-      this.onStreamingText?.(event.text, true);
-      this.finishAssistantTurnIfLastSegment(index, total);
      return;
    }

@@ -317,18 +456,40 @@ export class RealtimeSession {
      const id =
        this.pendingAssistantMessageId ??
        `${this.conversationId}_agent_${Date.now()}`;
+      let finishSyncTurnNow = false;
      if (sync) {
-        this.flushBufferedTtsIfSync(assistantId ?? undefined, 0);
-        this.commitStreamingBufferWithId(id);
-        const visible =
-          this.streamingBuffer.trim().length > 0 ? this.streamingBuffer : '…';
-        this.onStreamingText?.(visible, true);
+        const bufferedTts = this.takeBufferedTtsIfSync(
+          assistantId ?? undefined,
+          0,
+        );
+        if (bufferedTts) {
+          this.commitStreamingBufferWithId(id);
+          const visible =
+            this.streamingBuffer.trim().length > 0 ? this.streamingBuffer : '…';
+          this.onStreamingText?.(visible, true);
+          this.onTtsSegment?.(bufferedTts);
+          finishSyncTurnNow = true;
+        } else {
+          const snapshot = this.streamingBuffer;
+          const key = RealtimeSession.bufferedTtsKey(assistantId ?? undefined, 0);
+          const idCaptured = id;
+          this.scheduleDeferredSyncCommit(key, 0, 1, () => {
+            this.streamingBuffer = snapshot;
+            this.commitStreamingBufferWithId(idCaptured);
+            this.streamingBuffer = '';
+            const visible = snapshot.trim().length > 0 ? snapshot : '…';
+            this.onStreamingText?.(visible, true);
+          });
+        }
      } else {
        this.commitStreamingBufferWithId(id);
+        finishSyncTurnNow = true;
      }
      this.streamingBuffer = '';
      this.pendingAssistantMessageId = null;
-      this.finishAssistantTurnIfLastSegment(0, 1);
+      if (!sync || finishSyncTurnNow) {
+        this.finishAssistantTurnIfLastSegment(0, 1);
+      }
    }
  }

--- a/app-expo/src/features/voice/hooks/use-player.ts
+++ b/app-expo/src/features/voice/hooks/use-player.ts
@@ -94,6 +94,12 @@ export function usePlayer(): UsePlayerResult {

      const acquired = await audioFocus.acquireForPlayback();
      if (!acquired) {
+        /**
+         * 录音占用时 acquire 失败且队列尚未 shift；若用户进入会话前焦点已在
+         * `recorder`，可能不会再次触发 `onOwnerChange('recorder')`，旧的
+         * `wasBlockedByRecorderRef` 不会被置位，录音结束后也不会重试 playNext。
+         */
+        wasBlockedByRecorderRef.current = true;
        setStatus('idle');
        return;
      }
@@ -172,14 +178,17 @@ export function usePlayer(): UsePlayerResult {

      if (owner === null && wasBlockedByRecorderRef.current) {
        wasBlockedByRecorderRef.current = false;
-        if (queueRef.current.length > 0 && status === 'idle') {
-          playNext();
+        if (
+          queueRef.current.length > 0 &&
+          playbackActiveUriRef.current === null
+        ) {
+          void playNext();
        }
      }
    });

    return unsub;
-  }, [status, currentSource, playNext]);
+  }, [currentSource, playNext]);

  const enqueue = useCallback(
    async (item: PlaybackItem) => {
--- a/app-expo/tests/core/ws/client.test.ts
+++ b/app-expo/tests/core/ws/client.test.ts
@@ -118,6 +118,45 @@ describe('WsClient', () => {
    client.dispose();
  });

+  test('maps tts audio with base64 and url playback channels', async () => {
+    const client = new WsClient('conv-123');
+    const events: WsEvent[] = [];
+    client.onEvent((e) => events.push(e));
+
+    await client.connect();
+    await new Promise((r) => setTimeout(r, 10));
+
+    const ws = (client as unknown as { ws: MockWebSocket }).ws;
+    ws.simulateMessage({
+      type: 'tts_audio',
+      conversation_id: 'conv-123',
+      data: {
+        audio_base64: 'ZmFrZS1tcDM=',
+        audio_url: 'https://example.com/tts.mp3',
+        index: 0,
+        total: 1,
+        assistant_message_id: 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
+        manual: true,
+      },
+      timestamp: '2026-01-01T00:00:00Z',
+    });
+
+    expect(events).toEqual([
+      {
+        kind: 'tts_audio_received',
+        conversationId: 'conv-123',
+        audioBase64: 'ZmFrZS1tcDM=',
+        audioUrl: 'https://example.com/tts.mp3',
+        index: 0,
+        total: 1,
+        assistantMessageId: 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
+        manual: true,
+      },
+    ]);
+
+    client.dispose();
+  });
+
  test('sends text messages', async () => {
    const client = new WsClient('conv-123');

--- a/app-expo/tests/features/conversation/entry-warmup.test.ts
+++ b/app-expo/tests/features/conversation/entry-warmup.test.ts
@@ -1,6 +1,7 @@
 import { QueryClient } from '@tanstack/react-query';

 import {
+  prewarmConversationSession,
  prefetchConversationMessages,
  warmupConversationOpening,
 } from '@/features/conversation/entry-warmup';
@@ -90,6 +91,20 @@ describe('conversation entry warmup', () => {
    ).resolves.toBeUndefined();
  });

+  test('prewarms existing conversations without opening an offscreen websocket', async () => {
+    const existing = assistantMessage();
+    mockLoadMessages.mockResolvedValueOnce([existing]);
+
+    prewarmConversationSession(queryClient, 'conv-1');
+    await new Promise((r) => setImmediate(r));
+
+    expect(mockLoadMessages).toHaveBeenCalledWith('conv-1');
+    expect(mockSessions).toHaveLength(0);
+    expect(
+      queryClient.getQueryData(conversationKeys.messages('conv-1')),
+    ).toEqual([existing]);
+  });
+
  test('uses refreshed history and skips websocket when opening is already cached', async () => {
    const existing = assistantMessage();
    mockLoadMessages.mockResolvedValueOnce([existing]);
--- a/app-expo/tests/features/conversation/realtime-session-sync-order.test.ts
+++ b/app-expo/tests/features/conversation/realtime-session-sync-order.test.ts
@@ -0,0 +1,253 @@
+import { QueryClient } from '@tanstack/react-query';
+
+import { RealtimeSession } from '@/features/conversation/realtime-session';
+import { conversationKeys } from '@/features/conversation/query-keys';
+import type { MessageItem } from '@/features/conversation/types';
+
+jest.mock('@/core/auth/token-manager', () => ({
+  tokenManager: {
+    getAccessToken: jest.fn().mockResolvedValue('test-token'),
+  },
+}));
+
+jest.mock('@/core/config', () => ({
+  config: {
+    wsBaseUrl: 'ws://localhost:8000/',
+    ws: {
+      reconnectMaxRetries: 3,
+      reconnectBaseDelayMs: 10,
+      reconnectMaxDelayMs: 100,
+      heartbeatIntervalMs: 600000,
+    },
+  },
+}));
+
+class MockWebSocket {
+  static OPEN = 1;
+  static CLOSED = 3;
+  static instances: MockWebSocket[] = [];
+
+  readyState = MockWebSocket.OPEN;
+  onopen: (() => void) | null = null;
+  onmessage: ((event: { data: string }) => void) | null = null;
+  onclose: (() => void) | null = null;
+  onerror: (() => void) | null = null;
+
+  constructor(public url: string) {
+    MockWebSocket.instances.push(this);
+    queueMicrotask(() => this.onopen?.());
+  }
+
+  send(): void {}
+
+  close(): void {
+    this.readyState = MockWebSocket.CLOSED;
+  }
+
+  simulateMessage(data: Record<string, unknown>): void {
+    this.onmessage?.({ data: JSON.stringify(data) });
+  }
+}
+
+(global as Record<string, unknown>).WebSocket = MockWebSocket;
+
+function msgs(qc: QueryClient, cid: string): MessageItem[] {
+  return qc.getQueryData<MessageItem[]>(conversationKeys.messages(cid)) ?? [];
+}
+
+describe('RealtimeSession sync TTS / agent ordering', () => {
+  let qc: QueryClient;
+
+  beforeEach(() => {
+    jest.clearAllMocks();
+    MockWebSocket.instances = [];
+    qc = new QueryClient();
+    qc.setQueryData(conversationKeys.messages('conv-x'), []);
+  });
+
+  afterEach(async () => {
+    await new Promise((r) => setImmediate(r));
+  });
+
+  it('defers assistant commit when agent_response arrives before tts_audio (single segment)', async () => {
+    const aid = 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa';
+    const onTts = jest.fn(() => {
+      expect(msgs(qc, 'conv-x').some((m) => m.id === aid)).toBe(true);
+    });
+    const onStream = jest.fn();
+    const session = new RealtimeSession({
+      conversationId: 'conv-x',
+      queryClient: qc,
+      onStreamingText: onStream,
+      onTtsSegment: onTts,
+    });
+
+    await session.connect();
+    await new Promise((r) => setImmediate(r));
+
+    const ws = MockWebSocket.instances[0]!;
+    expect(session.sendText('hi', { ttsThisTurn: true })).toBe(true);
+
+    ws.simulateMessage({
+      type: 'agent_response',
+      conversation_id: 'conv-x',
+      data: {
+        text: 'Hello segment',
+        index: 0,
+        total: 1,
+        assistant_message_id: aid,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    const afterAgentOnly = msgs(qc, 'conv-x').filter(
+      (m) => m.senderType === 'assistant',
+    );
+    expect(afterAgentOnly).toHaveLength(0);
+
+    ws.simulateMessage({
+      type: 'tts_audio',
+      conversation_id: 'conv-x',
+      data: {
+        audio_url: 'https://example.com/tts-a.mp3',
+        index: 0,
+        total: 1,
+        assistant_message_id: aid,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    expect(onTts).toHaveBeenCalledTimes(1);
+    const committed = msgs(qc, 'conv-x').filter(
+      (m) => m.senderType === 'assistant',
+    );
+    expect(committed).toHaveLength(1);
+    expect(committed[0]!.content).toContain('Hello segment');
+
+    session.dispose();
+  });
+
+  it('multi-segment sync clears pending UI without streaming footer text', async () => {
+    const aid = 'bb22bb22-bbbb-bbbb-bbbb-bbbbbbbbbbbb';
+    const onTts = jest.fn(() => {
+      expect(
+        msgs(qc, 'conv-x').some((m) => m.id === `${aid}_seg_0`),
+      ).toBe(true);
+    });
+    const onStream = jest.fn();
+    const session = new RealtimeSession({
+      conversationId: 'conv-x',
+      queryClient: qc,
+      onStreamingText: onStream,
+      onTtsSegment: onTts,
+    });
+
+    await session.connect();
+    await new Promise((r) => setImmediate(r));
+
+    const ws = MockWebSocket.instances[0]!;
+    session.sendText('hi', { ttsThisTurn: true });
+
+    ws.simulateMessage({
+      type: 'tts_audio',
+      conversation_id: 'conv-x',
+      data: {
+        audio_url: 'https://example.com/tts-b.mp3',
+        index: 0,
+        total: 2,
+        assistant_message_id: aid,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    ws.simulateMessage({
+      type: 'agent_response',
+      conversation_id: 'conv-x',
+      data: {
+        text: 'Part A',
+        index: 0,
+        total: 2,
+        assistant_message_id: aid,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    expect(onStream).toHaveBeenCalledWith('', true);
+    expect(onStream).not.toHaveBeenCalledWith('Part A', true);
+    expect(onTts).toHaveBeenCalled();
+    session.dispose();
+  });
+
+  it('keeps active screen TTS callback when stale offscreen attach runs later', async () => {
+    const aid = 'cc33cc33-cccc-cccc-cccc-cccccccccccc';
+    const screenOnTts = jest.fn();
+    const offscreenOnTts = jest.fn();
+    const session = new RealtimeSession({
+      conversationId: 'conv-x',
+      queryClient: qc,
+    });
+    const owner = Symbol('screen-owner');
+
+    session.attachUiCallbacks({ onTtsSegment: screenOnTts }, owner);
+    session.attachUiCallbacks({ onTtsSegment: offscreenOnTts });
+
+    await session.connect();
+    await new Promise((r) => setImmediate(r));
+
+    const ws = MockWebSocket.instances[0]!;
+    ws.simulateMessage({
+      type: 'tts_audio',
+      conversation_id: 'conv-x',
+      data: {
+        audio_base64: 'ZmFrZS1tcDM=',
+        audio_url: 'https://example.com/tts-c.mp3',
+        index: 0,
+        total: 1,
+        assistant_message_id: aid,
+        manual: true,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    expect(screenOnTts).toHaveBeenCalledTimes(1);
+    expect(offscreenOnTts).not.toHaveBeenCalled();
+    session.dispose();
+  });
+
+  it('keeps active screen TTS callback when a stale screen owner attaches later', async () => {
+    const aid = 'dd44dd44-dddd-dddd-dddd-dddddddddddd';
+    const screenOnTts = jest.fn();
+    const staleScreenOnTts = jest.fn();
+    const session = new RealtimeSession({
+      conversationId: 'conv-x',
+      queryClient: qc,
+    });
+    const activeOwner = Symbol('active-screen-owner');
+    const staleOwner = Symbol('stale-screen-owner');
+
+    session.attachUiCallbacks({ onTtsSegment: screenOnTts }, activeOwner);
+    session.attachUiCallbacks({ onTtsSegment: staleScreenOnTts }, staleOwner);
+
+    await session.connect();
+    await new Promise((r) => setImmediate(r));
+
+    const ws = MockWebSocket.instances[0]!;
+    ws.simulateMessage({
+      type: 'tts_audio',
+      conversation_id: 'conv-x',
+      data: {
+        audio_base64: 'ZmFrZS1tcDM=',
+        audio_url: 'https://example.com/tts-d.mp3',
+        index: 0,
+        total: 1,
+        assistant_message_id: aid,
+        manual: true,
+      },
+      timestamp: new Date().toISOString(),
+    });
+
+    expect(screenOnTts).toHaveBeenCalledTimes(1);
+    expect(staleScreenOnTts).not.toHaveBeenCalled();
+    session.dispose();
+  });
+});
--- a/app-expo/tests/features/voice/use-player.test.tsx
+++ b/app-expo/tests/features/voice/use-player.test.tsx
@@ -37,6 +37,7 @@ describe('usePlayer', () => {
    });
    jest.mocked(audioFocus.acquireForPlayback).mockResolvedValue(true);
    jest.mocked(audioFocus.releaseIfOwnedBy).mockResolvedValue(undefined);
+    jest.mocked(audioFocus.onOwnerChange).mockImplementation(() => jest.fn());
  });

  test('keeps the native audio session active while app-level audio focus owns teardown', () => {
@@ -127,4 +128,43 @@ describe('usePlayer', () => {
    expect(pause).not.toHaveBeenCalled();
    expect(result.current.status).toBe('idle');
  });
+
+  test('retries queued audio after acquire fails once then audio focus frees', async () => {
+    const acquire = jest.mocked(audioFocus.acquireForPlayback);
+    acquire.mockResolvedValueOnce(false).mockResolvedValue(true);
+
+    let ownerListener: ((owner: null | string) => void) | undefined;
+    jest.mocked(audioFocus.onOwnerChange).mockImplementation((cb) => {
+      ownerListener = cb as (owner: null | string) => void;
+      return jest.fn();
+    });
+
+    mockUseAudioPlayerStatus.mockReturnValue({
+      isLoaded: true,
+      playing: false,
+      currentTime: 0,
+      duration: 10,
+    });
+    const play = jest.fn();
+    mockUseAudioPlayer.mockReturnValue({ pause: jest.fn(), play });
+
+    const { result } = renderHook(() => usePlayer());
+
+    await act(async () => {
+      await result.current.enqueue({
+        uri: 'file:///queued.mp3',
+        kind: 'tts_auto',
+      });
+    });
+
+    expect(acquire).toHaveBeenCalledTimes(1);
+    expect(result.current.status).toBe('idle');
+
+    await act(async () => {
+      ownerListener?.(null);
+    });
+
+    expect(acquire).toHaveBeenCalledTimes(2);
+    expect(play).toHaveBeenCalled();
+  });
 });