feat(conversation): TTS 投递与 WebSocket 管线；客户端播放门禁与会话页联动；COS 键与迁移脚本调整

2026-03-26 15:51:24 +08:00
parent c23931ec91
commit d990399112
22 changed files with 630 additions and 74 deletions
--- a/app-expo/src/app/(main)/conversation/[id].tsx
+++ b/app-expo/src/app/(main)/conversation/[id].tsx
@@ -1,6 +1,15 @@
 import { Image } from 'expo-image';
 import { useLocalSearchParams } from 'expo-router';
-import { Mic, Pause, Play, PlusCircle, Type, X } from 'lucide-react-native';
+import {
+  Mic,
+  Pause,
+  Play,
+  PlusCircle,
+  Square,
+  Type,
+  Volume2,
+  X,
+} from 'lucide-react-native';
 import React, { useCallback, useEffect, useRef, useState } from 'react';
 import type {
  LayoutChangeEvent,
@@ -23,18 +32,23 @@ import {
 import { KeyboardAvoidingView as KeyboardControllerAvoidingView } from 'react-native-keyboard-controller';
 import { useSafeAreaInsets } from 'react-native-safe-area-context';
 import { useTranslation } from 'react-i18next';
+import { useQueryClient } from '@tanstack/react-query';

 import { Icon } from '@/components/ui/icon';
 import { Text } from '@/components/ui/text';
 import { ScreenHeader } from '@/components/screen-header';
 import { useThemeColors } from '@/hooks/use-theme-colors';
 import { useMessages, useRealtimeSession } from '@/features/conversation/hooks';
+import type { TtsSegmentPayload } from '@/features/conversation/realtime-session';
+import { conversationKeys } from '@/features/conversation/query-keys';
 import {
  splitMessageParts,
  splitStreamingSegments,
 } from '@/features/conversation/message-split';
 import type { MessageItem } from '@/features/conversation/types';
 import { isVoiceMessage } from '@/features/conversation/types';
+import type { PlaybackItem } from '@/features/voice/types';
+import { createTtsPlaybackGate } from '@/features/voice/tts-playback-gate';
 import { usePlayer } from '@/features/voice/hooks/use-player';
 import { useRecorder } from '@/features/voice/hooks/use-recorder';

@@ -65,6 +79,14 @@ const USER_AVATAR =

 type InputMode = 'text' | 'voice';

+/** 流式助手区与自动 TTS 的 `PlaybackItem.messageRef.listKey` 对齐，用于点区域停止朗读 */
+const TTS_STREAMING_LIST_KEY = '__tts_streaming__';
+
+/** 多段拆分后仅首段显示「朗读」控件（整段消息共用 `ttsAudioUrls`） */
+function isFirstAssistantTextPart(listKey: string, messageId: string): boolean {
+  return listKey === messageId || listKey === `${messageId}_part_0`;
+}
+
 /** 展平消息列表：assistant 消息按 [SPLIT] 边界拆成多条，每条一个 listKey */
 function flattenMessagesForList(
  messages: MessageItem[],
@@ -96,23 +118,119 @@ function flattenMessagesForList(

 function MessageBubble({
  item,
+  listKey,
  agentName,
  meLabel,
  currentPlaybackUri,
+  currentPlaybackItem,
  playbackIsPlaying,
  onPlayVoiceExclusive,
  onPausePlayback,
+  onInterruptAssistantTts,
+  onReplayAssistantTts,
 }: {
  item: MessageItem;
+  listKey: string;
  agentName: string;
  meLabel: string;
  currentPlaybackUri: string | null;
+  currentPlaybackItem: PlaybackItem | null;
  playbackIsPlaying: boolean;
  onPlayVoiceExclusive: (uri: string) => void;
  onPausePlayback: () => void;
+  onInterruptAssistantTts: () => void;
+  onReplayAssistantTts: (messageId: string, urls: string[]) => void;
 }) {
+  const { t } = useTranslation('conversation');
  const isUser = item.senderType === 'user';
  const isVoice = isVoiceMessage(item);
+  const ttsUrls =
+    Array.isArray(item.ttsAudioUrls) && item.ttsAudioUrls.length > 0
+      ? item.ttsAudioUrls.filter(
+          (u): u is string => typeof u === 'string' && u.trim().length > 0,
+        )
+      : [];
+
+  const isAssistantTextFirstPart =
+    !isUser && !isVoice && isFirstAssistantTextPart(listKey, item.id);
+
+  const isThisBubbleTtsTarget =
+    !isUser &&
+    !isVoice &&
+    playbackIsPlaying &&
+    currentPlaybackItem?.kind !== 'voice' &&
+    currentPlaybackItem?.messageRef?.listKey === item.id;
+
+  const isAssistantTtsHighlight = isThisBubbleTtsTarget;
+
+  const assistantTextBubbleBody = (
+    <View
+      style={[
+        styles.bubble,
+        styles.bubbleAgent,
+        isAssistantTtsHighlight && styles.bubbleAgentTtsActive,
+      ]}
+    >
+      <Text selectable style={[styles.bubbleText, styles.bubbleTextAgent]}>
+        {item.content}
+      </Text>
+      {isAssistantTtsHighlight ? (
+        <Text style={styles.readingAloudCaption}>{t('readingAloud')}</Text>
+      ) : null}
+      {isAssistantTextFirstPart ? (
+        <View style={styles.readAloudRow}>
+          {isThisBubbleTtsTarget ? (
+            <View
+              style={styles.readAloudButtonInner}
+              accessibilityElementsHidden
+              importantForAccessibility="no-hide-descendants"
+            >
+              <Icon as={Square} size={16} color={CHAT_COLORS.primary} />
+              <Text style={styles.readAloudButtonLabel}>
+                {t('stopReadingAloud')}
+              </Text>
+            </View>
+          ) : (
+            <Pressable
+              onPress={() => {
+                if (ttsUrls.length) {
+                  onReplayAssistantTts(item.id, ttsUrls);
+                }
+              }}
+              disabled={!ttsUrls.length}
+              style={({ pressed }) => [
+                styles.readAloudButton,
+                !ttsUrls.length && styles.readAloudButtonDisabled,
+                pressed && ttsUrls.length && { opacity: 0.85 },
+              ]}
+              accessibilityRole="button"
+              accessibilityLabel={
+                ttsUrls.length ? t('readAloudAgain') : t('cannotReadAloud')
+              }
+            >
+              <Icon
+                as={Volume2}
+                size={16}
+                color={
+                  ttsUrls.length
+                    ? CHAT_COLORS.primary
+                    : CHAT_COLORS.onSurfaceVariant
+                }
+              />
+              <Text
+                style={[
+                  styles.readAloudButtonLabel,
+                  !ttsUrls.length && styles.readAloudButtonLabelDisabled,
+                ]}
+              >
+                {ttsUrls.length ? t('readAloudAgain') : t('cannotReadAloud')}
+              </Text>
+            </Pressable>
+          )}
+        </View>
+      ) : null}
+    </View>
+  );

  return (
    <View style={[styles.messageRow, isUser && styles.messageRowReverse]}>
@@ -157,23 +275,23 @@ function MessageBubble({
              }}
            />
          </View>
-        ) : (
-          <View
-            style={[
-              styles.bubble,
-              isUser ? styles.bubbleUser : styles.bubbleAgent,
-            ]}
-          >
-            <Text
-              selectable
-              style={[
-                styles.bubbleText,
-                isUser ? styles.bubbleTextUser : styles.bubbleTextAgent,
-              ]}
-            >
+        ) : isUser ? (
+          <View style={[styles.bubble, styles.bubbleUser]}>
+            <Text selectable style={[styles.bubbleText, styles.bubbleTextUser]}>
              {item.content}
            </Text>
          </View>
+        ) : isThisBubbleTtsTarget ? (
+          <Pressable
+            onPress={onInterruptAssistantTts}
+            style={({ pressed }) => [pressed && { opacity: 0.92 }]}
+            accessibilityRole="button"
+            accessibilityLabel={t('stopReadingAloud')}
+          >
+            {assistantTextBubbleBody}
+          </Pressable>
+        ) : (
+          assistantTextBubbleBody
        )}
      </View>
    </View>
@@ -184,11 +302,16 @@ function StreamingBubbles({
  streamingText,
  isComplete,
  agentName,
+  streamingTtsActive,
+  onStreamingPress,
 }: {
  streamingText: string;
  isComplete: boolean;
  agentName: string;
+  streamingTtsActive?: boolean;
+  onStreamingPress?: () => void;
 }) {
+  const { t } = useTranslation('conversation');
  const segments = splitStreamingSegments(streamingText);
  const completedParts =
    segments.length > 1
@@ -197,8 +320,8 @@ function StreamingBubbles({
  const streamingPart =
    segments.length > 0 ? segments[segments.length - 1]! : streamingText;

-  return (
-    <View>
+  const inner = (
+    <>
      {completedParts.map((part, i) => (
        <View
          key={`streaming_complete_${i}`}
@@ -214,7 +337,13 @@ function StreamingBubbles({
            />
          </View>
          <View style={[styles.bubbleColumn]}>
-            <View style={[styles.bubble, styles.bubbleAgent]}>
+            <View
+              style={[
+                styles.bubble,
+                styles.bubbleAgent,
+                streamingTtsActive && styles.bubbleAgentTtsActive,
+              ]}
+            >
              <Text
                selectable
                style={[styles.bubbleText, styles.bubbleTextAgent]}
@@ -236,7 +365,13 @@ function StreamingBubbles({
          />
        </View>
        <View style={[styles.bubbleColumn]}>
-          <View style={[styles.bubble, styles.bubbleAgent]}>
+          <View
+            style={[
+              styles.bubble,
+              styles.bubbleAgent,
+              streamingTtsActive && styles.bubbleAgentTtsActive,
+            ]}
+          >
            <Text
              selectable
              style={[styles.bubbleText, styles.bubbleTextAgent]}
@@ -247,8 +382,27 @@ function StreamingBubbles({
          </View>
        </View>
      </View>
-    </View>
+      {streamingTtsActive ? (
+        <View style={styles.streamingTtsCaptionRow}>
+          <Text style={styles.readingAloudCaption}>{t('readingAloud')}</Text>
+        </View>
+      ) : null}
+    </>
  );
+
+  if (streamingTtsActive && onStreamingPress) {
+    return (
+      <Pressable
+        onPress={onStreamingPress}
+        accessibilityRole="button"
+        accessibilityLabel={t('stopReadingAloud')}
+      >
+        {inner}
+      </Pressable>
+    );
+  }
+
+  return <View>{inner}</View>;
 }

 function formatRecordingDuration(seconds: number): string {
@@ -606,35 +760,95 @@ function ChatInputBar({

 export default function ConversationScreen() {
  const { id } = useLocalSearchParams<{ id: string }>();
+  const queryClient = useQueryClient();
  const insets = useSafeAreaInsets();
  const { t } = useTranslation('conversation');
  const { t: tApp } = useTranslation('app');
  const { data: messages } = useMessages(id);
+  const ttsGate = useRef(createTtsPlaybackGate());
  const {
    enqueue,
    enqueueExclusive,
    stop,
    status: playerStatus,
    currentSource,
+    currentPlaybackItem,
  } = usePlayer();

+  const handleTtsPlaybackResume = useCallback(() => {
+    ttsGate.current.onUserMessageSent();
+  }, []);
+
  const handleTtsSegment = useCallback(
-    (p: { audioBase64?: string; audioUrl?: string }) => {
+    (p: TtsSegmentPayload) => {
+      if (!ttsGate.current.shouldAcceptIncomingTts()) return;
+      const convId = id ?? '';
+      const cosUrl = p.audioUrl?.trim();
+      /**
+       * 播放走 WS，但「再读」依赖 MessageItem.ttsAudioUrls。乐观提交的消息没有 URL，
+       * 服务端 attach 要等整轮结束；收到 COS URL 时写入缓存，按钮才能用。
+       */
+      if (cosUrl && convId) {
+        queryClient.setQueryData<MessageItem[]>(
+          conversationKeys.messages(convId),
+          (old) => {
+            if (!old?.length) return old;
+            let idx = -1;
+            if (p.assistantMessageId) {
+              idx = old.findIndex((m) => m.id === p.assistantMessageId);
+            }
+            if (idx < 0) {
+              for (let i = old.length - 1; i >= 0; i--) {
+                const row = old[i]!;
+                if (row.senderType === 'assistant' && !isVoiceMessage(row)) {
+                  idx = i;
+                  break;
+                }
+              }
+            }
+            if (idx < 0) return old;
+            const target = old[idx]!;
+            const prevUrls = target.ttsAudioUrls ?? [];
+            if (prevUrls.includes(cosUrl)) return old;
+            const nextUrls = [...prevUrls, cosUrl];
+            const nextId =
+              p.assistantMessageId &&
+              (target.id.startsWith(`${convId}_agent_`) ||
+                target.id.startsWith('pending'))
+                ? p.assistantMessageId
+                : target.id;
+            const next = [...old];
+            next[idx] = {
+              ...target,
+              id: nextId,
+              ttsAudioUrls: nextUrls,
+            };
+            return next;
+          },
+        );
+      }
+
+      const listKey = p.assistantMessageId ?? TTS_STREAMING_LIST_KEY;
+      const shared = {
+        kind: 'tts_auto' as const,
+        label: 'TTS',
+        messageRef: { listKey },
+      };
      if (p.audioBase64) {
        void enqueue({
+          ...shared,
          uri: `data:audio/mp3;base64,${p.audioBase64}`,
-          label: 'TTS',
        });
      } else if (p.audioUrl) {
-        void enqueue({ uri: p.audioUrl, label: 'TTS' });
+        void enqueue({ ...shared, uri: p.audioUrl });
      }
    },
-    [enqueue],
+    [enqueue, id, queryClient],
  );

  const handlePlayVoiceExclusive = useCallback(
    (uri: string) => {
-      void enqueueExclusive({ uri, label: 'voice' });
+      void enqueueExclusive({ uri, label: 'voice', kind: 'voice' });
    },
    [enqueueExclusive],
  );
@@ -643,12 +857,42 @@ export default function ConversationScreen() {
    void stop();
  }, [stop]);

-  const { connectionState, streamingMessage, sendText, sendVoiceMessage } =
-    useRealtimeSession({
-      conversationId: id ?? '',
-      enabled: !!id,
-      onTtsSegment: handleTtsSegment,
-    });
+  const handleReplayAssistantTts = useCallback(
+    (messageId: string, urls: string[]) => {
+      if (!urls.length) return;
+      void (async () => {
+        await stop();
+        for (const uri of urls) {
+          await enqueue({
+            uri,
+            kind: 'tts_repeat',
+            label: 'TTS',
+            messageRef: { listKey: messageId },
+          });
+        }
+      })();
+    },
+    [enqueue, stop],
+  );
+
+  const {
+    connectionState,
+    streamingMessage,
+    sendText,
+    sendVoiceMessage,
+    sendTtsCancel,
+  } = useRealtimeSession({
+    conversationId: id ?? '',
+    enabled: !!id,
+    onTtsSegment: handleTtsSegment,
+    onTtsPlaybackResume: handleTtsPlaybackResume,
+  });
+
+  const handleInterruptAssistantTts = useCallback(() => {
+    sendTtsCancel();
+    ttsGate.current.interrupt();
+    void stop();
+  }, [sendTtsCancel, stop]);

  const handleRecordingComplete = useCallback(
    (uri: string, durationMs: number) => {
@@ -697,8 +941,12 @@ export default function ConversationScreen() {
    const ok = await startRecording();
    if (!ok) {
      Alert.alert(t('recordingPermissionDenied'));
+      return;
    }
-  }, [startRecording, t]);
+    sendTtsCancel();
+    ttsGate.current.interrupt();
+    void stop();
+  }, [sendTtsCancel, startRecording, stop, t]);

  const scrollListToEndAfterComposerLayout = useCallback(() => {
    InteractionManager.runAfterInteractions(() => {
@@ -872,12 +1120,16 @@ export default function ConversationScreen() {
        renderItem={({ item }) => (
          <MessageBubble
            item={item}
+            listKey={item.listKey}
            agentName={t('agentName')}
            meLabel={t('me')}
            currentPlaybackUri={currentSource}
+            currentPlaybackItem={currentPlaybackItem}
            playbackIsPlaying={playerStatus === 'playing'}
            onPlayVoiceExclusive={handlePlayVoiceExclusive}
            onPausePlayback={handlePausePlayback}
+            onInterruptAssistantTts={handleInterruptAssistantTts}
+            onReplayAssistantTts={handleReplayAssistantTts}
          />
        )}
        onContentSizeChange={() =>
@@ -891,6 +1143,12 @@ export default function ConversationScreen() {
              streamingText={streamingMessage.text}
              isComplete={streamingMessage.isComplete}
              agentName={t('agentName')}
+              streamingTtsActive={
+                !!streamingMessage &&
+                playerStatus === 'playing' &&
+                currentPlaybackItem?.kind === 'tts_auto'
+              }
+              onStreamingPress={handleInterruptAssistantTts}
            />
          ) : null
        }
@@ -1075,6 +1333,53 @@ const styles = StyleSheet.create({
    borderBottomRightRadius: 12,
    borderBottomLeftRadius: 4,
  },
+  bubbleAgentTtsActive: {
+    borderWidth: 1.5,
+    borderColor: 'rgba(129, 119, 166, 0.5)',
+    backgroundColor: 'rgba(231, 222, 255, 0.45)',
+  },
+  readingAloudCaption: {
+    fontSize: 12,
+    lineHeight: 16,
+    marginTop: 6,
+    color: CHAT_COLORS.primary,
+    fontWeight: '500',
+  },
+  streamingTtsCaptionRow: {
+    paddingLeft: 50,
+    marginTop: 4,
+    marginBottom: 8,
+  },
+  readAloudRow: {
+    marginTop: 8,
+    paddingTop: 8,
+    borderTopWidth: StyleSheet.hairlineWidth,
+    borderTopColor: 'rgba(0, 0, 0, 0.08)',
+  },
+  readAloudButton: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    gap: 6,
+    alignSelf: 'flex-start',
+  },
+  readAloudButtonInner: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    gap: 6,
+    alignSelf: 'flex-start',
+  },
+  readAloudButtonLabel: {
+    fontSize: 14,
+    fontWeight: '600',
+    color: CHAT_COLORS.primary,
+  },
+  readAloudButtonLabelDisabled: {
+    color: CHAT_COLORS.onSurfaceVariant,
+    fontWeight: '500',
+  },
+  readAloudButtonDisabled: {
+    opacity: 0.72,
+  },
  bubbleUser: {
    backgroundColor: CHAT_COLORS.primary,
    borderTopLeftRadius: 12,
--- a/app-expo/src/core/ws/client.ts
+++ b/app-expo/src/core/ws/client.ts
@@ -49,6 +49,7 @@ function mapServerMessage(raw: RawServerMessage): WsEvent | null {
        audioUrl: d.audio_url as string | undefined,
        index: d.index as number | undefined,
        total: d.total as number | undefined,
+        assistantMessageId: d.assistant_message_id as string | undefined,
      };

    case 'end_conversation':
@@ -166,6 +167,10 @@ export class WsClient {
    return this.send({ type: 'text', data: { text } });
  }

+  sendTtsCancel(): boolean {
+    return this.send({ type: 'tts_cancel', data: {} });
+  }
+
  sendEndConversation(): boolean {
    return this.send({ type: 'end_conversation', data: {} });
  }
--- a/app-expo/src/core/ws/types.ts
+++ b/app-expo/src/core/ws/types.ts
@@ -14,6 +14,7 @@ export type ClientMessageType =
  | 'audio_segment'
  | 'audio_message'
  | 'transcribe_only'
+  | 'tts_cancel'
  | 'end_conversation';

 export interface RawServerMessage {
@@ -63,6 +64,8 @@ export interface TtsAudioReceivedEvent {
  audioUrl?: string;
  index?: number;
  total?: number;
+  /** 持久化后的助手消息 id（与 REST `messages` 中 `id` 对齐） */
+  assistantMessageId?: string;
 }

 export interface ConversationEndedEvent {
--- a/app-expo/src/features/conversation/hooks.ts
+++ b/app-expo/src/features/conversation/hooks.ts
@@ -117,6 +117,8 @@ interface UseRealtimeSessionOptions {
  conversationId: string;
  enabled?: boolean;
  onTtsSegment?: (payload: TtsSegmentPayload) => void;
+  /** 用户发出下一条文本/语音成功后调用，用于恢复接受 TTS 片段（打断后丢弃迟到片段） */
+  onTtsPlaybackResume?: () => void;
 }

 const MIN_RECORDING_DURATION_SEC = 1;
@@ -136,12 +138,14 @@ interface RealtimeSessionState {
  sendText: (text: string) => void;
  sendVoiceMessage: (uri: string, durationMs: number) => Promise<boolean>;
  sendEndConversation: () => void;
+  sendTtsCancel: () => void;
 }

 export function useRealtimeSession({
  conversationId,
  enabled = true,
  onTtsSegment,
+  onTtsPlaybackResume,
 }: UseRealtimeSessionOptions): RealtimeSessionState {
  const queryClient = useQueryClient();
  const sessionRef = useRef<RealtimeSession | null>(null);
@@ -207,6 +211,8 @@ export function useRealtimeSession({
        return;
      }

+      onTtsPlaybackResume?.();
+
      const localId = `pending_${Date.now()}`;

      queryClient.setQueryData<MessageItem[]>(
@@ -224,7 +230,7 @@ export function useRealtimeSession({
        },
      );
    },
-    [conversationId, queryClient],
+    [conversationId, queryClient, onTtsPlaybackResume],
  );

  const sendVoiceMessage = useCallback(
@@ -276,19 +282,24 @@ export function useRealtimeSession({
            return [...(old ?? []), msg];
          },
        );
+        onTtsPlaybackResume?.();
        return true;
      } catch {
        setError('语音文件读取失败');
        return false;
      }
    },
-    [conversationId, queryClient],
+    [conversationId, queryClient, onTtsPlaybackResume],
  );

  const sendEndConversation = useCallback(() => {
    sessionRef.current?.sendEndConversation();
  }, []);

+  const sendTtsCancel = useCallback(() => {
+    sessionRef.current?.sendTtsCancel();
+  }, []);
+
  return {
    connectionState,
    streamingMessage,
@@ -296,5 +307,6 @@ export function useRealtimeSession({
    sendText,
    sendVoiceMessage,
    sendEndConversation,
+    sendTtsCancel,
  };
 }
--- a/app-expo/src/features/conversation/realtime-session.ts
+++ b/app-expo/src/features/conversation/realtime-session.ts
@@ -19,6 +19,10 @@ export type ErrorCallback = (message: string, code?: string) => void;
 export type TtsSegmentPayload = {
  audioBase64?: string;
  audioUrl?: string;
+  index?: number;
+  total?: number;
+  /** 服务端持久化后的助手消息 id，用于与气泡 listKey / 消息 id 对齐 */
+  assistantMessageId?: string;
 };

 interface RealtimeSessionOptions {
@@ -116,6 +120,11 @@ export class RealtimeSession {
    return this.client.sendEndConversation();
  }

+  /** 通知服务端停止当前轮次后续 TTS 合成与下发（与客户端 stop 队列配合） */
+  sendTtsCancel(): boolean {
+    return this.client.sendTtsCancel();
+  }
+
  getConnectionState(): WsConnectionState {
    return this.client.getState();
  }
@@ -135,6 +144,9 @@ export class RealtimeSession {
        this.onTtsSegment?.({
          audioBase64: b64 || undefined,
          audioUrl: url || undefined,
+          index: event.index,
+          total: event.total,
+          assistantMessageId: event.assistantMessageId,
        });
      }
      return;
--- a/app-expo/src/features/conversation/types.ts
+++ b/app-expo/src/features/conversation/types.ts
@@ -66,6 +66,8 @@ export interface MessageItem {
  durationSeconds?: number;
  /** 语音文件本地 URI，用于回放，仅本地乐观语音条有值 */
  audioUri?: string;
+  /** 助手 TTS 已上传的 COS URL 列表（与后端 `ttsAudioUrls` 一致），用于不重合成重复朗读 */
+  ttsAudioUrls?: string[];
 }

 export interface OrganizeResponse {
--- a/app-expo/src/features/voice/hooks/use-player.ts
+++ b/app-expo/src/features/voice/hooks/use-player.ts
@@ -1,5 +1,5 @@
 import { useAudioPlayer, useAudioPlayerStatus } from 'expo-audio';
-import { useCallback, useEffect, useRef, useState } from 'react';
+import { useCallback, useEffect, useMemo, useRef, useState } from 'react';

 import { audioFocus } from '@/core/audio/audio-focus';

@@ -10,6 +10,8 @@ interface UsePlayerResult {
  queueLength: number;
  /** Current playback source URI (file, https, or data URL). */
  currentSource: string | null;
+  /** 当前正在播放的队列项（含 kind / messageRef），队列为空或未开始为 null */
+  currentPlaybackItem: PlaybackItem | null;
  enqueue: (item: PlaybackItem) => void;
  /** Replace queue and play this item (e.g. user voice bubble vs other sources). */
  enqueueExclusive: (item: PlaybackItem) => Promise<void>;
@@ -29,6 +31,8 @@ export function usePlayer(): UsePlayerResult {
  const [status, setStatus] = useState<PlayerStatus>('idle');
  const [queueLength, setQueueLength] = useState(0);
  const [currentSource, setCurrentSource] = useState<string | null>(null);
+  const [currentPlaybackItem, setCurrentPlaybackItem] =
+    useState<PlaybackItem | null>(null);
  const isPlayingRef = useRef(false);
  const wasBlockedByRecorderRef = useRef(false);
  const isPlayNextInProgressRef = useRef(false);
@@ -37,16 +41,30 @@ export function usePlayer(): UsePlayerResult {
  /** 当前 source 是否已进入过 playing=true，避免换源瞬间 playerStatus 仍带上一首的 duration 而误判「已播完」。 */
  const trackHasPlayedRef = useRef(false);

-  const player = useAudioPlayer(currentSource, { downloadFirst: false });
+  /** 远程 HTTPS 需先下载再解码，否则再读（仅 URL、无 base64）可能无声；本地/data URL 保持 false */
+  const playerOptions = useMemo(() => {
+    const remote =
+      typeof currentSource === 'string' &&
+      (currentSource.startsWith('https://') ||
+        currentSource.startsWith('http://'));
+    return { downloadFirst: remote };
+  }, [currentSource]);
+
+  const player = useAudioPlayer(currentSource, playerOptions);
  const playerStatus = useAudioPlayerStatus(player);

-  // Start playback when a new source is set
+  /**
+   * 必须在 `isLoaded` 之后再 `play()`。
+   * expo-audio 在 `downloadFirst: true` 时先用 null 建 player，再在内部 effect 里异步
+   * `resolveSourceWithDownload` 后 `replace()`（见 node_modules/expo-audio/build/ExpoAudio.js）。
+   * 若仅在 `currentSource` 变化时立刻 `play()`，会在 replace 完成前播放 → 远程 URL（再读）无声。
+   */
  useEffect(() => {
-    if (currentSource && player) {
-      player.play();
-      isPlayingRef.current = true;
-    }
-  }, [currentSource, player]);
+    if (!currentSource || !player) return;
+    if (!playerStatus.isLoaded) return;
+    player.play();
+    isPlayingRef.current = true;
+  }, [currentSource, player, playerStatus.isLoaded]);

  const playNext = useCallback(async () => {
    if (isPlayNextInProgressRef.current) return;
@@ -54,6 +72,7 @@ export function usePlayer(): UsePlayerResult {
    try {
      if (queueRef.current.length === 0) {
        playbackActiveUriRef.current = null;
+        setCurrentPlaybackItem(null);
        setCurrentSource(null);
        setStatus('idle');
        setQueueLength(0);
@@ -74,6 +93,7 @@ export function usePlayer(): UsePlayerResult {
      setStatus('playing');
      trackHasPlayedRef.current = false;
      playbackActiveUriRef.current = next.uri;
+      setCurrentPlaybackItem(next);
      setCurrentSource(next.uri);
    } finally {
      isPlayNextInProgressRef.current = false;
@@ -147,6 +167,7 @@ export function usePlayer(): UsePlayerResult {
        player.pause();
      }
      playbackActiveUriRef.current = null;
+      setCurrentPlaybackItem(null);
      setCurrentSource(null);
      setStatus('idle');
      await audioFocus.release();
@@ -165,6 +186,7 @@ export function usePlayer(): UsePlayerResult {
    }

    playbackActiveUriRef.current = null;
+    setCurrentPlaybackItem(null);
    setCurrentSource(null);
    setStatus('idle');
    await audioFocus.release();
@@ -174,6 +196,7 @@ export function usePlayer(): UsePlayerResult {
    status,
    queueLength,
    currentSource,
+    currentPlaybackItem,
    enqueue,
    enqueueExclusive,
    stop,
--- a/app-expo/src/features/voice/tts-playback-gate.ts
+++ b/app-expo/src/features/voice/tts-playback-gate.ts
@@ -0,0 +1,17 @@
+/**
+ * 打断 TTS 后服务端仍可能推送迟到的 `tts_audio`；在恢复新一轮对话前丢弃这些片段。
+ * `interrupt` 在录音开始或点气泡停止时调用；`onUserMessageSent` 在用户发出下一条文本/语音成功后调用。
+ */
+export function createTtsPlaybackGate() {
+  let dropLateSegments = false;
+
+  return {
+    interrupt: () => {
+      dropLateSegments = true;
+    },
+    onUserMessageSent: () => {
+      dropLateSegments = false;
+    },
+    shouldAcceptIncomingTts: () => !dropLateSegments,
+  };
+}
--- a/app-expo/src/features/voice/types.ts
+++ b/app-expo/src/features/voice/types.ts
@@ -31,7 +31,12 @@ export interface SegmentOutboxEntry {

 export type PlayerStatus = 'idle' | 'loading' | 'playing' | 'paused' | 'error';

+export type PlaybackItemKind = 'tts_auto' | 'tts_repeat' | 'voice';
+
 export interface PlaybackItem {
  uri: string;
  label?: string;
+  kind?: PlaybackItemKind;
+  /** 与 `flattenMessagesForList` 的 `listKey` 对齐，用于朗读中高亮与点气泡停止 */
+  messageRef?: { listKey: string };
 }
--- a/app-expo/src/i18n/generated/resources.ts
+++ b/app-expo/src/i18n/generated/resources.ts
@@ -65,6 +65,7 @@ interface Resources {
    agentName: 'Life Echo';
    cancel: 'Cancel';
    cancelRecording: 'Cancel recording';
+    cannotReadAloud: 'Read unavailable';
    chatQueueSendTimeout: 'Connection timed out. Check your network and try again.';
    chatTitle: 'Conversation';
    chatUnavailableConnecting: 'Reconnecting now. You can keep typing and send once the connection is back.';
@@ -83,10 +84,13 @@ interface Resources {
    inputPlaceholder: 'Type a message...';
    inputPlaceholderVoice: 'Type here or hold the mic to speak...';
    me: 'Me';
+    readAloudAgain: 'Play again';
+    readingAloud: 'Reading aloud…';
    recentChats: 'Recent Chats';
    recordingPermissionDenied: 'Microphone permission is required to record';
    send: 'Send';
    startNewSubtitle: 'Capture a new memory or share your thoughts with your companion.';
+    stopReadingAloud: 'Stop reading aloud';
    switchToText: 'Switch to text input';
    switchToVoice: 'Switch to voice input';
    tapToEndRecording: 'Tap to end';
--- a/app-expo/src/i18n/locales/en/conversation.json
+++ b/app-expo/src/i18n/locales/en/conversation.json
@@ -22,6 +22,10 @@
  "inputPlaceholderVoice": "Type here or hold the mic to speak...",
  "me": "Me",
  "recentChats": "Recent Chats",
+  "stopReadingAloud": "Stop reading aloud",
+  "readAloudAgain": "Play again",
+  "cannotReadAloud": "Read unavailable",
+  "readingAloud": "Reading aloud…",
  "recordingPermissionDenied": "Microphone permission is required to record",
  "send": "Send",
  "startNewSubtitle": "Capture a new memory or share your thoughts with your companion.",
--- a/app-expo/src/i18n/locales/zh/conversation.json
+++ b/app-expo/src/i18n/locales/zh/conversation.json
@@ -22,6 +22,10 @@
  "inputPlaceholderVoice": "点击这里输入，或者按住左边说话...",
  "me": "我",
  "recentChats": "最近对话",
+  "stopReadingAloud": "停止朗读",
+  "readAloudAgain": "再读",
+  "cannotReadAloud": "暂无法朗读",
+  "readingAloud": "朗读中…",
  "recordingPermissionDenied": "需要麦克风权限才能录音",
  "send": "发送",
  "startNewSubtitle": "记录新回忆，或与岁月知己分享你的想法。",