fix(conversation): 修复实时会话 TTS/回复被离屏 WS 抢占

- 列表预热仅预取消息缓存,避免后台 WebSocket 覆盖服务端连接
- RealtimeSession UI 回调按 owner 独占,防止 offscreen 覆盖聊天页
- 列表页聚焦时再 prewarm,会话页 TTS 入队优先 base64
- 管线下发 TTS 同时带 audio_base64 与 audio_url;协议说明同步
- 移除 TTS 排查用前后端调试日志,保留错误/告警
- 补充 WS / RealtimeSession / entry-warmup / 播放器相关单测

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Kevin
2026-05-12 10:42:44 +08:00
parent 93be60f74c
commit 3d01085442
18 changed files with 643 additions and 261 deletions

View File

@@ -46,6 +46,10 @@ import { Icon } from '@/components/ui/icon';
import { Text } from '@/components/ui/text';
import { ScreenHeader } from '@/components/screen-header';
import { resolveApiMediaUrl } from '@/core/api/media-url';
import {
getTtsSpeakDefault,
setTtsSpeakDefault,
} from '@/core/settings/app-settings';
import { useAppSettings } from '@/hooks/use-app-settings';
import { useThemeColors } from '@/hooks/use-theme-colors';
import { useTypography } from '@/core/typography-context';
@@ -1238,15 +1242,14 @@ export default function ConversationScreen() {
const handleTtsSegment = useCallback(
(p: TtsSegmentPayload) => {
// 闸门用于丢弃「用户已打断后」迟到的自动 TTS按需朗读 (manual) 是当前明确操作,必须放行。
const allowByGate =
p.manual === true || ttsGate.current.shouldAcceptIncomingTts();
if (!allowByGate) return;
const convId = id ?? '';
const cosUrl = p.audioUrl?.trim();
const isManualPlayback = !!p.manual;
const shouldAutoPlay = !!p.autoPlay;
/**
* 播放走 WS但「再读」依赖 MessageItem.ttsAudioUrls。乐观提交的消息没有 URL
* 服务端 attach 要等整轮结束;收到 COS URL 时写入缓存,按钮才能用
* COS URL 先写入缓存:**与Speak无关**,否则录音/打断后闸门关闭时会跳过 merge
* 按需朗读下发的 `tts_audio` 无法绑定到气泡,喇叭表现为「不可用」
* 闸门仅约束是否入队播放(迟到自动朗读),不禁用 URL 附着。
*/
if (cosUrl && convId) {
queryClient.setQueryData<MessageItem[]>(
@@ -1307,8 +1310,13 @@ export default function ConversationScreen() {
);
}
const gateAllowsPlayback =
isManualPlayback || ttsGate.current.shouldAcceptIncomingTts();
const shouldEnqueue =
p.manual === true || lastUserMessageRequestedTtsRef.current;
isManualPlayback ||
shouldAutoPlay ||
lastUserMessageRequestedTtsRef.current;
if (!gateAllowsPlayback) return;
if (!shouldEnqueue) return;
const listKey =
@@ -1325,8 +1333,8 @@ export default function ConversationScreen() {
...shared,
uri: `data:audio/mp3;base64,${p.audioBase64}`,
});
} else if (p.audioUrl) {
void enqueue({ ...shared, uri: p.audioUrl });
} else if (cosUrl) {
void enqueue({ ...shared, uri: cosUrl });
}
},
[enqueue, id, queryClient],
@@ -1399,8 +1407,9 @@ export default function ConversationScreen() {
const [input, setInput] = useState('');
const [inputResetKey, setInputResetKey] = useState(0);
/** 本条发出的用户消息是否请求助手朗读(先 TTS 再出字) */
/** 本条发出的用户消息是否请求助手朗读(先 TTS 再出字);默认值从存储恢复 */
const [ttsThisTurn, setTtsThisTurn] = useState(false);
const [ttsSpeakPrefReady, setTtsSpeakPrefReady] = useState(false);
const [inputMode, setInputMode] = useState<InputMode>('text');
const [isKeyboardVisible, setIsKeyboardVisible] = useState(false);
const inputModeRef = useRef<InputMode>('text');
@@ -1489,6 +1498,24 @@ export default function ConversationScreen() {
inputModeRef.current = inputMode;
}, [inputMode]);
useEffect(() => {
let cancelled = false;
void (async () => {
try {
const v = await getTtsSpeakDefault();
if (!cancelled) {
setTtsThisTurn(v);
setTtsSpeakPrefReady(true);
}
} catch {
if (!cancelled) setTtsSpeakPrefReady(true);
}
})();
return () => {
cancelled = true;
};
}, []);
useEffect(() => {
const onKeyboardWillShow = () => {
if (inputModeRef.current !== 'text') return;
@@ -1687,7 +1714,11 @@ export default function ConversationScreen() {
<Switch
accessibilityLabel={t('ttsThisTurnAccessibility')}
value={ttsThisTurn}
onValueChange={setTtsThisTurn}
disabled={!ttsSpeakPrefReady}
onValueChange={(v) => {
setTtsThisTurn(v);
void setTtsSpeakDefault(v);
}}
trackColor={{
false: CHAT_COLORS.outline,
true: CHAT_COLORS.secondaryContainer,

View File

@@ -2,6 +2,7 @@ import { Image } from 'expo-image';
import { router } from 'expo-router';
import React, { useEffect, useRef, useState } from 'react';
import { useQueryClient } from '@tanstack/react-query';
import { useIsFocused } from '@react-navigation/native';
import {
Alert,
AppState,
@@ -302,6 +303,7 @@ function findTodayConversationToResume(
export default function ConversationsScreen() {
const { t } = useTranslation('conversation');
const queryClient = useQueryClient();
const isFocused = useIsFocused();
const { data: conversations = [], isLoading } = useConversations();
const createConversation = useCreateConversation();
@@ -448,12 +450,13 @@ export default function ConversationsScreen() {
* 单槽连接池:换会话会自动 dispose 旧槽,所以这里只挑一条最像即将被点的。
*/
useEffect(() => {
if (!isFocused) return;
if (isLoading) return;
const candidate =
todayConversation ?? conversations.find(conversationHasAnyMessage);
if (!candidate) return;
prewarmConversationSession(queryClient, candidate.id);
}, [isLoading, conversations, todayConversation, queryClient]);
}, [isFocused, isLoading, conversations, todayConversation, queryClient]);
return (
<View className="flex-1 bg-background">

View File

@@ -20,7 +20,10 @@ function notify() {
*/
export const audioFocus = {
async acquireForRecording(): Promise<boolean> {
if (currentOwner === 'recorder') return true;
if (currentOwner === 'recorder') {
await setIsAudioActiveAsync(true);
return true;
}
if (currentOwner === 'player') {
await this.releaseIfOwnedBy('player');
@@ -30,6 +33,7 @@ export const audioFocus = {
playsInSilentMode: true,
allowsRecording: true,
});
await setIsAudioActiveAsync(true);
currentOwner = 'recorder';
notify();
@@ -37,7 +41,10 @@ export const audioFocus = {
},
async acquireForPlayback(): Promise<boolean> {
if (currentOwner === 'player') return true;
if (currentOwner === 'player') {
await setIsAudioActiveAsync(true);
return true;
}
if (currentOwner === 'recorder') {
return false;
@@ -47,6 +54,7 @@ export const audioFocus = {
playsInSilentMode: true,
allowsRecording: false,
});
await setIsAudioActiveAsync(true);
currentOwner = 'player';
notify();

View File

@@ -14,6 +14,7 @@ const KEY_LANGUAGE = 'app_settings_language';
const KEY_LARGE_TEXT = 'app_settings_large_text';
const KEY_DARK_MODE = 'app_settings_dark_mode';
const KEY_THEME_NAME = 'app_settings_theme_name';
const KEY_TTS_SPEAK_DEFAULT = 'app_settings_tts_speak_default';
const webFallback: Record<string, string> = {};
@@ -83,5 +84,16 @@ export async function setThemeName(value: ThemeName): Promise<void> {
await setStored(KEY_THEME_NAME, value);
}
/** 会话页「Speak / 本轮朗读」开关是否默认开启(跨会话记忆) */
export async function getTtsSpeakDefault(): Promise<boolean> {
const v = await getStored(KEY_TTS_SPEAK_DEFAULT);
if (v == null || v === '') return false;
return v === 'true';
}
export async function setTtsSpeakDefault(value: boolean): Promise<void> {
await setStored(KEY_TTS_SPEAK_DEFAULT, value ? 'true' : 'false');
}
export { supportedLanguages, THEME_NAMES };
export type { AppLanguage, ThemeName };

View File

@@ -63,7 +63,8 @@ export interface AgentResponseEvent {
export interface TtsAudioReceivedEvent {
kind: 'tts_audio_received';
conversationId: string;
audioBase64: string;
/** 兼容旧 WS payload标准链路使用 audioUrl。 */
audioBase64?: string;
audioUrl?: string;
index?: number;
total?: number;

View File

@@ -1,7 +1,10 @@
import type { QueryClient } from '@tanstack/react-query';
import { AppState, type AppStateStatus } from 'react-native';
import { RealtimeSession } from './realtime-session';
import {
RealtimeSession,
type RealtimeSessionUiOwner,
} from './realtime-session';
type Slot = { conversationId: string; session: RealtimeSession };
@@ -34,8 +37,11 @@ const offScreenUi = {
};
/** 离屏:保持 WebSocket去掉 UI 回调,避免列表页播 TTS 或对已卸载组件 setState */
export function releaseConversationWsUi(session: RealtimeSession): void {
session.attachUiCallbacks({
export function releaseConversationWsUi(
session: RealtimeSession,
owner: RealtimeSessionUiOwner,
): void {
session.releaseUiCallbacks(owner, {
onStreamingText: offScreenUi.onStreamingText,
onTtsSegment: offScreenUi.onTtsSegment,
onError: offScreenUi.onError,

View File

@@ -1,6 +1,5 @@
import type { QueryClient } from '@tanstack/react-query';
import { acquireBackgroundConversationWs } from './conversation-ws-background-pool';
import { conversationMessagesRepository } from './conversation-messages-repository';
import { conversationKeys } from './query-keys';
import { registerPreparedRealtimeSession } from './prepared-session-registry';
@@ -52,17 +51,13 @@ export async function prefetchConversationMessages(
});
}
const offscreenUiCallbacks = {
onStreamingText: () => {},
onTtsSegment: () => {},
onError: () => {},
onStateChange: () => {},
};
const inflightPrewarms = new Set<string>();
/**
* 列表页/卡片按下时的预热:保持后台 WS 连接,并触发消息缓存填充
* 列表页/卡片按下时的预热:只填充消息缓存,不建立后台 WS
*
* 后端当前以 conversation_id 记录 active WebSocket离屏 WS 会覆盖聊天页连接,
* 导致本轮 TTS/agent_response 发到 offscreen session页面停在「回复中」。
* 与 `warmupConversationOpening` 不同:不等待开场白、不阻塞调用方,仅适用于"已有消息"的会话。
*/
export function prewarmConversationSession(
@@ -70,13 +65,6 @@ export function prewarmConversationSession(
conversationId: string,
): void {
if (!conversationId) return;
const session = acquireBackgroundConversationWs(
conversationId,
queryClient,
null,
);
// 预热阶段没有挂载的 UI先用空回调占位聊天页 mount 时会重新 attach。
session.attachUiCallbacks(offscreenUiCallbacks);
if (inflightPrewarms.has(conversationId)) return;
const cached = queryClient.getQueryData<MessageItem[]>(
conversationKeys.messages(conversationId),

View File

@@ -18,6 +18,7 @@ import { conversationKeys } from './query-keys';
import { takePreparedRealtimeSession } from './prepared-session-registry';
import {
type ErrorCallback,
type RealtimeSessionUiOwner,
type StreamingTextCallback,
type TtsSegmentPayload,
type RealtimeSession,
@@ -219,6 +220,9 @@ export function useRealtimeSession({
}: UseRealtimeSessionOptions): RealtimeSessionState {
const queryClient = useQueryClient();
const sessionRef = useRef<RealtimeSession | null>(null);
const uiOwnerRef = useRef<RealtimeSessionUiOwner>(
Symbol('conversation-screen-ui'),
);
const uiRef = useRef({
handleStreamingText: (() => {}) as StreamingTextCallback,
handleError: (() => {}) as ErrorCallback,
@@ -300,20 +304,23 @@ export function useRealtimeSession({
prepared,
);
session.attachUiCallbacks({
onStreamingText: (text, isComplete) => {
uiRef.current.handleStreamingText(text, isComplete);
session.attachUiCallbacks(
{
onStreamingText: (text, isComplete) => {
uiRef.current.handleStreamingText(text, isComplete);
},
onTtsSegment: (payload) => uiRef.current.onTtsSegment?.(payload),
onError: (message, code) => uiRef.current.handleError(message, code),
onStateChange: setConnectionState,
},
onTtsSegment: (payload) => uiRef.current.onTtsSegment?.(payload),
onError: (message, code) => uiRef.current.handleError(message, code),
onStateChange: setConnectionState,
});
uiOwnerRef.current,
);
sessionRef.current = session;
setConnectionState(session.getConnectionState());
return () => {
releaseConversationWsUi(session);
releaseConversationWsUi(session, uiOwnerRef.current);
sessionRef.current = null;
setConnectionState('disconnected');
setStreamingMessage(null);

View File

@@ -21,6 +21,7 @@ function looksLikeUuidAssistantMessageId(id: string): boolean {
export type StreamingTextCallback = (text: string, isComplete: boolean) => void;
export type ErrorCallback = (message: string, code?: string) => void;
export type RealtimeSessionUiOwner = symbol;
/** WebSocket `tts_audio`:服务端可能只带 base64、只带 COS URL或两者都有 */
export type TtsSegmentPayload = {
@@ -32,6 +33,8 @@ export type TtsSegmentPayload = {
assistantMessageId?: string;
/** 用户点喇叭按需下发时为 true应加入播放队列即使未开「本轮朗读」 */
manual?: boolean;
/** 本段属于用户显式打开 Speak 的自动朗读轮次。 */
autoPlay?: boolean;
};
interface RealtimeSessionOptions {
@@ -63,6 +66,7 @@ export class RealtimeSession {
private onTtsSegment?: (payload: TtsSegmentPayload) => void;
private onError?: ErrorCallback;
private uiStateListener?: WsStateListener;
private uiOwner: RealtimeSessionUiOwner | null = null;
private unsubEvent: (() => void) | null = null;
private unsubState: (() => void) | null = null;
@@ -75,6 +79,18 @@ export class RealtimeSession {
private assistantTurnTtsSync = false;
private pendingTtsByKey = new Map<string, TtsSegmentPayload>();
/** `agent_response` 早于 `tts_audio` 时延后落库,超时后露字且无播放 */
private readonly SYNC_REVEAL_TIMEOUT_MS = 3000;
private deferredSyncCommits = new Map<
string,
{
timeoutId: ReturnType<typeof setTimeout>;
commit: () => void;
index: number;
total: number;
}
>();
private static bufferedTtsKey(
assistantMessageId: string | undefined,
index: number,
@@ -99,13 +115,20 @@ export class RealtimeSession {
}
/** 列表预热接棒或刷新 UI 订阅时替换回调,不重建 WebSocket */
attachUiCallbacks(options: {
onStreamingText?: StreamingTextCallback;
onTtsSegment?: (payload: TtsSegmentPayload) => void;
onError?: ErrorCallback;
onStateChange?: WsStateListener;
}): void {
attachUiCallbacks(
options: {
onStreamingText?: StreamingTextCallback;
onTtsSegment?: (payload: TtsSegmentPayload) => void;
onError?: ErrorCallback;
onStateChange?: WsStateListener;
},
owner?: RealtimeSessionUiOwner,
): void {
if (this.destroyed) return;
if (this.uiOwner && owner !== this.uiOwner) {
return;
}
this.uiOwner = owner ?? null;
if (options.onStreamingText !== undefined) {
this.onStreamingText = options.onStreamingText;
}
@@ -124,6 +147,23 @@ export class RealtimeSession {
}
}
releaseUiCallbacks(
owner: RealtimeSessionUiOwner,
options: {
onStreamingText?: StreamingTextCallback;
onTtsSegment?: (payload: TtsSegmentPayload) => void;
onError?: ErrorCallback;
onStateChange?: WsStateListener;
},
): void {
if (this.destroyed) return;
if (this.uiOwner !== owner) {
return;
}
this.uiOwner = null;
this.attachUiCallbacks(options);
}
async connect(): Promise<void> {
await this.client.connect();
}
@@ -144,6 +184,7 @@ export class RealtimeSession {
/** Returns true if the message was sent over the socket. */
sendText(text: string, options?: { ttsThisTurn?: boolean }): boolean {
this.beginNewOutboundUserTurnCleanup();
const tts = !!options?.ttsThisTurn;
this.assistantTurnTtsSync = tts;
return this.client.sendText(text, { ttsThisTurn: tts });
@@ -160,6 +201,7 @@ export class RealtimeSession {
ttsThisTurn?: boolean;
},
): boolean {
this.beginNewOutboundUserTurnCleanup();
const tts = !!options?.ttsThisTurn;
this.assistantTurnTtsSync = tts;
return this.client.send({
@@ -198,32 +240,99 @@ export class RealtimeSession {
return this.client.sendTtsRequest(body);
}
/** 新开一轮用户发到 WS 之前:清空上轮 sync 残留的缓冲,避免占位 / 错乱 */
private beginNewOutboundUserTurnCleanup(): void {
if (this.deferredSyncCommits.size === 0 && this.pendingTtsByKey.size === 0) {
return;
}
for (const def of this.deferredSyncCommits.values()) {
clearTimeout(def.timeoutId);
try {
def.commit();
} catch {
/* best-effort */
}
}
this.deferredSyncCommits.clear();
this.pendingTtsByKey.clear();
}
// ─── Internal ───
private resetAssistantTtsSyncState(): void {
for (const def of this.deferredSyncCommits.values()) {
clearTimeout(def.timeoutId);
try {
def.commit();
} catch {
/* 取消打断时尽量不丢正文 */
}
}
this.deferredSyncCommits.clear();
this.assistantTurnTtsSync = false;
this.pendingTtsByKey.clear();
}
private flushBufferedTtsIfSync(
/** sync 模式下取出缓冲的该段 TTS调用方需先落缓存再转发方便 UI 绑定 URL */
private takeBufferedTtsIfSync(
assistantMessageId: string | undefined,
index: number,
): void {
if (!this.assistantTurnTtsSync) return;
): TtsSegmentPayload | null {
if (!this.assistantTurnTtsSync) return null;
const key = RealtimeSession.bufferedTtsKey(assistantMessageId, index);
const payload = this.pendingTtsByKey.get(key);
if (payload) {
if (!payload) return null;
this.pendingTtsByKey.delete(key);
return payload;
}
/**
* 正文已先于音频到达:`commit` 延至收到 `tts_audio` 或超时(无音频路径则照常露字)
*/
private scheduleDeferredSyncCommit(
key: string,
index: number,
total: number,
commit: () => void,
): void {
const timeoutId = setTimeout(() => {
const def = this.deferredSyncCommits.get(key);
if (!def || def.timeoutId !== timeoutId) return;
this.deferredSyncCommits.delete(key);
this.pendingTtsByKey.delete(key);
this.onTtsSegment?.(payload);
}
def.commit();
this.finishAssistantTurnIfLastSegment(def.index, def.total);
}, this.SYNC_REVEAL_TIMEOUT_MS);
this.deferredSyncCommits.set(key, { timeoutId, commit, index, total });
}
/** 迟到 `tts_audio` 与延后落库会合:先写缓存再入队播放,确保 URL 能绑定到气泡 */
private tryResolveDeferredSyncWithIncomingTts(
key: string,
incoming: TtsSegmentPayload,
): boolean {
const def = this.deferredSyncCommits.get(key);
if (!def) return false;
clearTimeout(def.timeoutId);
this.deferredSyncCommits.delete(key);
def.commit();
this.onTtsSegment?.(incoming);
this.finishAssistantTurnIfLastSegment(def.index, def.total);
return true;
}
private finishAssistantTurnIfLastSegment(index: number, total: number): void {
if (index >= total - 1) {
this.resetAssistantTtsSyncState();
this.assistantTurnTtsSync = false;
this.pendingTtsByKey.clear();
}
}
/** sync 多段回复不走 footer 流式展示,但仍要清掉「正在回复」占位气泡。 */
private clearAssistantPendingUi(): void {
this.onStreamingText?.('', true);
}
private handleEvent: WsEventListener = (event: WsEvent) => {
if (event.kind === 'agent_response') {
this.handleAgentChunk(event);
@@ -244,12 +353,21 @@ export class RealtimeSession {
assistantMessageId: event.assistantMessageId,
manual: event.manual,
};
if (this.assistantTurnTtsSync && !payload.manual) {
const idx = event.index ?? 0;
const key = RealtimeSession.bufferedTtsKey(
event.assistantMessageId,
idx,
);
payload.autoPlay = true;
const resolvedDeferred = this.tryResolveDeferredSyncWithIncomingTts(
key,
payload,
);
if (resolvedDeferred) {
return;
}
this.pendingTtsByKey.set(key, payload);
} else {
this.onTtsSegment?.(payload);
@@ -292,11 +410,32 @@ export class RealtimeSession {
? assistantSegmentMessageId(event.assistantMessageId, index)
: `${this.conversationId}_agent_${Date.now()}_${index}`;
if (sync) {
this.flushBufferedTtsIfSync(event.assistantMessageId, index);
const bufferedTts = this.takeBufferedTtsIfSync(
event.assistantMessageId,
index,
);
if (bufferedTts) {
this.commitOneAssistantMessage(event.text, id);
this.clearAssistantPendingUi();
this.onTtsSegment?.(bufferedTts);
this.finishAssistantTurnIfLastSegment(index, total);
} else {
const key = RealtimeSession.bufferedTtsKey(
event.assistantMessageId,
index,
);
const textCaptured = event.text;
const idCaptured = id;
this.scheduleDeferredSyncCommit(key, index, total, () => {
this.commitOneAssistantMessage(textCaptured, idCaptured);
this.clearAssistantPendingUi();
});
}
} else {
this.commitOneAssistantMessage(event.text, id);
this.onStreamingText?.(event.text, true);
this.finishAssistantTurnIfLastSegment(index, total);
}
this.commitOneAssistantMessage(event.text, id);
this.onStreamingText?.(event.text, true);
this.finishAssistantTurnIfLastSegment(index, total);
return;
}
@@ -317,18 +456,40 @@ export class RealtimeSession {
const id =
this.pendingAssistantMessageId ??
`${this.conversationId}_agent_${Date.now()}`;
let finishSyncTurnNow = false;
if (sync) {
this.flushBufferedTtsIfSync(assistantId ?? undefined, 0);
this.commitStreamingBufferWithId(id);
const visible =
this.streamingBuffer.trim().length > 0 ? this.streamingBuffer : '…';
this.onStreamingText?.(visible, true);
const bufferedTts = this.takeBufferedTtsIfSync(
assistantId ?? undefined,
0,
);
if (bufferedTts) {
this.commitStreamingBufferWithId(id);
const visible =
this.streamingBuffer.trim().length > 0 ? this.streamingBuffer : '…';
this.onStreamingText?.(visible, true);
this.onTtsSegment?.(bufferedTts);
finishSyncTurnNow = true;
} else {
const snapshot = this.streamingBuffer;
const key = RealtimeSession.bufferedTtsKey(assistantId ?? undefined, 0);
const idCaptured = id;
this.scheduleDeferredSyncCommit(key, 0, 1, () => {
this.streamingBuffer = snapshot;
this.commitStreamingBufferWithId(idCaptured);
this.streamingBuffer = '';
const visible = snapshot.trim().length > 0 ? snapshot : '…';
this.onStreamingText?.(visible, true);
});
}
} else {
this.commitStreamingBufferWithId(id);
finishSyncTurnNow = true;
}
this.streamingBuffer = '';
this.pendingAssistantMessageId = null;
this.finishAssistantTurnIfLastSegment(0, 1);
if (!sync || finishSyncTurnNow) {
this.finishAssistantTurnIfLastSegment(0, 1);
}
}
}

View File

@@ -94,6 +94,12 @@ export function usePlayer(): UsePlayerResult {
const acquired = await audioFocus.acquireForPlayback();
if (!acquired) {
/**
* 录音占用时 acquire 失败且队列尚未 shift若用户进入会话前焦点已在
* `recorder`,可能不会再次触发 `onOwnerChange('recorder')`,旧的
* `wasBlockedByRecorderRef` 不会被置位,录音结束后也不会重试 playNext。
*/
wasBlockedByRecorderRef.current = true;
setStatus('idle');
return;
}
@@ -172,14 +178,17 @@ export function usePlayer(): UsePlayerResult {
if (owner === null && wasBlockedByRecorderRef.current) {
wasBlockedByRecorderRef.current = false;
if (queueRef.current.length > 0 && status === 'idle') {
playNext();
if (
queueRef.current.length > 0 &&
playbackActiveUriRef.current === null
) {
void playNext();
}
}
});
return unsub;
}, [status, currentSource, playNext]);
}, [currentSource, playNext]);
const enqueue = useCallback(
async (item: PlaybackItem) => {

View File

@@ -118,6 +118,45 @@ describe('WsClient', () => {
client.dispose();
});
test('maps tts audio with base64 and url playback channels', async () => {
const client = new WsClient('conv-123');
const events: WsEvent[] = [];
client.onEvent((e) => events.push(e));
await client.connect();
await new Promise((r) => setTimeout(r, 10));
const ws = (client as unknown as { ws: MockWebSocket }).ws;
ws.simulateMessage({
type: 'tts_audio',
conversation_id: 'conv-123',
data: {
audio_base64: 'ZmFrZS1tcDM=',
audio_url: 'https://example.com/tts.mp3',
index: 0,
total: 1,
assistant_message_id: 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
manual: true,
},
timestamp: '2026-01-01T00:00:00Z',
});
expect(events).toEqual([
{
kind: 'tts_audio_received',
conversationId: 'conv-123',
audioBase64: 'ZmFrZS1tcDM=',
audioUrl: 'https://example.com/tts.mp3',
index: 0,
total: 1,
assistantMessageId: 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
manual: true,
},
]);
client.dispose();
});
test('sends text messages', async () => {
const client = new WsClient('conv-123');

View File

@@ -1,6 +1,7 @@
import { QueryClient } from '@tanstack/react-query';
import {
prewarmConversationSession,
prefetchConversationMessages,
warmupConversationOpening,
} from '@/features/conversation/entry-warmup';
@@ -90,6 +91,20 @@ describe('conversation entry warmup', () => {
).resolves.toBeUndefined();
});
test('prewarms existing conversations without opening an offscreen websocket', async () => {
const existing = assistantMessage();
mockLoadMessages.mockResolvedValueOnce([existing]);
prewarmConversationSession(queryClient, 'conv-1');
await new Promise((r) => setImmediate(r));
expect(mockLoadMessages).toHaveBeenCalledWith('conv-1');
expect(mockSessions).toHaveLength(0);
expect(
queryClient.getQueryData(conversationKeys.messages('conv-1')),
).toEqual([existing]);
});
test('uses refreshed history and skips websocket when opening is already cached', async () => {
const existing = assistantMessage();
mockLoadMessages.mockResolvedValueOnce([existing]);

View File

@@ -0,0 +1,253 @@
import { QueryClient } from '@tanstack/react-query';
import { RealtimeSession } from '@/features/conversation/realtime-session';
import { conversationKeys } from '@/features/conversation/query-keys';
import type { MessageItem } from '@/features/conversation/types';
jest.mock('@/core/auth/token-manager', () => ({
tokenManager: {
getAccessToken: jest.fn().mockResolvedValue('test-token'),
},
}));
jest.mock('@/core/config', () => ({
config: {
wsBaseUrl: 'ws://localhost:8000/',
ws: {
reconnectMaxRetries: 3,
reconnectBaseDelayMs: 10,
reconnectMaxDelayMs: 100,
heartbeatIntervalMs: 600000,
},
},
}));
class MockWebSocket {
static OPEN = 1;
static CLOSED = 3;
static instances: MockWebSocket[] = [];
readyState = MockWebSocket.OPEN;
onopen: (() => void) | null = null;
onmessage: ((event: { data: string }) => void) | null = null;
onclose: (() => void) | null = null;
onerror: (() => void) | null = null;
constructor(public url: string) {
MockWebSocket.instances.push(this);
queueMicrotask(() => this.onopen?.());
}
send(): void {}
close(): void {
this.readyState = MockWebSocket.CLOSED;
}
simulateMessage(data: Record<string, unknown>): void {
this.onmessage?.({ data: JSON.stringify(data) });
}
}
(global as Record<string, unknown>).WebSocket = MockWebSocket;
function msgs(qc: QueryClient, cid: string): MessageItem[] {
return qc.getQueryData<MessageItem[]>(conversationKeys.messages(cid)) ?? [];
}
describe('RealtimeSession sync TTS / agent ordering', () => {
let qc: QueryClient;
beforeEach(() => {
jest.clearAllMocks();
MockWebSocket.instances = [];
qc = new QueryClient();
qc.setQueryData(conversationKeys.messages('conv-x'), []);
});
afterEach(async () => {
await new Promise((r) => setImmediate(r));
});
it('defers assistant commit when agent_response arrives before tts_audio (single segment)', async () => {
const aid = 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa';
const onTts = jest.fn(() => {
expect(msgs(qc, 'conv-x').some((m) => m.id === aid)).toBe(true);
});
const onStream = jest.fn();
const session = new RealtimeSession({
conversationId: 'conv-x',
queryClient: qc,
onStreamingText: onStream,
onTtsSegment: onTts,
});
await session.connect();
await new Promise((r) => setImmediate(r));
const ws = MockWebSocket.instances[0]!;
expect(session.sendText('hi', { ttsThisTurn: true })).toBe(true);
ws.simulateMessage({
type: 'agent_response',
conversation_id: 'conv-x',
data: {
text: 'Hello segment',
index: 0,
total: 1,
assistant_message_id: aid,
},
timestamp: new Date().toISOString(),
});
const afterAgentOnly = msgs(qc, 'conv-x').filter(
(m) => m.senderType === 'assistant',
);
expect(afterAgentOnly).toHaveLength(0);
ws.simulateMessage({
type: 'tts_audio',
conversation_id: 'conv-x',
data: {
audio_url: 'https://example.com/tts-a.mp3',
index: 0,
total: 1,
assistant_message_id: aid,
},
timestamp: new Date().toISOString(),
});
expect(onTts).toHaveBeenCalledTimes(1);
const committed = msgs(qc, 'conv-x').filter(
(m) => m.senderType === 'assistant',
);
expect(committed).toHaveLength(1);
expect(committed[0]!.content).toContain('Hello segment');
session.dispose();
});
it('multi-segment sync clears pending UI without streaming footer text', async () => {
const aid = 'bb22bb22-bbbb-bbbb-bbbb-bbbbbbbbbbbb';
const onTts = jest.fn(() => {
expect(
msgs(qc, 'conv-x').some((m) => m.id === `${aid}_seg_0`),
).toBe(true);
});
const onStream = jest.fn();
const session = new RealtimeSession({
conversationId: 'conv-x',
queryClient: qc,
onStreamingText: onStream,
onTtsSegment: onTts,
});
await session.connect();
await new Promise((r) => setImmediate(r));
const ws = MockWebSocket.instances[0]!;
session.sendText('hi', { ttsThisTurn: true });
ws.simulateMessage({
type: 'tts_audio',
conversation_id: 'conv-x',
data: {
audio_url: 'https://example.com/tts-b.mp3',
index: 0,
total: 2,
assistant_message_id: aid,
},
timestamp: new Date().toISOString(),
});
ws.simulateMessage({
type: 'agent_response',
conversation_id: 'conv-x',
data: {
text: 'Part A',
index: 0,
total: 2,
assistant_message_id: aid,
},
timestamp: new Date().toISOString(),
});
expect(onStream).toHaveBeenCalledWith('', true);
expect(onStream).not.toHaveBeenCalledWith('Part A', true);
expect(onTts).toHaveBeenCalled();
session.dispose();
});
it('keeps active screen TTS callback when stale offscreen attach runs later', async () => {
const aid = 'cc33cc33-cccc-cccc-cccc-cccccccccccc';
const screenOnTts = jest.fn();
const offscreenOnTts = jest.fn();
const session = new RealtimeSession({
conversationId: 'conv-x',
queryClient: qc,
});
const owner = Symbol('screen-owner');
session.attachUiCallbacks({ onTtsSegment: screenOnTts }, owner);
session.attachUiCallbacks({ onTtsSegment: offscreenOnTts });
await session.connect();
await new Promise((r) => setImmediate(r));
const ws = MockWebSocket.instances[0]!;
ws.simulateMessage({
type: 'tts_audio',
conversation_id: 'conv-x',
data: {
audio_base64: 'ZmFrZS1tcDM=',
audio_url: 'https://example.com/tts-c.mp3',
index: 0,
total: 1,
assistant_message_id: aid,
manual: true,
},
timestamp: new Date().toISOString(),
});
expect(screenOnTts).toHaveBeenCalledTimes(1);
expect(offscreenOnTts).not.toHaveBeenCalled();
session.dispose();
});
it('keeps active screen TTS callback when a stale screen owner attaches later', async () => {
const aid = 'dd44dd44-dddd-dddd-dddd-dddddddddddd';
const screenOnTts = jest.fn();
const staleScreenOnTts = jest.fn();
const session = new RealtimeSession({
conversationId: 'conv-x',
queryClient: qc,
});
const activeOwner = Symbol('active-screen-owner');
const staleOwner = Symbol('stale-screen-owner');
session.attachUiCallbacks({ onTtsSegment: screenOnTts }, activeOwner);
session.attachUiCallbacks({ onTtsSegment: staleScreenOnTts }, staleOwner);
await session.connect();
await new Promise((r) => setImmediate(r));
const ws = MockWebSocket.instances[0]!;
ws.simulateMessage({
type: 'tts_audio',
conversation_id: 'conv-x',
data: {
audio_base64: 'ZmFrZS1tcDM=',
audio_url: 'https://example.com/tts-d.mp3',
index: 0,
total: 1,
assistant_message_id: aid,
manual: true,
},
timestamp: new Date().toISOString(),
});
expect(screenOnTts).toHaveBeenCalledTimes(1);
expect(staleScreenOnTts).not.toHaveBeenCalled();
session.dispose();
});
});

View File

@@ -37,6 +37,7 @@ describe('usePlayer', () => {
});
jest.mocked(audioFocus.acquireForPlayback).mockResolvedValue(true);
jest.mocked(audioFocus.releaseIfOwnedBy).mockResolvedValue(undefined);
jest.mocked(audioFocus.onOwnerChange).mockImplementation(() => jest.fn());
});
test('keeps the native audio session active while app-level audio focus owns teardown', () => {
@@ -127,4 +128,43 @@ describe('usePlayer', () => {
expect(pause).not.toHaveBeenCalled();
expect(result.current.status).toBe('idle');
});
test('retries queued audio after acquire fails once then audio focus frees', async () => {
const acquire = jest.mocked(audioFocus.acquireForPlayback);
acquire.mockResolvedValueOnce(false).mockResolvedValue(true);
let ownerListener: ((owner: null | string) => void) | undefined;
jest.mocked(audioFocus.onOwnerChange).mockImplementation((cb) => {
ownerListener = cb as (owner: null | string) => void;
return jest.fn();
});
mockUseAudioPlayerStatus.mockReturnValue({
isLoaded: true,
playing: false,
currentTime: 0,
duration: 10,
});
const play = jest.fn();
mockUseAudioPlayer.mockReturnValue({ pause: jest.fn(), play });
const { result } = renderHook(() => usePlayer());
await act(async () => {
await result.current.enqueue({
uri: 'file:///queued.mp3',
kind: 'tts_auto',
});
});
expect(acquire).toHaveBeenCalledTimes(1);
expect(result.current.status).toBe('idle');
await act(async () => {
ownerListener?.(null);
});
expect(acquire).toHaveBeenCalledTimes(2);
expect(play).toHaveBeenCalled();
});
});