fix(conversation): 修复实时会话 TTS/回复被离屏 WS 抢占
- 列表预热仅预取消息缓存,避免后台 WebSocket 覆盖服务端连接 - RealtimeSession UI 回调按 owner 独占,防止 offscreen 覆盖聊天页 - 列表页聚焦时再 prewarm,会话页 TTS 入队优先 base64 - 管线下发 TTS 同时带 audio_base64 与 audio_url;协议说明同步 - 移除 TTS 排查用前后端调试日志,保留错误/告警 - 补充 WS / RealtimeSession / entry-warmup / 播放器相关单测 Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -46,6 +46,10 @@ import { Icon } from '@/components/ui/icon';
|
||||
import { Text } from '@/components/ui/text';
|
||||
import { ScreenHeader } from '@/components/screen-header';
|
||||
import { resolveApiMediaUrl } from '@/core/api/media-url';
|
||||
import {
|
||||
getTtsSpeakDefault,
|
||||
setTtsSpeakDefault,
|
||||
} from '@/core/settings/app-settings';
|
||||
import { useAppSettings } from '@/hooks/use-app-settings';
|
||||
import { useThemeColors } from '@/hooks/use-theme-colors';
|
||||
import { useTypography } from '@/core/typography-context';
|
||||
@@ -1238,15 +1242,14 @@ export default function ConversationScreen() {
|
||||
|
||||
const handleTtsSegment = useCallback(
|
||||
(p: TtsSegmentPayload) => {
|
||||
// 闸门用于丢弃「用户已打断后」迟到的自动 TTS;按需朗读 (manual) 是当前明确操作,必须放行。
|
||||
const allowByGate =
|
||||
p.manual === true || ttsGate.current.shouldAcceptIncomingTts();
|
||||
if (!allowByGate) return;
|
||||
const convId = id ?? '';
|
||||
const cosUrl = p.audioUrl?.trim();
|
||||
const isManualPlayback = !!p.manual;
|
||||
const shouldAutoPlay = !!p.autoPlay;
|
||||
/**
|
||||
* 播放走 WS,但「再读」依赖 MessageItem.ttsAudioUrls。乐观提交的消息没有 URL,
|
||||
* 服务端 attach 要等整轮结束;收到 COS URL 时写入缓存,按钮才能用。
|
||||
* COS URL 先写入缓存:**与Speak无关**,否则录音/打断后闸门关闭时会跳过 merge,
|
||||
* 按需朗读下发的 `tts_audio` 无法绑定到气泡,喇叭表现为「不可用」。
|
||||
* 闸门仅约束是否入队播放(迟到自动朗读),不禁用 URL 附着。
|
||||
*/
|
||||
if (cosUrl && convId) {
|
||||
queryClient.setQueryData<MessageItem[]>(
|
||||
@@ -1307,8 +1310,13 @@ export default function ConversationScreen() {
|
||||
);
|
||||
}
|
||||
|
||||
const gateAllowsPlayback =
|
||||
isManualPlayback || ttsGate.current.shouldAcceptIncomingTts();
|
||||
const shouldEnqueue =
|
||||
p.manual === true || lastUserMessageRequestedTtsRef.current;
|
||||
isManualPlayback ||
|
||||
shouldAutoPlay ||
|
||||
lastUserMessageRequestedTtsRef.current;
|
||||
if (!gateAllowsPlayback) return;
|
||||
if (!shouldEnqueue) return;
|
||||
|
||||
const listKey =
|
||||
@@ -1325,8 +1333,8 @@ export default function ConversationScreen() {
|
||||
...shared,
|
||||
uri: `data:audio/mp3;base64,${p.audioBase64}`,
|
||||
});
|
||||
} else if (p.audioUrl) {
|
||||
void enqueue({ ...shared, uri: p.audioUrl });
|
||||
} else if (cosUrl) {
|
||||
void enqueue({ ...shared, uri: cosUrl });
|
||||
}
|
||||
},
|
||||
[enqueue, id, queryClient],
|
||||
@@ -1399,8 +1407,9 @@ export default function ConversationScreen() {
|
||||
|
||||
const [input, setInput] = useState('');
|
||||
const [inputResetKey, setInputResetKey] = useState(0);
|
||||
/** 本条发出的用户消息是否请求助手朗读(先 TTS 再出字) */
|
||||
/** 本条发出的用户消息是否请求助手朗读(先 TTS 再出字);默认值从存储恢复 */
|
||||
const [ttsThisTurn, setTtsThisTurn] = useState(false);
|
||||
const [ttsSpeakPrefReady, setTtsSpeakPrefReady] = useState(false);
|
||||
const [inputMode, setInputMode] = useState<InputMode>('text');
|
||||
const [isKeyboardVisible, setIsKeyboardVisible] = useState(false);
|
||||
const inputModeRef = useRef<InputMode>('text');
|
||||
@@ -1489,6 +1498,24 @@ export default function ConversationScreen() {
|
||||
inputModeRef.current = inputMode;
|
||||
}, [inputMode]);
|
||||
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
void (async () => {
|
||||
try {
|
||||
const v = await getTtsSpeakDefault();
|
||||
if (!cancelled) {
|
||||
setTtsThisTurn(v);
|
||||
setTtsSpeakPrefReady(true);
|
||||
}
|
||||
} catch {
|
||||
if (!cancelled) setTtsSpeakPrefReady(true);
|
||||
}
|
||||
})();
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
const onKeyboardWillShow = () => {
|
||||
if (inputModeRef.current !== 'text') return;
|
||||
@@ -1687,7 +1714,11 @@ export default function ConversationScreen() {
|
||||
<Switch
|
||||
accessibilityLabel={t('ttsThisTurnAccessibility')}
|
||||
value={ttsThisTurn}
|
||||
onValueChange={setTtsThisTurn}
|
||||
disabled={!ttsSpeakPrefReady}
|
||||
onValueChange={(v) => {
|
||||
setTtsThisTurn(v);
|
||||
void setTtsSpeakDefault(v);
|
||||
}}
|
||||
trackColor={{
|
||||
false: CHAT_COLORS.outline,
|
||||
true: CHAT_COLORS.secondaryContainer,
|
||||
|
||||
@@ -2,6 +2,7 @@ import { Image } from 'expo-image';
|
||||
import { router } from 'expo-router';
|
||||
import React, { useEffect, useRef, useState } from 'react';
|
||||
import { useQueryClient } from '@tanstack/react-query';
|
||||
import { useIsFocused } from '@react-navigation/native';
|
||||
import {
|
||||
Alert,
|
||||
AppState,
|
||||
@@ -302,6 +303,7 @@ function findTodayConversationToResume(
|
||||
export default function ConversationsScreen() {
|
||||
const { t } = useTranslation('conversation');
|
||||
const queryClient = useQueryClient();
|
||||
const isFocused = useIsFocused();
|
||||
|
||||
const { data: conversations = [], isLoading } = useConversations();
|
||||
const createConversation = useCreateConversation();
|
||||
@@ -448,12 +450,13 @@ export default function ConversationsScreen() {
|
||||
* 单槽连接池:换会话会自动 dispose 旧槽,所以这里只挑一条最像即将被点的。
|
||||
*/
|
||||
useEffect(() => {
|
||||
if (!isFocused) return;
|
||||
if (isLoading) return;
|
||||
const candidate =
|
||||
todayConversation ?? conversations.find(conversationHasAnyMessage);
|
||||
if (!candidate) return;
|
||||
prewarmConversationSession(queryClient, candidate.id);
|
||||
}, [isLoading, conversations, todayConversation, queryClient]);
|
||||
}, [isFocused, isLoading, conversations, todayConversation, queryClient]);
|
||||
|
||||
return (
|
||||
<View className="flex-1 bg-background">
|
||||
|
||||
@@ -20,7 +20,10 @@ function notify() {
|
||||
*/
|
||||
export const audioFocus = {
|
||||
async acquireForRecording(): Promise<boolean> {
|
||||
if (currentOwner === 'recorder') return true;
|
||||
if (currentOwner === 'recorder') {
|
||||
await setIsAudioActiveAsync(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (currentOwner === 'player') {
|
||||
await this.releaseIfOwnedBy('player');
|
||||
@@ -30,6 +33,7 @@ export const audioFocus = {
|
||||
playsInSilentMode: true,
|
||||
allowsRecording: true,
|
||||
});
|
||||
await setIsAudioActiveAsync(true);
|
||||
|
||||
currentOwner = 'recorder';
|
||||
notify();
|
||||
@@ -37,7 +41,10 @@ export const audioFocus = {
|
||||
},
|
||||
|
||||
async acquireForPlayback(): Promise<boolean> {
|
||||
if (currentOwner === 'player') return true;
|
||||
if (currentOwner === 'player') {
|
||||
await setIsAudioActiveAsync(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (currentOwner === 'recorder') {
|
||||
return false;
|
||||
@@ -47,6 +54,7 @@ export const audioFocus = {
|
||||
playsInSilentMode: true,
|
||||
allowsRecording: false,
|
||||
});
|
||||
await setIsAudioActiveAsync(true);
|
||||
|
||||
currentOwner = 'player';
|
||||
notify();
|
||||
|
||||
@@ -14,6 +14,7 @@ const KEY_LANGUAGE = 'app_settings_language';
|
||||
const KEY_LARGE_TEXT = 'app_settings_large_text';
|
||||
const KEY_DARK_MODE = 'app_settings_dark_mode';
|
||||
const KEY_THEME_NAME = 'app_settings_theme_name';
|
||||
const KEY_TTS_SPEAK_DEFAULT = 'app_settings_tts_speak_default';
|
||||
|
||||
const webFallback: Record<string, string> = {};
|
||||
|
||||
@@ -83,5 +84,16 @@ export async function setThemeName(value: ThemeName): Promise<void> {
|
||||
await setStored(KEY_THEME_NAME, value);
|
||||
}
|
||||
|
||||
/** 会话页「Speak / 本轮朗读」开关是否默认开启(跨会话记忆) */
|
||||
export async function getTtsSpeakDefault(): Promise<boolean> {
|
||||
const v = await getStored(KEY_TTS_SPEAK_DEFAULT);
|
||||
if (v == null || v === '') return false;
|
||||
return v === 'true';
|
||||
}
|
||||
|
||||
export async function setTtsSpeakDefault(value: boolean): Promise<void> {
|
||||
await setStored(KEY_TTS_SPEAK_DEFAULT, value ? 'true' : 'false');
|
||||
}
|
||||
|
||||
export { supportedLanguages, THEME_NAMES };
|
||||
export type { AppLanguage, ThemeName };
|
||||
|
||||
@@ -63,7 +63,8 @@ export interface AgentResponseEvent {
|
||||
export interface TtsAudioReceivedEvent {
|
||||
kind: 'tts_audio_received';
|
||||
conversationId: string;
|
||||
audioBase64: string;
|
||||
/** 兼容旧 WS payload;标准链路使用 audioUrl。 */
|
||||
audioBase64?: string;
|
||||
audioUrl?: string;
|
||||
index?: number;
|
||||
total?: number;
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import type { QueryClient } from '@tanstack/react-query';
|
||||
import { AppState, type AppStateStatus } from 'react-native';
|
||||
|
||||
import { RealtimeSession } from './realtime-session';
|
||||
import {
|
||||
RealtimeSession,
|
||||
type RealtimeSessionUiOwner,
|
||||
} from './realtime-session';
|
||||
|
||||
type Slot = { conversationId: string; session: RealtimeSession };
|
||||
|
||||
@@ -34,8 +37,11 @@ const offScreenUi = {
|
||||
};
|
||||
|
||||
/** 离屏:保持 WebSocket,去掉 UI 回调,避免列表页播 TTS 或对已卸载组件 setState */
|
||||
export function releaseConversationWsUi(session: RealtimeSession): void {
|
||||
session.attachUiCallbacks({
|
||||
export function releaseConversationWsUi(
|
||||
session: RealtimeSession,
|
||||
owner: RealtimeSessionUiOwner,
|
||||
): void {
|
||||
session.releaseUiCallbacks(owner, {
|
||||
onStreamingText: offScreenUi.onStreamingText,
|
||||
onTtsSegment: offScreenUi.onTtsSegment,
|
||||
onError: offScreenUi.onError,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import type { QueryClient } from '@tanstack/react-query';
|
||||
|
||||
import { acquireBackgroundConversationWs } from './conversation-ws-background-pool';
|
||||
import { conversationMessagesRepository } from './conversation-messages-repository';
|
||||
import { conversationKeys } from './query-keys';
|
||||
import { registerPreparedRealtimeSession } from './prepared-session-registry';
|
||||
@@ -52,17 +51,13 @@ export async function prefetchConversationMessages(
|
||||
});
|
||||
}
|
||||
|
||||
const offscreenUiCallbacks = {
|
||||
onStreamingText: () => {},
|
||||
onTtsSegment: () => {},
|
||||
onError: () => {},
|
||||
onStateChange: () => {},
|
||||
};
|
||||
|
||||
const inflightPrewarms = new Set<string>();
|
||||
|
||||
/**
|
||||
* 列表页/卡片按下时的预热:保持后台 WS 连接,并触发消息缓存填充。
|
||||
* 列表页/卡片按下时的预热:只填充消息缓存,不建立后台 WS。
|
||||
*
|
||||
* 后端当前以 conversation_id 记录 active WebSocket;离屏 WS 会覆盖聊天页连接,
|
||||
* 导致本轮 TTS/agent_response 发到 offscreen session,页面停在「回复中」。
|
||||
* 与 `warmupConversationOpening` 不同:不等待开场白、不阻塞调用方,仅适用于"已有消息"的会话。
|
||||
*/
|
||||
export function prewarmConversationSession(
|
||||
@@ -70,13 +65,6 @@ export function prewarmConversationSession(
|
||||
conversationId: string,
|
||||
): void {
|
||||
if (!conversationId) return;
|
||||
const session = acquireBackgroundConversationWs(
|
||||
conversationId,
|
||||
queryClient,
|
||||
null,
|
||||
);
|
||||
// 预热阶段没有挂载的 UI,先用空回调占位;聊天页 mount 时会重新 attach。
|
||||
session.attachUiCallbacks(offscreenUiCallbacks);
|
||||
if (inflightPrewarms.has(conversationId)) return;
|
||||
const cached = queryClient.getQueryData<MessageItem[]>(
|
||||
conversationKeys.messages(conversationId),
|
||||
|
||||
@@ -18,6 +18,7 @@ import { conversationKeys } from './query-keys';
|
||||
import { takePreparedRealtimeSession } from './prepared-session-registry';
|
||||
import {
|
||||
type ErrorCallback,
|
||||
type RealtimeSessionUiOwner,
|
||||
type StreamingTextCallback,
|
||||
type TtsSegmentPayload,
|
||||
type RealtimeSession,
|
||||
@@ -219,6 +220,9 @@ export function useRealtimeSession({
|
||||
}: UseRealtimeSessionOptions): RealtimeSessionState {
|
||||
const queryClient = useQueryClient();
|
||||
const sessionRef = useRef<RealtimeSession | null>(null);
|
||||
const uiOwnerRef = useRef<RealtimeSessionUiOwner>(
|
||||
Symbol('conversation-screen-ui'),
|
||||
);
|
||||
const uiRef = useRef({
|
||||
handleStreamingText: (() => {}) as StreamingTextCallback,
|
||||
handleError: (() => {}) as ErrorCallback,
|
||||
@@ -300,20 +304,23 @@ export function useRealtimeSession({
|
||||
prepared,
|
||||
);
|
||||
|
||||
session.attachUiCallbacks({
|
||||
onStreamingText: (text, isComplete) => {
|
||||
uiRef.current.handleStreamingText(text, isComplete);
|
||||
session.attachUiCallbacks(
|
||||
{
|
||||
onStreamingText: (text, isComplete) => {
|
||||
uiRef.current.handleStreamingText(text, isComplete);
|
||||
},
|
||||
onTtsSegment: (payload) => uiRef.current.onTtsSegment?.(payload),
|
||||
onError: (message, code) => uiRef.current.handleError(message, code),
|
||||
onStateChange: setConnectionState,
|
||||
},
|
||||
onTtsSegment: (payload) => uiRef.current.onTtsSegment?.(payload),
|
||||
onError: (message, code) => uiRef.current.handleError(message, code),
|
||||
onStateChange: setConnectionState,
|
||||
});
|
||||
uiOwnerRef.current,
|
||||
);
|
||||
|
||||
sessionRef.current = session;
|
||||
setConnectionState(session.getConnectionState());
|
||||
|
||||
return () => {
|
||||
releaseConversationWsUi(session);
|
||||
releaseConversationWsUi(session, uiOwnerRef.current);
|
||||
sessionRef.current = null;
|
||||
setConnectionState('disconnected');
|
||||
setStreamingMessage(null);
|
||||
|
||||
@@ -21,6 +21,7 @@ function looksLikeUuidAssistantMessageId(id: string): boolean {
|
||||
|
||||
export type StreamingTextCallback = (text: string, isComplete: boolean) => void;
|
||||
export type ErrorCallback = (message: string, code?: string) => void;
|
||||
export type RealtimeSessionUiOwner = symbol;
|
||||
|
||||
/** WebSocket `tts_audio`:服务端可能只带 base64、只带 COS URL,或两者都有 */
|
||||
export type TtsSegmentPayload = {
|
||||
@@ -32,6 +33,8 @@ export type TtsSegmentPayload = {
|
||||
assistantMessageId?: string;
|
||||
/** 用户点喇叭按需下发时为 true,应加入播放队列(即使未开「本轮朗读」) */
|
||||
manual?: boolean;
|
||||
/** 本段属于用户显式打开 Speak 的自动朗读轮次。 */
|
||||
autoPlay?: boolean;
|
||||
};
|
||||
|
||||
interface RealtimeSessionOptions {
|
||||
@@ -63,6 +66,7 @@ export class RealtimeSession {
|
||||
private onTtsSegment?: (payload: TtsSegmentPayload) => void;
|
||||
private onError?: ErrorCallback;
|
||||
private uiStateListener?: WsStateListener;
|
||||
private uiOwner: RealtimeSessionUiOwner | null = null;
|
||||
private unsubEvent: (() => void) | null = null;
|
||||
private unsubState: (() => void) | null = null;
|
||||
|
||||
@@ -75,6 +79,18 @@ export class RealtimeSession {
|
||||
private assistantTurnTtsSync = false;
|
||||
private pendingTtsByKey = new Map<string, TtsSegmentPayload>();
|
||||
|
||||
/** `agent_response` 早于 `tts_audio` 时延后落库,超时后露字且无播放 */
|
||||
private readonly SYNC_REVEAL_TIMEOUT_MS = 3000;
|
||||
private deferredSyncCommits = new Map<
|
||||
string,
|
||||
{
|
||||
timeoutId: ReturnType<typeof setTimeout>;
|
||||
commit: () => void;
|
||||
index: number;
|
||||
total: number;
|
||||
}
|
||||
>();
|
||||
|
||||
private static bufferedTtsKey(
|
||||
assistantMessageId: string | undefined,
|
||||
index: number,
|
||||
@@ -99,13 +115,20 @@ export class RealtimeSession {
|
||||
}
|
||||
|
||||
/** 列表预热接棒或刷新 UI 订阅时替换回调,不重建 WebSocket */
|
||||
attachUiCallbacks(options: {
|
||||
onStreamingText?: StreamingTextCallback;
|
||||
onTtsSegment?: (payload: TtsSegmentPayload) => void;
|
||||
onError?: ErrorCallback;
|
||||
onStateChange?: WsStateListener;
|
||||
}): void {
|
||||
attachUiCallbacks(
|
||||
options: {
|
||||
onStreamingText?: StreamingTextCallback;
|
||||
onTtsSegment?: (payload: TtsSegmentPayload) => void;
|
||||
onError?: ErrorCallback;
|
||||
onStateChange?: WsStateListener;
|
||||
},
|
||||
owner?: RealtimeSessionUiOwner,
|
||||
): void {
|
||||
if (this.destroyed) return;
|
||||
if (this.uiOwner && owner !== this.uiOwner) {
|
||||
return;
|
||||
}
|
||||
this.uiOwner = owner ?? null;
|
||||
if (options.onStreamingText !== undefined) {
|
||||
this.onStreamingText = options.onStreamingText;
|
||||
}
|
||||
@@ -124,6 +147,23 @@ export class RealtimeSession {
|
||||
}
|
||||
}
|
||||
|
||||
releaseUiCallbacks(
|
||||
owner: RealtimeSessionUiOwner,
|
||||
options: {
|
||||
onStreamingText?: StreamingTextCallback;
|
||||
onTtsSegment?: (payload: TtsSegmentPayload) => void;
|
||||
onError?: ErrorCallback;
|
||||
onStateChange?: WsStateListener;
|
||||
},
|
||||
): void {
|
||||
if (this.destroyed) return;
|
||||
if (this.uiOwner !== owner) {
|
||||
return;
|
||||
}
|
||||
this.uiOwner = null;
|
||||
this.attachUiCallbacks(options);
|
||||
}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
await this.client.connect();
|
||||
}
|
||||
@@ -144,6 +184,7 @@ export class RealtimeSession {
|
||||
|
||||
/** Returns true if the message was sent over the socket. */
|
||||
sendText(text: string, options?: { ttsThisTurn?: boolean }): boolean {
|
||||
this.beginNewOutboundUserTurnCleanup();
|
||||
const tts = !!options?.ttsThisTurn;
|
||||
this.assistantTurnTtsSync = tts;
|
||||
return this.client.sendText(text, { ttsThisTurn: tts });
|
||||
@@ -160,6 +201,7 @@ export class RealtimeSession {
|
||||
ttsThisTurn?: boolean;
|
||||
},
|
||||
): boolean {
|
||||
this.beginNewOutboundUserTurnCleanup();
|
||||
const tts = !!options?.ttsThisTurn;
|
||||
this.assistantTurnTtsSync = tts;
|
||||
return this.client.send({
|
||||
@@ -198,32 +240,99 @@ export class RealtimeSession {
|
||||
return this.client.sendTtsRequest(body);
|
||||
}
|
||||
|
||||
/** 新开一轮用户发到 WS 之前:清空上轮 sync 残留的缓冲,避免占位 / 错乱 */
|
||||
private beginNewOutboundUserTurnCleanup(): void {
|
||||
if (this.deferredSyncCommits.size === 0 && this.pendingTtsByKey.size === 0) {
|
||||
return;
|
||||
}
|
||||
for (const def of this.deferredSyncCommits.values()) {
|
||||
clearTimeout(def.timeoutId);
|
||||
try {
|
||||
def.commit();
|
||||
} catch {
|
||||
/* best-effort */
|
||||
}
|
||||
}
|
||||
this.deferredSyncCommits.clear();
|
||||
this.pendingTtsByKey.clear();
|
||||
}
|
||||
|
||||
// ─── Internal ───
|
||||
|
||||
private resetAssistantTtsSyncState(): void {
|
||||
for (const def of this.deferredSyncCommits.values()) {
|
||||
clearTimeout(def.timeoutId);
|
||||
try {
|
||||
def.commit();
|
||||
} catch {
|
||||
/* 取消打断时尽量不丢正文 */
|
||||
}
|
||||
}
|
||||
this.deferredSyncCommits.clear();
|
||||
this.assistantTurnTtsSync = false;
|
||||
this.pendingTtsByKey.clear();
|
||||
}
|
||||
|
||||
private flushBufferedTtsIfSync(
|
||||
/** sync 模式下取出缓冲的该段 TTS;调用方需先落缓存再转发,方便 UI 绑定 URL */
|
||||
private takeBufferedTtsIfSync(
|
||||
assistantMessageId: string | undefined,
|
||||
index: number,
|
||||
): void {
|
||||
if (!this.assistantTurnTtsSync) return;
|
||||
): TtsSegmentPayload | null {
|
||||
if (!this.assistantTurnTtsSync) return null;
|
||||
const key = RealtimeSession.bufferedTtsKey(assistantMessageId, index);
|
||||
const payload = this.pendingTtsByKey.get(key);
|
||||
if (payload) {
|
||||
if (!payload) return null;
|
||||
this.pendingTtsByKey.delete(key);
|
||||
return payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* 正文已先于音频到达:`commit` 延至收到 `tts_audio` 或超时(无音频路径则照常露字)
|
||||
*/
|
||||
private scheduleDeferredSyncCommit(
|
||||
key: string,
|
||||
index: number,
|
||||
total: number,
|
||||
commit: () => void,
|
||||
): void {
|
||||
const timeoutId = setTimeout(() => {
|
||||
const def = this.deferredSyncCommits.get(key);
|
||||
if (!def || def.timeoutId !== timeoutId) return;
|
||||
this.deferredSyncCommits.delete(key);
|
||||
this.pendingTtsByKey.delete(key);
|
||||
this.onTtsSegment?.(payload);
|
||||
}
|
||||
def.commit();
|
||||
this.finishAssistantTurnIfLastSegment(def.index, def.total);
|
||||
}, this.SYNC_REVEAL_TIMEOUT_MS);
|
||||
this.deferredSyncCommits.set(key, { timeoutId, commit, index, total });
|
||||
}
|
||||
|
||||
/** 迟到 `tts_audio` 与延后落库会合:先写缓存再入队播放,确保 URL 能绑定到气泡 */
|
||||
private tryResolveDeferredSyncWithIncomingTts(
|
||||
key: string,
|
||||
incoming: TtsSegmentPayload,
|
||||
): boolean {
|
||||
const def = this.deferredSyncCommits.get(key);
|
||||
if (!def) return false;
|
||||
clearTimeout(def.timeoutId);
|
||||
this.deferredSyncCommits.delete(key);
|
||||
def.commit();
|
||||
this.onTtsSegment?.(incoming);
|
||||
this.finishAssistantTurnIfLastSegment(def.index, def.total);
|
||||
return true;
|
||||
}
|
||||
|
||||
private finishAssistantTurnIfLastSegment(index: number, total: number): void {
|
||||
if (index >= total - 1) {
|
||||
this.resetAssistantTtsSyncState();
|
||||
this.assistantTurnTtsSync = false;
|
||||
this.pendingTtsByKey.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/** sync 多段回复不走 footer 流式展示,但仍要清掉「正在回复」占位气泡。 */
|
||||
private clearAssistantPendingUi(): void {
|
||||
this.onStreamingText?.('', true);
|
||||
}
|
||||
|
||||
private handleEvent: WsEventListener = (event: WsEvent) => {
|
||||
if (event.kind === 'agent_response') {
|
||||
this.handleAgentChunk(event);
|
||||
@@ -244,12 +353,21 @@ export class RealtimeSession {
|
||||
assistantMessageId: event.assistantMessageId,
|
||||
manual: event.manual,
|
||||
};
|
||||
|
||||
if (this.assistantTurnTtsSync && !payload.manual) {
|
||||
const idx = event.index ?? 0;
|
||||
const key = RealtimeSession.bufferedTtsKey(
|
||||
event.assistantMessageId,
|
||||
idx,
|
||||
);
|
||||
payload.autoPlay = true;
|
||||
const resolvedDeferred = this.tryResolveDeferredSyncWithIncomingTts(
|
||||
key,
|
||||
payload,
|
||||
);
|
||||
if (resolvedDeferred) {
|
||||
return;
|
||||
}
|
||||
this.pendingTtsByKey.set(key, payload);
|
||||
} else {
|
||||
this.onTtsSegment?.(payload);
|
||||
@@ -292,11 +410,32 @@ export class RealtimeSession {
|
||||
? assistantSegmentMessageId(event.assistantMessageId, index)
|
||||
: `${this.conversationId}_agent_${Date.now()}_${index}`;
|
||||
if (sync) {
|
||||
this.flushBufferedTtsIfSync(event.assistantMessageId, index);
|
||||
const bufferedTts = this.takeBufferedTtsIfSync(
|
||||
event.assistantMessageId,
|
||||
index,
|
||||
);
|
||||
if (bufferedTts) {
|
||||
this.commitOneAssistantMessage(event.text, id);
|
||||
this.clearAssistantPendingUi();
|
||||
this.onTtsSegment?.(bufferedTts);
|
||||
this.finishAssistantTurnIfLastSegment(index, total);
|
||||
} else {
|
||||
const key = RealtimeSession.bufferedTtsKey(
|
||||
event.assistantMessageId,
|
||||
index,
|
||||
);
|
||||
const textCaptured = event.text;
|
||||
const idCaptured = id;
|
||||
this.scheduleDeferredSyncCommit(key, index, total, () => {
|
||||
this.commitOneAssistantMessage(textCaptured, idCaptured);
|
||||
this.clearAssistantPendingUi();
|
||||
});
|
||||
}
|
||||
} else {
|
||||
this.commitOneAssistantMessage(event.text, id);
|
||||
this.onStreamingText?.(event.text, true);
|
||||
this.finishAssistantTurnIfLastSegment(index, total);
|
||||
}
|
||||
this.commitOneAssistantMessage(event.text, id);
|
||||
this.onStreamingText?.(event.text, true);
|
||||
this.finishAssistantTurnIfLastSegment(index, total);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -317,18 +456,40 @@ export class RealtimeSession {
|
||||
const id =
|
||||
this.pendingAssistantMessageId ??
|
||||
`${this.conversationId}_agent_${Date.now()}`;
|
||||
let finishSyncTurnNow = false;
|
||||
if (sync) {
|
||||
this.flushBufferedTtsIfSync(assistantId ?? undefined, 0);
|
||||
this.commitStreamingBufferWithId(id);
|
||||
const visible =
|
||||
this.streamingBuffer.trim().length > 0 ? this.streamingBuffer : '…';
|
||||
this.onStreamingText?.(visible, true);
|
||||
const bufferedTts = this.takeBufferedTtsIfSync(
|
||||
assistantId ?? undefined,
|
||||
0,
|
||||
);
|
||||
if (bufferedTts) {
|
||||
this.commitStreamingBufferWithId(id);
|
||||
const visible =
|
||||
this.streamingBuffer.trim().length > 0 ? this.streamingBuffer : '…';
|
||||
this.onStreamingText?.(visible, true);
|
||||
this.onTtsSegment?.(bufferedTts);
|
||||
finishSyncTurnNow = true;
|
||||
} else {
|
||||
const snapshot = this.streamingBuffer;
|
||||
const key = RealtimeSession.bufferedTtsKey(assistantId ?? undefined, 0);
|
||||
const idCaptured = id;
|
||||
this.scheduleDeferredSyncCommit(key, 0, 1, () => {
|
||||
this.streamingBuffer = snapshot;
|
||||
this.commitStreamingBufferWithId(idCaptured);
|
||||
this.streamingBuffer = '';
|
||||
const visible = snapshot.trim().length > 0 ? snapshot : '…';
|
||||
this.onStreamingText?.(visible, true);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
this.commitStreamingBufferWithId(id);
|
||||
finishSyncTurnNow = true;
|
||||
}
|
||||
this.streamingBuffer = '';
|
||||
this.pendingAssistantMessageId = null;
|
||||
this.finishAssistantTurnIfLastSegment(0, 1);
|
||||
if (!sync || finishSyncTurnNow) {
|
||||
this.finishAssistantTurnIfLastSegment(0, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -94,6 +94,12 @@ export function usePlayer(): UsePlayerResult {
|
||||
|
||||
const acquired = await audioFocus.acquireForPlayback();
|
||||
if (!acquired) {
|
||||
/**
|
||||
* 录音占用时 acquire 失败且队列尚未 shift;若用户进入会话前焦点已在
|
||||
* `recorder`,可能不会再次触发 `onOwnerChange('recorder')`,旧的
|
||||
* `wasBlockedByRecorderRef` 不会被置位,录音结束后也不会重试 playNext。
|
||||
*/
|
||||
wasBlockedByRecorderRef.current = true;
|
||||
setStatus('idle');
|
||||
return;
|
||||
}
|
||||
@@ -172,14 +178,17 @@ export function usePlayer(): UsePlayerResult {
|
||||
|
||||
if (owner === null && wasBlockedByRecorderRef.current) {
|
||||
wasBlockedByRecorderRef.current = false;
|
||||
if (queueRef.current.length > 0 && status === 'idle') {
|
||||
playNext();
|
||||
if (
|
||||
queueRef.current.length > 0 &&
|
||||
playbackActiveUriRef.current === null
|
||||
) {
|
||||
void playNext();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return unsub;
|
||||
}, [status, currentSource, playNext]);
|
||||
}, [currentSource, playNext]);
|
||||
|
||||
const enqueue = useCallback(
|
||||
async (item: PlaybackItem) => {
|
||||
|
||||
@@ -118,6 +118,45 @@ describe('WsClient', () => {
|
||||
client.dispose();
|
||||
});
|
||||
|
||||
test('maps tts audio with base64 and url playback channels', async () => {
|
||||
const client = new WsClient('conv-123');
|
||||
const events: WsEvent[] = [];
|
||||
client.onEvent((e) => events.push(e));
|
||||
|
||||
await client.connect();
|
||||
await new Promise((r) => setTimeout(r, 10));
|
||||
|
||||
const ws = (client as unknown as { ws: MockWebSocket }).ws;
|
||||
ws.simulateMessage({
|
||||
type: 'tts_audio',
|
||||
conversation_id: 'conv-123',
|
||||
data: {
|
||||
audio_base64: 'ZmFrZS1tcDM=',
|
||||
audio_url: 'https://example.com/tts.mp3',
|
||||
index: 0,
|
||||
total: 1,
|
||||
assistant_message_id: 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
|
||||
manual: true,
|
||||
},
|
||||
timestamp: '2026-01-01T00:00:00Z',
|
||||
});
|
||||
|
||||
expect(events).toEqual([
|
||||
{
|
||||
kind: 'tts_audio_received',
|
||||
conversationId: 'conv-123',
|
||||
audioBase64: 'ZmFrZS1tcDM=',
|
||||
audioUrl: 'https://example.com/tts.mp3',
|
||||
index: 0,
|
||||
total: 1,
|
||||
assistantMessageId: 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
|
||||
manual: true,
|
||||
},
|
||||
]);
|
||||
|
||||
client.dispose();
|
||||
});
|
||||
|
||||
test('sends text messages', async () => {
|
||||
const client = new WsClient('conv-123');
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { QueryClient } from '@tanstack/react-query';
|
||||
|
||||
import {
|
||||
prewarmConversationSession,
|
||||
prefetchConversationMessages,
|
||||
warmupConversationOpening,
|
||||
} from '@/features/conversation/entry-warmup';
|
||||
@@ -90,6 +91,20 @@ describe('conversation entry warmup', () => {
|
||||
).resolves.toBeUndefined();
|
||||
});
|
||||
|
||||
test('prewarms existing conversations without opening an offscreen websocket', async () => {
|
||||
const existing = assistantMessage();
|
||||
mockLoadMessages.mockResolvedValueOnce([existing]);
|
||||
|
||||
prewarmConversationSession(queryClient, 'conv-1');
|
||||
await new Promise((r) => setImmediate(r));
|
||||
|
||||
expect(mockLoadMessages).toHaveBeenCalledWith('conv-1');
|
||||
expect(mockSessions).toHaveLength(0);
|
||||
expect(
|
||||
queryClient.getQueryData(conversationKeys.messages('conv-1')),
|
||||
).toEqual([existing]);
|
||||
});
|
||||
|
||||
test('uses refreshed history and skips websocket when opening is already cached', async () => {
|
||||
const existing = assistantMessage();
|
||||
mockLoadMessages.mockResolvedValueOnce([existing]);
|
||||
|
||||
@@ -0,0 +1,253 @@
|
||||
import { QueryClient } from '@tanstack/react-query';
|
||||
|
||||
import { RealtimeSession } from '@/features/conversation/realtime-session';
|
||||
import { conversationKeys } from '@/features/conversation/query-keys';
|
||||
import type { MessageItem } from '@/features/conversation/types';
|
||||
|
||||
jest.mock('@/core/auth/token-manager', () => ({
|
||||
tokenManager: {
|
||||
getAccessToken: jest.fn().mockResolvedValue('test-token'),
|
||||
},
|
||||
}));
|
||||
|
||||
jest.mock('@/core/config', () => ({
|
||||
config: {
|
||||
wsBaseUrl: 'ws://localhost:8000/',
|
||||
ws: {
|
||||
reconnectMaxRetries: 3,
|
||||
reconnectBaseDelayMs: 10,
|
||||
reconnectMaxDelayMs: 100,
|
||||
heartbeatIntervalMs: 600000,
|
||||
},
|
||||
},
|
||||
}));
|
||||
|
||||
class MockWebSocket {
|
||||
static OPEN = 1;
|
||||
static CLOSED = 3;
|
||||
static instances: MockWebSocket[] = [];
|
||||
|
||||
readyState = MockWebSocket.OPEN;
|
||||
onopen: (() => void) | null = null;
|
||||
onmessage: ((event: { data: string }) => void) | null = null;
|
||||
onclose: (() => void) | null = null;
|
||||
onerror: (() => void) | null = null;
|
||||
|
||||
constructor(public url: string) {
|
||||
MockWebSocket.instances.push(this);
|
||||
queueMicrotask(() => this.onopen?.());
|
||||
}
|
||||
|
||||
send(): void {}
|
||||
|
||||
close(): void {
|
||||
this.readyState = MockWebSocket.CLOSED;
|
||||
}
|
||||
|
||||
simulateMessage(data: Record<string, unknown>): void {
|
||||
this.onmessage?.({ data: JSON.stringify(data) });
|
||||
}
|
||||
}
|
||||
|
||||
(global as Record<string, unknown>).WebSocket = MockWebSocket;
|
||||
|
||||
function msgs(qc: QueryClient, cid: string): MessageItem[] {
|
||||
return qc.getQueryData<MessageItem[]>(conversationKeys.messages(cid)) ?? [];
|
||||
}
|
||||
|
||||
describe('RealtimeSession sync TTS / agent ordering', () => {
|
||||
let qc: QueryClient;
|
||||
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
MockWebSocket.instances = [];
|
||||
qc = new QueryClient();
|
||||
qc.setQueryData(conversationKeys.messages('conv-x'), []);
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await new Promise((r) => setImmediate(r));
|
||||
});
|
||||
|
||||
it('defers assistant commit when agent_response arrives before tts_audio (single segment)', async () => {
|
||||
const aid = 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa';
|
||||
const onTts = jest.fn(() => {
|
||||
expect(msgs(qc, 'conv-x').some((m) => m.id === aid)).toBe(true);
|
||||
});
|
||||
const onStream = jest.fn();
|
||||
const session = new RealtimeSession({
|
||||
conversationId: 'conv-x',
|
||||
queryClient: qc,
|
||||
onStreamingText: onStream,
|
||||
onTtsSegment: onTts,
|
||||
});
|
||||
|
||||
await session.connect();
|
||||
await new Promise((r) => setImmediate(r));
|
||||
|
||||
const ws = MockWebSocket.instances[0]!;
|
||||
expect(session.sendText('hi', { ttsThisTurn: true })).toBe(true);
|
||||
|
||||
ws.simulateMessage({
|
||||
type: 'agent_response',
|
||||
conversation_id: 'conv-x',
|
||||
data: {
|
||||
text: 'Hello segment',
|
||||
index: 0,
|
||||
total: 1,
|
||||
assistant_message_id: aid,
|
||||
},
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
|
||||
const afterAgentOnly = msgs(qc, 'conv-x').filter(
|
||||
(m) => m.senderType === 'assistant',
|
||||
);
|
||||
expect(afterAgentOnly).toHaveLength(0);
|
||||
|
||||
ws.simulateMessage({
|
||||
type: 'tts_audio',
|
||||
conversation_id: 'conv-x',
|
||||
data: {
|
||||
audio_url: 'https://example.com/tts-a.mp3',
|
||||
index: 0,
|
||||
total: 1,
|
||||
assistant_message_id: aid,
|
||||
},
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
|
||||
expect(onTts).toHaveBeenCalledTimes(1);
|
||||
const committed = msgs(qc, 'conv-x').filter(
|
||||
(m) => m.senderType === 'assistant',
|
||||
);
|
||||
expect(committed).toHaveLength(1);
|
||||
expect(committed[0]!.content).toContain('Hello segment');
|
||||
|
||||
session.dispose();
|
||||
});
|
||||
|
||||
it('multi-segment sync clears pending UI without streaming footer text', async () => {
|
||||
const aid = 'bb22bb22-bbbb-bbbb-bbbb-bbbbbbbbbbbb';
|
||||
const onTts = jest.fn(() => {
|
||||
expect(
|
||||
msgs(qc, 'conv-x').some((m) => m.id === `${aid}_seg_0`),
|
||||
).toBe(true);
|
||||
});
|
||||
const onStream = jest.fn();
|
||||
const session = new RealtimeSession({
|
||||
conversationId: 'conv-x',
|
||||
queryClient: qc,
|
||||
onStreamingText: onStream,
|
||||
onTtsSegment: onTts,
|
||||
});
|
||||
|
||||
await session.connect();
|
||||
await new Promise((r) => setImmediate(r));
|
||||
|
||||
const ws = MockWebSocket.instances[0]!;
|
||||
session.sendText('hi', { ttsThisTurn: true });
|
||||
|
||||
ws.simulateMessage({
|
||||
type: 'tts_audio',
|
||||
conversation_id: 'conv-x',
|
||||
data: {
|
||||
audio_url: 'https://example.com/tts-b.mp3',
|
||||
index: 0,
|
||||
total: 2,
|
||||
assistant_message_id: aid,
|
||||
},
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
|
||||
ws.simulateMessage({
|
||||
type: 'agent_response',
|
||||
conversation_id: 'conv-x',
|
||||
data: {
|
||||
text: 'Part A',
|
||||
index: 0,
|
||||
total: 2,
|
||||
assistant_message_id: aid,
|
||||
},
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
|
||||
expect(onStream).toHaveBeenCalledWith('', true);
|
||||
expect(onStream).not.toHaveBeenCalledWith('Part A', true);
|
||||
expect(onTts).toHaveBeenCalled();
|
||||
session.dispose();
|
||||
});
|
||||
|
||||
it('keeps active screen TTS callback when stale offscreen attach runs later', async () => {
|
||||
const aid = 'cc33cc33-cccc-cccc-cccc-cccccccccccc';
|
||||
const screenOnTts = jest.fn();
|
||||
const offscreenOnTts = jest.fn();
|
||||
const session = new RealtimeSession({
|
||||
conversationId: 'conv-x',
|
||||
queryClient: qc,
|
||||
});
|
||||
const owner = Symbol('screen-owner');
|
||||
|
||||
session.attachUiCallbacks({ onTtsSegment: screenOnTts }, owner);
|
||||
session.attachUiCallbacks({ onTtsSegment: offscreenOnTts });
|
||||
|
||||
await session.connect();
|
||||
await new Promise((r) => setImmediate(r));
|
||||
|
||||
const ws = MockWebSocket.instances[0]!;
|
||||
ws.simulateMessage({
|
||||
type: 'tts_audio',
|
||||
conversation_id: 'conv-x',
|
||||
data: {
|
||||
audio_base64: 'ZmFrZS1tcDM=',
|
||||
audio_url: 'https://example.com/tts-c.mp3',
|
||||
index: 0,
|
||||
total: 1,
|
||||
assistant_message_id: aid,
|
||||
manual: true,
|
||||
},
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
|
||||
expect(screenOnTts).toHaveBeenCalledTimes(1);
|
||||
expect(offscreenOnTts).not.toHaveBeenCalled();
|
||||
session.dispose();
|
||||
});
|
||||
|
||||
it('keeps active screen TTS callback when a stale screen owner attaches later', async () => {
|
||||
const aid = 'dd44dd44-dddd-dddd-dddd-dddddddddddd';
|
||||
const screenOnTts = jest.fn();
|
||||
const staleScreenOnTts = jest.fn();
|
||||
const session = new RealtimeSession({
|
||||
conversationId: 'conv-x',
|
||||
queryClient: qc,
|
||||
});
|
||||
const activeOwner = Symbol('active-screen-owner');
|
||||
const staleOwner = Symbol('stale-screen-owner');
|
||||
|
||||
session.attachUiCallbacks({ onTtsSegment: screenOnTts }, activeOwner);
|
||||
session.attachUiCallbacks({ onTtsSegment: staleScreenOnTts }, staleOwner);
|
||||
|
||||
await session.connect();
|
||||
await new Promise((r) => setImmediate(r));
|
||||
|
||||
const ws = MockWebSocket.instances[0]!;
|
||||
ws.simulateMessage({
|
||||
type: 'tts_audio',
|
||||
conversation_id: 'conv-x',
|
||||
data: {
|
||||
audio_base64: 'ZmFrZS1tcDM=',
|
||||
audio_url: 'https://example.com/tts-d.mp3',
|
||||
index: 0,
|
||||
total: 1,
|
||||
assistant_message_id: aid,
|
||||
manual: true,
|
||||
},
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
|
||||
expect(screenOnTts).toHaveBeenCalledTimes(1);
|
||||
expect(staleScreenOnTts).not.toHaveBeenCalled();
|
||||
session.dispose();
|
||||
});
|
||||
});
|
||||
@@ -37,6 +37,7 @@ describe('usePlayer', () => {
|
||||
});
|
||||
jest.mocked(audioFocus.acquireForPlayback).mockResolvedValue(true);
|
||||
jest.mocked(audioFocus.releaseIfOwnedBy).mockResolvedValue(undefined);
|
||||
jest.mocked(audioFocus.onOwnerChange).mockImplementation(() => jest.fn());
|
||||
});
|
||||
|
||||
test('keeps the native audio session active while app-level audio focus owns teardown', () => {
|
||||
@@ -127,4 +128,43 @@ describe('usePlayer', () => {
|
||||
expect(pause).not.toHaveBeenCalled();
|
||||
expect(result.current.status).toBe('idle');
|
||||
});
|
||||
|
||||
test('retries queued audio after acquire fails once then audio focus frees', async () => {
|
||||
const acquire = jest.mocked(audioFocus.acquireForPlayback);
|
||||
acquire.mockResolvedValueOnce(false).mockResolvedValue(true);
|
||||
|
||||
let ownerListener: ((owner: null | string) => void) | undefined;
|
||||
jest.mocked(audioFocus.onOwnerChange).mockImplementation((cb) => {
|
||||
ownerListener = cb as (owner: null | string) => void;
|
||||
return jest.fn();
|
||||
});
|
||||
|
||||
mockUseAudioPlayerStatus.mockReturnValue({
|
||||
isLoaded: true,
|
||||
playing: false,
|
||||
currentTime: 0,
|
||||
duration: 10,
|
||||
});
|
||||
const play = jest.fn();
|
||||
mockUseAudioPlayer.mockReturnValue({ pause: jest.fn(), play });
|
||||
|
||||
const { result } = renderHook(() => usePlayer());
|
||||
|
||||
await act(async () => {
|
||||
await result.current.enqueue({
|
||||
uri: 'file:///queued.mp3',
|
||||
kind: 'tts_auto',
|
||||
});
|
||||
});
|
||||
|
||||
expect(acquire).toHaveBeenCalledTimes(1);
|
||||
expect(result.current.status).toBe('idle');
|
||||
|
||||
await act(async () => {
|
||||
ownerListener?.(null);
|
||||
});
|
||||
|
||||
expect(acquire).toHaveBeenCalledTimes(2);
|
||||
expect(play).toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user