fix(conversation): 修复实时会话 TTS/回复被离屏 WS 抢占

- 列表预热仅预取消息缓存,避免后台 WebSocket 覆盖服务端连接
- RealtimeSession UI 回调按 owner 独占,防止 offscreen 覆盖聊天页
- 列表页聚焦时再 prewarm,会话页 TTS 入队优先 base64
- 管线下发 TTS 同时带 audio_base64 与 audio_url;协议说明同步
- 移除 TTS 排查用前后端调试日志,保留错误/告警
- 补充 WS / RealtimeSession / entry-warmup / 播放器相关单测

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Kevin
2026-05-12 10:42:44 +08:00
parent 93be60f74c
commit 3d01085442
18 changed files with 643 additions and 261 deletions

View File

@@ -1,6 +1,7 @@
import { QueryClient } from '@tanstack/react-query';
import {
prewarmConversationSession,
prefetchConversationMessages,
warmupConversationOpening,
} from '@/features/conversation/entry-warmup';
@@ -90,6 +91,20 @@ describe('conversation entry warmup', () => {
).resolves.toBeUndefined();
});
test('prewarms existing conversations without opening an offscreen websocket', async () => {
const existing = assistantMessage();
mockLoadMessages.mockResolvedValueOnce([existing]);
prewarmConversationSession(queryClient, 'conv-1');
await new Promise((r) => setImmediate(r));
expect(mockLoadMessages).toHaveBeenCalledWith('conv-1');
expect(mockSessions).toHaveLength(0);
expect(
queryClient.getQueryData(conversationKeys.messages('conv-1')),
).toEqual([existing]);
});
test('uses refreshed history and skips websocket when opening is already cached', async () => {
const existing = assistantMessage();
mockLoadMessages.mockResolvedValueOnce([existing]);

View File

@@ -0,0 +1,253 @@
import { QueryClient } from '@tanstack/react-query';
import { RealtimeSession } from '@/features/conversation/realtime-session';
import { conversationKeys } from '@/features/conversation/query-keys';
import type { MessageItem } from '@/features/conversation/types';
jest.mock('@/core/auth/token-manager', () => ({
tokenManager: {
getAccessToken: jest.fn().mockResolvedValue('test-token'),
},
}));
jest.mock('@/core/config', () => ({
config: {
wsBaseUrl: 'ws://localhost:8000/',
ws: {
reconnectMaxRetries: 3,
reconnectBaseDelayMs: 10,
reconnectMaxDelayMs: 100,
heartbeatIntervalMs: 600000,
},
},
}));
class MockWebSocket {
static OPEN = 1;
static CLOSED = 3;
static instances: MockWebSocket[] = [];
readyState = MockWebSocket.OPEN;
onopen: (() => void) | null = null;
onmessage: ((event: { data: string }) => void) | null = null;
onclose: (() => void) | null = null;
onerror: (() => void) | null = null;
constructor(public url: string) {
MockWebSocket.instances.push(this);
queueMicrotask(() => this.onopen?.());
}
send(): void {}
close(): void {
this.readyState = MockWebSocket.CLOSED;
}
simulateMessage(data: Record<string, unknown>): void {
this.onmessage?.({ data: JSON.stringify(data) });
}
}
(global as Record<string, unknown>).WebSocket = MockWebSocket;
function msgs(qc: QueryClient, cid: string): MessageItem[] {
return qc.getQueryData<MessageItem[]>(conversationKeys.messages(cid)) ?? [];
}
describe('RealtimeSession sync TTS / agent ordering', () => {
let qc: QueryClient;
beforeEach(() => {
jest.clearAllMocks();
MockWebSocket.instances = [];
qc = new QueryClient();
qc.setQueryData(conversationKeys.messages('conv-x'), []);
});
afterEach(async () => {
await new Promise((r) => setImmediate(r));
});
it('defers assistant commit when agent_response arrives before tts_audio (single segment)', async () => {
const aid = 'aa11aa11-aaaa-aaaa-aaaa-aaaaaaaaaaaa';
const onTts = jest.fn(() => {
expect(msgs(qc, 'conv-x').some((m) => m.id === aid)).toBe(true);
});
const onStream = jest.fn();
const session = new RealtimeSession({
conversationId: 'conv-x',
queryClient: qc,
onStreamingText: onStream,
onTtsSegment: onTts,
});
await session.connect();
await new Promise((r) => setImmediate(r));
const ws = MockWebSocket.instances[0]!;
expect(session.sendText('hi', { ttsThisTurn: true })).toBe(true);
ws.simulateMessage({
type: 'agent_response',
conversation_id: 'conv-x',
data: {
text: 'Hello segment',
index: 0,
total: 1,
assistant_message_id: aid,
},
timestamp: new Date().toISOString(),
});
const afterAgentOnly = msgs(qc, 'conv-x').filter(
(m) => m.senderType === 'assistant',
);
expect(afterAgentOnly).toHaveLength(0);
ws.simulateMessage({
type: 'tts_audio',
conversation_id: 'conv-x',
data: {
audio_url: 'https://example.com/tts-a.mp3',
index: 0,
total: 1,
assistant_message_id: aid,
},
timestamp: new Date().toISOString(),
});
expect(onTts).toHaveBeenCalledTimes(1);
const committed = msgs(qc, 'conv-x').filter(
(m) => m.senderType === 'assistant',
);
expect(committed).toHaveLength(1);
expect(committed[0]!.content).toContain('Hello segment');
session.dispose();
});
it('multi-segment sync clears pending UI without streaming footer text', async () => {
const aid = 'bb22bb22-bbbb-bbbb-bbbb-bbbbbbbbbbbb';
const onTts = jest.fn(() => {
expect(
msgs(qc, 'conv-x').some((m) => m.id === `${aid}_seg_0`),
).toBe(true);
});
const onStream = jest.fn();
const session = new RealtimeSession({
conversationId: 'conv-x',
queryClient: qc,
onStreamingText: onStream,
onTtsSegment: onTts,
});
await session.connect();
await new Promise((r) => setImmediate(r));
const ws = MockWebSocket.instances[0]!;
session.sendText('hi', { ttsThisTurn: true });
ws.simulateMessage({
type: 'tts_audio',
conversation_id: 'conv-x',
data: {
audio_url: 'https://example.com/tts-b.mp3',
index: 0,
total: 2,
assistant_message_id: aid,
},
timestamp: new Date().toISOString(),
});
ws.simulateMessage({
type: 'agent_response',
conversation_id: 'conv-x',
data: {
text: 'Part A',
index: 0,
total: 2,
assistant_message_id: aid,
},
timestamp: new Date().toISOString(),
});
expect(onStream).toHaveBeenCalledWith('', true);
expect(onStream).not.toHaveBeenCalledWith('Part A', true);
expect(onTts).toHaveBeenCalled();
session.dispose();
});
it('keeps active screen TTS callback when stale offscreen attach runs later', async () => {
const aid = 'cc33cc33-cccc-cccc-cccc-cccccccccccc';
const screenOnTts = jest.fn();
const offscreenOnTts = jest.fn();
const session = new RealtimeSession({
conversationId: 'conv-x',
queryClient: qc,
});
const owner = Symbol('screen-owner');
session.attachUiCallbacks({ onTtsSegment: screenOnTts }, owner);
session.attachUiCallbacks({ onTtsSegment: offscreenOnTts });
await session.connect();
await new Promise((r) => setImmediate(r));
const ws = MockWebSocket.instances[0]!;
ws.simulateMessage({
type: 'tts_audio',
conversation_id: 'conv-x',
data: {
audio_base64: 'ZmFrZS1tcDM=',
audio_url: 'https://example.com/tts-c.mp3',
index: 0,
total: 1,
assistant_message_id: aid,
manual: true,
},
timestamp: new Date().toISOString(),
});
expect(screenOnTts).toHaveBeenCalledTimes(1);
expect(offscreenOnTts).not.toHaveBeenCalled();
session.dispose();
});
it('keeps active screen TTS callback when a stale screen owner attaches later', async () => {
const aid = 'dd44dd44-dddd-dddd-dddd-dddddddddddd';
const screenOnTts = jest.fn();
const staleScreenOnTts = jest.fn();
const session = new RealtimeSession({
conversationId: 'conv-x',
queryClient: qc,
});
const activeOwner = Symbol('active-screen-owner');
const staleOwner = Symbol('stale-screen-owner');
session.attachUiCallbacks({ onTtsSegment: screenOnTts }, activeOwner);
session.attachUiCallbacks({ onTtsSegment: staleScreenOnTts }, staleOwner);
await session.connect();
await new Promise((r) => setImmediate(r));
const ws = MockWebSocket.instances[0]!;
ws.simulateMessage({
type: 'tts_audio',
conversation_id: 'conv-x',
data: {
audio_base64: 'ZmFrZS1tcDM=',
audio_url: 'https://example.com/tts-d.mp3',
index: 0,
total: 1,
assistant_message_id: aid,
manual: true,
},
timestamp: new Date().toISOString(),
});
expect(screenOnTts).toHaveBeenCalledTimes(1);
expect(staleScreenOnTts).not.toHaveBeenCalled();
session.dispose();
});
});

View File

@@ -37,6 +37,7 @@ describe('usePlayer', () => {
});
jest.mocked(audioFocus.acquireForPlayback).mockResolvedValue(true);
jest.mocked(audioFocus.releaseIfOwnedBy).mockResolvedValue(undefined);
jest.mocked(audioFocus.onOwnerChange).mockImplementation(() => jest.fn());
});
test('keeps the native audio session active while app-level audio focus owns teardown', () => {
@@ -127,4 +128,43 @@ describe('usePlayer', () => {
expect(pause).not.toHaveBeenCalled();
expect(result.current.status).toBe('idle');
});
test('retries queued audio after acquire fails once then audio focus frees', async () => {
const acquire = jest.mocked(audioFocus.acquireForPlayback);
acquire.mockResolvedValueOnce(false).mockResolvedValue(true);
let ownerListener: ((owner: null | string) => void) | undefined;
jest.mocked(audioFocus.onOwnerChange).mockImplementation((cb) => {
ownerListener = cb as (owner: null | string) => void;
return jest.fn();
});
mockUseAudioPlayerStatus.mockReturnValue({
isLoaded: true,
playing: false,
currentTime: 0,
duration: 10,
});
const play = jest.fn();
mockUseAudioPlayer.mockReturnValue({ pause: jest.fn(), play });
const { result } = renderHook(() => usePlayer());
await act(async () => {
await result.current.enqueue({
uri: 'file:///queued.mp3',
kind: 'tts_auto',
});
});
expect(acquire).toHaveBeenCalledTimes(1);
expect(result.current.status).toBe('idle');
await act(async () => {
ownerListener?.(null);
});
expect(acquire).toHaveBeenCalledTimes(2);
expect(play).toHaveBeenCalled();
});
});