fix(conversation): 修复实时会话 TTS/回复被离屏 WS 抢占

- 列表预热仅预取消息缓存,避免后台 WebSocket 覆盖服务端连接
- RealtimeSession UI 回调按 owner 独占,防止 offscreen 覆盖聊天页
- 列表页聚焦时再 prewarm,会话页 TTS 入队优先 base64
- 管线下发 TTS 同时带 audio_base64 与 audio_url;协议说明同步
- 移除 TTS 排查用前后端调试日志,保留错误/告警
- 补充 WS / RealtimeSession / entry-warmup / 播放器相关单测

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Kevin
2026-05-12 10:42:44 +08:00
parent 93be60f74c
commit 3d01085442
18 changed files with 643 additions and 261 deletions

View File

@@ -99,41 +99,10 @@ async def _send_tts_audio(
) -> str | None:
"""Synthesize TTS, upload to COS, append Redis, send TTS_AUDIO. Returns public URL or None."""
current_epoch = _tts_epoch_value(conversation_id)
# 长期保留 INFOTTS 决策与执行链路必须在 INFO 级别全程可见
logger.info(
"pipeline._send_tts_audio entry conversation_id={} chunk_index={} chunk_total={} "
"text_len={} language={} manual={} tts_epoch_start={} current_epoch={} "
"enable_tts={} provider={}",
conversation_id,
chunk_index,
chunk_total,
len(text or ""),
language,
manual,
tts_epoch_start,
current_epoch,
settings.enable_tts,
settings.tts_provider,
)
# enable_tts仅禁用「助手回复自动生成 TTS」want_tts 路径用户点喇叭manual=True仍可合成。
if not manual and not settings.enable_tts:
logger.info(
"pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
"url_set=False audio_bytes_len=0 reason=enable_tts_false",
conversation_id,
chunk_index,
)
return None
if current_epoch != tts_epoch_start:
logger.info(
"pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
"url_set=False audio_bytes_len=0 reason=epoch_mismatch_pre_synth "
"tts_epoch_start={} current_epoch={}",
conversation_id,
chunk_index,
tts_epoch_start,
current_epoch,
)
return None
try:
tts = get_tts_provider()
@@ -148,50 +117,19 @@ async def _send_tts_audio(
(text or "")[:30],
settings.tts_provider,
)
logger.info(
"pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
"url_set=False audio_bytes_len=0 reason=synthesize_empty",
conversation_id,
chunk_index,
)
return None
if _tts_epoch_value(conversation_id) != tts_epoch_start:
logger.info(
"pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
"url_set=False audio_bytes_len={} reason=epoch_mismatch_post_synth",
conversation_id,
chunk_index,
len(audio_bytes),
)
return None
ext = _tts_object_ext(settings.tts_codec)
content_type = _tts_codec_to_content_type(settings.tts_codec)
storage = get_object_storage()
key = f"conversations/{conversation_id}/tts/{uuid.uuid4().hex}.{ext}"
upload_started = time.perf_counter()
logger.debug(
"pipeline._send_tts_audio uploading key={} audio_bytes_len={} content_type={}",
key,
len(audio_bytes),
content_type,
)
public_url = storage.upload(key, audio_bytes, content_type)
upload_ms = (time.perf_counter() - upload_started) * 1000
# 与 `tts_delivery.apply_presigned_tts_urls_to_messages` / 回忆录图片 presign 一致:下发可播 URL
playback_url = storage.get_url(key, expires=TTS_PRESIGNED_EXPIRES_SEC)
logger.debug(
"pipeline._send_tts_audio uploaded key={} audio_bytes_len={} upload_ms={:.2f} "
"public_url_set={} playback_url_set={}",
key,
len(audio_bytes),
upload_ms,
bool(public_url),
bool(playback_url),
)
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
payload_data: Dict[str, Any] = {
"audio_base64": audio_b64,
"format": settings.tts_codec,
"audio_base64": base64.b64encode(audio_bytes).decode("utf-8"),
"audio_url": playback_url,
"index": chunk_index,
"total": chunk_total,
@@ -200,16 +138,6 @@ async def _send_tts_audio(
payload_data["assistant_message_id"] = assistant_message_id
if manual:
payload_data["manual"] = True
logger.debug(
"pipeline._send_tts_audio sending TTS_AUDIO conversation_id={} chunk_index={} "
"chunk_total={} payload_fields={} audio_b64_len={} manual={}",
conversation_id,
chunk_index,
chunk_total,
sorted(payload_data.keys()),
len(audio_b64),
manual,
)
await manager.send_message(
conversation_id,
{
@@ -219,16 +147,6 @@ async def _send_tts_audio(
"timestamp": datetime.now(timezone.utc).isoformat(),
},
)
logger.info(
"pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=True "
"url_set={} audio_bytes_len={} upload_ms={:.2f} manual={}",
conversation_id,
chunk_index,
bool(public_url),
len(audio_bytes),
upload_ms,
manual,
)
return public_url
except Exception as e:
err_str = str(e)
@@ -239,13 +157,6 @@ async def _send_tts_audio(
)
else:
logger.error("TTS synthesize failed: {}", e)
logger.info(
"pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
"url_set=False audio_bytes_len=0 reason=exception err={}",
conversation_id,
chunk_index,
type(e).__name__,
)
return None
@@ -1035,18 +946,6 @@ async def process_user_message(
segment.id,
len(user_message or ""),
)
# 长期保留TTS 决策入口pipeline 层INFO 级别可见所有控制位
logger.info(
"pipeline.process_user_message entry conversation_id={} segment_id={} "
"tts_this_turn={} force_skip_tts={} enable_tts={} provider={} user_language={}",
conversation_id,
segment.id,
tts_this_turn,
force_skip_tts,
settings.enable_tts,
settings.tts_provider,
user_language,
)
is_from_voice = bool(segment.audio_url)
voice_session_id = _voice_session_id_from_audio_url(segment.audio_url)
audio_dur = getattr(segment, "audio_duration_seconds", None)
@@ -1074,21 +973,6 @@ async def process_user_message(
skip_tts = bool(turn.skip_tts)
want_voice = bool(tts_this_turn) if tts_this_turn is not None else False
want_tts = want_voice and settings.enable_tts and not skip_tts
# 长期保留 INFOTTS 决策最终结论;不再被 agent_summary_enabled 门控
logger.info(
"pipeline.process_user_message tts_decision conversation_id={} segment_id={} "
"tts_this_turn={} force_skip_tts={} enable_tts={} skip_tts_from_turn={} "
"want_voice={} want_tts={} response_segments={}",
conversation_id,
segment.id,
tts_this_turn,
force_skip_tts,
settings.enable_tts,
skip_tts,
want_voice,
want_tts,
len(turn.messages),
)
if agent_summary_enabled():
logger.info(
"pipeline.process_user_message duration_ms={:.2f} "