fix(conversation): 修复实时会话 TTS/回复被离屏 WS 抢占

- 列表预热仅预取消息缓存，避免后台 WebSocket 覆盖服务端连接 - RealtimeSession UI 回调按 owner 独占，防止 offscreen 覆盖聊天页 - 列表页聚焦时再 prewarm，会话页 TTS 入队优先 base64 - 管线下发 TTS 同时带 audio_base64 与 audio_url；协议说明同步 - 移除 TTS 排查用前后端调试日志，保留错误/告警 - 补充 WS / RealtimeSession / entry-warmup / 播放器相关单测 Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 10:42:44 +08:00
parent 93be60f74c
commit 3d01085442
18 changed files with 643 additions and 261 deletions
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -99,41 +99,10 @@ async def _send_tts_audio(
 ) -> str | None:
    """Synthesize TTS, upload to COS, append Redis, send TTS_AUDIO. Returns public URL or None."""
    current_epoch = _tts_epoch_value(conversation_id)
-    # 长期保留 INFO：TTS 决策与执行链路必须在 INFO 级别全程可见
-    logger.info(
-        "pipeline._send_tts_audio entry conversation_id={} chunk_index={} chunk_total={} "
-        "text_len={} language={} manual={} tts_epoch_start={} current_epoch={} "
-        "enable_tts={} provider={}",
-        conversation_id,
-        chunk_index,
-        chunk_total,
-        len(text or ""),
-        language,
-        manual,
-        tts_epoch_start,
-        current_epoch,
-        settings.enable_tts,
-        settings.tts_provider,
-    )
    # enable_tts：仅禁用「助手回复自动生成 TTS」（want_tts 路径）；用户点喇叭（manual=True）仍可合成。
    if not manual and not settings.enable_tts:
-        logger.info(
-            "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
-            "url_set=False audio_bytes_len=0 reason=enable_tts_false",
-            conversation_id,
-            chunk_index,
-        )
        return None
    if current_epoch != tts_epoch_start:
-        logger.info(
-            "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
-            "url_set=False audio_bytes_len=0 reason=epoch_mismatch_pre_synth "
-            "tts_epoch_start={} current_epoch={}",
-            conversation_id,
-            chunk_index,
-            tts_epoch_start,
-            current_epoch,
-        )
        return None
    try:
        tts = get_tts_provider()
@@ -148,50 +117,19 @@ async def _send_tts_audio(
                (text or "")[:30],
                settings.tts_provider,
            )
-            logger.info(
-                "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
-                "url_set=False audio_bytes_len=0 reason=synthesize_empty",
-                conversation_id,
-                chunk_index,
-            )
            return None
        if _tts_epoch_value(conversation_id) != tts_epoch_start:
-            logger.info(
-                "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
-                "url_set=False audio_bytes_len={} reason=epoch_mismatch_post_synth",
-                conversation_id,
-                chunk_index,
-                len(audio_bytes),
-            )
            return None
        ext = _tts_object_ext(settings.tts_codec)
        content_type = _tts_codec_to_content_type(settings.tts_codec)
        storage = get_object_storage()
        key = f"conversations/{conversation_id}/tts/{uuid.uuid4().hex}.{ext}"
-        upload_started = time.perf_counter()
-        logger.debug(
-            "pipeline._send_tts_audio uploading key={} audio_bytes_len={} content_type={}",
-            key,
-            len(audio_bytes),
-            content_type,
-        )
        public_url = storage.upload(key, audio_bytes, content_type)
-        upload_ms = (time.perf_counter() - upload_started) * 1000
        # 与 `tts_delivery.apply_presigned_tts_urls_to_messages` / 回忆录图片 presign 一致：下发可播 URL
        playback_url = storage.get_url(key, expires=TTS_PRESIGNED_EXPIRES_SEC)
-        logger.debug(
-            "pipeline._send_tts_audio uploaded key={} audio_bytes_len={} upload_ms={:.2f} "
-            "public_url_set={} playback_url_set={}",
-            key,
-            len(audio_bytes),
-            upload_ms,
-            bool(public_url),
-            bool(playback_url),
-        )
-        audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
        payload_data: Dict[str, Any] = {
-            "audio_base64": audio_b64,
            "format": settings.tts_codec,
+            "audio_base64": base64.b64encode(audio_bytes).decode("utf-8"),
            "audio_url": playback_url,
            "index": chunk_index,
            "total": chunk_total,
@@ -200,16 +138,6 @@ async def _send_tts_audio(
            payload_data["assistant_message_id"] = assistant_message_id
        if manual:
            payload_data["manual"] = True
-        logger.debug(
-            "pipeline._send_tts_audio sending TTS_AUDIO conversation_id={} chunk_index={} "
-            "chunk_total={} payload_fields={} audio_b64_len={} manual={}",
-            conversation_id,
-            chunk_index,
-            chunk_total,
-            sorted(payload_data.keys()),
-            len(audio_b64),
-            manual,
-        )
        await manager.send_message(
            conversation_id,
            {
@@ -219,16 +147,6 @@ async def _send_tts_audio(
                "timestamp": datetime.now(timezone.utc).isoformat(),
            },
        )
-        logger.info(
-            "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=True "
-            "url_set={} audio_bytes_len={} upload_ms={:.2f} manual={}",
-            conversation_id,
-            chunk_index,
-            bool(public_url),
-            len(audio_bytes),
-            upload_ms,
-            manual,
-        )
        return public_url
    except Exception as e:
        err_str = str(e)
@@ -239,13 +157,6 @@ async def _send_tts_audio(
            )
        else:
            logger.error("TTS synthesize failed: {}", e)
-        logger.info(
-            "pipeline._send_tts_audio result conversation_id={} chunk_index={} ok=False "
-            "url_set=False audio_bytes_len=0 reason=exception err={}",
-            conversation_id,
-            chunk_index,
-            type(e).__name__,
-        )
        return None


@@ -1035,18 +946,6 @@ async def process_user_message(
            segment.id,
            len(user_message or ""),
        )
-        # 长期保留：TTS 决策入口（pipeline 层）；INFO 级别可见所有控制位
-        logger.info(
-            "pipeline.process_user_message entry conversation_id={} segment_id={} "
-            "tts_this_turn={} force_skip_tts={} enable_tts={} provider={} user_language={}",
-            conversation_id,
-            segment.id,
-            tts_this_turn,
-            force_skip_tts,
-            settings.enable_tts,
-            settings.tts_provider,
-            user_language,
-        )
        is_from_voice = bool(segment.audio_url)
        voice_session_id = _voice_session_id_from_audio_url(segment.audio_url)
        audio_dur = getattr(segment, "audio_duration_seconds", None)
@@ -1074,21 +973,6 @@ async def process_user_message(
        skip_tts = bool(turn.skip_tts)
        want_voice = bool(tts_this_turn) if tts_this_turn is not None else False
        want_tts = want_voice and settings.enable_tts and not skip_tts
-        # 长期保留 INFO：TTS 决策最终结论；不再被 agent_summary_enabled 门控
-        logger.info(
-            "pipeline.process_user_message tts_decision conversation_id={} segment_id={} "
-            "tts_this_turn={} force_skip_tts={} enable_tts={} skip_tts_from_turn={} "
-            "want_voice={} want_tts={} response_segments={}",
-            conversation_id,
-            segment.id,
-            tts_this_turn,
-            force_skip_tts,
-            settings.enable_tts,
-            skip_tts,
-            want_voice,
-            want_tts,
-            len(turn.messages),
-        )
        if agent_summary_enabled():
            logger.info(
                "pipeline.process_user_message duration_ms={:.2f} "