fix(conversation): 修复实时会话 TTS/回复被离屏 WS 抢占

- 列表预热仅预取消息缓存，避免后台 WebSocket 覆盖服务端连接 - RealtimeSession UI 回调按 owner 独占，防止 offscreen 覆盖聊天页 - 列表页聚焦时再 prewarm，会话页 TTS 入队优先 base64 - 管线下发 TTS 同时带 audio_base64 与 audio_url；协议说明同步 - 移除 TTS 排查用前后端调试日志，保留错误/告警 - 补充 WS / RealtimeSession / entry-warmup / 播放器相关单测 Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 10:42:44 +08:00
parent 93be60f74c
commit 3d01085442
18 changed files with 643 additions and 261 deletions
--- a/api/app/adapters/tts/tencent_tts.py
+++ b/api/app/adapters/tts/tencent_tts.py
@@ -137,18 +137,6 @@ class TencentTTSProvider:
            # 显式声明使用新模型；大模型音色（501xxx）若不带该字段会被旧模型拒绝并静默返回空音频。
            req.ModelType = MODEL_TYPE_LLM

-            # 长期保留 INFO：TTS 实际请求腾讯云 SDK 时的关键参数
-            logger.info(
-                "tencent_tts._synthesize_sync request voice_type={} primary_language={} "
-                "model_type={} sample_rate={} codec={} text_len={}",
-                voice_type,
-                primary_language,
-                MODEL_TYPE_LLM,
-                req.SampleRate,
-                self._codec,
-                len(text or ""),
-            )
-
            resp = client.TextToVoice(req)
            request_id = getattr(resp, "RequestId", None) if resp is not None else None
            audio_b64 = getattr(resp, "Audio", "") if resp is not None else ""
@@ -163,15 +151,6 @@ class TencentTTSProvider:
                )
                return b""
            audio_bytes = base64.b64decode(audio_b64)
-            # 长期保留 INFO：腾讯云 SDK 返回的 request_id + 音频字节数（用户排查必需）
-            logger.info(
-                "tencent_tts._synthesize_sync response request_id={} audio_bytes_len={} "
-                "voice_type={} primary_language={}",
-                request_id,
-                len(audio_bytes),
-                voice_type,
-                primary_language,
-            )
            return audio_bytes
        except TencentCloudSDKException as e:
            logger.error(
@@ -225,18 +204,6 @@ class TencentTTSProvider:
        else:
            voice_type = VOICE_MAP.get(v, default_voice)
        chunks = _chunk_text(text, max_chars=max_chars)
-        # 长期保留 INFO：adapter 入口的 language / voice_type / chunk_count（排查必需）
-        logger.info(
-            "tencent_tts.synthesize entry language={} voice_arg={} resolved_voice_type={} "
-            "primary_language={} max_chars={} text_len={} chunk_count={}",
-            language,
-            voice,
-            voice_type,
-            primary_language,
-            max_chars,
-            len(text or ""),
-            len(chunks),
-        )
        if not chunks:
            return b""