fix(conversation): 离屏不丢回复、列表预热 WS 与非阻塞进入聊天

- 后端：文本/转写后 AI 生成改为独立任务，避免断连取消整轮；按需 TTS 等与 WS 改动 - 前端：RealtimeSession 重绑 UI 时恢复流式 buffer；列表 onPressIn/挂载预热、已有会话立即 push - 同步会话相关类型、i18n、测试与 env/资源等累计改动 Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-08 17:28:31 +08:00
parent 5dac3efd52
commit d0c26242db
44 changed files with 1209 additions and 212 deletions
--- a/api/app/features/conversation/service.py
+++ b/api/app/features/conversation/service.py
@@ -96,6 +96,9 @@ def _build_messages_from_history(
            tts = msg.get("ttsAudioUrls")
            if isinstance(tts, list) and tts:
                item["ttsAudioUrls"] = [x for x in tts if isinstance(x, str)]
+        dm = msg.get("durableMessageId")
+        if isinstance(dm, str) and dm:
+            item["durableMessageId"] = dm
        messages.append(item)
    return messages

--- a/api/app/features/conversation/session_history.py
+++ b/api/app/features/conversation/session_history.py
@@ -18,6 +18,7 @@ def conversation_messages_to_redis_history(
            "content": row.content,
            "messageType": row.message_type,
            "timestamp": row.created_at.isoformat() if row.created_at else None,
+            "durableMessageId": row.id,
        }
        if row.voice_session_id:
            item["voiceSessionId"] = row.voice_session_id
--- a/api/app/features/conversation/tts_delivery.py
+++ b/api/app/features/conversation/tts_delivery.py
@@ -9,9 +9,15 @@

 from __future__ import annotations

-from app.core.cos_url_keys import presign_tts_urls_for_playback
+from app.core.cos_url_keys import (
+    TTS_PRESIGNED_EXPIRES_SEC,
+    extract_cos_object_key_if_owned,
+)
+from app.core.logging import get_logger
 from app.ports.storage import ObjectStorage

+logger = get_logger(__name__)
+

 def apply_presigned_tts_urls_to_messages(
    messages: list[dict],
@@ -24,5 +30,26 @@ def apply_presigned_tts_urls_to_messages(
        tts = m.get("ttsAudioUrls")
        if not isinstance(tts, list) or not tts:
            continue
-        str_urls = [x for x in tts if isinstance(x, str)]
-        m["ttsAudioUrls"] = presign_tts_urls_for_playback(str_urls, storage)
+        out: list[str] = []
+        for x in tts:
+            if not isinstance(x, str):
+                out.append("")
+                continue
+            s = x.strip()
+            if not s:
+                out.append("")
+                continue
+            key = extract_cos_object_key_if_owned(s)
+            if key:
+                try:
+                    out.append(storage.get_url(key, expires=TTS_PRESIGNED_EXPIRES_SEC))
+                except Exception as exc:
+                    logger.warning(
+                        "presign tts url failed, keeping original url: key={} err={}",
+                        key,
+                        exc,
+                    )
+                    out.append(s)
+            else:
+                out.append(s)
+        m["ttsAudioUrls"] = out
--- a/api/app/features/conversation/ws/message_types.py
+++ b/api/app/features/conversation/ws/message_types.py
@@ -17,6 +17,7 @@ class MessageType(str, Enum):
    AGENT_RESPONSE = "agent_response"
    TTS_AUDIO = "tts_audio"
    TTS_CANCEL = "tts_cancel"
+    TTS_REQUEST = "tts_request"
    PING = "ping"
    PONG = "pong"
    END_CONVERSATION = "end_conversation"
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -18,9 +18,13 @@ from sqlalchemy import select, update
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.agents.chat import ChatOrchestrator
+from app.agents.chat.reply_limits import segments_from_llm_response
 from app.core.agent_logging import agent_summary_enabled
 from app.core.config import settings
-from app.core.cos_url_keys import TTS_PRESIGNED_EXPIRES_SEC
+from app.core.cos_url_keys import (
+    TTS_PRESIGNED_EXPIRES_SEC,
+    extract_cos_object_key_if_owned,
+)
 from app.core.db import AsyncSessionLocal
 from app.core.dependencies import get_asr_provider, get_object_storage, get_tts_provider
 from app.features.conversation.chat_turn import (
@@ -33,7 +37,7 @@ from app.features.conversation.history_store import (
    ConversationHistoryStore,
 )
 from app.features.conversation.lineage_schemas import DialogueLineage
-from app.features.conversation.models import Conversation, Segment
+from app.features.conversation.models import Conversation, ConversationMessage, Segment
 from app.features.conversation.ws.connection_manager import manager
 from app.features.conversation.ws.message_types import MessageType
 from app.features.conversation.ws.profile_collector import (
@@ -84,6 +88,7 @@ async def _send_tts_audio(
    chunk_total: int,
    assistant_message_id: str | None,
    tts_epoch_start: int,
+    manual: bool = False,
 ) -> str | None:
    """Synthesize TTS, upload to COS, append Redis, send TTS_AUDIO. Returns public URL or None."""
    if not settings.enable_tts:
@@ -116,6 +121,8 @@ async def _send_tts_audio(
        }
        if assistant_message_id:
            payload_data["assistant_message_id"] = assistant_message_id
+        if manual:
+            payload_data["manual"] = True
        await manager.send_message(
            conversation_id,
            {
@@ -138,6 +145,109 @@ async def _send_tts_audio(
        return None


+async def handle_tts_request_on_demand(
+    *,
+    conversation_id: str,
+    user_id: str,
+    assistant_message_id: str,
+    segment_index: int,
+    segment_text: str | None,
+    db: AsyncSession,
+) -> tuple[bool, str]:
+    """用户点喇叭：该段已有 TTS 则预签名下发；否则合成后落库并下发。不重复合成同一段。"""
+    if not settings.enable_tts:
+        return False, "未开启语音合成"
+
+    conv = await db.get(Conversation, conversation_id)
+    if not conv or conv.user_id != user_id or conv.deleted_at is not None:
+        return False, "对话不存在或无权访问"
+
+    msg = await db.get(ConversationMessage, assistant_message_id)
+    if not msg or msg.conversation_id != conversation_id or msg.role != "ai":
+        return False, "消息不存在"
+
+    # 与客户端 splitMessageParts / segments_from_llm_response 对齐（含无 [SPLIT] 时的段落拆段）
+    parts = segments_from_llm_response(msg.content or "", max_segments=3)
+    if segment_index < 0 or segment_index >= len(parts):
+        return False, "分段序号无效"
+
+    canon = (parts[segment_index] or "").strip()
+    if not canon:
+        return False, "该段无朗读文本"
+    if segment_text and segment_text.strip() and segment_text.strip() != canon:
+        logger.debug(
+            "按需 TTS: 客户端传入 segment_text 与规范化后 canon 不完全一致，已按 segment_index 朗读 canon "
+            "(client_len={} canon_len={})",
+            len(segment_text.strip()),
+            len(canon),
+        )
+
+    urls: List[str] = []
+    for x in msg.tts_audio_urls or []:
+        if isinstance(x, str) and x.strip():
+            urls.append(x)
+        else:
+            urls.append("")
+    while len(urls) < len(parts):
+        urls.append("")
+
+    existing = urls[segment_index].strip() if segment_index < len(urls) else ""
+    chunk_total = len(parts)
+
+    if existing:
+        storage = get_object_storage()
+        key = extract_cos_object_key_if_owned(existing)
+        try:
+            playback_url = (
+                storage.get_url(key, expires=TTS_PRESIGNED_EXPIRES_SEC)
+                if key
+                else existing
+            )
+        except Exception as exc:
+            logger.warning("按需 TTS 预签名失败: {}", exc)
+            playback_url = existing
+        await manager.send_message(
+            conversation_id,
+            {
+                "type": MessageType.TTS_AUDIO,
+                "conversation_id": conversation_id,
+                "data": {
+                    "audio_url": playback_url,
+                    "format": settings.tts_codec,
+                    "index": segment_index,
+                    "total": chunk_total,
+                    "assistant_message_id": assistant_message_id,
+                    "manual": True,
+                },
+                "timestamp": datetime.now(timezone.utc).isoformat(),
+            },
+        )
+        return True, ""
+
+    tts_epoch_start = _tts_epoch_value(conversation_id)
+    url_stored = await _send_tts_audio(
+        conversation_id,
+        canon,
+        chunk_index=segment_index,
+        chunk_total=chunk_total,
+        assistant_message_id=assistant_message_id,
+        tts_epoch_start=tts_epoch_start,
+        manual=True,
+    )
+    if not url_stored:
+        return False, "语音合成失败"
+
+    while len(urls) <= segment_index:
+        urls.append("")
+    urls[segment_index] = url_stored
+    msg.tts_audio_urls = urls
+    await db.commit()
+
+    store = ConversationHistoryStore(db)
+    await store._sync_redis_best_effort(conversation_id)
+    return True, ""
+
+
 # ── Agent 实例（从 ConnectionManager 移出） ─────────────────────
 chat_orchestrator = ChatOrchestrator()
 chat_turn_service = ChatTurnService(chat_orchestrator)
@@ -153,6 +263,8 @@ class SegmentStreamState:
    """会话内分段处理状态（用于并行 ASR + 有序聚合）"""

    lock: asyncio.Lock = field(default_factory=asyncio.Lock)
+    #: 本条语音会话最近一次分段上行携带的本轮朗读开关（客户端每段一致即可）
+    tts_this_turn: bool = False
    pending_indices: Set[int] = field(default_factory=set)
    processed_indices: Set[int] = field(default_factory=set)
    buffered_transcripts: Dict[int, Tuple[str, Segment]] = field(default_factory=dict)
@@ -163,6 +275,43 @@ class SegmentStreamState:


 _segment_states: Dict[Tuple[str, str], SegmentStreamState] = {}
+_user_response_tasks: Dict[str, Set[asyncio.Task]] = {}
+_user_response_locks: Dict[str, asyncio.Lock] = {}
+
+
+def _get_user_response_lock(conversation_id: str) -> asyncio.Lock:
+    lock = _user_response_locks.get(conversation_id)
+    if lock is None:
+        lock = asyncio.Lock()
+        _user_response_locks[conversation_id] = lock
+    return lock
+
+
+def register_user_response_task(conversation_id: str, task: asyncio.Task) -> None:
+    tasks = _user_response_tasks.setdefault(conversation_id, set())
+    tasks.add(task)
+
+    def _cleanup(done_task: asyncio.Task) -> None:
+        tasks.discard(done_task)
+        if not tasks:
+            _user_response_tasks.pop(conversation_id, None)
+            _user_response_locks.pop(conversation_id, None)
+        if done_task.cancelled():
+            logger.warning(
+                "用户回复后台任务被取消 conversation_id={}",
+                conversation_id,
+            )
+            return
+        exc = done_task.exception()
+        if exc:
+            logger.error(
+                "用户回复后台任务异常 conversation_id={}: {}",
+                conversation_id,
+                exc,
+                exc_info=True,
+            )
+
+    task.add_done_callback(_cleanup)


 def get_or_create_segment_state(
@@ -432,9 +581,13 @@ async def process_audio_segment(
    audio_base64: str,
    audio_duration: int,
    is_last: bool,
+    *,
+    tts_this_turn: bool = False,
 ) -> None:
    """分段语音的异步处理：并行 ASR + 幂等落库 + 有序聚合触发 Agent。"""
    state = get_or_create_segment_state(conversation_id, voice_session_id)
+    async with state.lock:
+        state.tts_this_turn = bool(tts_this_turn)
    logger.info(
        "process_audio_segment 开始: conversation_id={} voice_session_id={} "
        "segment_index={} is_last={} duration_s={} audio_b64_len={}",
@@ -588,6 +741,7 @@ async def process_audio_segment(
                )

            ready_segments: List[Tuple[int, str, Segment]] = []
+            tts_flag_this_voice_session = False
            async with state.lock:
                state.processed_indices.add(segment_index)
                state.buffered_transcripts[segment_index] = (
@@ -602,6 +756,8 @@ async def process_audio_segment(
                    state.consumed_index = next_index
                    next_index += 1

+                tts_flag_this_voice_session = bool(state.tts_this_turn)
+
            for _, ordered_text, ordered_segment in ready_segments:
                await process_user_message(
                    conversation_id=conversation_id,
@@ -612,6 +768,7 @@ async def process_audio_segment(
                    user=user,
                    user_message_timestamp=ordered_segment.created_at
                    or user_message_timestamp,
+                    tts_this_turn=tts_flag_this_voice_session,
                )

    except Exception as e:
@@ -638,6 +795,48 @@ async def process_audio_segment(
 # ── 用户消息处理 ────────────────────────────────────────────────


+async def process_persisted_user_segment_response(
+    *,
+    conversation_id: str,
+    user_id: str,
+    segment_id: str,
+    tts_this_turn: bool = False,
+) -> None:
+    """后台继续生成已落库用户段落的助手回复；即使 WS 页面退出也要完成落库。"""
+    lock = _get_user_response_lock(conversation_id)
+    async with lock:
+        async with AsyncSessionLocal() as db:
+            conversation = await db.get(Conversation, conversation_id)
+            user = await db.get(User, user_id)
+            segment = await db.get(Segment, segment_id)
+            if (
+                not conversation
+                or conversation.deleted_at is not None
+                or conversation.user_id != user_id
+                or not user
+                or not segment
+                or segment.conversation_id != conversation_id
+            ):
+                logger.warning(
+                    "跳过用户回复后台任务: conversation_id={} segment_id={} user_id={}",
+                    conversation_id,
+                    segment_id,
+                    user_id,
+                )
+                return
+            await process_user_message(
+                conversation_id=conversation_id,
+                user_message=segment.user_input_text or "",
+                conversation=conversation,
+                segment=segment,
+                db=db,
+                user=user,
+                user_message_timestamp=segment.created_at
+                or conversation.last_message_at,
+                tts_this_turn=tts_this_turn,
+            )
+
+
 async def process_user_message(
    conversation_id: str,
    user_message: str,
@@ -648,6 +847,7 @@ async def process_user_message(
    user_message_timestamp: Optional[datetime] = None,
    *,
    force_skip_tts: bool = False,
+    tts_this_turn: Optional[bool] = None,
 ) -> None:
    """处理用户消息，生成 Agent 回应。由 ChatOrchestrator 路由到 ProfileAgent 或 InterviewAgent。"""
    store = ConversationHistoryStore(db)
@@ -682,20 +882,23 @@ async def process_user_message(
                get_filled_profile_fields_fn=get_filled_profile_fields,
            ),
        )
+        responses = turn.messages
+        skip_tts = bool(turn.skip_tts)
+        want_voice = bool(tts_this_turn) if tts_this_turn is not None else False
+        want_tts = want_voice and settings.enable_tts and not skip_tts
        if agent_summary_enabled():
            logger.info(
                "pipeline.process_user_message duration_ms={:.2f} "
                "conversation_id={} segment_id={} user_msg_len={} "
-                "response_segments={} skip_tts={}",
+                "response_segments={} skip_tts={} want_tts={}",
                (time.perf_counter() - t_pipeline) * 1000,
                conversation_id,
                segment.id,
                len(user_message or ""),
                len(turn.messages),
                turn.skip_tts,
+                want_tts,
            )
-        responses = turn.messages
-        skip_tts = bool(turn.skip_tts)

        segment.agent_response = AI_RESPONSE_SEGMENT_JOIN.join(responses)
        _mark_conversation_active(conversation)
@@ -750,6 +953,21 @@ async def process_user_message(
        tts_epoch_start = _tts_epoch_value(conversation_id)
        n = len(responses)
        for i, response_text in enumerate(responses):
+            url_for_segment: Optional[str] = None
+            if want_tts:
+                if _tts_epoch_value(conversation_id) != tts_epoch_start:
+                    break
+                url_for_segment = await _send_tts_audio(
+                    conversation_id,
+                    response_text,
+                    chunk_index=i,
+                    chunk_total=n,
+                    assistant_message_id=ai_msg_id,
+                    tts_epoch_start=tts_epoch_start,
+                )
+                if url_for_segment:
+                    tts_urls.append(url_for_segment)
+
            await manager.send_message(
                conversation_id,
                {
@@ -764,20 +982,7 @@ async def process_user_message(
                    "timestamp": datetime.now(timezone.utc).isoformat(),
                },
            )
-            url = None
-            if not skip_tts:
-                if _tts_epoch_value(conversation_id) != tts_epoch_start:
-                    break
-                url = await _send_tts_audio(
-                    conversation_id,
-                    response_text,
-                    chunk_index=i,
-                    chunk_total=n,
-                    assistant_message_id=ai_msg_id,
-                    tts_epoch_start=tts_epoch_start,
-                )
-            if url:
-                tts_urls.append(url)
+
            if _tts_epoch_value(conversation_id) != tts_epoch_start:
                break
            if i < n - 1:
--- a/api/app/features/conversation/ws/protocol.md
+++ b/api/app/features/conversation/ws/protocol.md
@@ -1,25 +1,35 @@
 # WebSocket 消息协议

 ## 连接
- URL: /ws/conversation/{conversation_id}?token={jwt_access_token}
- 鉴权: query 参数 token，JWT access_token
+
+- URL: `/ws/conversation/{conversation_id}?token={jwt_access_token}`
+- 鉴权: query 参数 `token`，JWT `access_token`

 ## 消息类型 (client → server)
- TEXT: 文本消息
- AUDIO_SEGMENT: 语音分段
- AUDIO_MESSAGE: 完整语音消息
- TRANSCRIBE_ONLY: 仅转写不回复
- END_CONVERSATION: 结束对话
+
+- `TEXT`：文本消息。`data.text` 必填。可选 `data.tts_this_turn`（布尔）：为 `true` 且服务端 `ENABLE_TTS` 开启且本轮回避 `skip_tts` 时，对该轮助手回复分段合成 TTS；默认为 `false`/缺省即不合成。**当开启本轮 TTS 时，每个助手分段服务端先推送 `tts_audio` 再推送该段 `agent_response`**，便于客户端先收音频再展示同段文字。
+- `AUDIO_SEGMENT`：语音分段。`data` 含 `audio_base64`、`segment_index`、`voice_session_id` / `client_segment_id`、`is_last`、`duration`。可选同上 `tts_this_turn`。
+- `AUDIO_MESSAGE`：整段音频（单次 ASR + 对话）。同上可选 `tts_this_turn`。
+- `TRANSCRIBE_ONLY`：仅转写不回复
+- `TTS_CANCEL`：取消当前轮未完成的分段合成与下发
+- `TTS_REQUEST`：用户点击某一助手气泡「朗读」且该段尚无 TTS 时下发。`data` 含 `assistant_message_id`（落库 `conversation_messages.id`）、`segment_index`（与该条助手正文按 `[SPLIT]` 分段后的从 0 下标）、可选 `segment_text`（须与该分段正文一致，用于校验）。服务端若该段已有 URL 则只做预签名后推送 `tts_audio`（`data.manual=true`），**不重复合成**。
+- `END_CONVERSATION`：结束对话
+- `PING` / `PONG`：心跳（客户端也可用 JSON `{"type":"ping"}`）

 ## 消息类型 (server → client)
- TRANSCRIPT: ASR 转写结果
- AGENT_RESPONSE: AI 回复文本
- TTS_AUDIO: 语音合成音频 (base64)
- MEMOIR_UPDATE: 回忆录更新通知
- ERROR: 错误信息
+
+- `TRANSCRIPT`: ASR 转写结果
+- `AGENT_RESPONSE`: AI 回复文本分段
+- `TTS_AUDIO`: 语音合成结果（可与 `COS` 签名 URL、`base64` 并存）。按需朗读成功时 `data.manual` 可为 `true`，提示客户端应播放（即使用户未开「本轮 Speak」）。
+- `MEMOIR_UPDATE`: 回忆录更新通知
+- `ERROR`: 错误信息

 ## 状态流转
-CONNECT → (TEXT|AUDIO_*) ↔ (TRANSCRIPT|AGENT_RESPONSE|TTS_AUDIO) → END_CONVERSATION
+
+`CONNECT → (TEXT|AUDIO_*) ↔ (TRANSCRIPT|AGENT_RESPONSE|[TTS_AUDIO]) → END_CONVERSATION`
+
+同一连接内消息顺序稳定；首轮朗读模式下每一助手分段为 `tts_audio` 先于对应 `agent_response`。

 ## 重连
-客户端断连后可用相同 conversation_id 重连，历史消息从 Redis 恢复。
+
+客户端断连后可用相同 `conversation_id` 重连，历史消息从 Redis / HTTP 缓存恢复。
--- a/api/app/features/conversation/ws/router.py
+++ b/api/app/features/conversation/ws/router.py
@@ -28,11 +28,13 @@ from app.features.conversation.ws.pipeline import (
    chat_orchestrator,
    cleanup_segment_states,
    get_or_create_segment_state,
+    handle_tts_request_on_demand,
    memoir_ingest_scheduler,
    process_audio_segment,
    process_conversation_segments,
-    process_user_message,
+    process_persisted_user_segment_response,
    register_segment_task,
+    register_user_response_task,
 )
 from app.features.conversation.ws.profile_collector import get_missing_profile_fields
 from app.features.conversation.ws.quota_guard import check_ws_quota
@@ -276,7 +278,9 @@ async def websocket_endpoint(
                        )

                    if msg_type == MessageType.TEXT:
-                        text_message = message.get("data", {}).get("text", "")
+                        data = message.get("data") or {}
+                        text_message = data.get("text", "")
+                        tts_this_turn = bool(data.get("tts_this_turn"))

                        if text_message:
                            can_send, quota_msg = await check_ws_quota(
@@ -303,23 +307,21 @@ async def websocket_endpoint(
                                user_id,
                                text_message,
                            )
-                            user_message_timestamp = conversation.last_message_at
                            await memoir_ingest_scheduler.queue_segment(
                                conversation.user_id,
                                segment.id,
                                text_char_count=len(text_message.strip()),
                            )

-                            await process_user_message(
-                                conversation_id=conversation_id,
-                                user_message=text_message,
-                                conversation=conversation,
-                                segment=segment,
-                                db=db,
-                                user=user,
-                                user_message_timestamp=segment.created_at
-                                or user_message_timestamp,
+                            task = asyncio.create_task(
+                                process_persisted_user_segment_response(
+                                    conversation_id=conversation_id,
+                                    user_id=user_id,
+                                    segment_id=segment.id,
+                                    tts_this_turn=tts_this_turn,
+                                )
                            )
+                            register_user_response_task(conversation_id, task)

                    elif msg_type == MessageType.RECORDING_STARTED:
                        data = message.get("data", {})
@@ -486,6 +488,7 @@ async def websocket_endpoint(
                                audio_base64=audio_base64,
                                audio_duration=audio_duration,
                                is_last=is_last,
+                                tts_this_turn=bool(data.get("tts_this_turn")),
                            )
                        )
                        register_segment_task(conversation_id, voice_session_id, task)
@@ -494,6 +497,7 @@ async def websocket_endpoint(
                        data = message.get("data", {})
                        audio_base64 = data.get("audio_base64", "")
                        audio_duration = data.get("duration", 0)
+                        tts_this_turn = bool(data.get("tts_this_turn"))

                        if audio_base64:
                            can_send, quota_msg = await check_ws_quota(
@@ -564,7 +568,6 @@ async def websocket_endpoint(
                                        audio_duration_seconds=ads if ads > 0 else None,
                                    )
                                )
-                                user_message_timestamp = conversation.last_message_at
                                await memoir_ingest_scheduler.queue_segment(
                                    conversation.user_id,
                                    segment.id,
@@ -572,16 +575,15 @@ async def websocket_endpoint(
                                )

                                if asr_text and not asr_text.startswith("转写失败"):
-                                    await process_user_message(
-                                        conversation_id=conversation_id,
-                                        user_message=asr_text,
-                                        conversation=conversation,
-                                        segment=segment,
-                                        db=db,
-                                        user=user,
-                                        user_message_timestamp=segment.created_at
-                                        or user_message_timestamp,
+                                    task = asyncio.create_task(
+                                        process_persisted_user_segment_response(
+                                            conversation_id=conversation_id,
+                                            user_id=user_id,
+                                            segment_id=segment.id,
+                                            tts_this_turn=tts_this_turn,
+                                        )
                                    )
+                                    register_user_response_task(conversation_id, task)
                                else:
                                    await manager.send_message(
                                        conversation_id,
@@ -651,6 +653,51 @@ async def websocket_endpoint(
                    elif msg_type == MessageType.TTS_CANCEL:
                        bump_tts_cancel_epoch(conversation_id)

+                    elif msg_type == MessageType.TTS_REQUEST:
+                        data = message.get("data") or {}
+                        aid = data.get("assistant_message_id") or data.get(
+                            "assistantMessageId"
+                        )
+                        if not aid or not str(aid).strip():
+                            await manager.send_message(
+                                conversation_id,
+                                {
+                                    "type": MessageType.ERROR,
+                                    "data": {"message": "缺少助手消息 id"},
+                                    "timestamp": datetime.now(timezone.utc).isoformat(),
+                                },
+                            )
+                            continue
+                        try:
+                            seg_idx = int(
+                                data.get("segment_index", data.get("segmentIndex", 0))
+                            )
+                        except (TypeError, ValueError):
+                            seg_idx = 0
+                        st = data.get("segment_text") or data.get("segmentText")
+                        st_val: str | None
+                        if st is None:
+                            st_val = None
+                        else:
+                            st_val = str(st).strip() or None
+                        ok, err_msg = await handle_tts_request_on_demand(
+                            conversation_id=conversation_id,
+                            user_id=user_id,
+                            assistant_message_id=str(aid).strip(),
+                            segment_index=seg_idx,
+                            segment_text=st_val,
+                            db=db,
+                        )
+                        if not ok:
+                            await manager.send_message(
+                                conversation_id,
+                                {
+                                    "type": MessageType.ERROR,
+                                    "data": {"message": err_msg or "朗读请求失败"},
+                                    "timestamp": datetime.now(timezone.utc).isoformat(),
+                                },
+                            )
+
                    elif msg_type == MessageType.END_CONVERSATION:
                        await conversation_service.end(conversation_id, user_id)

--- a/api/app/features/user/router.py
+++ b/api/app/features/user/router.py
@@ -66,6 +66,11 @@ async def update_user_profile(
    current_user: User = Depends(get_current_user),
    service: UserService = Depends(get_user_service),
 ):
+    logger.info(
+        "更新用户档案 user_id={} fields={}",
+        current_user.id,
+        sorted(body.model_fields_set),
+    )
    return await service.update_profile(current_user.id, body)


--- a/api/app/features/user/service.py
+++ b/api/app/features/user/service.py
@@ -46,14 +46,9 @@ class UserService:
        user = await repo.get_user_by_id(user_id, self._db)
        if not user:
            raise ValueError("用户不存在")
-        if body.birth_year is not None:
-            user.birth_year = body.birth_year
-        if body.birth_place is not None:
-            user.birth_place = body.birth_place
-        if body.grew_up_place is not None:
-            user.grew_up_place = body.grew_up_place
-        if body.occupation is not None:
-            user.occupation = body.occupation
+        for field in ("birth_year", "birth_place", "grew_up_place", "occupation"):
+            if field in body.model_fields_set:
+                setattr(user, field, getattr(body, field))
        await self._db.commit()
        await self._db.refresh(user)
        return _user_to_profile(user)