修复：CI 部署环境与 ref 错配、迁移碎片化、图片意图 source_span、章节物化脏版式、会话历史与本地语音不一致

新增：TTS 上传 COS 与分片、章节 reading_segments 物化与快照、markdown 清洗、会话消息 repository、语音 store 重构与相关测试
2026-03-20 16:36:42 +08:00
parent 7317bf10cd
commit 8af37e5e8e
65 changed files with 1704 additions and 504 deletions
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -12,7 +12,7 @@ from app.core.logging import get_logger
 if TYPE_CHECKING:
    from app.features.quota.service import QuotaService

-from sqlalchemy import select
+from sqlalchemy import select, update
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.agents import ConversationAgent, MemoryAgent
@@ -20,7 +20,8 @@ from app.agents.chat import ChatOrchestrator
 from app.agents.memoir import BackgroundTaskRunner
 from app.core.config import settings
 from app.core.db import AsyncSessionLocal
-from app.core.dependencies import get_asr_provider, get_tts_provider
+from app.core.dependencies import get_asr_provider, get_object_storage, get_tts_provider
+from app.core.redis import redis_service
 from app.features.conversation.models import Conversation, Segment
 from app.features.conversation.ws.connection_manager import manager
 from app.features.conversation.ws.message_types import (
@@ -37,10 +38,32 @@ from app.features.user.models import User
 logger = get_logger(__name__)


-async def _send_tts_audio(conversation_id: str, text: str) -> None:
-    """Synthesize text to speech and send TTS_AUDIO if successful."""
+def _tts_object_ext(codec: str) -> str:
+    c = (codec or "mp3").lower().lstrip(".")
+    if c in ("wave",):
+        return "wav"
+    return c if c else "mp3"
+
+
+def _tts_codec_to_content_type(codec: str) -> str:
+    c = (codec or "mp3").lower().lstrip(".")
+    if c == "mp3":
+        return "audio/mpeg"
+    if c in ("wav", "wave"):
+        return "audio/wav"
+    return "application/octet-stream"
+
+
+async def _send_tts_audio(
+    conversation_id: str,
+    text: str,
+    *,
+    chunk_index: int,
+    chunk_total: int,
+) -> str | None:
+    """Synthesize TTS, upload to COS, append Redis, send TTS_AUDIO. Returns public URL or None."""
    if not settings.enable_tts:
-        return
+        return None
    try:
        tts = get_tts_provider()
        audio_bytes = await tts.synthesize(text)
@@ -48,7 +71,15 @@ async def _send_tts_audio(conversation_id: str, text: str) -> None:
            logger.warning(
                "TTS skipped: synthesize returned empty. Check TTS config in .env"
            )
-            return
+            return None
+        ext = _tts_object_ext(settings.tts_codec)
+        content_type = _tts_codec_to_content_type(settings.tts_codec)
+        storage = get_object_storage()
+        key = f"conversations/{conversation_id}/tts/{uuid.uuid4().hex}.{ext}"
+        public_url = storage.upload(key, audio_bytes, content_type)
+        await redis_service.append_tts_audio_url_to_last_ai_message(
+            conversation_id, public_url
+        )
        await manager.send_message(
            conversation_id,
            {
@@ -57,10 +88,14 @@ async def _send_tts_audio(conversation_id: str, text: str) -> None:
                "data": {
                    "audio_base64": base64.b64encode(audio_bytes).decode("utf-8"),
                    "format": settings.tts_codec,
+                    "audio_url": public_url,
+                    "index": chunk_index,
+                    "total": chunk_total,
                },
                "timestamp": datetime.now(timezone.utc).isoformat(),
            },
        )
+        return public_url
    except Exception as e:
        err_str = str(e)
        if "PkgExhausted" in err_str:
@@ -70,6 +105,7 @@ async def _send_tts_audio(conversation_id: str, text: str) -> None:
            )
        else:
            logger.error("TTS synthesize failed: %s", e)
+        return None


 # ── Agent 实例（从 ConnectionManager 移出） ─────────────────────
@@ -427,6 +463,9 @@ async def process_audio_segment(
                    conversation_id=conversation_id,
                    transcript_text=transcript_text or "",
                    audio_url=_build_segment_audio_url(voice_session_id, segment_index),
+                    audio_duration_seconds=audio_duration
+                    if audio_duration > 0
+                    else None,
                    processed=False,
                )
                db.add(segment)
@@ -499,6 +538,7 @@ async def process_user_message(
    try:
        is_from_voice = bool(segment.audio_url)
        voice_session_id = _voice_session_id_from_audio_url(segment.audio_url)
+        audio_dur = getattr(segment, "audio_duration_seconds", None)
        responses = await chat_orchestrator.process_user_message(
            conversation_id=conversation_id,
            user_message=user_message,
@@ -511,12 +551,15 @@ async def process_user_message(
            get_missing_profile_fields_fn=get_missing_profile_fields,
            get_filled_profile_fields_fn=get_filled_profile_fields,
            user_message_timestamp=user_message_timestamp,
+            audio_duration_seconds=audio_dur,
        )

        segment.agent_response = "\n\n".join(responses)
        _mark_conversation_active(conversation)
        await db.commit()

+        tts_urls: list[str] = []
+        n = len(responses)
        for i, response_text in enumerate(responses):
            await manager.send_message(
                conversation_id,
@@ -526,15 +569,29 @@ async def process_user_message(
                    "data": {
                        "text": response_text,
                        "index": i,
-                        "total": len(responses),
+                        "total": n,
                    },
                    "timestamp": datetime.now(timezone.utc).isoformat(),
                },
            )
-            await _send_tts_audio(conversation_id, response_text)
-            if i < len(responses) - 1:
+            url = await _send_tts_audio(
+                conversation_id,
+                response_text,
+                chunk_index=i,
+                chunk_total=n,
+            )
+            if url:
+                tts_urls.append(url)
+            if i < n - 1:
                await asyncio.sleep(0.5)

+        await db.execute(
+            update(Segment)
+            .where(Segment.id == segment.id)
+            .values(tts_audio_urls=tts_urls if tts_urls else None)
+        )
+        await db.commit()
+
    except Exception as e:
        logger.error(f"处理用户消息失败: {e}", exc_info=True)
        if conversation_id in manager.active_connections:
--- a/api/app/features/conversation/ws/router.py
+++ b/api/app/features/conversation/ws/router.py
@@ -462,11 +462,16 @@ async def websocket_endpoint(
                                    },
                                )

+                                try:
+                                    ads = int(audio_duration)
+                                except (TypeError, ValueError):
+                                    ads = 0
                                segment = Segment(
                                    id=str(uuid.uuid4()),
                                    conversation_id=conversation_id,
                                    transcript_text=transcript_text,
                                    audio_url=f"audio:{audio_duration}s",
+                                    audio_duration_seconds=ads if ads > 0 else None,
                                    processed=False,
                                )
                                db.add(segment)