修复:CI 部署环境与 ref 错配、迁移碎片化、图片意图 source_span、章节物化脏版式、会话历史与本地语音不一致

新增:TTS 上传 COS 与分片、章节 reading_segments 物化与快照、markdown 清洗、会话消息 repository、语音 store 重构与相关测试
This commit is contained in:
Kevin
2026-03-20 16:36:42 +08:00
parent 7317bf10cd
commit 8af37e5e8e
65 changed files with 1704 additions and 504 deletions

View File

@@ -12,7 +12,7 @@ from app.core.logging import get_logger
if TYPE_CHECKING:
from app.features.quota.service import QuotaService
from sqlalchemy import select
from sqlalchemy import select, update
from sqlalchemy.ext.asyncio import AsyncSession
from app.agents import ConversationAgent, MemoryAgent
@@ -20,7 +20,8 @@ from app.agents.chat import ChatOrchestrator
from app.agents.memoir import BackgroundTaskRunner
from app.core.config import settings
from app.core.db import AsyncSessionLocal
from app.core.dependencies import get_asr_provider, get_tts_provider
from app.core.dependencies import get_asr_provider, get_object_storage, get_tts_provider
from app.core.redis import redis_service
from app.features.conversation.models import Conversation, Segment
from app.features.conversation.ws.connection_manager import manager
from app.features.conversation.ws.message_types import (
@@ -37,10 +38,32 @@ from app.features.user.models import User
logger = get_logger(__name__)
async def _send_tts_audio(conversation_id: str, text: str) -> None:
"""Synthesize text to speech and send TTS_AUDIO if successful."""
def _tts_object_ext(codec: str) -> str:
c = (codec or "mp3").lower().lstrip(".")
if c in ("wave",):
return "wav"
return c if c else "mp3"
def _tts_codec_to_content_type(codec: str) -> str:
c = (codec or "mp3").lower().lstrip(".")
if c == "mp3":
return "audio/mpeg"
if c in ("wav", "wave"):
return "audio/wav"
return "application/octet-stream"
async def _send_tts_audio(
conversation_id: str,
text: str,
*,
chunk_index: int,
chunk_total: int,
) -> str | None:
"""Synthesize TTS, upload to COS, append Redis, send TTS_AUDIO. Returns public URL or None."""
if not settings.enable_tts:
return
return None
try:
tts = get_tts_provider()
audio_bytes = await tts.synthesize(text)
@@ -48,7 +71,15 @@ async def _send_tts_audio(conversation_id: str, text: str) -> None:
logger.warning(
"TTS skipped: synthesize returned empty. Check TTS config in .env"
)
return
return None
ext = _tts_object_ext(settings.tts_codec)
content_type = _tts_codec_to_content_type(settings.tts_codec)
storage = get_object_storage()
key = f"conversations/{conversation_id}/tts/{uuid.uuid4().hex}.{ext}"
public_url = storage.upload(key, audio_bytes, content_type)
await redis_service.append_tts_audio_url_to_last_ai_message(
conversation_id, public_url
)
await manager.send_message(
conversation_id,
{
@@ -57,10 +88,14 @@ async def _send_tts_audio(conversation_id: str, text: str) -> None:
"data": {
"audio_base64": base64.b64encode(audio_bytes).decode("utf-8"),
"format": settings.tts_codec,
"audio_url": public_url,
"index": chunk_index,
"total": chunk_total,
},
"timestamp": datetime.now(timezone.utc).isoformat(),
},
)
return public_url
except Exception as e:
err_str = str(e)
if "PkgExhausted" in err_str:
@@ -70,6 +105,7 @@ async def _send_tts_audio(conversation_id: str, text: str) -> None:
)
else:
logger.error("TTS synthesize failed: %s", e)
return None
# ── Agent 实例(从 ConnectionManager 移出) ─────────────────────
@@ -427,6 +463,9 @@ async def process_audio_segment(
conversation_id=conversation_id,
transcript_text=transcript_text or "",
audio_url=_build_segment_audio_url(voice_session_id, segment_index),
audio_duration_seconds=audio_duration
if audio_duration > 0
else None,
processed=False,
)
db.add(segment)
@@ -499,6 +538,7 @@ async def process_user_message(
try:
is_from_voice = bool(segment.audio_url)
voice_session_id = _voice_session_id_from_audio_url(segment.audio_url)
audio_dur = getattr(segment, "audio_duration_seconds", None)
responses = await chat_orchestrator.process_user_message(
conversation_id=conversation_id,
user_message=user_message,
@@ -511,12 +551,15 @@ async def process_user_message(
get_missing_profile_fields_fn=get_missing_profile_fields,
get_filled_profile_fields_fn=get_filled_profile_fields,
user_message_timestamp=user_message_timestamp,
audio_duration_seconds=audio_dur,
)
segment.agent_response = "\n\n".join(responses)
_mark_conversation_active(conversation)
await db.commit()
tts_urls: list[str] = []
n = len(responses)
for i, response_text in enumerate(responses):
await manager.send_message(
conversation_id,
@@ -526,15 +569,29 @@ async def process_user_message(
"data": {
"text": response_text,
"index": i,
"total": len(responses),
"total": n,
},
"timestamp": datetime.now(timezone.utc).isoformat(),
},
)
await _send_tts_audio(conversation_id, response_text)
if i < len(responses) - 1:
url = await _send_tts_audio(
conversation_id,
response_text,
chunk_index=i,
chunk_total=n,
)
if url:
tts_urls.append(url)
if i < n - 1:
await asyncio.sleep(0.5)
await db.execute(
update(Segment)
.where(Segment.id == segment.id)
.values(tts_audio_urls=tts_urls if tts_urls else None)
)
await db.commit()
except Exception as e:
logger.error(f"处理用户消息失败: {e}", exc_info=True)
if conversation_id in manager.active_connections:

View File

@@ -462,11 +462,16 @@ async def websocket_endpoint(
},
)
try:
ads = int(audio_duration)
except (TypeError, ValueError):
ads = 0
segment = Segment(
id=str(uuid.uuid4()),
conversation_id=conversation_id,
transcript_text=transcript_text,
audio_url=f"audio:{audio_duration}s",
audio_duration_seconds=ads if ads > 0 else None,
processed=False,
)
db.add(segment)