refactor(api): TOML 配置 SSOT、统一错误契约、Auth/事务加固与可观测性 (#33)

配置 SSOT（TOML + .env）统一错误契约 Auth 与事务边界 Redis / Celery 可靠性:业务 Redis（DB/0）与 Celery broker/backend（DB/1）显式拆分；连接池、sync client 可观测性（OpenTelemetry + LGTM）
2026-05-22 13:44:50 +08:00
parent f09ae248f9
commit 53e0065e3e
298 changed files with 15247 additions and 4344 deletions
--- a/api/app/features/memory/ingest_service.py
+++ b/api/app/features/memory/ingest_service.py
@@ -5,6 +5,8 @@ from __future__ import annotations
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.core.config import settings
+from app.core.db import transactional
+from app.core.errors import BadRequestError
 from app.core.logging import get_logger
 from app.features.conversation.lineage_schemas import (
    primary_user_message_id_from_lineage,
@@ -22,8 +24,10 @@ from app.features.memory.enrichment_scheduler import (
 from app.features.memory.repo import (
    create_chunk,
    create_source,
+    get_transcript_source_by_segment_id,
 )
 from app.ports.embedding import EmbeddingProvider
+from app.features.memory.constants import memory

 logger = get_logger(__name__)

@@ -53,34 +57,32 @@ class MemoryIngestService:
        lineage_json: dict | None = None,
    ) -> str:
        if not transcript or not transcript.strip():
-            raise ValueError("transcript cannot be empty")
+            raise BadRequestError("transcript cannot be empty")

        primary_mid = (
            primary_user_message_id_from_lineage(lineage_json) if lineage_json else None
        )
-        source = await create_source(
-            self._db,
-            user_id=user_id,
-            source_type="transcript",
-            raw_text=transcript.strip(),
-            conversation_id=conversation_id,
-            lineage_json=lineage_json,
-            primary_user_message_id=primary_mid,
-        )
-
-        chunk_records: list[tuple[str, str]] = []
-        for i, content in enumerate(chunk_transcript(transcript.strip())):
-            chunk = await create_chunk(
+        async with transactional(self._db):
+            source = await create_source(
                self._db,
-                source_id=source.id,
                user_id=user_id,
-                content=content,
-                chunk_index=i,
+                source_type="transcript",
+                raw_text=transcript.strip(),
+                conversation_id=conversation_id,
+                lineage_json=lineage_json,
+                primary_user_message_id=primary_mid,
            )
-            chunk_records.append((chunk.id, content))

-        await self._db.flush()
-        await self._db.commit()
+            chunk_records: list[tuple[str, str]] = []
+            for i, content in enumerate(chunk_transcript(transcript.strip())):
+                chunk = await create_chunk(
+                    self._db,
+                    source_id=source.id,
+                    user_id=user_id,
+                    content=content,
+                    chunk_index=i,
+                )
+                chunk_records.append((chunk.id, content))

        embedding_result = await MemoryEmbeddingService(
            self._db,
@@ -108,7 +110,7 @@ class MemoryIngestService:
            embedding_result.get("status"),
            emb_ok,
            embedding_task_id,
-            settings.memory_enrichment_enabled,
+            memory.enrichment_enabled,
            enrichment_task_id,
        )
        return source.id
@@ -116,50 +118,63 @@ class MemoryIngestService:
    async def ingest_transcripts_batch(
        self,
        user_id: str,
-        items: list[tuple[str, str, dict | None]],
+        items: list[tuple[str, str, dict | None, str | None]],
        *,
        memoir_correlation_id: str | None = None,
    ) -> list[str]:
        """
        Batch ingest transcript items through the async memory path.

-        items: (conversation_id, transcript, lineage_json). Empty transcripts are skipped.
+        items: (conversation_id, transcript, lineage_json, segment_id).
+        Empty transcripts are skipped. When segment_id is set and a transcript
+        source already exists for the user, returns the existing source id.
        """
        source_ids: list[str] = []
        chunk_records: list[tuple[str, str]] = []
+        new_source_ids: list[str] = []

-        for conversation_id, transcript, lineage_json in items:
-            text = (transcript or "").strip()
-            if not text:
-                continue
-            primary_mid = (
-                primary_user_message_id_from_lineage(lineage_json)
-                if lineage_json
-                else None
-            )
-            source = await create_source(
-                self._db,
-                user_id=user_id,
-                source_type="transcript",
-                raw_text=text,
-                conversation_id=conversation_id or None,
-                lineage_json=lineage_json,
-                primary_user_message_id=primary_mid,
-            )
-            source_ids.append(source.id)
-
-            for i, content in enumerate(chunk_transcript(text)):
-                chunk = await create_chunk(
-                    self._db,
-                    source_id=source.id,
-                    user_id=user_id,
-                    content=content,
-                    chunk_index=i,
+        async with transactional(self._db):
+            for conversation_id, transcript, lineage_json, segment_id in items:
+                text = (transcript or "").strip()
+                if not text:
+                    continue
+                sid = (segment_id or "").strip() or None
+                if sid:
+                    existing = await get_transcript_source_by_segment_id(
+                        self._db,
+                        user_id=user_id,
+                        segment_id=sid,
+                    )
+                    if existing is not None:
+                        source_ids.append(existing.id)
+                        continue
+                primary_mid = (
+                    primary_user_message_id_from_lineage(lineage_json)
+                    if lineage_json
+                    else None
                )
-                chunk_records.append((chunk.id, content))
+                source = await create_source(
+                    self._db,
+                    user_id=user_id,
+                    source_type="transcript",
+                    raw_text=text,
+                    conversation_id=conversation_id or None,
+                    segment_id=sid,
+                    lineage_json=lineage_json,
+                    primary_user_message_id=primary_mid,
+                )
+                source_ids.append(source.id)
+                new_source_ids.append(source.id)

-        await self._db.flush()
-        await self._db.commit()
+                for i, content in enumerate(chunk_transcript(text)):
+                    chunk = await create_chunk(
+                        self._db,
+                        source_id=source.id,
+                        user_id=user_id,
+                        content=content,
+                        chunk_index=i,
+                    )
+                    chunk_records.append((chunk.id, content))

        vectors_written = 0
        embedding_retry_task_ids: list[str] = []
@@ -168,7 +183,7 @@ class MemoryIngestService:
            self._db,
            embedding_provider=self._embedding,
        )
-        for source_id in source_ids:
+        for source_id in new_source_ids:
            result = await embedding_service.embed_source(user_id, source_id)
            vectors_written += int(result.get("vectors_written") or 0)
            status = str(result.get("status") or "unknown")
@@ -185,7 +200,7 @@ class MemoryIngestService:
        emb_ok = self._embedding.is_available() if self._embedding else False
        task_ids = self._enrichment_scheduler.schedule_many(
            user_id,
-            source_ids,
+            new_source_ids,
            memoir_correlation_id=memoir_correlation_id,
        )

@@ -200,7 +215,7 @@ class MemoryIngestService:
            emb_ok,
            embedding_statuses,
            len(embedding_retry_task_ids),
-            settings.memory_enrichment_enabled,
+            memory.enrichment_enabled,
            len(task_ids),
        )
        return source_ids