refactor(api): TOML 配置 SSOT、统一错误契约、Auth/事务加固与可观测性 (#33)

配置 SSOT(TOML + .env)
统一错误契约
Auth 与事务边界
Redis / Celery 可靠性:业务 Redis(DB/0)与 Celery broker/backend(DB/1)显式拆分;连接池、sync client
可观测性(OpenTelemetry + LGTM)
This commit is contained in:
Sully
2026-05-22 13:44:50 +08:00
committed by GitHub
parent f09ae248f9
commit 53e0065e3e
298 changed files with 15247 additions and 4344 deletions

View File

@@ -14,21 +14,22 @@ from celery import shared_task
from celery.exceptions import Retry
from celery.result import AsyncResult
from sqlalchemy import func, select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from app.agents.chat.background_voice import infer_background_voice
from app.agents.chat.prompts_profile import format_user_profile_context
from app.agents.memoir import MemoirOrchestrator
from app.agents.stage_constants import normalize_chapter_category
from app.core.business_telemetry import business_span
from app.core.chapter_pipeline_lock import (
acquire_chapter_pipeline_lock as _acquire_chapter_lock,
)
from app.core.chapter_pipeline_lock import (
release_chapter_pipeline_lock as _release_chapter_lock,
)
from app.core.business_telemetry import business_span
from app.core.config import settings
from app.core.db import AsyncSessionLocal, get_sync_db
from app.core.db import AsyncSessionLocal, get_sync_db, transactional_sync
from app.core.dependencies import get_embedding_provider
from app.core.llm_gateway import LlmGateway, LlmUseCase
from app.core.logging import get_logger
@@ -69,11 +70,19 @@ from app.features.memoir.story_pipeline_sync import (
run_story_pipeline_for_category_batch,
)
from app.features.memory.service import MemoryService
from app.features.memory.repo import get_transcript_source_by_segment_id_sync
from app.features.user.models import User
from app.tasks.celery_app import celery_app
from app.core.redis_sync import get_sync_redis
from app.core.runtime_constants import llm_defaults, redis_defaults
from app.features.memoir.constants import memoir
from app.features.story.constants import story
logger = get_logger(__name__)
_REDIS_CLIENTS: dict[bool, redis.Redis] = {}
def _get_redis_client(*, decode_responses: bool = False) -> redis.Redis:
return get_sync_redis(decode_responses=decode_responses)
def _run_post_pipeline_commit(
@@ -146,7 +155,7 @@ def _get_llm_fast():
async def _memory_ingest_transcripts_batch(
user_id: str,
items: list[tuple[str, str, dict | None]],
items: list[tuple[str, str, dict | None, str | None]],
*,
memoir_correlation_id: str,
) -> list[str]:
@@ -159,6 +168,47 @@ async def _memory_ingest_transcripts_batch(
)
def _phase1_memory_ingest_batch_sync(
db: Session,
user_id: str,
ingest_items: list[tuple[str, str, dict | None, str | None]],
*,
memoir_correlation_id: str,
) -> list[str]:
"""Run phase1 batch memory ingest; resolve segment unique races, else fail the task."""
if not ingest_items:
return []
try:
return asyncio.run(
_memory_ingest_transcripts_batch(
user_id,
ingest_items,
memoir_correlation_id=memoir_correlation_id,
)
)
except IntegrityError:
logger.warning(
"event=memoir_phase1_memory_ingest_race user_id={} item_count={} "
"msg=Concurrent segment ingest; resolving existing sources",
user_id,
len(ingest_items),
)
resolved: list[str] = []
for _conv_id, _text, _lineage, segment_id in ingest_items:
sid = (segment_id or "").strip()
if not sid:
continue
existing = get_transcript_source_by_segment_id_sync(
db,
user_id=user_id,
segment_id=sid,
)
if existing is None:
raise
resolved.append(existing.id)
return resolved
async def _memory_retrieve_evidence(
user_id: str,
query: str,
@@ -171,21 +221,8 @@ async def _memory_retrieve_evidence(
return bundle.model_dump()
def _get_redis_client(*, decode_responses: bool = False) -> redis.Redis:
from app.core.config import settings
client = _REDIS_CLIENTS.get(decode_responses)
if client is None:
client = redis.from_url(
settings.redis_url,
decode_responses=decode_responses,
)
_REDIS_CLIENTS[decode_responses] = client
return client
def _chapter_lock_ttl() -> int:
return int(settings.chapter_pipeline_lock_ttl_seconds)
return int(story.chapter_pipeline_lock_ttl_seconds)
def _update_task_status_sync(
@@ -210,7 +247,7 @@ def _update_task_status_sync(
task_info["result"] = result
r.hset(key, task_id, json.dumps(task_info))
r.expire(key, 3600) # 1小时过期
r.expire(key, redis_defaults.task_tracker_ttl_seconds)
logger.debug("任务状态已更新: task_id={} status={}", task_id, status)
except Exception as e:
@@ -309,7 +346,7 @@ def _should_trigger_phase2(
chapter_category: str,
current_segment_chars: int,
) -> bool:
if current_segment_chars >= int(settings.memoir_narrative_immediate_char_threshold):
if current_segment_chars >= int(memoir.narrative_immediate_char_threshold):
return True
user_convs = select(Conversation.id).where(
Conversation.user_id == user_id,
@@ -326,9 +363,9 @@ def _should_trigger_phase2(
)
row = db.execute(stmt).one()
count, total_chars = int(row[0] or 0), int(row[1] or 0)
if count >= int(settings.memoir_narrative_batch_min_segments):
if count >= int(memoir.narrative_batch_min_segments):
return True
if total_chars >= int(settings.memoir_narrative_batch_min_chars):
if total_chars >= int(memoir.narrative_batch_min_chars):
return True
return False
@@ -385,8 +422,8 @@ def _persist_phase2_route_defer(
返回 Celery 任务的 result dict``status=deferred``)。
"""
now_ts = datetime.now(timezone.utc)
max_attempts = int(settings.memoir_route_defer_max_attempts)
defer_seconds = float(settings.memoir_route_defer_seconds)
max_attempts = int(memoir.route_defer_max_attempts)
defer_seconds = float(memoir.route_defer_seconds)
deferred_until_ts = now_ts + timedelta(seconds=max(defer_seconds, 1.0))
rows: list[Segment] = []
@@ -396,19 +433,18 @@ def _persist_phase2_route_defer(
saturated_segments = 0
new_max_attempts_reached = False
for seg in rows:
prev_count = int(seg.narrative_defer_count or 0)
seg.narrative_defer_count = prev_count + 1
seg.narrative_defer_reason = defer_reason
seg.narrative_last_attempt_at = now_ts
if seg.narrative_defer_count >= max_attempts:
seg.narrative_deferred_until = None
saturated_segments += 1
new_max_attempts_reached = True
else:
seg.narrative_deferred_until = deferred_until_ts
db.commit()
with transactional_sync(db):
for seg in rows:
prev_count = int(seg.narrative_defer_count or 0)
seg.narrative_defer_count = prev_count + 1
seg.narrative_defer_reason = defer_reason
seg.narrative_last_attempt_at = now_ts
if seg.narrative_defer_count >= max_attempts:
seg.narrative_deferred_until = None
saturated_segments += 1
new_max_attempts_reached = True
else:
seg.narrative_deferred_until = deferred_until_ts
next_task_id: str | None = None
if rows and not new_max_attempts_reached:
@@ -469,7 +505,7 @@ def _schedule_phase2_timeout(
) -> str | None:
"""Reset countdown for Phase 2 narrative for one category。返回 Celery task_id。"""
_revoke_phase2_timeout(user_id, chapter_category)
countdown = float(max(1.0, settings.memoir_narrative_batch_max_wait_seconds))
countdown = float(max(1.0, memoir.narrative_batch_max_wait_seconds))
p2_kwargs: dict = {}
if memoir_correlation_id:
p2_kwargs["memoir_correlation_id"] = memoir_correlation_id
@@ -504,7 +540,7 @@ def _dispatch_phase2_immediate(
"kwargs": p2_kwargs,
}
fixed_tid: str | None = None
if settings.memoir_phase2_singleflight_immediate:
if memoir.phase2_singleflight_immediate:
fixed_tid = _phase2_immediate_task_id(user_id, chapter_category)
send_kw["task_id"] = fixed_tid
ar = celery_app.send_task("app.tasks.memoir_tasks.process_memoir_phase2", **send_kw)
@@ -515,7 +551,7 @@ def _dispatch_phase2_immediate(
user_id,
chapter_category,
memoir_correlation_id or "",
"singleflight" if settings.memoir_phase2_singleflight_immediate else "unique",
"singleflight" if memoir.phase2_singleflight_immediate else "unique",
out_tid or "",
)
return out_tid
@@ -580,7 +616,7 @@ def dispatch_pending_memoir_phase2_for_user(user_id: str) -> None:
)
@shared_task(bind=True, max_retries=3, default_retry_delay=30)
@shared_task(bind=True, max_retries=3, default_retry_delay=30, ignore_result=True)
def process_memoir_phase2(
self,
user_id: str,
@@ -734,9 +770,9 @@ def process_memoir_phase2(
segment_texts = [seg.user_input_text or "" for seg in category_segments]
combined_text = "\n\n".join(segment_texts)
n_units = len(category_segments)
evidence_top_k = int(settings.evidence_top_k_default)
if n_units > int(settings.evidence_large_batch_threshold):
evidence_top_k = int(settings.evidence_top_k_large_batch)
evidence_top_k = int(story.evidence_top_k_default)
if n_units > int(story.evidence_large_batch_threshold):
evidence_top_k = int(story.evidence_top_k_large_batch)
try:
memory_evidence = asyncio.run(
_memory_retrieve_evidence(
@@ -813,34 +849,33 @@ def process_memoir_phase2(
image_settings.enabled and chapter_needs_cover_enqueue(chapter)
)
stmt_book = (
select(Book)
.where(Book.user_id == user_id)
.order_by(Book.updated_at.desc())
)
result_book = db.execute(stmt_book)
book = result_book.scalar_one_or_none()
if not book:
book = Book(
id=str(uuid.uuid4()),
user_id=user_id,
title="我的回忆录",
total_pages=0,
total_words=0,
cover_image_url=None,
with transactional_sync(db):
stmt_book = (
select(Book)
.where(Book.user_id == user_id)
.order_by(Book.updated_at.desc())
)
db.add(book)
book.has_update = True
book.last_update_chapter_id = chapter.id
result_book = db.execute(stmt_book)
book = result_book.scalar_one_or_none()
if not book:
book = Book(
id=str(uuid.uuid4()),
user_id=user_id,
title="我的回忆录",
total_pages=0,
total_words=0,
cover_image_url=None,
)
db.add(book)
book.has_update = True
book.last_update_chapter_id = chapter.id
if needs_cover_enqueue:
chapters_to_enqueue.add(chapter.id)
if needs_cover_enqueue:
chapters_to_enqueue.add(chapter.id)
for seg in category_segments:
seg.narrated = True
seg.processed = True
db.commit()
for seg in category_segments:
seg.narrated = True
seg.processed = True
_run_post_pipeline_commit(
user_id=user_id,
@@ -925,7 +960,7 @@ def process_memoir_phase2(
raise self.retry(exc=e) from e
@shared_task(bind=True, max_retries=3, default_retry_delay=60)
@shared_task(bind=True, max_retries=3, default_retry_delay=60, ignore_result=True)
def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
"""
Phase 1记忆 ingest + 抽取/分类;持久化 topic_category / skip_narrative
@@ -991,6 +1026,30 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
)
return {"status": "no_segments"}
for seg in segments:
db.refresh(seg)
lineage_missing = [
str(seg.id)
for seg in segments
if (seg.agent_response or "").strip()
and not isinstance(getattr(seg, "lineage_json", None), dict)
]
if lineage_missing:
logger.warning(
"event=memoir_phase1_lineage_pending user_id={} task_id={} "
"segment_ids={} msg=Agent response persisted without lineage; retrying",
user_id,
task_id,
lineage_missing,
)
raise self.retry(
countdown=15,
exc=RuntimeError(
f"memoir_phase1_lineage_pending: {len(lineage_missing)} segments"
),
)
merge_pipeline_run(
memoir_correlation_id,
{
@@ -1002,8 +1061,9 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
)
ingest_t0 = time.perf_counter()
with business_span("memoir.phase1.ingest"):
ingest_items: list[tuple[str, str, dict | None]] = []
ingest_items: list[tuple[str, str, dict | None, str | None]] = []
non_empty_segments: list = []
ingested_source_ids: list[str] = []
for seg in segments:
text = (seg.user_input_text or "").strip()
if not text:
@@ -1011,37 +1071,46 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
conv_id = getattr(seg, "conversation_id", None) or ""
ln = getattr(seg, "lineage_json", None)
lineage_payload = ln if isinstance(ln, dict) else None
ingest_items.append((conv_id, text, lineage_payload))
if lineage_payload is None and not (seg.agent_response or "").strip():
logger.debug(
"event=memoir_phase1_skip_memory_ingest segment_id={} "
"msg=No lineage and no agent response yet",
seg.id,
)
continue
existing = get_transcript_source_by_segment_id_sync(
db,
user_id=user_id,
segment_id=str(seg.id),
)
if existing is not None:
ingested_source_ids.append(existing.id)
continue
ingest_items.append(
(conv_id, text, lineage_payload, str(seg.id)),
)
non_empty_segments.append(seg)
ingested_source_ids: list[str] = []
if ingest_items:
try:
ingested_source_ids = asyncio.run(
_memory_ingest_transcripts_batch(
user_id,
ingest_items,
memoir_correlation_id=memoir_correlation_id,
)
)
for seg, sid in zip(
non_empty_segments, ingested_source_ids, strict=True
):
logger.info(
"event=memory_transcript_ingested user_id={} task_id={} "
"source_id={} conversation_id={} segment_id={} transcript_chars={}",
user_id,
task_id,
sid,
getattr(seg, "conversation_id", None) or "",
seg.id,
len((seg.user_input_text or "").strip()),
)
except Exception as e:
logger.warning(
"Memory batch ingest 失败: {} exc_type={}",
e,
type(e).__name__,
new_source_ids = _phase1_memory_ingest_batch_sync(
db,
user_id,
ingest_items,
memoir_correlation_id=memoir_correlation_id,
)
ingested_source_ids.extend(new_source_ids)
for seg, sid in zip(
non_empty_segments, new_source_ids, strict=True
):
logger.info(
"event=memory_transcript_ingested user_id={} task_id={} "
"source_id={} conversation_id={} segment_id={} transcript_chars={}",
user_id,
task_id,
sid,
getattr(seg, "conversation_id", None) or "",
seg.id,
len((seg.user_input_text or "").strip()),
)
ingest_elapsed = time.perf_counter() - ingest_t0
merge_pipeline_run(
@@ -1059,10 +1128,10 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
llm = _get_llm()
llm_fast = _get_llm_fast() or llm
if (settings.llm_fast_model or "").strip():
if (llm_defaults.fast_model or "").strip():
logger.info(
"event=llm_fast_tier_used pipeline=memoir_prepare_batches model={}",
settings.llm_fast_model,
llm_defaults.fast_model,
)
prep_t0 = time.perf_counter()
@@ -1118,42 +1187,43 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
f"memoir_phase1_missing_category: {len(missing_cat)} segments"
)
for seg in segments:
cat = prepared.segment_chapter_category[str(seg.id)]
seg.topic_category = cat
is_skip = str(seg.id) in skip_ids
seg.skip_narrative = is_skip
seg.narrated = False
if is_skip:
seg.processed = True
db.flush()
categories_for_phase2: Set[str] = set()
phase2_immediate: list[str] = []
phase2_timeout: list[str] = []
woke_up_by_category: dict[str, int] = {}
for chapter_category, cat_segments in prepared.category_to_segments.items():
batch_non_skip = [
s
for s in cat_segments
if str(s.id) not in prepared.segment_skip_story_ids
]
if not batch_non_skip:
continue
woke = _wake_deferred_segments_for_category(
db, user_id, chapter_category
)
if woke:
woke_up_by_category[chapter_category] = woke
max_chars = max(
len((s.user_input_text or "").strip()) for s in batch_non_skip
)
categories_for_phase2.add(chapter_category)
if _should_trigger_phase2(db, user_id, chapter_category, max_chars):
phase2_immediate.append(chapter_category)
else:
phase2_timeout.append(chapter_category)
with transactional_sync(db):
for seg in segments:
cat = prepared.segment_chapter_category[str(seg.id)]
seg.topic_category = cat
is_skip = str(seg.id) in skip_ids
seg.skip_narrative = is_skip
seg.narrated = False
if is_skip:
seg.processed = True
db.flush()
for chapter_category, cat_segments in prepared.category_to_segments.items():
batch_non_skip = [
s
for s in cat_segments
if str(s.id) not in prepared.segment_skip_story_ids
]
if not batch_non_skip:
continue
woke = _wake_deferred_segments_for_category(
db, user_id, chapter_category
)
if woke:
woke_up_by_category[chapter_category] = woke
max_chars = max(
len((s.user_input_text or "").strip()) for s in batch_non_skip
)
categories_for_phase2.add(chapter_category)
if _should_trigger_phase2(db, user_id, chapter_category, max_chars):
phase2_immediate.append(chapter_category)
else:
phase2_timeout.append(chapter_category)
if woke_up_by_category:
logger.info(
@@ -1163,8 +1233,6 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
woke_up_by_category,
)
db.commit()
merge_pipeline_run(
memoir_correlation_id,
{
@@ -1278,7 +1346,7 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
_update_task_status_sync(user_id, task_id, "failure", {"error": str(e)})
raise self.retry(exc=e) from e
@shared_task(bind=True, max_retries=3, default_retry_delay=30)
@shared_task(bind=True, max_retries=3, default_retry_delay=30, ignore_result=True)
def generate_chapter_content(self, user_id: str, stage: str, new_content: str):
"""
单独生成章节内容的任务(用于实时更新)
@@ -1360,7 +1428,8 @@ def generate_chapter_content(self, user_id: str, stage: str, new_content: str):
exc=RuntimeError("story_pipeline returned no chapter"),
countdown=30,
)
db.commit()
with transactional_sync(db):
pass # commit pending pipeline writes
db.refresh(chapter)
ch_ids: set[str] = {str(chapter.id)}