"""手术归档持久化:写库失败后的内存归档 + 指数退避重试 + durable fallback。 设计目标: - ``CameraSessionManager`` 停录后把「待落库明细」交给本模块,不再自行持有重试状态。 - 首次写库失败时: 1. 将归档放入内存 ``_archive`` 以便下次重试。 2. 若开启 durable fallback,同步写一个 JSON 文件到磁盘,进程重启后可从中恢复。 - 后台循环以指数退避 + 最大重试次数的方式尝试把内存中的归档写库成功。达到上限仍失败时记 告警并保留 durable 文件,等待人工介入。 """ from __future__ import annotations import asyncio import json import os from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import TYPE_CHECKING from loguru import logger from sqlalchemy.ext.asyncio import async_sessionmaker from app.baked import pipeline as bp from app.domain.consumption import SurgeryConsumptionStored if TYPE_CHECKING: from app.repositories.surgery_results import SurgeryResultRepository @dataclass class _ArchiveEntry: """内存归档条目,记录尝试次数以驱动指数退避。""" details: list[SurgeryConsumptionStored] attempts: int = 0 next_attempt_monotonic: float = 0.0 durable_path: Path | None = None def _serialize_details(details: list[SurgeryConsumptionStored]) -> list[dict]: return [ { "item_id": d.item_id, "item_name": d.item_name, "qty": d.qty, "doctor_id": d.doctor_id, "timestamp": d.timestamp.isoformat(), "source": d.source, "pending_confirmation_id": d.pending_confirmation_id, } for d in details ] def _deserialize_details(rows: list[dict]) -> list[SurgeryConsumptionStored]: out: list[SurgeryConsumptionStored] = [] for r in rows: ts_raw = r["timestamp"] try: ts = datetime.fromisoformat(ts_raw) except ValueError: ts = datetime.now(timezone.utc) iid = str(r["item_id"]) pend = r.get("pending_confirmation_id") if pend is None and iid.startswith("pending:"): pend = iid.removeprefix("pending:") out.append( SurgeryConsumptionStored( item_id=iid, item_name=str(r["item_name"]), qty=int(r["qty"]), doctor_id=str(r["doctor_id"]), timestamp=ts, source=str(r.get("source", "vision")), pending_confirmation_id=pend, ) ) return out class ArchivePersister: """把手术结束明细写入 DB;失败时进入退避重试 + 可选 durable fallback。""" def __init__( self, *, repository: "SurgeryResultRepository | None", session_factory: async_sessionmaker, ) -> None: self._repo = repository self._session_factory = session_factory self._archive: dict[str, _ArchiveEntry] = {} self._lock = asyncio.Lock() self._retry_task: asyncio.Task[None] | None = None self._retry_stop = asyncio.Event() @property def repository(self) -> "SurgeryResultRepository | None": return self._repo @property def has_pending(self) -> bool: return bool(self._archive) def archived_details( self, surgery_id: str ) -> list[SurgeryConsumptionStored] | None: """供 API 回退查询:读取内存归档,不访问 DB。""" entry = self._archive.get(surgery_id) if entry is None: return None return list(entry.details) async def take_archived_details( self, surgery_id: str ) -> list[SurgeryConsumptionStored] | None: """弹出归档(用于同一手术号重新开始前的强制落库 / 移交)。""" async with self._lock: entry = self._archive.pop(surgery_id, None) if entry is None: return None return list(entry.details) async def restore(self, surgery_id: str, details: list[SurgeryConsumptionStored]) -> None: """把此前弹出的归档重新放回(比如「强制落库」再次失败时回退)。""" async with self._lock: self._archive[surgery_id] = _ArchiveEntry(details=list(details)) async def persist_or_archive( self, surgery_id: str, details: list[SurgeryConsumptionStored], ) -> bool: """尝试立即写库;失败则放入内存归档,并按配置写入 durable fallback。""" if await self._write_to_db(surgery_id, details): return True entry = _ArchiveEntry(details=list(details)) if bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED: entry.durable_path = self._write_durable(surgery_id, details) async with self._lock: self._archive[surgery_id] = entry logger.error( "Surgery {} final result kept in memory archive (durable={}); " "background retry will attempt persist", surgery_id, bool(entry.durable_path), ) return False async def try_persist_archive(self, surgery_id: str) -> bool: """尝试把一条内存归档写入数据库;成功则清理内存及 durable 文件。""" async with self._lock: entry = self._archive.get(surgery_id) if entry is None: return True if self._repo is None: return False ok = await self._write_to_db(surgery_id, entry.details) if not ok: entry.attempts += 1 return False async with self._lock: removed = self._archive.pop(surgery_id, None) if removed is not None and removed.durable_path is not None: self._safe_remove(removed.durable_path) logger.info("Archive persisted after retry surgery_id={}", surgery_id) return True async def start_retry_loop(self) -> None: if self._retry_task is not None and not self._retry_task.done(): return self._retry_stop.clear() self._retry_task = asyncio.create_task( self._retry_loop(), name="archive_persist_retry", ) async def shutdown(self) -> None: self._retry_stop.set() if self._retry_task is not None: self._retry_task.cancel() try: await self._retry_task except asyncio.CancelledError: pass except Exception as exc: logger.debug("archive retry shutdown: {}", exc) self._retry_task = None async def recover_from_durable_fallback(self) -> int: """进程启动时调用:从 durable 目录把未写库的归档读回内存。""" if not bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED: return 0 directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR) if not directory.exists(): return 0 loaded = 0 for path in sorted(directory.glob("*.json")): try: raw = json.loads(path.read_text(encoding="utf-8")) surgery_id = str(raw["surgery_id"]) details = _deserialize_details(list(raw.get("details") or [])) except Exception as exc: logger.warning("Skip unreadable durable archive {}: {}", path, exc) continue async with self._lock: if surgery_id in self._archive: continue self._archive[surgery_id] = _ArchiveEntry( details=details, durable_path=path, ) loaded += 1 if loaded: logger.warning( "Recovered {} durable archive(s) from {}; retry loop will attempt persist", loaded, directory, ) return loaded async def _write_to_db( self, surgery_id: str, details: list[SurgeryConsumptionStored], ) -> bool: if self._repo is None: return True try: async with self._session_factory() as session: async with session.begin(): await self._repo.save_final_result( session, surgery_id=surgery_id, details=list(details), ) except Exception as exc: logger.warning( "Persist surgery {} failed (will archive/retry): {}", surgery_id, exc ) return False return True def _write_durable( self, surgery_id: str, details: list[SurgeryConsumptionStored], ) -> Path | None: directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR) try: directory.mkdir(parents=True, exist_ok=True) except Exception as exc: logger.warning("mkdir durable archive dir {} failed: {}", directory, exc) return None path = directory / f"{surgery_id}.json" payload = { "surgery_id": surgery_id, "saved_at": datetime.now(timezone.utc).isoformat(), "details": _serialize_details(details), } try: tmp = path.with_suffix(".json.tmp") tmp.write_text( json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8", ) os.replace(tmp, path) return path except Exception as exc: logger.warning("write durable archive {} failed: {}", path, exc) return None def _safe_remove(self, path: Path) -> None: try: path.unlink(missing_ok=True) except Exception as exc: logger.debug("remove durable archive {} failed: {}", path, exc) def _next_backoff_seconds(self, attempts: int) -> float: base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS) cap = float(bp.ARCHIVE_PERSIST_BACKOFF_CAP_SECONDS) # 指数退避:base * 2^(attempts-1),首个间隔即 base。 exp = max(0, attempts - 1) return min(cap, base * (2**exp)) async def _retry_loop(self) -> None: base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS) max_attempts = int(bp.ARCHIVE_PERSIST_MAX_RETRIES) while not self._retry_stop.is_set(): try: await asyncio.wait_for(self._retry_stop.wait(), timeout=base) break except TimeoutError: pass loop = asyncio.get_running_loop() now = loop.time() # 快照当前归档条目;后续尝试可能改变 _archive 内部状态。 async with self._lock: entries = [(sid, ent) for sid, ent in self._archive.items()] for surgery_id, entry in entries: if self._retry_stop.is_set(): break if entry.attempts >= max_attempts: # 达到上限,放弃自动重试,等待进程重启或人工介入。 continue if entry.next_attempt_monotonic > now: continue ok = await self.try_persist_archive(surgery_id) if not ok: # 失败:更新退避时间 async with self._lock: current = self._archive.get(surgery_id) if current is not None: current.next_attempt_monotonic = now + self._next_backoff_seconds( current.attempts ) if current.attempts >= max_attempts: logger.error( "Archive persist exhausted retries surgery_id={} " "attempts={}; durable={} kept for manual recovery", surgery_id, current.attempts, bool(current.durable_path), )