operating-room-monitor-server/app/services/video/archive_persister.py

"""手术归档持久化：写库失败后的内存归档 + 指数退避重试 + durable fallback。

设计目标：
- ``CameraSessionManager`` 停录后把「待落库明细」交给本模块，不再自行持有重试状态。
- 首次写库失败时：
  1. 将归档放入内存 ``_archive`` 以便下次重试。
  2. 若开启 durable fallback，同步写一个 JSON 文件到磁盘，进程重启后可从中恢复。
- 后台循环以指数退避 + 最大重试次数的方式尝试把内存中的归档写库成功。达到上限仍失败时记
  告警并保留 durable 文件，等待人工介入。
"""

from __future__ import annotations

import asyncio
import json
import os
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING

from loguru import logger
from sqlalchemy.ext.asyncio import async_sessionmaker

from app.baked import pipeline as bp
from app.domain.consumption import SurgeryConsumptionStored

if TYPE_CHECKING:
    from app.repositories.surgery_results import SurgeryResultRepository


@dataclass
class _ArchiveEntry:
    """内存归档条目，记录尝试次数以驱动指数退避。"""

    details: list[SurgeryConsumptionStored]
    attempts: int = 0
    next_attempt_monotonic: float = 0.0
    durable_path: Path | None = None


def _serialize_details(details: list[SurgeryConsumptionStored]) -> list[dict]:
    return [
        {
            "item_id": d.item_id,
            "item_name": d.item_name,
            "qty": d.qty,
            "doctor_id": d.doctor_id,
            "timestamp": d.timestamp.isoformat(),
            "source": d.source,
            "pending_confirmation_id": d.pending_confirmation_id,
        }
        for d in details
    ]


def _deserialize_details(rows: list[dict]) -> list[SurgeryConsumptionStored]:
    out: list[SurgeryConsumptionStored] = []
    for r in rows:
        ts_raw = r["timestamp"]
        try:
            ts = datetime.fromisoformat(ts_raw)
        except ValueError:
            ts = datetime.now(timezone.utc)
        iid = str(r["item_id"])
        pend = r.get("pending_confirmation_id")
        if pend is None and iid.startswith("pending:"):
            pend = iid.removeprefix("pending:")
        out.append(
            SurgeryConsumptionStored(
                item_id=iid,
                item_name=str(r["item_name"]),
                qty=int(r["qty"]),
                doctor_id=str(r["doctor_id"]),
                timestamp=ts,
                source=str(r.get("source", "vision")),
                pending_confirmation_id=pend,
            )
        )
    return out


class ArchivePersister:
    """把手术结束明细写入 DB；失败时进入退避重试 + 可选 durable fallback。"""

    def __init__(
        self,
        *,
        repository: "SurgeryResultRepository | None",
        session_factory: async_sessionmaker,
    ) -> None:
        self._repo = repository
        self._session_factory = session_factory
        self._archive: dict[str, _ArchiveEntry] = {}
        self._lock = asyncio.Lock()
        self._retry_task: asyncio.Task[None] | None = None
        self._retry_stop = asyncio.Event()

    @property
    def repository(self) -> "SurgeryResultRepository | None":
        return self._repo

    @property
    def has_pending(self) -> bool:
        return bool(self._archive)

    def archived_details(
        self, surgery_id: str
    ) -> list[SurgeryConsumptionStored] | None:
        """供 API 回退查询：读取内存归档，不访问 DB。"""
        entry = self._archive.get(surgery_id)
        if entry is None:
            return None
        return list(entry.details)

    async def take_archived_details(
        self, surgery_id: str
    ) -> list[SurgeryConsumptionStored] | None:
        """弹出归档（用于同一手术号重新开始前的强制落库 / 移交）。"""
        async with self._lock:
            entry = self._archive.pop(surgery_id, None)
        if entry is None:
            return None
        return list(entry.details)

    async def restore(self, surgery_id: str, details: list[SurgeryConsumptionStored]) -> None:
        """把此前弹出的归档重新放回（比如「强制落库」再次失败时回退）。"""
        async with self._lock:
            self._archive[surgery_id] = _ArchiveEntry(details=list(details))

    async def persist_or_archive(
        self,
        surgery_id: str,
        details: list[SurgeryConsumptionStored],
    ) -> bool:
        """尝试立即写库；失败则放入内存归档，并按配置写入 durable fallback。"""
        if await self._write_to_db(surgery_id, details):
            return True
        entry = _ArchiveEntry(details=list(details))
        if bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
            entry.durable_path = self._write_durable(surgery_id, details)
        async with self._lock:
            self._archive[surgery_id] = entry
        logger.error(
            "Surgery {} final result kept in memory archive (durable={}); "
            "background retry will attempt persist",
            surgery_id,
            bool(entry.durable_path),
        )
        return False

    async def try_persist_archive(self, surgery_id: str) -> bool:
        """尝试把一条内存归档写入数据库；成功则清理内存及 durable 文件。"""
        async with self._lock:
            entry = self._archive.get(surgery_id)
        if entry is None:
            return True
        if self._repo is None:
            return False
        ok = await self._write_to_db(surgery_id, entry.details)
        if not ok:
            entry.attempts += 1
            return False
        async with self._lock:
            removed = self._archive.pop(surgery_id, None)
        if removed is not None and removed.durable_path is not None:
            self._safe_remove(removed.durable_path)
        logger.info("Archive persisted after retry surgery_id={}", surgery_id)
        return True

    async def start_retry_loop(self) -> None:
        if self._retry_task is not None and not self._retry_task.done():
            return
        self._retry_stop.clear()
        self._retry_task = asyncio.create_task(
            self._retry_loop(),
            name="archive_persist_retry",
        )

    async def shutdown(self) -> None:
        self._retry_stop.set()
        if self._retry_task is not None:
            self._retry_task.cancel()
            try:
                await self._retry_task
            except asyncio.CancelledError:
                pass
            except Exception as exc:
                logger.debug("archive retry shutdown: {}", exc)
            self._retry_task = None

    async def recover_from_durable_fallback(self) -> int:
        """进程启动时调用：从 durable 目录把未写库的归档读回内存。"""
        if not bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
            return 0
        directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
        if not directory.exists():
            return 0
        loaded = 0
        for path in sorted(directory.glob("*.json")):
            try:
                raw = json.loads(path.read_text(encoding="utf-8"))
                surgery_id = str(raw["surgery_id"])
                details = _deserialize_details(list(raw.get("details") or []))
            except Exception as exc:
                logger.warning("Skip unreadable durable archive {}: {}", path, exc)
                continue
            async with self._lock:
                if surgery_id in self._archive:
                    continue
                self._archive[surgery_id] = _ArchiveEntry(
                    details=details,
                    durable_path=path,
                )
            loaded += 1
        if loaded:
            logger.warning(
                "Recovered {} durable archive(s) from {}; retry loop will attempt persist",
                loaded,
                directory,
            )
        return loaded

    async def _write_to_db(
        self,
        surgery_id: str,
        details: list[SurgeryConsumptionStored],
    ) -> bool:
        if self._repo is None:
            return True
        try:
            async with self._session_factory() as session:
                async with session.begin():
                    await self._repo.save_final_result(
                        session,
                        surgery_id=surgery_id,
                        details=list(details),
                    )
        except Exception as exc:
            logger.warning(
                "Persist surgery {} failed (will archive/retry): {}", surgery_id, exc
            )
            return False
        return True

    def _write_durable(
        self,
        surgery_id: str,
        details: list[SurgeryConsumptionStored],
    ) -> Path | None:
        directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
        try:
            directory.mkdir(parents=True, exist_ok=True)
        except Exception as exc:
            logger.warning("mkdir durable archive dir {} failed: {}", directory, exc)
            return None
        path = directory / f"{surgery_id}.json"
        payload = {
            "surgery_id": surgery_id,
            "saved_at": datetime.now(timezone.utc).isoformat(),
            "details": _serialize_details(details),
        }
        try:
            tmp = path.with_suffix(".json.tmp")
            tmp.write_text(
                json.dumps(payload, ensure_ascii=False, indent=2),
                encoding="utf-8",
            )
            os.replace(tmp, path)
            return path
        except Exception as exc:
            logger.warning("write durable archive {} failed: {}", path, exc)
            return None

    def _safe_remove(self, path: Path) -> None:
        try:
            path.unlink(missing_ok=True)
        except Exception as exc:
            logger.debug("remove durable archive {} failed: {}", path, exc)

    def _next_backoff_seconds(self, attempts: int) -> float:
        base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
        cap = float(bp.ARCHIVE_PERSIST_BACKOFF_CAP_SECONDS)
        # 指数退避：base * 2^(attempts-1)，首个间隔即 base。
        exp = max(0, attempts - 1)
        return min(cap, base * (2**exp))

    async def _retry_loop(self) -> None:
        base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
        max_attempts = int(bp.ARCHIVE_PERSIST_MAX_RETRIES)
        while not self._retry_stop.is_set():
            try:
                await asyncio.wait_for(self._retry_stop.wait(), timeout=base)
                break
            except TimeoutError:
                pass

            loop = asyncio.get_running_loop()
            now = loop.time()
            # 快照当前归档条目；后续尝试可能改变 _archive 内部状态。
            async with self._lock:
                entries = [(sid, ent) for sid, ent in self._archive.items()]

            for surgery_id, entry in entries:
                if self._retry_stop.is_set():
                    break
                if entry.attempts >= max_attempts:
                    # 达到上限，放弃自动重试，等待进程重启或人工介入。
                    continue
                if entry.next_attempt_monotonic > now:
                    continue
                ok = await self.try_persist_archive(surgery_id)
                if not ok:
                    # 失败：更新退避时间
                    async with self._lock:
                        current = self._archive.get(surgery_id)
                        if current is not None:
                            current.next_attempt_monotonic = now + self._next_backoff_seconds(
                                current.attempts
                            )
                            if current.attempts >= max_attempts:
                                logger.error(
                                    "Archive persist exhausted retries surgery_id={} "
                                    "attempts={}; durable={} kept for manual recovery",
                                    surgery_id,
                                    current.attempts,
                                    bool(current.durable_path),
                                )