Files
operating-room-monitor-server/app/services/video/archive_persister.py

329 lines
12 KiB
Python
Raw Normal View History

"""手术归档持久化:写库失败后的内存归档 + 指数退避重试 + durable fallback。
设计目标
- ``CameraSessionManager`` 停录后把待落库明细交给本模块不再自行持有重试状态
- 首次写库失败时
1. 将归档放入内存 ``_archive`` 以便下次重试
2. 若开启 durable fallback同步写一个 JSON 文件到磁盘进程重启后可从中恢复
- 后台循环以指数退避 + 最大重试次数的方式尝试把内存中的归档写库成功达到上限仍失败时记
告警并保留 durable 文件等待人工介入
"""
from __future__ import annotations
import asyncio
import json
import os
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING
from loguru import logger
from sqlalchemy.ext.asyncio import async_sessionmaker
from app.baked import pipeline as bp
from app.domain.consumption import SurgeryConsumptionStored
if TYPE_CHECKING:
from app.repositories.surgery_results import SurgeryResultRepository
@dataclass
class _ArchiveEntry:
"""内存归档条目,记录尝试次数以驱动指数退避。"""
details: list[SurgeryConsumptionStored]
attempts: int = 0
next_attempt_monotonic: float = 0.0
durable_path: Path | None = None
def _serialize_details(details: list[SurgeryConsumptionStored]) -> list[dict]:
return [
{
"item_id": d.item_id,
"item_name": d.item_name,
"qty": d.qty,
"doctor_id": d.doctor_id,
"timestamp": d.timestamp.isoformat(),
"source": d.source,
"pending_confirmation_id": d.pending_confirmation_id,
}
for d in details
]
def _deserialize_details(rows: list[dict]) -> list[SurgeryConsumptionStored]:
out: list[SurgeryConsumptionStored] = []
for r in rows:
ts_raw = r["timestamp"]
try:
ts = datetime.fromisoformat(ts_raw)
except ValueError:
ts = datetime.now(timezone.utc)
iid = str(r["item_id"])
pend = r.get("pending_confirmation_id")
if pend is None and iid.startswith("pending:"):
pend = iid.removeprefix("pending:")
out.append(
SurgeryConsumptionStored(
item_id=iid,
item_name=str(r["item_name"]),
qty=int(r["qty"]),
doctor_id=str(r["doctor_id"]),
timestamp=ts,
source=str(r.get("source", "vision")),
pending_confirmation_id=pend,
)
)
return out
class ArchivePersister:
"""把手术结束明细写入 DB失败时进入退避重试 + 可选 durable fallback。"""
def __init__(
self,
*,
repository: "SurgeryResultRepository | None",
session_factory: async_sessionmaker,
) -> None:
self._repo = repository
self._session_factory = session_factory
self._archive: dict[str, _ArchiveEntry] = {}
self._lock = asyncio.Lock()
self._retry_task: asyncio.Task[None] | None = None
self._retry_stop = asyncio.Event()
@property
def repository(self) -> "SurgeryResultRepository | None":
return self._repo
@property
def has_pending(self) -> bool:
return bool(self._archive)
def archived_details(
self, surgery_id: str
) -> list[SurgeryConsumptionStored] | None:
"""供 API 回退查询:读取内存归档,不访问 DB。"""
entry = self._archive.get(surgery_id)
if entry is None:
return None
return list(entry.details)
async def take_archived_details(
self, surgery_id: str
) -> list[SurgeryConsumptionStored] | None:
"""弹出归档(用于同一手术号重新开始前的强制落库 / 移交)。"""
async with self._lock:
entry = self._archive.pop(surgery_id, None)
if entry is None:
return None
return list(entry.details)
async def restore(self, surgery_id: str, details: list[SurgeryConsumptionStored]) -> None:
"""把此前弹出的归档重新放回(比如「强制落库」再次失败时回退)。"""
async with self._lock:
self._archive[surgery_id] = _ArchiveEntry(details=list(details))
async def persist_or_archive(
self,
surgery_id: str,
details: list[SurgeryConsumptionStored],
) -> bool:
"""尝试立即写库;失败则放入内存归档,并按配置写入 durable fallback。"""
if await self._write_to_db(surgery_id, details):
return True
entry = _ArchiveEntry(details=list(details))
if bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
entry.durable_path = self._write_durable(surgery_id, details)
async with self._lock:
self._archive[surgery_id] = entry
logger.error(
"Surgery {} final result kept in memory archive (durable={}); "
"background retry will attempt persist",
surgery_id,
bool(entry.durable_path),
)
return False
async def try_persist_archive(self, surgery_id: str) -> bool:
"""尝试把一条内存归档写入数据库;成功则清理内存及 durable 文件。"""
async with self._lock:
entry = self._archive.get(surgery_id)
if entry is None:
return True
if self._repo is None:
return False
ok = await self._write_to_db(surgery_id, entry.details)
if not ok:
entry.attempts += 1
return False
async with self._lock:
removed = self._archive.pop(surgery_id, None)
if removed is not None and removed.durable_path is not None:
self._safe_remove(removed.durable_path)
logger.info("Archive persisted after retry surgery_id={}", surgery_id)
return True
async def start_retry_loop(self) -> None:
if self._retry_task is not None and not self._retry_task.done():
return
self._retry_stop.clear()
self._retry_task = asyncio.create_task(
self._retry_loop(),
name="archive_persist_retry",
)
async def shutdown(self) -> None:
self._retry_stop.set()
if self._retry_task is not None:
self._retry_task.cancel()
try:
await self._retry_task
except asyncio.CancelledError:
pass
except Exception as exc:
logger.debug("archive retry shutdown: {}", exc)
self._retry_task = None
async def recover_from_durable_fallback(self) -> int:
"""进程启动时调用:从 durable 目录把未写库的归档读回内存。"""
if not bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
return 0
directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
if not directory.exists():
return 0
loaded = 0
for path in sorted(directory.glob("*.json")):
try:
raw = json.loads(path.read_text(encoding="utf-8"))
surgery_id = str(raw["surgery_id"])
details = _deserialize_details(list(raw.get("details") or []))
except Exception as exc:
logger.warning("Skip unreadable durable archive {}: {}", path, exc)
continue
async with self._lock:
if surgery_id in self._archive:
continue
self._archive[surgery_id] = _ArchiveEntry(
details=details,
durable_path=path,
)
loaded += 1
if loaded:
logger.warning(
"Recovered {} durable archive(s) from {}; retry loop will attempt persist",
loaded,
directory,
)
return loaded
async def _write_to_db(
self,
surgery_id: str,
details: list[SurgeryConsumptionStored],
) -> bool:
if self._repo is None:
return True
try:
async with self._session_factory() as session:
async with session.begin():
await self._repo.save_final_result(
session,
surgery_id=surgery_id,
details=list(details),
)
except Exception as exc:
logger.warning(
"Persist surgery {} failed (will archive/retry): {}", surgery_id, exc
)
return False
return True
def _write_durable(
self,
surgery_id: str,
details: list[SurgeryConsumptionStored],
) -> Path | None:
directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
try:
directory.mkdir(parents=True, exist_ok=True)
except Exception as exc:
logger.warning("mkdir durable archive dir {} failed: {}", directory, exc)
return None
path = directory / f"{surgery_id}.json"
payload = {
"surgery_id": surgery_id,
"saved_at": datetime.now(timezone.utc).isoformat(),
"details": _serialize_details(details),
}
try:
tmp = path.with_suffix(".json.tmp")
tmp.write_text(
json.dumps(payload, ensure_ascii=False, indent=2),
encoding="utf-8",
)
os.replace(tmp, path)
return path
except Exception as exc:
logger.warning("write durable archive {} failed: {}", path, exc)
return None
def _safe_remove(self, path: Path) -> None:
try:
path.unlink(missing_ok=True)
except Exception as exc:
logger.debug("remove durable archive {} failed: {}", path, exc)
def _next_backoff_seconds(self, attempts: int) -> float:
base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
cap = float(bp.ARCHIVE_PERSIST_BACKOFF_CAP_SECONDS)
# 指数退避base * 2^(attempts-1),首个间隔即 base。
exp = max(0, attempts - 1)
return min(cap, base * (2**exp))
async def _retry_loop(self) -> None:
base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
max_attempts = int(bp.ARCHIVE_PERSIST_MAX_RETRIES)
while not self._retry_stop.is_set():
try:
await asyncio.wait_for(self._retry_stop.wait(), timeout=base)
break
except TimeoutError:
pass
loop = asyncio.get_running_loop()
now = loop.time()
# 快照当前归档条目;后续尝试可能改变 _archive 内部状态。
async with self._lock:
entries = [(sid, ent) for sid, ent in self._archive.items()]
for surgery_id, entry in entries:
if self._retry_stop.is_set():
break
if entry.attempts >= max_attempts:
# 达到上限,放弃自动重试,等待进程重启或人工介入。
continue
if entry.next_attempt_monotonic > now:
continue
ok = await self.try_persist_archive(surgery_id)
if not ok:
# 失败:更新退避时间
async with self._lock:
current = self._archive.get(surgery_id)
if current is not None:
current.next_attempt_monotonic = now + self._next_backoff_seconds(
current.attempts
)
if current.attempts >= max_attempts:
logger.error(
"Archive persist exhausted retries surgery_id={} "
"attempts={}; durable={} kept for manual recovery",
surgery_id,
current.attempts,
bool(current.durable_path),
)