2026-04-23 20:42:21 +08:00
|
|
|
|
"""手术归档持久化:写库失败后的内存归档 + 指数退避重试 + durable fallback。
|
|
|
|
|
|
|
|
|
|
|
|
设计目标:
|
|
|
|
|
|
- ``CameraSessionManager`` 停录后把「待落库明细」交给本模块,不再自行持有重试状态。
|
|
|
|
|
|
- 首次写库失败时:
|
|
|
|
|
|
1. 将归档放入内存 ``_archive`` 以便下次重试。
|
|
|
|
|
|
2. 若开启 durable fallback,同步写一个 JSON 文件到磁盘,进程重启后可从中恢复。
|
|
|
|
|
|
- 后台循环以指数退避 + 最大重试次数的方式尝试把内存中的归档写库成功。达到上限仍失败时记
|
|
|
|
|
|
告警并保留 durable 文件,等待人工介入。
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
|
|
import json
|
|
|
|
|
|
import os
|
|
|
|
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
|
|
|
|
|
|
|
|
from loguru import logger
|
|
|
|
|
|
from sqlalchemy.ext.asyncio import async_sessionmaker
|
|
|
|
|
|
|
2026-04-24 15:33:22 +08:00
|
|
|
|
from app.baked import pipeline as bp
|
2026-04-23 20:42:21 +08:00
|
|
|
|
from app.domain.consumption import SurgeryConsumptionStored
|
|
|
|
|
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
|
|
from app.repositories.surgery_results import SurgeryResultRepository
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class _ArchiveEntry:
|
|
|
|
|
|
"""内存归档条目,记录尝试次数以驱动指数退避。"""
|
|
|
|
|
|
|
|
|
|
|
|
details: list[SurgeryConsumptionStored]
|
|
|
|
|
|
attempts: int = 0
|
|
|
|
|
|
next_attempt_monotonic: float = 0.0
|
|
|
|
|
|
durable_path: Path | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _serialize_details(details: list[SurgeryConsumptionStored]) -> list[dict]:
|
|
|
|
|
|
return [
|
|
|
|
|
|
{
|
|
|
|
|
|
"item_id": d.item_id,
|
|
|
|
|
|
"item_name": d.item_name,
|
|
|
|
|
|
"qty": d.qty,
|
|
|
|
|
|
"doctor_id": d.doctor_id,
|
|
|
|
|
|
"timestamp": d.timestamp.isoformat(),
|
|
|
|
|
|
"source": d.source,
|
|
|
|
|
|
"pending_confirmation_id": d.pending_confirmation_id,
|
|
|
|
|
|
}
|
|
|
|
|
|
for d in details
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _deserialize_details(rows: list[dict]) -> list[SurgeryConsumptionStored]:
|
|
|
|
|
|
out: list[SurgeryConsumptionStored] = []
|
|
|
|
|
|
for r in rows:
|
|
|
|
|
|
ts_raw = r["timestamp"]
|
|
|
|
|
|
try:
|
|
|
|
|
|
ts = datetime.fromisoformat(ts_raw)
|
|
|
|
|
|
except ValueError:
|
|
|
|
|
|
ts = datetime.now(timezone.utc)
|
|
|
|
|
|
iid = str(r["item_id"])
|
|
|
|
|
|
pend = r.get("pending_confirmation_id")
|
|
|
|
|
|
if pend is None and iid.startswith("pending:"):
|
|
|
|
|
|
pend = iid.removeprefix("pending:")
|
|
|
|
|
|
out.append(
|
|
|
|
|
|
SurgeryConsumptionStored(
|
|
|
|
|
|
item_id=iid,
|
|
|
|
|
|
item_name=str(r["item_name"]),
|
|
|
|
|
|
qty=int(r["qty"]),
|
|
|
|
|
|
doctor_id=str(r["doctor_id"]),
|
|
|
|
|
|
timestamp=ts,
|
|
|
|
|
|
source=str(r.get("source", "vision")),
|
|
|
|
|
|
pending_confirmation_id=pend,
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ArchivePersister:
|
|
|
|
|
|
"""把手术结束明细写入 DB;失败时进入退避重试 + 可选 durable fallback。"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
repository: "SurgeryResultRepository | None",
|
|
|
|
|
|
session_factory: async_sessionmaker,
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
self._repo = repository
|
|
|
|
|
|
self._session_factory = session_factory
|
|
|
|
|
|
self._archive: dict[str, _ArchiveEntry] = {}
|
|
|
|
|
|
self._lock = asyncio.Lock()
|
|
|
|
|
|
self._retry_task: asyncio.Task[None] | None = None
|
|
|
|
|
|
self._retry_stop = asyncio.Event()
|
|
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
|
def repository(self) -> "SurgeryResultRepository | None":
|
|
|
|
|
|
return self._repo
|
|
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
|
def has_pending(self) -> bool:
|
|
|
|
|
|
return bool(self._archive)
|
|
|
|
|
|
|
|
|
|
|
|
def archived_details(
|
|
|
|
|
|
self, surgery_id: str
|
|
|
|
|
|
) -> list[SurgeryConsumptionStored] | None:
|
|
|
|
|
|
"""供 API 回退查询:读取内存归档,不访问 DB。"""
|
|
|
|
|
|
entry = self._archive.get(surgery_id)
|
|
|
|
|
|
if entry is None:
|
|
|
|
|
|
return None
|
|
|
|
|
|
return list(entry.details)
|
|
|
|
|
|
|
|
|
|
|
|
async def take_archived_details(
|
|
|
|
|
|
self, surgery_id: str
|
|
|
|
|
|
) -> list[SurgeryConsumptionStored] | None:
|
|
|
|
|
|
"""弹出归档(用于同一手术号重新开始前的强制落库 / 移交)。"""
|
|
|
|
|
|
async with self._lock:
|
|
|
|
|
|
entry = self._archive.pop(surgery_id, None)
|
|
|
|
|
|
if entry is None:
|
|
|
|
|
|
return None
|
|
|
|
|
|
return list(entry.details)
|
|
|
|
|
|
|
|
|
|
|
|
async def restore(self, surgery_id: str, details: list[SurgeryConsumptionStored]) -> None:
|
|
|
|
|
|
"""把此前弹出的归档重新放回(比如「强制落库」再次失败时回退)。"""
|
|
|
|
|
|
async with self._lock:
|
|
|
|
|
|
self._archive[surgery_id] = _ArchiveEntry(details=list(details))
|
|
|
|
|
|
|
|
|
|
|
|
async def persist_or_archive(
|
|
|
|
|
|
self,
|
|
|
|
|
|
surgery_id: str,
|
|
|
|
|
|
details: list[SurgeryConsumptionStored],
|
|
|
|
|
|
) -> bool:
|
|
|
|
|
|
"""尝试立即写库;失败则放入内存归档,并按配置写入 durable fallback。"""
|
|
|
|
|
|
if await self._write_to_db(surgery_id, details):
|
|
|
|
|
|
return True
|
|
|
|
|
|
entry = _ArchiveEntry(details=list(details))
|
2026-04-24 15:33:22 +08:00
|
|
|
|
if bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
|
2026-04-23 20:42:21 +08:00
|
|
|
|
entry.durable_path = self._write_durable(surgery_id, details)
|
|
|
|
|
|
async with self._lock:
|
|
|
|
|
|
self._archive[surgery_id] = entry
|
|
|
|
|
|
logger.error(
|
|
|
|
|
|
"Surgery {} final result kept in memory archive (durable={}); "
|
|
|
|
|
|
"background retry will attempt persist",
|
|
|
|
|
|
surgery_id,
|
|
|
|
|
|
bool(entry.durable_path),
|
|
|
|
|
|
)
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
async def try_persist_archive(self, surgery_id: str) -> bool:
|
|
|
|
|
|
"""尝试把一条内存归档写入数据库;成功则清理内存及 durable 文件。"""
|
|
|
|
|
|
async with self._lock:
|
|
|
|
|
|
entry = self._archive.get(surgery_id)
|
|
|
|
|
|
if entry is None:
|
|
|
|
|
|
return True
|
|
|
|
|
|
if self._repo is None:
|
|
|
|
|
|
return False
|
|
|
|
|
|
ok = await self._write_to_db(surgery_id, entry.details)
|
|
|
|
|
|
if not ok:
|
|
|
|
|
|
entry.attempts += 1
|
|
|
|
|
|
return False
|
|
|
|
|
|
async with self._lock:
|
|
|
|
|
|
removed = self._archive.pop(surgery_id, None)
|
|
|
|
|
|
if removed is not None and removed.durable_path is not None:
|
|
|
|
|
|
self._safe_remove(removed.durable_path)
|
|
|
|
|
|
logger.info("Archive persisted after retry surgery_id={}", surgery_id)
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
async def start_retry_loop(self) -> None:
|
|
|
|
|
|
if self._retry_task is not None and not self._retry_task.done():
|
|
|
|
|
|
return
|
|
|
|
|
|
self._retry_stop.clear()
|
|
|
|
|
|
self._retry_task = asyncio.create_task(
|
|
|
|
|
|
self._retry_loop(),
|
|
|
|
|
|
name="archive_persist_retry",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
async def shutdown(self) -> None:
|
|
|
|
|
|
self._retry_stop.set()
|
|
|
|
|
|
if self._retry_task is not None:
|
|
|
|
|
|
self._retry_task.cancel()
|
|
|
|
|
|
try:
|
|
|
|
|
|
await self._retry_task
|
|
|
|
|
|
except asyncio.CancelledError:
|
|
|
|
|
|
pass
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.debug("archive retry shutdown: {}", exc)
|
|
|
|
|
|
self._retry_task = None
|
|
|
|
|
|
|
|
|
|
|
|
async def recover_from_durable_fallback(self) -> int:
|
|
|
|
|
|
"""进程启动时调用:从 durable 目录把未写库的归档读回内存。"""
|
2026-04-24 15:33:22 +08:00
|
|
|
|
if not bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
|
2026-04-23 20:42:21 +08:00
|
|
|
|
return 0
|
2026-04-24 15:33:22 +08:00
|
|
|
|
directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
|
2026-04-23 20:42:21 +08:00
|
|
|
|
if not directory.exists():
|
|
|
|
|
|
return 0
|
|
|
|
|
|
loaded = 0
|
|
|
|
|
|
for path in sorted(directory.glob("*.json")):
|
|
|
|
|
|
try:
|
|
|
|
|
|
raw = json.loads(path.read_text(encoding="utf-8"))
|
|
|
|
|
|
surgery_id = str(raw["surgery_id"])
|
|
|
|
|
|
details = _deserialize_details(list(raw.get("details") or []))
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning("Skip unreadable durable archive {}: {}", path, exc)
|
|
|
|
|
|
continue
|
|
|
|
|
|
async with self._lock:
|
|
|
|
|
|
if surgery_id in self._archive:
|
|
|
|
|
|
continue
|
|
|
|
|
|
self._archive[surgery_id] = _ArchiveEntry(
|
|
|
|
|
|
details=details,
|
|
|
|
|
|
durable_path=path,
|
|
|
|
|
|
)
|
|
|
|
|
|
loaded += 1
|
|
|
|
|
|
if loaded:
|
|
|
|
|
|
logger.warning(
|
|
|
|
|
|
"Recovered {} durable archive(s) from {}; retry loop will attempt persist",
|
|
|
|
|
|
loaded,
|
|
|
|
|
|
directory,
|
|
|
|
|
|
)
|
|
|
|
|
|
return loaded
|
|
|
|
|
|
|
|
|
|
|
|
async def _write_to_db(
|
|
|
|
|
|
self,
|
|
|
|
|
|
surgery_id: str,
|
|
|
|
|
|
details: list[SurgeryConsumptionStored],
|
|
|
|
|
|
) -> bool:
|
|
|
|
|
|
if self._repo is None:
|
|
|
|
|
|
return True
|
|
|
|
|
|
try:
|
|
|
|
|
|
async with self._session_factory() as session:
|
|
|
|
|
|
async with session.begin():
|
|
|
|
|
|
await self._repo.save_final_result(
|
|
|
|
|
|
session,
|
|
|
|
|
|
surgery_id=surgery_id,
|
|
|
|
|
|
details=list(details),
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning(
|
|
|
|
|
|
"Persist surgery {} failed (will archive/retry): {}", surgery_id, exc
|
|
|
|
|
|
)
|
|
|
|
|
|
return False
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def _write_durable(
|
|
|
|
|
|
self,
|
|
|
|
|
|
surgery_id: str,
|
|
|
|
|
|
details: list[SurgeryConsumptionStored],
|
|
|
|
|
|
) -> Path | None:
|
2026-04-24 15:33:22 +08:00
|
|
|
|
directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
|
2026-04-23 20:42:21 +08:00
|
|
|
|
try:
|
|
|
|
|
|
directory.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning("mkdir durable archive dir {} failed: {}", directory, exc)
|
|
|
|
|
|
return None
|
|
|
|
|
|
path = directory / f"{surgery_id}.json"
|
|
|
|
|
|
payload = {
|
|
|
|
|
|
"surgery_id": surgery_id,
|
|
|
|
|
|
"saved_at": datetime.now(timezone.utc).isoformat(),
|
|
|
|
|
|
"details": _serialize_details(details),
|
|
|
|
|
|
}
|
|
|
|
|
|
try:
|
|
|
|
|
|
tmp = path.with_suffix(".json.tmp")
|
|
|
|
|
|
tmp.write_text(
|
|
|
|
|
|
json.dumps(payload, ensure_ascii=False, indent=2),
|
|
|
|
|
|
encoding="utf-8",
|
|
|
|
|
|
)
|
|
|
|
|
|
os.replace(tmp, path)
|
|
|
|
|
|
return path
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning("write durable archive {} failed: {}", path, exc)
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def _safe_remove(self, path: Path) -> None:
|
|
|
|
|
|
try:
|
|
|
|
|
|
path.unlink(missing_ok=True)
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.debug("remove durable archive {} failed: {}", path, exc)
|
|
|
|
|
|
|
|
|
|
|
|
def _next_backoff_seconds(self, attempts: int) -> float:
|
2026-04-24 15:33:22 +08:00
|
|
|
|
base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
|
|
|
|
|
|
cap = float(bp.ARCHIVE_PERSIST_BACKOFF_CAP_SECONDS)
|
2026-04-23 20:42:21 +08:00
|
|
|
|
# 指数退避:base * 2^(attempts-1),首个间隔即 base。
|
|
|
|
|
|
exp = max(0, attempts - 1)
|
|
|
|
|
|
return min(cap, base * (2**exp))
|
|
|
|
|
|
|
|
|
|
|
|
async def _retry_loop(self) -> None:
|
2026-04-24 15:33:22 +08:00
|
|
|
|
base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
|
|
|
|
|
|
max_attempts = int(bp.ARCHIVE_PERSIST_MAX_RETRIES)
|
2026-04-23 20:42:21 +08:00
|
|
|
|
while not self._retry_stop.is_set():
|
|
|
|
|
|
try:
|
|
|
|
|
|
await asyncio.wait_for(self._retry_stop.wait(), timeout=base)
|
|
|
|
|
|
break
|
|
|
|
|
|
except TimeoutError:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
loop = asyncio.get_running_loop()
|
|
|
|
|
|
now = loop.time()
|
|
|
|
|
|
# 快照当前归档条目;后续尝试可能改变 _archive 内部状态。
|
|
|
|
|
|
async with self._lock:
|
|
|
|
|
|
entries = [(sid, ent) for sid, ent in self._archive.items()]
|
|
|
|
|
|
|
|
|
|
|
|
for surgery_id, entry in entries:
|
|
|
|
|
|
if self._retry_stop.is_set():
|
|
|
|
|
|
break
|
|
|
|
|
|
if entry.attempts >= max_attempts:
|
|
|
|
|
|
# 达到上限,放弃自动重试,等待进程重启或人工介入。
|
|
|
|
|
|
continue
|
|
|
|
|
|
if entry.next_attempt_monotonic > now:
|
|
|
|
|
|
continue
|
|
|
|
|
|
ok = await self.try_persist_archive(surgery_id)
|
|
|
|
|
|
if not ok:
|
|
|
|
|
|
# 失败:更新退避时间
|
|
|
|
|
|
async with self._lock:
|
|
|
|
|
|
current = self._archive.get(surgery_id)
|
|
|
|
|
|
if current is not None:
|
|
|
|
|
|
current.next_attempt_monotonic = now + self._next_backoff_seconds(
|
|
|
|
|
|
current.attempts
|
|
|
|
|
|
)
|
|
|
|
|
|
if current.attempts >= max_attempts:
|
|
|
|
|
|
logger.error(
|
|
|
|
|
|
"Archive persist exhausted retries surgery_id={} "
|
|
|
|
|
|
"attempts={}; durable={} kept for manual recovery",
|
|
|
|
|
|
surgery_id,
|
|
|
|
|
|
current.attempts,
|
|
|
|
|
|
bool(current.durable_path),
|
|
|
|
|
|
)
|