feat: 手术视频消耗、待确认与持久化改造
- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志:TSV/Markdown 含 top2/top3;item_id 优先产品编码;待确认记「待确认」行,语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行,确认后替换;拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy,修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测 Made-with: Cursor
This commit is contained in:
330
app/services/video/archive_persister.py
Normal file
330
app/services/video/archive_persister.py
Normal file
@@ -0,0 +1,330 @@
|
||||
"""手术归档持久化:写库失败后的内存归档 + 指数退避重试 + durable fallback。
|
||||
|
||||
设计目标:
|
||||
- ``CameraSessionManager`` 停录后把「待落库明细」交给本模块,不再自行持有重试状态。
|
||||
- 首次写库失败时:
|
||||
1. 将归档放入内存 ``_archive`` 以便下次重试。
|
||||
2. 若开启 durable fallback,同步写一个 JSON 文件到磁盘,进程重启后可从中恢复。
|
||||
- 后台循环以指数退避 + 最大重试次数的方式尝试把内存中的归档写库成功。达到上限仍失败时记
|
||||
告警并保留 durable 文件,等待人工介入。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from loguru import logger
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker
|
||||
|
||||
from app.config import Settings
|
||||
from app.domain.consumption import SurgeryConsumptionStored
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.repositories.surgery_results import SurgeryResultRepository
|
||||
|
||||
|
||||
@dataclass
|
||||
class _ArchiveEntry:
|
||||
"""内存归档条目,记录尝试次数以驱动指数退避。"""
|
||||
|
||||
details: list[SurgeryConsumptionStored]
|
||||
attempts: int = 0
|
||||
next_attempt_monotonic: float = 0.0
|
||||
durable_path: Path | None = None
|
||||
|
||||
|
||||
def _serialize_details(details: list[SurgeryConsumptionStored]) -> list[dict]:
|
||||
return [
|
||||
{
|
||||
"item_id": d.item_id,
|
||||
"item_name": d.item_name,
|
||||
"qty": d.qty,
|
||||
"doctor_id": d.doctor_id,
|
||||
"timestamp": d.timestamp.isoformat(),
|
||||
"source": d.source,
|
||||
"pending_confirmation_id": d.pending_confirmation_id,
|
||||
}
|
||||
for d in details
|
||||
]
|
||||
|
||||
|
||||
def _deserialize_details(rows: list[dict]) -> list[SurgeryConsumptionStored]:
|
||||
out: list[SurgeryConsumptionStored] = []
|
||||
for r in rows:
|
||||
ts_raw = r["timestamp"]
|
||||
try:
|
||||
ts = datetime.fromisoformat(ts_raw)
|
||||
except ValueError:
|
||||
ts = datetime.now(timezone.utc)
|
||||
iid = str(r["item_id"])
|
||||
pend = r.get("pending_confirmation_id")
|
||||
if pend is None and iid.startswith("pending:"):
|
||||
pend = iid.removeprefix("pending:")
|
||||
out.append(
|
||||
SurgeryConsumptionStored(
|
||||
item_id=iid,
|
||||
item_name=str(r["item_name"]),
|
||||
qty=int(r["qty"]),
|
||||
doctor_id=str(r["doctor_id"]),
|
||||
timestamp=ts,
|
||||
source=str(r.get("source", "vision")),
|
||||
pending_confirmation_id=pend,
|
||||
)
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
class ArchivePersister:
|
||||
"""把手术结束明细写入 DB;失败时进入退避重试 + 可选 durable fallback。"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
settings: Settings,
|
||||
repository: "SurgeryResultRepository | None",
|
||||
session_factory: async_sessionmaker,
|
||||
) -> None:
|
||||
self._s = settings
|
||||
self._repo = repository
|
||||
self._session_factory = session_factory
|
||||
self._archive: dict[str, _ArchiveEntry] = {}
|
||||
self._lock = asyncio.Lock()
|
||||
self._retry_task: asyncio.Task[None] | None = None
|
||||
self._retry_stop = asyncio.Event()
|
||||
|
||||
@property
|
||||
def repository(self) -> "SurgeryResultRepository | None":
|
||||
return self._repo
|
||||
|
||||
@property
|
||||
def has_pending(self) -> bool:
|
||||
return bool(self._archive)
|
||||
|
||||
def archived_details(
|
||||
self, surgery_id: str
|
||||
) -> list[SurgeryConsumptionStored] | None:
|
||||
"""供 API 回退查询:读取内存归档,不访问 DB。"""
|
||||
entry = self._archive.get(surgery_id)
|
||||
if entry is None:
|
||||
return None
|
||||
return list(entry.details)
|
||||
|
||||
async def take_archived_details(
|
||||
self, surgery_id: str
|
||||
) -> list[SurgeryConsumptionStored] | None:
|
||||
"""弹出归档(用于同一手术号重新开始前的强制落库 / 移交)。"""
|
||||
async with self._lock:
|
||||
entry = self._archive.pop(surgery_id, None)
|
||||
if entry is None:
|
||||
return None
|
||||
return list(entry.details)
|
||||
|
||||
async def restore(self, surgery_id: str, details: list[SurgeryConsumptionStored]) -> None:
|
||||
"""把此前弹出的归档重新放回(比如「强制落库」再次失败时回退)。"""
|
||||
async with self._lock:
|
||||
self._archive[surgery_id] = _ArchiveEntry(details=list(details))
|
||||
|
||||
async def persist_or_archive(
|
||||
self,
|
||||
surgery_id: str,
|
||||
details: list[SurgeryConsumptionStored],
|
||||
) -> bool:
|
||||
"""尝试立即写库;失败则放入内存归档,并按配置写入 durable fallback。"""
|
||||
if await self._write_to_db(surgery_id, details):
|
||||
return True
|
||||
entry = _ArchiveEntry(details=list(details))
|
||||
if self._s.archive_persist_durable_fallback_enabled:
|
||||
entry.durable_path = self._write_durable(surgery_id, details)
|
||||
async with self._lock:
|
||||
self._archive[surgery_id] = entry
|
||||
logger.error(
|
||||
"Surgery {} final result kept in memory archive (durable={}); "
|
||||
"background retry will attempt persist",
|
||||
surgery_id,
|
||||
bool(entry.durable_path),
|
||||
)
|
||||
return False
|
||||
|
||||
async def try_persist_archive(self, surgery_id: str) -> bool:
|
||||
"""尝试把一条内存归档写入数据库;成功则清理内存及 durable 文件。"""
|
||||
async with self._lock:
|
||||
entry = self._archive.get(surgery_id)
|
||||
if entry is None:
|
||||
return True
|
||||
if self._repo is None:
|
||||
return False
|
||||
ok = await self._write_to_db(surgery_id, entry.details)
|
||||
if not ok:
|
||||
entry.attempts += 1
|
||||
return False
|
||||
async with self._lock:
|
||||
removed = self._archive.pop(surgery_id, None)
|
||||
if removed is not None and removed.durable_path is not None:
|
||||
self._safe_remove(removed.durable_path)
|
||||
logger.info("Archive persisted after retry surgery_id={}", surgery_id)
|
||||
return True
|
||||
|
||||
async def start_retry_loop(self) -> None:
|
||||
if self._retry_task is not None and not self._retry_task.done():
|
||||
return
|
||||
self._retry_stop.clear()
|
||||
self._retry_task = asyncio.create_task(
|
||||
self._retry_loop(),
|
||||
name="archive_persist_retry",
|
||||
)
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
self._retry_stop.set()
|
||||
if self._retry_task is not None:
|
||||
self._retry_task.cancel()
|
||||
try:
|
||||
await self._retry_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception as exc:
|
||||
logger.debug("archive retry shutdown: {}", exc)
|
||||
self._retry_task = None
|
||||
|
||||
async def recover_from_durable_fallback(self) -> int:
|
||||
"""进程启动时调用:从 durable 目录把未写库的归档读回内存。"""
|
||||
if not self._s.archive_persist_durable_fallback_enabled:
|
||||
return 0
|
||||
directory = Path(self._s.archive_persist_durable_fallback_dir)
|
||||
if not directory.exists():
|
||||
return 0
|
||||
loaded = 0
|
||||
for path in sorted(directory.glob("*.json")):
|
||||
try:
|
||||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||
surgery_id = str(raw["surgery_id"])
|
||||
details = _deserialize_details(list(raw.get("details") or []))
|
||||
except Exception as exc:
|
||||
logger.warning("Skip unreadable durable archive {}: {}", path, exc)
|
||||
continue
|
||||
async with self._lock:
|
||||
if surgery_id in self._archive:
|
||||
continue
|
||||
self._archive[surgery_id] = _ArchiveEntry(
|
||||
details=details,
|
||||
durable_path=path,
|
||||
)
|
||||
loaded += 1
|
||||
if loaded:
|
||||
logger.warning(
|
||||
"Recovered {} durable archive(s) from {}; retry loop will attempt persist",
|
||||
loaded,
|
||||
directory,
|
||||
)
|
||||
return loaded
|
||||
|
||||
async def _write_to_db(
|
||||
self,
|
||||
surgery_id: str,
|
||||
details: list[SurgeryConsumptionStored],
|
||||
) -> bool:
|
||||
if self._repo is None:
|
||||
return True
|
||||
try:
|
||||
async with self._session_factory() as session:
|
||||
async with session.begin():
|
||||
await self._repo.save_final_result(
|
||||
session,
|
||||
surgery_id=surgery_id,
|
||||
details=list(details),
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Persist surgery {} failed (will archive/retry): {}", surgery_id, exc
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def _write_durable(
|
||||
self,
|
||||
surgery_id: str,
|
||||
details: list[SurgeryConsumptionStored],
|
||||
) -> Path | None:
|
||||
directory = Path(self._s.archive_persist_durable_fallback_dir)
|
||||
try:
|
||||
directory.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as exc:
|
||||
logger.warning("mkdir durable archive dir {} failed: {}", directory, exc)
|
||||
return None
|
||||
path = directory / f"{surgery_id}.json"
|
||||
payload = {
|
||||
"surgery_id": surgery_id,
|
||||
"saved_at": datetime.now(timezone.utc).isoformat(),
|
||||
"details": _serialize_details(details),
|
||||
}
|
||||
try:
|
||||
tmp = path.with_suffix(".json.tmp")
|
||||
tmp.write_text(
|
||||
json.dumps(payload, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
os.replace(tmp, path)
|
||||
return path
|
||||
except Exception as exc:
|
||||
logger.warning("write durable archive {} failed: {}", path, exc)
|
||||
return None
|
||||
|
||||
def _safe_remove(self, path: Path) -> None:
|
||||
try:
|
||||
path.unlink(missing_ok=True)
|
||||
except Exception as exc:
|
||||
logger.debug("remove durable archive {} failed: {}", path, exc)
|
||||
|
||||
def _next_backoff_seconds(self, attempts: int) -> float:
|
||||
base = float(self._s.archive_persist_retry_interval_seconds)
|
||||
cap = float(self._s.archive_persist_backoff_cap_seconds)
|
||||
# 指数退避:base * 2^(attempts-1),首个间隔即 base。
|
||||
exp = max(0, attempts - 1)
|
||||
return min(cap, base * (2**exp))
|
||||
|
||||
async def _retry_loop(self) -> None:
|
||||
base = float(self._s.archive_persist_retry_interval_seconds)
|
||||
max_attempts = int(self._s.archive_persist_max_retries)
|
||||
while not self._retry_stop.is_set():
|
||||
try:
|
||||
await asyncio.wait_for(self._retry_stop.wait(), timeout=base)
|
||||
break
|
||||
except TimeoutError:
|
||||
pass
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
now = loop.time()
|
||||
# 快照当前归档条目;后续尝试可能改变 _archive 内部状态。
|
||||
async with self._lock:
|
||||
entries = [(sid, ent) for sid, ent in self._archive.items()]
|
||||
|
||||
for surgery_id, entry in entries:
|
||||
if self._retry_stop.is_set():
|
||||
break
|
||||
if entry.attempts >= max_attempts:
|
||||
# 达到上限,放弃自动重试,等待进程重启或人工介入。
|
||||
continue
|
||||
if entry.next_attempt_monotonic > now:
|
||||
continue
|
||||
ok = await self.try_persist_archive(surgery_id)
|
||||
if not ok:
|
||||
# 失败:更新退避时间
|
||||
async with self._lock:
|
||||
current = self._archive.get(surgery_id)
|
||||
if current is not None:
|
||||
current.next_attempt_monotonic = now + self._next_backoff_seconds(
|
||||
current.attempts
|
||||
)
|
||||
if current.attempts >= max_attempts:
|
||||
logger.error(
|
||||
"Archive persist exhausted retries surgery_id={} "
|
||||
"attempts={}; durable={} kept for manual recovery",
|
||||
surgery_id,
|
||||
current.attempts,
|
||||
bool(current.durable_path),
|
||||
)
|
||||
Reference in New Issue
Block a user