- 新增 app/baked/algorithm|pipeline,非部署参数不再走 env;Settings 保留 DB/HTTP/RTSP/海康/百度/MinIO/Demo - 移除 init_db_schema 与 reload 配置;main 仅 check_database;start*.sh 在 uvicorn 前执行 alembic upgrade head - 依赖 psycopg[binary] 供 Alembic 同步 URL;alembic/env 注释与预发清单更新 - 撕段门控消费管线、各视频/语音/归档调用改为 baked - 百度环境变量仅 BAIDU_APP_ID、BAIDU_API_KEY、BAIDU_SECRET_KEY 与 BAIDU_* 超时/ASR;人脸脚本与 baidu_speech 文案同步 - 全量单测与 .env.example 更新;.gitignore 忽略 refs/(本地权重/视频不入库) Made-with: Cursor
329 lines
12 KiB
Python
329 lines
12 KiB
Python
"""手术归档持久化:写库失败后的内存归档 + 指数退避重试 + durable fallback。
|
||
|
||
设计目标:
|
||
- ``CameraSessionManager`` 停录后把「待落库明细」交给本模块,不再自行持有重试状态。
|
||
- 首次写库失败时:
|
||
1. 将归档放入内存 ``_archive`` 以便下次重试。
|
||
2. 若开启 durable fallback,同步写一个 JSON 文件到磁盘,进程重启后可从中恢复。
|
||
- 后台循环以指数退避 + 最大重试次数的方式尝试把内存中的归档写库成功。达到上限仍失败时记
|
||
告警并保留 durable 文件,等待人工介入。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import json
|
||
import os
|
||
from dataclasses import dataclass, field
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import TYPE_CHECKING
|
||
|
||
from loguru import logger
|
||
from sqlalchemy.ext.asyncio import async_sessionmaker
|
||
|
||
from app.baked import pipeline as bp
|
||
from app.domain.consumption import SurgeryConsumptionStored
|
||
|
||
if TYPE_CHECKING:
|
||
from app.repositories.surgery_results import SurgeryResultRepository
|
||
|
||
|
||
@dataclass
|
||
class _ArchiveEntry:
|
||
"""内存归档条目,记录尝试次数以驱动指数退避。"""
|
||
|
||
details: list[SurgeryConsumptionStored]
|
||
attempts: int = 0
|
||
next_attempt_monotonic: float = 0.0
|
||
durable_path: Path | None = None
|
||
|
||
|
||
def _serialize_details(details: list[SurgeryConsumptionStored]) -> list[dict]:
|
||
return [
|
||
{
|
||
"item_id": d.item_id,
|
||
"item_name": d.item_name,
|
||
"qty": d.qty,
|
||
"doctor_id": d.doctor_id,
|
||
"timestamp": d.timestamp.isoformat(),
|
||
"source": d.source,
|
||
"pending_confirmation_id": d.pending_confirmation_id,
|
||
}
|
||
for d in details
|
||
]
|
||
|
||
|
||
def _deserialize_details(rows: list[dict]) -> list[SurgeryConsumptionStored]:
|
||
out: list[SurgeryConsumptionStored] = []
|
||
for r in rows:
|
||
ts_raw = r["timestamp"]
|
||
try:
|
||
ts = datetime.fromisoformat(ts_raw)
|
||
except ValueError:
|
||
ts = datetime.now(timezone.utc)
|
||
iid = str(r["item_id"])
|
||
pend = r.get("pending_confirmation_id")
|
||
if pend is None and iid.startswith("pending:"):
|
||
pend = iid.removeprefix("pending:")
|
||
out.append(
|
||
SurgeryConsumptionStored(
|
||
item_id=iid,
|
||
item_name=str(r["item_name"]),
|
||
qty=int(r["qty"]),
|
||
doctor_id=str(r["doctor_id"]),
|
||
timestamp=ts,
|
||
source=str(r.get("source", "vision")),
|
||
pending_confirmation_id=pend,
|
||
)
|
||
)
|
||
return out
|
||
|
||
|
||
class ArchivePersister:
|
||
"""把手术结束明细写入 DB;失败时进入退避重试 + 可选 durable fallback。"""
|
||
|
||
def __init__(
|
||
self,
|
||
*,
|
||
repository: "SurgeryResultRepository | None",
|
||
session_factory: async_sessionmaker,
|
||
) -> None:
|
||
self._repo = repository
|
||
self._session_factory = session_factory
|
||
self._archive: dict[str, _ArchiveEntry] = {}
|
||
self._lock = asyncio.Lock()
|
||
self._retry_task: asyncio.Task[None] | None = None
|
||
self._retry_stop = asyncio.Event()
|
||
|
||
@property
|
||
def repository(self) -> "SurgeryResultRepository | None":
|
||
return self._repo
|
||
|
||
@property
|
||
def has_pending(self) -> bool:
|
||
return bool(self._archive)
|
||
|
||
def archived_details(
|
||
self, surgery_id: str
|
||
) -> list[SurgeryConsumptionStored] | None:
|
||
"""供 API 回退查询:读取内存归档,不访问 DB。"""
|
||
entry = self._archive.get(surgery_id)
|
||
if entry is None:
|
||
return None
|
||
return list(entry.details)
|
||
|
||
async def take_archived_details(
|
||
self, surgery_id: str
|
||
) -> list[SurgeryConsumptionStored] | None:
|
||
"""弹出归档(用于同一手术号重新开始前的强制落库 / 移交)。"""
|
||
async with self._lock:
|
||
entry = self._archive.pop(surgery_id, None)
|
||
if entry is None:
|
||
return None
|
||
return list(entry.details)
|
||
|
||
async def restore(self, surgery_id: str, details: list[SurgeryConsumptionStored]) -> None:
|
||
"""把此前弹出的归档重新放回(比如「强制落库」再次失败时回退)。"""
|
||
async with self._lock:
|
||
self._archive[surgery_id] = _ArchiveEntry(details=list(details))
|
||
|
||
async def persist_or_archive(
|
||
self,
|
||
surgery_id: str,
|
||
details: list[SurgeryConsumptionStored],
|
||
) -> bool:
|
||
"""尝试立即写库;失败则放入内存归档,并按配置写入 durable fallback。"""
|
||
if await self._write_to_db(surgery_id, details):
|
||
return True
|
||
entry = _ArchiveEntry(details=list(details))
|
||
if bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
|
||
entry.durable_path = self._write_durable(surgery_id, details)
|
||
async with self._lock:
|
||
self._archive[surgery_id] = entry
|
||
logger.error(
|
||
"Surgery {} final result kept in memory archive (durable={}); "
|
||
"background retry will attempt persist",
|
||
surgery_id,
|
||
bool(entry.durable_path),
|
||
)
|
||
return False
|
||
|
||
async def try_persist_archive(self, surgery_id: str) -> bool:
|
||
"""尝试把一条内存归档写入数据库;成功则清理内存及 durable 文件。"""
|
||
async with self._lock:
|
||
entry = self._archive.get(surgery_id)
|
||
if entry is None:
|
||
return True
|
||
if self._repo is None:
|
||
return False
|
||
ok = await self._write_to_db(surgery_id, entry.details)
|
||
if not ok:
|
||
entry.attempts += 1
|
||
return False
|
||
async with self._lock:
|
||
removed = self._archive.pop(surgery_id, None)
|
||
if removed is not None and removed.durable_path is not None:
|
||
self._safe_remove(removed.durable_path)
|
||
logger.info("Archive persisted after retry surgery_id={}", surgery_id)
|
||
return True
|
||
|
||
async def start_retry_loop(self) -> None:
|
||
if self._retry_task is not None and not self._retry_task.done():
|
||
return
|
||
self._retry_stop.clear()
|
||
self._retry_task = asyncio.create_task(
|
||
self._retry_loop(),
|
||
name="archive_persist_retry",
|
||
)
|
||
|
||
async def shutdown(self) -> None:
|
||
self._retry_stop.set()
|
||
if self._retry_task is not None:
|
||
self._retry_task.cancel()
|
||
try:
|
||
await self._retry_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
except Exception as exc:
|
||
logger.debug("archive retry shutdown: {}", exc)
|
||
self._retry_task = None
|
||
|
||
async def recover_from_durable_fallback(self) -> int:
|
||
"""进程启动时调用:从 durable 目录把未写库的归档读回内存。"""
|
||
if not bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
|
||
return 0
|
||
directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
|
||
if not directory.exists():
|
||
return 0
|
||
loaded = 0
|
||
for path in sorted(directory.glob("*.json")):
|
||
try:
|
||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||
surgery_id = str(raw["surgery_id"])
|
||
details = _deserialize_details(list(raw.get("details") or []))
|
||
except Exception as exc:
|
||
logger.warning("Skip unreadable durable archive {}: {}", path, exc)
|
||
continue
|
||
async with self._lock:
|
||
if surgery_id in self._archive:
|
||
continue
|
||
self._archive[surgery_id] = _ArchiveEntry(
|
||
details=details,
|
||
durable_path=path,
|
||
)
|
||
loaded += 1
|
||
if loaded:
|
||
logger.warning(
|
||
"Recovered {} durable archive(s) from {}; retry loop will attempt persist",
|
||
loaded,
|
||
directory,
|
||
)
|
||
return loaded
|
||
|
||
async def _write_to_db(
|
||
self,
|
||
surgery_id: str,
|
||
details: list[SurgeryConsumptionStored],
|
||
) -> bool:
|
||
if self._repo is None:
|
||
return True
|
||
try:
|
||
async with self._session_factory() as session:
|
||
async with session.begin():
|
||
await self._repo.save_final_result(
|
||
session,
|
||
surgery_id=surgery_id,
|
||
details=list(details),
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"Persist surgery {} failed (will archive/retry): {}", surgery_id, exc
|
||
)
|
||
return False
|
||
return True
|
||
|
||
def _write_durable(
|
||
self,
|
||
surgery_id: str,
|
||
details: list[SurgeryConsumptionStored],
|
||
) -> Path | None:
|
||
directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
|
||
try:
|
||
directory.mkdir(parents=True, exist_ok=True)
|
||
except Exception as exc:
|
||
logger.warning("mkdir durable archive dir {} failed: {}", directory, exc)
|
||
return None
|
||
path = directory / f"{surgery_id}.json"
|
||
payload = {
|
||
"surgery_id": surgery_id,
|
||
"saved_at": datetime.now(timezone.utc).isoformat(),
|
||
"details": _serialize_details(details),
|
||
}
|
||
try:
|
||
tmp = path.with_suffix(".json.tmp")
|
||
tmp.write_text(
|
||
json.dumps(payload, ensure_ascii=False, indent=2),
|
||
encoding="utf-8",
|
||
)
|
||
os.replace(tmp, path)
|
||
return path
|
||
except Exception as exc:
|
||
logger.warning("write durable archive {} failed: {}", path, exc)
|
||
return None
|
||
|
||
def _safe_remove(self, path: Path) -> None:
|
||
try:
|
||
path.unlink(missing_ok=True)
|
||
except Exception as exc:
|
||
logger.debug("remove durable archive {} failed: {}", path, exc)
|
||
|
||
def _next_backoff_seconds(self, attempts: int) -> float:
|
||
base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
|
||
cap = float(bp.ARCHIVE_PERSIST_BACKOFF_CAP_SECONDS)
|
||
# 指数退避:base * 2^(attempts-1),首个间隔即 base。
|
||
exp = max(0, attempts - 1)
|
||
return min(cap, base * (2**exp))
|
||
|
||
async def _retry_loop(self) -> None:
|
||
base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
|
||
max_attempts = int(bp.ARCHIVE_PERSIST_MAX_RETRIES)
|
||
while not self._retry_stop.is_set():
|
||
try:
|
||
await asyncio.wait_for(self._retry_stop.wait(), timeout=base)
|
||
break
|
||
except TimeoutError:
|
||
pass
|
||
|
||
loop = asyncio.get_running_loop()
|
||
now = loop.time()
|
||
# 快照当前归档条目;后续尝试可能改变 _archive 内部状态。
|
||
async with self._lock:
|
||
entries = [(sid, ent) for sid, ent in self._archive.items()]
|
||
|
||
for surgery_id, entry in entries:
|
||
if self._retry_stop.is_set():
|
||
break
|
||
if entry.attempts >= max_attempts:
|
||
# 达到上限,放弃自动重试,等待进程重启或人工介入。
|
||
continue
|
||
if entry.next_attempt_monotonic > now:
|
||
continue
|
||
ok = await self.try_persist_archive(surgery_id)
|
||
if not ok:
|
||
# 失败:更新退避时间
|
||
async with self._lock:
|
||
current = self._archive.get(surgery_id)
|
||
if current is not None:
|
||
current.next_attempt_monotonic = now + self._next_backoff_seconds(
|
||
current.attempts
|
||
)
|
||
if current.attempts >= max_attempts:
|
||
logger.error(
|
||
"Archive persist exhausted retries surgery_id={} "
|
||
"attempts={}; durable={} kept for manual recovery",
|
||
surgery_id,
|
||
current.attempts,
|
||
bool(current.durable_path),
|
||
)
|