app/services/video/archive_persister.py

"""手术归档持久化：写库失败后的内存归档 + 指数退避重试 + durable fallback。

设计目标：
- ``CameraSessionManager`` 停录后把「待落库明细」交给本模块，不再自行持有重试状态。
- 首次写库失败时：
  1. 将归档放入内存 ``_archive`` 以便下次重试。
  2. 若开启 durable fallback，同步写一个 JSON 文件到磁盘，进程重启后可从中恢复。
- 后台循环以指数退避 + 最大重试次数的方式尝试把内存中的归档写库成功。达到上限仍失败时记
  告警并保留 durable 文件，等待人工介入。
"""

from __future__ import annotations

import asyncio
import json
import os
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING

from loguru import logger
from sqlalchemy.ext.asyncio import async_sessionmaker

from app.baked import pipeline as bp
from app.domain.consumption import SurgeryConsumptionStored

if TYPE_CHECKING:
    from app.repositories.surgery_results import SurgeryResultRepository


@dataclass
class _ArchiveEntry:
    """内存归档条目，记录尝试次数以驱动指数退避。"""

    details: list[SurgeryConsumptionStored]
    attempts: int = 0
    next_attempt_monotonic: float = 0.0
    durable_path: Path | None = None


def _serialize_details(details: list[SurgeryConsumptionStored]) -> list[dict]:
    return [
        {
            "item_id": d.item_id,
            "item_name": d.item_name,
            "qty": d.qty,
            "doctor_id": d.doctor_id,
            "timestamp": d.timestamp.isoformat(),
            "source": d.source,
            "pending_confirmation_id": d.pending_confirmation_id,
        }
        for d in details
    ]


def _deserialize_details(rows: list[dict]) -> list[SurgeryConsumptionStored]:
    out: list[SurgeryConsumptionStored] = []
    for r in rows:
        ts_raw = r["timestamp"]
        try:
            ts = datetime.fromisoformat(ts_raw)
        except ValueError:
            ts = datetime.now(timezone.utc)
        iid = str(r["item_id"])
        pend = r.get("pending_confirmation_id")
        if pend is None and iid.startswith("pending:"):
            pend = iid.removeprefix("pending:")
        out.append(
            SurgeryConsumptionStored(
                item_id=iid,
                item_name=str(r["item_name"]),
                qty=int(r["qty"]),
                doctor_id=str(r["doctor_id"]),
                timestamp=ts,
                source=str(r.get("source", "vision")),
                pending_confirmation_id=pend,
            )
        )
    return out


class ArchivePersister:
    """把手术结束明细写入 DB；失败时进入退避重试 + 可选 durable fallback。"""

    def __init__(
        self,
        *,
        repository: "SurgeryResultRepository | None",
        session_factory: async_sessionmaker,
    ) -> None:
        self._repo = repository
        self._session_factory = session_factory
        self._archive: dict[str, _ArchiveEntry] = {}
        self._lock = asyncio.Lock()
        self._retry_task: asyncio.Task[None] | None = None
        self._retry_stop = asyncio.Event()

    @property
    def repository(self) -> "SurgeryResultRepository | None":
        return self._repo

    @property
    def has_pending(self) -> bool:
        return bool(self._archive)

    def archived_details(
        self, surgery_id: str
    ) -> list[SurgeryConsumptionStored] | None:
        """供 API 回退查询：读取内存归档，不访问 DB。"""
        entry = self._archive.get(surgery_id)
        if entry is None:
            return None
        return list(entry.details)

    async def take_archived_details(
        self, surgery_id: str
    ) -> list[SurgeryConsumptionStored] | None:
        """弹出归档（用于同一手术号重新开始前的强制落库 / 移交）。"""
        async with self._lock:
            entry = self._archive.pop(surgery_id, None)
        if entry is None:
            return None
        return list(entry.details)

    async def restore(self, surgery_id: str, details: list[SurgeryConsumptionStored]) -> None:
        """把此前弹出的归档重新放回（比如「强制落库」再次失败时回退）。"""
        async with self._lock:
            self._archive[surgery_id] = _ArchiveEntry(details=list(details))

    async def persist_or_archive(
        self,
        surgery_id: str,
        details: list[SurgeryConsumptionStored],
    ) -> bool:
        """尝试立即写库；失败则放入内存归档，并按配置写入 durable fallback。"""
        if await self._write_to_db(surgery_id, details):
            return True
        entry = _ArchiveEntry(details=list(details))
        if bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
            entry.durable_path = self._write_durable(surgery_id, details)
        async with self._lock:
            self._archive[surgery_id] = entry
        logger.error(
            "Surgery {} final result kept in memory archive (durable={}); "
            "background retry will attempt persist",
            surgery_id,
            bool(entry.durable_path),
        )
        return False

    async def try_persist_archive(self, surgery_id: str) -> bool:
        """尝试把一条内存归档写入数据库；成功则清理内存及 durable 文件。"""
        async with self._lock:
            entry = self._archive.get(surgery_id)
        if entry is None:
            return True
        if self._repo is None:
            return False
        ok = await self._write_to_db(surgery_id, entry.details)
        if not ok:
            entry.attempts += 1
            return False
        async with self._lock:
            removed = self._archive.pop(surgery_id, None)
        if removed is not None and removed.durable_path is not None:
            self._safe_remove(removed.durable_path)
        logger.info("Archive persisted after retry surgery_id={}", surgery_id)
        return True

    async def start_retry_loop(self) -> None:
        if self._retry_task is not None and not self._retry_task.done():
            return
        self._retry_stop.clear()
        self._retry_task = asyncio.create_task(
            self._retry_loop(),
            name="archive_persist_retry",
        )

    async def shutdown(self) -> None:
        self._retry_stop.set()
        if self._retry_task is not None:
            self._retry_task.cancel()
            try:
                await self._retry_task
            except asyncio.CancelledError:
                pass
            except Exception as exc:
                logger.debug("archive retry shutdown: {}", exc)
            self._retry_task = None

    async def recover_from_durable_fallback(self) -> int:
        """进程启动时调用：从 durable 目录把未写库的归档读回内存。"""
        if not bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
            return 0
        directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
        if not directory.exists():
            return 0
        loaded = 0
        for path in sorted(directory.glob("*.json")):
            try:
                raw = json.loads(path.read_text(encoding="utf-8"))
                surgery_id = str(raw["surgery_id"])
                details = _deserialize_details(list(raw.get("details") or []))
            except Exception as exc:
                logger.warning("Skip unreadable durable archive {}: {}", path, exc)
                continue
            async with self._lock:
                if surgery_id in self._archive:
                    continue
                self._archive[surgery_id] = _ArchiveEntry(
                    details=details,
                    durable_path=path,
                )
            loaded += 1
        if loaded:
            logger.warning(
                "Recovered {} durable archive(s) from {}; retry loop will attempt persist",
                loaded,
                directory,
            )
        return loaded

    async def _write_to_db(
        self,
        surgery_id: str,
        details: list[SurgeryConsumptionStored],
    ) -> bool:
        if self._repo is None:
            return True
        try:
            async with self._session_factory() as session:
                async with session.begin():
                    await self._repo.save_final_result(
                        session,
                        surgery_id=surgery_id,
                        details=list(details),
                    )
        except Exception as exc:
            logger.warning(
                "Persist surgery {} failed (will archive/retry): {}", surgery_id, exc
            )
            return False
        return True

    def _write_durable(
        self,
        surgery_id: str,
        details: list[SurgeryConsumptionStored],
    ) -> Path | None:
        directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
        try:
            directory.mkdir(parents=True, exist_ok=True)
        except Exception as exc:
            logger.warning("mkdir durable archive dir {} failed: {}", directory, exc)
            return None
        path = directory / f"{surgery_id}.json"
        payload = {
            "surgery_id": surgery_id,
            "saved_at": datetime.now(timezone.utc).isoformat(),
            "details": _serialize_details(details),
        }
        try:
            tmp = path.with_suffix(".json.tmp")
            tmp.write_text(
                json.dumps(payload, ensure_ascii=False, indent=2),
                encoding="utf-8",
            )
            os.replace(tmp, path)
            return path
        except Exception as exc:
            logger.warning("write durable archive {} failed: {}", path, exc)
            return None

    def _safe_remove(self, path: Path) -> None:
        try:
            path.unlink(missing_ok=True)
        except Exception as exc:
            logger.debug("remove durable archive {} failed: {}", path, exc)

    def _next_backoff_seconds(self, attempts: int) -> float:
        base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
        cap = float(bp.ARCHIVE_PERSIST_BACKOFF_CAP_SECONDS)
        # 指数退避：base * 2^(attempts-1)，首个间隔即 base。
        exp = max(0, attempts - 1)
        return min(cap, base * (2**exp))

    async def _retry_loop(self) -> None:
        base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
        max_attempts = int(bp.ARCHIVE_PERSIST_MAX_RETRIES)
        while not self._retry_stop.is_set():
            try:
                await asyncio.wait_for(self._retry_stop.wait(), timeout=base)
                break
            except TimeoutError:
                pass

            loop = asyncio.get_running_loop()
            now = loop.time()
            # 快照当前归档条目；后续尝试可能改变 _archive 内部状态。
            async with self._lock:
                entries = [(sid, ent) for sid, ent in self._archive.items()]

            for surgery_id, entry in entries:
                if self._retry_stop.is_set():
                    break
                if entry.attempts >= max_attempts:
                    # 达到上限，放弃自动重试，等待进程重启或人工介入。
                    continue
                if entry.next_attempt_monotonic > now:
                    continue
                ok = await self.try_persist_archive(surgery_id)
                if not ok:
                    # 失败：更新退避时间
                    async with self._lock:
                        current = self._archive.get(surgery_id)
                        if current is not None:
                            current.next_attempt_monotonic = now + self._next_backoff_seconds(
                                current.attempts
                            )
                            if current.attempts >= max_attempts:
                                logger.error(
                                    "Archive persist exhausted retries surgery_id={} "
                                    "attempts={}; durable={} kept for manual recovery",
                                    surgery_id,
                                    current.attempts,
                                    bool(current.durable_path),
                                )
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								"""手术归档持久化：写库失败后的内存归档 + 指数退避重试 + durable fallback。
 								设计目标：
 								- ``CameraSessionManager`` 停录后把「待落库明细」交给本模块，不再自行持有重试状态。
 								- 首次写库失败时：
 . 将归档放入内存 ``_archive`` 以便下次重试。
 . 若开启 durable fallback，同步写一个 JSON 文件到磁盘，进程重启后可从中恢复。
 								- 后台循环以指数退避 + 最大重试次数的方式尝试把内存中的归档写库成功。达到上限仍失败时记
 								  告警并保留 durable 文件，等待人工介入。
 								"""
 								from __future__ import annotations
 								import asyncio
 								import json
 								import os
 								from dataclasses import dataclass, field
 								from datetime import datetime, timezone
 								from pathlib import Path
 								from typing import TYPE_CHECKING
 								from loguru import logger
 								from sqlalchemy.ext.asyncio import async_sessionmaker
-												feat: 配置写死与 baked 模块，Alembic 建表，百度仅 BAIDU_*

- 新增 app/baked/algorithm|pipeline，非部署参数不再走 env；Settings 保留 DB/HTTP/RTSP/海康/百度/MinIO/Demo
- 移除 init_db_schema 与 reload 配置；main 仅 check_database；start*.sh 在 uvicorn 前执行 alembic upgrade head
- 依赖 psycopg[binary] 供 Alembic 同步 URL；alembic/env 注释与预发清单更新
- 撕段门控消费管线、各视频/语音/归档调用改为 baked
- 百度环境变量仅 BAIDU_APP_ID、BAIDU_API_KEY、BAIDU_SECRET_KEY 与 BAIDU_* 超时/ASR；人脸脚本与 baidu_speech 文案同步
- 全量单测与 .env.example 更新；.gitignore 忽略 refs/（本地权重/视频不入库）

Made-with: Cursor

											
										
										
											2026-04-24 15:33:22 +08:00
+								from app.baked import pipeline as bp
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								from app.domain.consumption import SurgeryConsumptionStored
 								if TYPE_CHECKING:
 								    from app.repositories.surgery_results import SurgeryResultRepository
 								@dataclass
 								class _ArchiveEntry:
 								    """内存归档条目，记录尝试次数以驱动指数退避。"""
 								    details: list[SurgeryConsumptionStored]
 								    attempts: int = 0
 								    next_attempt_monotonic: float = 0.0
 								    durable_path: Path | None = None
 								def _serialize_details(details: list[SurgeryConsumptionStored]) -> list[dict]:
 								    return [
 								        {
 								            "item_id": d.item_id,
 								            "item_name": d.item_name,
 								            "qty": d.qty,
 								            "doctor_id": d.doctor_id,
 								            "timestamp": d.timestamp.isoformat(),
 								            "source": d.source,
 								            "pending_confirmation_id": d.pending_confirmation_id,
 								        }
 								        for d in details
 								    ]
 								def _deserialize_details(rows: list[dict]) -> list[SurgeryConsumptionStored]:
 								    out: list[SurgeryConsumptionStored] = []
 								    for r in rows:
 								        ts_raw = r["timestamp"]
 								        try:
 								            ts = datetime.fromisoformat(ts_raw)
 								        except ValueError:
 								            ts = datetime.now(timezone.utc)
 								        iid = str(r["item_id"])
 								        pend = r.get("pending_confirmation_id")
 								        if pend is None and iid.startswith("pending:"):
 								            pend = iid.removeprefix("pending:")
 								        out.append(
 								            SurgeryConsumptionStored(
 								                item_id=iid,
 								                item_name=str(r["item_name"]),
 								                qty=int(r["qty"]),
 								                doctor_id=str(r["doctor_id"]),
 								                timestamp=ts,
 								                source=str(r.get("source", "vision")),
 								                pending_confirmation_id=pend,
 								            )
 								        )
 								    return out
 								class ArchivePersister:
 								    """把手术结束明细写入 DB；失败时进入退避重试 + 可选 durable fallback。"""
 								    def __init__(
 								        self,
 								        *,
 								        repository: "SurgeryResultRepository | None",
 								        session_factory: async_sessionmaker,
 								    ) -> None:
 								        self._repo = repository
 								        self._session_factory = session_factory
 								        self._archive: dict[str, _ArchiveEntry] = {}
 								        self._lock = asyncio.Lock()
 								        self._retry_task: asyncio.Task[None] | None = None
 								        self._retry_stop = asyncio.Event()
 								    @property
 								    def repository(self) -> "SurgeryResultRepository | None":
 								        return self._repo
 								    @property
 								    def has_pending(self) -> bool:
 								        return bool(self._archive)
 								    def archived_details(
 								        self, surgery_id: str
 								    ) -> list[SurgeryConsumptionStored] | None:
 								        """供 API 回退查询：读取内存归档，不访问 DB。"""
 								        entry = self._archive.get(surgery_id)
 								        if entry is None:
 								            return None
 								        return list(entry.details)
 								    async def take_archived_details(
 								        self, surgery_id: str
 								    ) -> list[SurgeryConsumptionStored] | None:
 								        """弹出归档（用于同一手术号重新开始前的强制落库 / 移交）。"""
 								        async with self._lock:
 								            entry = self._archive.pop(surgery_id, None)
 								        if entry is None:
 								            return None
 								        return list(entry.details)
 								    async def restore(self, surgery_id: str, details: list[SurgeryConsumptionStored]) -> None:
 								        """把此前弹出的归档重新放回（比如「强制落库」再次失败时回退）。"""
 								        async with self._lock:
 								            self._archive[surgery_id] = _ArchiveEntry(details=list(details))
 								    async def persist_or_archive(
 								        self,
 								        surgery_id: str,
 								        details: list[SurgeryConsumptionStored],
 								    ) -> bool:
 								        """尝试立即写库；失败则放入内存归档，并按配置写入 durable fallback。"""
 								        if await self._write_to_db(surgery_id, details):
 								            return True
 								        entry = _ArchiveEntry(details=list(details))
-												feat: 配置写死与 baked 模块，Alembic 建表，百度仅 BAIDU_*

- 新增 app/baked/algorithm|pipeline，非部署参数不再走 env；Settings 保留 DB/HTTP/RTSP/海康/百度/MinIO/Demo
- 移除 init_db_schema 与 reload 配置；main 仅 check_database；start*.sh 在 uvicorn 前执行 alembic upgrade head
- 依赖 psycopg[binary] 供 Alembic 同步 URL；alembic/env 注释与预发清单更新
- 撕段门控消费管线、各视频/语音/归档调用改为 baked
- 百度环境变量仅 BAIDU_APP_ID、BAIDU_API_KEY、BAIDU_SECRET_KEY 与 BAIDU_* 超时/ASR；人脸脚本与 baidu_speech 文案同步
- 全量单测与 .env.example 更新；.gitignore 忽略 refs/（本地权重/视频不入库）

Made-with: Cursor

											
										
										
											2026-04-24 15:33:22 +08:00
+								        if bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								            entry.durable_path = self._write_durable(surgery_id, details)
 								        async with self._lock:
 								            self._archive[surgery_id] = entry
 								        logger.error(
 								            "Surgery {} final result kept in memory archive (durable={}); "
 								            "background retry will attempt persist",
 								            surgery_id,
 								            bool(entry.durable_path),
 								        )
 								        return False
 								    async def try_persist_archive(self, surgery_id: str) -> bool:
 								        """尝试把一条内存归档写入数据库；成功则清理内存及 durable 文件。"""
 								        async with self._lock:
 								            entry = self._archive.get(surgery_id)
 								        if entry is None:
 								            return True
 								        if self._repo is None:
 								            return False
 								        ok = await self._write_to_db(surgery_id, entry.details)
 								        if not ok:
 								            entry.attempts += 1
 								            return False
 								        async with self._lock:
 								            removed = self._archive.pop(surgery_id, None)
 								        if removed is not None and removed.durable_path is not None:
 								            self._safe_remove(removed.durable_path)
 								        logger.info("Archive persisted after retry surgery_id={}", surgery_id)
 								        return True
 								    async def start_retry_loop(self) -> None:
 								        if self._retry_task is not None and not self._retry_task.done():
 								            return
 								        self._retry_stop.clear()
 								        self._retry_task = asyncio.create_task(
 								            self._retry_loop(),
 								            name="archive_persist_retry",
 								        )
 								    async def shutdown(self) -> None:
 								        self._retry_stop.set()
 								        if self._retry_task is not None:
 								            self._retry_task.cancel()
 								            try:
 								                await self._retry_task
 								            except asyncio.CancelledError:
 								                pass
 								            except Exception as exc:
 								                logger.debug("archive retry shutdown: {}", exc)
 								            self._retry_task = None
 								    async def recover_from_durable_fallback(self) -> int:
 								        """进程启动时调用：从 durable 目录把未写库的归档读回内存。"""
-												feat: 配置写死与 baked 模块，Alembic 建表，百度仅 BAIDU_*

- 新增 app/baked/algorithm|pipeline，非部署参数不再走 env；Settings 保留 DB/HTTP/RTSP/海康/百度/MinIO/Demo
- 移除 init_db_schema 与 reload 配置；main 仅 check_database；start*.sh 在 uvicorn 前执行 alembic upgrade head
- 依赖 psycopg[binary] 供 Alembic 同步 URL；alembic/env 注释与预发清单更新
- 撕段门控消费管线、各视频/语音/归档调用改为 baked
- 百度环境变量仅 BAIDU_APP_ID、BAIDU_API_KEY、BAIDU_SECRET_KEY 与 BAIDU_* 超时/ASR；人脸脚本与 baidu_speech 文案同步
- 全量单测与 .env.example 更新；.gitignore 忽略 refs/（本地权重/视频不入库）

Made-with: Cursor

											
										
										
											2026-04-24 15:33:22 +08:00
+								        if not bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_ENABLED:
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								            return 0
-												feat: 配置写死与 baked 模块，Alembic 建表，百度仅 BAIDU_*

- 新增 app/baked/algorithm|pipeline，非部署参数不再走 env；Settings 保留 DB/HTTP/RTSP/海康/百度/MinIO/Demo
- 移除 init_db_schema 与 reload 配置；main 仅 check_database；start*.sh 在 uvicorn 前执行 alembic upgrade head
- 依赖 psycopg[binary] 供 Alembic 同步 URL；alembic/env 注释与预发清单更新
- 撕段门控消费管线、各视频/语音/归档调用改为 baked
- 百度环境变量仅 BAIDU_APP_ID、BAIDU_API_KEY、BAIDU_SECRET_KEY 与 BAIDU_* 超时/ASR；人脸脚本与 baidu_speech 文案同步
- 全量单测与 .env.example 更新；.gitignore 忽略 refs/（本地权重/视频不入库）

Made-with: Cursor

											
										
										
											2026-04-24 15:33:22 +08:00
+								        directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								        if not directory.exists():
 								            return 0
 								        loaded = 0
 								        for path in sorted(directory.glob("*.json")):
 								            try:
 								                raw = json.loads(path.read_text(encoding="utf-8"))
 								                surgery_id = str(raw["surgery_id"])
 								                details = _deserialize_details(list(raw.get("details") or []))
 								            except Exception as exc:
 								                logger.warning("Skip unreadable durable archive {}: {}", path, exc)
 								                continue
 								            async with self._lock:
 								                if surgery_id in self._archive:
 								                    continue
 								                self._archive[surgery_id] = _ArchiveEntry(
 								                    details=details,
 								                    durable_path=path,
 								                )
 								            loaded += 1
 								        if loaded:
 								            logger.warning(
 								                "Recovered {} durable archive(s) from {}; retry loop will attempt persist",
 								                loaded,
 								                directory,
 								            )
 								        return loaded
 								    async def _write_to_db(
 								        self,
 								        surgery_id: str,
 								        details: list[SurgeryConsumptionStored],
 								    ) -> bool:
 								        if self._repo is None:
 								            return True
 								        try:
 								            async with self._session_factory() as session:
 								                async with session.begin():
 								                    await self._repo.save_final_result(
 								                        session,
 								                        surgery_id=surgery_id,
 								                        details=list(details),
 								                    )
 								        except Exception as exc:
 								            logger.warning(
 								                "Persist surgery {} failed (will archive/retry): {}", surgery_id, exc
 								            )
 								            return False
 								        return True
 								    def _write_durable(
 								        self,
 								        surgery_id: str,
 								        details: list[SurgeryConsumptionStored],
 								    ) -> Path | None:
-												feat: 配置写死与 baked 模块，Alembic 建表，百度仅 BAIDU_*

- 新增 app/baked/algorithm|pipeline，非部署参数不再走 env；Settings 保留 DB/HTTP/RTSP/海康/百度/MinIO/Demo
- 移除 init_db_schema 与 reload 配置；main 仅 check_database；start*.sh 在 uvicorn 前执行 alembic upgrade head
- 依赖 psycopg[binary] 供 Alembic 同步 URL；alembic/env 注释与预发清单更新
- 撕段门控消费管线、各视频/语音/归档调用改为 baked
- 百度环境变量仅 BAIDU_APP_ID、BAIDU_API_KEY、BAIDU_SECRET_KEY 与 BAIDU_* 超时/ASR；人脸脚本与 baidu_speech 文案同步
- 全量单测与 .env.example 更新；.gitignore 忽略 refs/（本地权重/视频不入库）

Made-with: Cursor

											
										
										
											2026-04-24 15:33:22 +08:00
+								        directory = Path(bp.ARCHIVE_PERSIST_DURABLE_FALLBACK_DIR)
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								        try:
 								            directory.mkdir(parents=True, exist_ok=True)
 								        except Exception as exc:
 								            logger.warning("mkdir durable archive dir {} failed: {}", directory, exc)
 								            return None
 								        path = directory / f"{surgery_id}.json"
 								        payload = {
 								            "surgery_id": surgery_id,
 								            "saved_at": datetime.now(timezone.utc).isoformat(),
 								            "details": _serialize_details(details),
 								        }
 								        try:
 								            tmp = path.with_suffix(".json.tmp")
 								            tmp.write_text(
 								                json.dumps(payload, ensure_ascii=False, indent=2),
 								                encoding="utf-8",
 								            )
 								            os.replace(tmp, path)
 								            return path
 								        except Exception as exc:
 								            logger.warning("write durable archive {} failed: {}", path, exc)
 								            return None
 								    def _safe_remove(self, path: Path) -> None:
 								        try:
 								            path.unlink(missing_ok=True)
 								        except Exception as exc:
 								            logger.debug("remove durable archive {} failed: {}", path, exc)
 								    def _next_backoff_seconds(self, attempts: int) -> float:
-												feat: 配置写死与 baked 模块，Alembic 建表，百度仅 BAIDU_*

- 新增 app/baked/algorithm|pipeline，非部署参数不再走 env；Settings 保留 DB/HTTP/RTSP/海康/百度/MinIO/Demo
- 移除 init_db_schema 与 reload 配置；main 仅 check_database；start*.sh 在 uvicorn 前执行 alembic upgrade head
- 依赖 psycopg[binary] 供 Alembic 同步 URL；alembic/env 注释与预发清单更新
- 撕段门控消费管线、各视频/语音/归档调用改为 baked
- 百度环境变量仅 BAIDU_APP_ID、BAIDU_API_KEY、BAIDU_SECRET_KEY 与 BAIDU_* 超时/ASR；人脸脚本与 baidu_speech 文案同步
- 全量单测与 .env.example 更新；.gitignore 忽略 refs/（本地权重/视频不入库）

Made-with: Cursor

											
										
										
											2026-04-24 15:33:22 +08:00
+								        base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
 								        cap = float(bp.ARCHIVE_PERSIST_BACKOFF_CAP_SECONDS)
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								        # 指数退避：base * 2^(attempts-1)，首个间隔即 base。
 								        exp = max(0, attempts - 1)
 								        return min(cap, base * (2**exp))
 								    async def _retry_loop(self) -> None:
-												feat: 配置写死与 baked 模块，Alembic 建表，百度仅 BAIDU_*

- 新增 app/baked/algorithm|pipeline，非部署参数不再走 env；Settings 保留 DB/HTTP/RTSP/海康/百度/MinIO/Demo
- 移除 init_db_schema 与 reload 配置；main 仅 check_database；start*.sh 在 uvicorn 前执行 alembic upgrade head
- 依赖 psycopg[binary] 供 Alembic 同步 URL；alembic/env 注释与预发清单更新
- 撕段门控消费管线、各视频/语音/归档调用改为 baked
- 百度环境变量仅 BAIDU_APP_ID、BAIDU_API_KEY、BAIDU_SECRET_KEY 与 BAIDU_* 超时/ASR；人脸脚本与 baidu_speech 文案同步
- 全量单测与 .env.example 更新；.gitignore 忽略 refs/（本地权重/视频不入库）

Made-with: Cursor

											
										
										
											2026-04-24 15:33:22 +08:00
+								        base = float(bp.ARCHIVE_PERSIST_RETRY_INTERVAL_SECONDS)
 								        max_attempts = int(bp.ARCHIVE_PERSIST_MAX_RETRIES)
-												feat: 手术视频消耗、待确认与持久化改造

- 新增 Alembic 初始迁移、领域明细模型及归档持久化与重试链路\n- 拆分视频会话注册表、分类处理、推理时间窗聚合与流处理\n- 消耗日志：TSV/Markdown 含 top2/top3；item_id 优先产品编码；待确认记「待确认」行，语音确认后落正式行并更新汇总\n- 待确认时内存/DB 明细为占位行，确认后替换；拒绝时移除占位\n- 分类 probs 先 detach/cpu 再转 NumPy，修复 MPS/CUDA 上推理被静默跳过\n- 补充集成测试、归档与设备张量等单测

Made-with: Cursor

											
										
										
											2026-04-23 20:42:21 +08:00
+								        while not self._retry_stop.is_set():
 								            try:
 								                await asyncio.wait_for(self._retry_stop.wait(), timeout=base)
 								                break
 								            except TimeoutError:
 								                pass
 								            loop = asyncio.get_running_loop()
 								            now = loop.time()
 								            # 快照当前归档条目；后续尝试可能改变 _archive 内部状态。
 								            async with self._lock:
 								                entries = [(sid, ent) for sid, ent in self._archive.items()]
 								            for surgery_id, entry in entries:
 								                if self._retry_stop.is_set():
 								                    break
 								                if entry.attempts >= max_attempts:
 								                    # 达到上限，放弃自动重试，等待进程重启或人工介入。
 								                    continue
 								                if entry.next_attempt_monotonic > now:
 								                    continue
 								                ok = await self.try_persist_archive(surgery_id)
 								                if not ok:
 								                    # 失败：更新退避时间
 								                    async with self._lock:
 								                        current = self._archive.get(surgery_id)
 								                        if current is not None:
 								                            current.next_attempt_monotonic = now + self._next_backoff_seconds(
 								                                current.attempts
 								                            )
 								                            if current.attempts >= max_attempts:
 								                                logger.error(
 								                                    "Archive persist exhausted retries surgery_id={} "
 								                                    "attempts={}; durable={} kept for manual recovery",
 								                                    surgery_id,
 								                                    current.attempts,
 								                                    bool(current.durable_path),
 								                                )