diff --git a/backend/.dockerignore b/backend/.dockerignore index 547b169..e78e20a 100755 --- a/backend/.dockerignore +++ b/backend/.dockerignore @@ -24,3 +24,6 @@ algorithm_subprocesses/**/*.xlsx algorithm_subprocesses/**/*.xls *.md .dockerignore +# Offline bake inputs: keep *.pth / *.task; ignore partials and accidental duplicates. +weights/**/*.part +weights/**/*.pth.* diff --git a/backend/.env.example b/backend/.env.example index 726209c..ca57351 100755 --- a/backend/.env.example +++ b/backend/.env.example @@ -4,11 +4,11 @@ # # 5-6 ActionFormer 实时算法(默认开启): # - app/resources/actionformer_epoch_045.pth.tar 必须存在(离线下发,~110MB,未入 git)。 -# - VideoSwin Swin3D-T 权重在 Docker 构建时预下载到 /app/.cache/torch(见 scripts/bake_torch_hub_checkpoint.py); -# 运行时不再访问 pytorch.org。「首次运行」指 torch 缓存为空时才会联网下载;现已改为构建时烘焙进镜像。 -# 国内 PyPI 镜像(南大/清华/阿里)不同步 /models/*.pth,构建默认先试 uv.agentsmirror.com 再回退官方源。 -# 离线/弱网:先 wget 权重到 backend/weights/swin3d_t-7615ae03.pth,再 docker compose build api。 -# 或:docker compose build --build-arg PYTORCH_MODELS_URL=https://your-mirror/.../swin3d_t-7615ae03.pth api +# - torchvision 预训练 hub 权重在 Docker 构建时烘焙到 /app/.cache/torch(scripts/bake_pretrained_weights.py): +# swin3d_t-7615ae03.pth(VideoSwin)、resnet50-0676ba61.pth(医生 ReID);运行时不再访问 pytorch.org。 +# 国内 PyPI 镜像不同步 /models/*.pth;构建默认 uv.agentsmirror.com 再回退官方源。 +# 离线/弱网:将上述文件放入 backend/weights/ 后 docker compose build api(本地有则跳过下载)。 +# 或:docker compose build --build-arg PYTORCH_MODELS_MIRROR=https://your-mirror/download.pytorch.org api # - Linux GPU 机:镜像内 torch / torchvision / torchaudio 为 cu130 wheel; # 宿主机需 NVIDIA 驱动 + NVIDIA Container Toolkit;`api` 服务已配置 `gpus: all`。 # 启动后可验证:docker compose exec api python -c "import torch; print(torch.cuda.is_available())" diff --git a/backend/Dockerfile b/backend/Dockerfile index fd1147c..9cf5cad 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -56,11 +56,20 @@ COPY algorithm_subprocesses ./algorithm_subprocesses/ # Bake runtime patches/assets so non-root api never writes the read-only bundle tree. COPY app/algorithm_runner/actionformer_release/libs/utils/nms.py \ algorithm_subprocesses/5.15/code/actionformer_release/libs/utils/nms.py -RUN mkdir -p algorithm_subprocesses/5.15/doctor_identity_package/.mediapipe_models && \ - curl -fsSL --retry 3 \ - -o algorithm_subprocesses/5.15/doctor_identity_package/.mediapipe_models/pose_landmarker_lite.task \ - "https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_lite/float16/1/pose_landmarker_lite.task" && \ - test -s algorithm_subprocesses/5.15/doctor_identity_package/.mediapipe_models/pose_landmarker_lite.task +# Optional offline assets (torch hub + MediaPipe); see backend/weights/.gitkeep +COPY weights ./weights/ +RUN set -eux; \ + MP_DEST=algorithm_subprocesses/5.15/doctor_identity_package/.mediapipe_models/pose_landmarker_lite.task; \ + mkdir -p "$(dirname "$MP_DEST")"; \ + if [ -s weights/pose_landmarker_lite.task ]; then \ + cp weights/pose_landmarker_lite.task "$MP_DEST"; \ + echo "mediapipe pose model from local weights/"; \ + else \ + curl -fsSL --retry 3 -o "$MP_DEST" \ + "https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_lite/float16/1/pose_landmarker_lite.task"; \ + echo "mediapipe pose model downloaded"; \ + fi; \ + test -s "$MP_DEST" # uv.lock pins uv.agentsmirror.com artifact URLs. Rewrite to mainland mirrors (same /packages/... paths). # PyPI: Tsinghua | PyTorch wheel index: 南大 (syncs download.pytorch.org / download-r2) @@ -73,29 +82,28 @@ RUN sed -i \ ENV UV_DEFAULT_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple -# VideoSwin (Swin3D-T) hub weights: bake at build time so RTSP batch jobs never hit pytorch.org at runtime. -# Domestic PyPI mirrors (NJU/Tsinghua/Aliyun) only sync pip wheels, not /models/*.pth; default tries -# uv.agentsmirror.com (same ecosystem as uv.lock) then download.pytorch.org. Optional offline bake: -# backend/weights/swin3d_t-7615ae03.pth (see weights/.gitkeep) -# Override: --build-arg PYTORCH_MODELS_URL=... or PYTORCH_MODELS_MIRROR=... +# Torchvision hub weights (Swin3D-T + ResNet50): bake at build so runtime never hits pytorch.org. +# Domestic PyPI mirrors do not sync /models/*.pth. Optional offline: backend/weights/*.pth (see .gitkeep). +# Override: --build-arg PYTORCH_MODELS_MIRROR=... or PYTORCH_MODELS_URL=... (swin only). ARG PYTORCH_MODELS_MIRROR= ARG PYTORCH_MODELS_URL= ENV PYTORCH_MODELS_MIRROR=${PYTORCH_MODELS_MIRROR} \ PYTORCH_MODELS_URL=${PYTORCH_MODELS_URL} -# Optional offline weight (only .gitkeep by default; add swin3d_t-7615ae03.pth before build if needed). -COPY weights ./weights/ +# Do not cache-mount /app/.cache/torch: BuildKit cache mounts are not exported into the image. RUN --mount=type=cache,target=/root/.cache/uv \ uv sync --frozen --no-dev --no-compile --refresh-package numpy --refresh-package mediapipe && \ .venv/bin/python -c "import alembic" && \ .venv/bin/python -c "import numpy; import numpy.lib._index_tricks_impl" && \ .venv/bin/python -c "import mediapipe as mp; print('mediapipe', mp.__version__)" && \ - mkdir -p /app/.cache/ultralytics && \ - PYTORCH_MODELS_LOCAL_PATH=/app/weights/swin3d_t-7615ae03.pth \ - .venv/bin/python scripts/bake_torch_hub_checkpoint.py && \ + mkdir -p /app/.cache/ultralytics /app/.cache/torch/hub/checkpoints && \ + PYTORCH_MODELS_LOCAL_DIR=/app/weights \ + .venv/bin/python scripts/bake_pretrained_weights.py && \ + test -s /app/.cache/torch/hub/checkpoints/swin3d_t-7615ae03.pth && \ + test -s /app/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth && \ rm -rf /app/weights && \ - TORCH_HOME=/app/.cache/torch .venv/bin/python -c "from torchvision.models.video import Swin3D_T_Weights, swin3d_t; swin3d_t(weights=Swin3D_T_Weights.KINETICS400_V1); print('swin3d_t cached ok')" && \ - chmod -R a+rX /app/.venv /app/algorithm_subprocesses /app/.cache/torch /app/.cache/ultralytics + chmod -R a+rX /app/.venv /app/algorithm_subprocesses && \ + chmod -R a+rwX /app/.cache/torch /app/.cache/ultralytics ENV PATH="/app/.venv/bin:$PATH" diff --git a/backend/scripts/bake_pretrained_weights.py b/backend/scripts/bake_pretrained_weights.py new file mode 100644 index 0000000..ce2259b --- /dev/null +++ b/backend/scripts/bake_pretrained_weights.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +"""Bake torchvision / torch.hub checkpoints into TORCH_HOME during Docker image build. + +Runtime batch jobs and doctor identity must not download from pytorch.org when the +api container runs as a non-root compose user (read-only /app/.cache/torch). + +Place optional offline copies under backend/weights/ before build: + - swin3d_t-7615ae03.pth (VideoSwin / Swin3D-T, ~110MB) + - resnet50-0676ba61.pth (doctor ReID backbone, ~98MB) + +Override mirrors: PYTORCH_MODELS_MIRROR, PYTORCH_MODELS_URL (per-file URL not supported). +Legacy single-file: PYTORCH_MODELS_LOCAL_PATH (applies only to swin3d_t). +""" + +from __future__ import annotations + +import os +import shutil +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + +OFFICIAL_PREFIX = "https://download.pytorch.org/models" +AGENTSMIRROR_PREFIX = "https://uv.agentsmirror.com/download.pytorch.org/models" + + +@dataclass(frozen=True) +class HubCheckpoint: + filename: str + min_bytes: int + label: str + + +# All torchvision hub files downloaded at runtime by production paths (as of 5.15 bundle). +HUB_CHECKPOINTS: tuple[HubCheckpoint, ...] = ( + HubCheckpoint( + filename="swin3d_t-7615ae03.pth", + min_bytes=100_000_000, + label="VideoSwin Swin3D-T (Kinetics-400)", + ), + HubCheckpoint( + filename="resnet50-0676ba61.pth", + min_bytes=90_000_000, + label="doctor ReID ResNet50 (ImageNet-1K)", + ), +) + + +def _candidate_urls(filename: str) -> list[str]: + explicit = (os.environ.get("PYTORCH_MODELS_URL") or "").strip() + # Legacy single-URL override applied only to VideoSwin (other files use mirror list). + if explicit and filename == "swin3d_t-7615ae03.pth": + return [explicit] + + raw = (os.environ.get("PYTORCH_MODELS_MIRROR") or "").strip().rstrip("/") + prefixes = [p for p in (raw, AGENTSMIRROR_PREFIX) if p] + urls: list[str] = [] + for prefix in prefixes: + urls.append(f"{prefix}/models/{filename}") + urls.append(f"{OFFICIAL_PREFIX}/{filename}") + + seen: set[str] = set() + ordered: list[str] = [] + for url in urls: + if url in seen: + continue + seen.add(url) + ordered.append(url) + return ordered + + +def _local_source(filename: str) -> Path | None: + legacy = (os.environ.get("PYTORCH_MODELS_LOCAL_PATH") or "").strip() + if legacy and filename == "swin3d_t-7615ae03.pth": + path = Path(legacy) + if path.is_file(): + return path + + local_dir = (os.environ.get("PYTORCH_MODELS_LOCAL_DIR") or "").strip() + if local_dir: + path = Path(local_dir) / filename + if path.is_file(): + return path + return None + + +def _copy_local(src: Path, dest: Path, *, min_bytes: int) -> None: + size = src.stat().st_size + if size < min_bytes: + raise OSError(f"local file too small ({size} bytes): {src}") + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dest) + + +def _download_with_curl(url: str, dest: Path, *, min_bytes: int) -> None: + curl = shutil.which("curl") + if curl is None: + raise RuntimeError("curl not found") + tmp = dest.with_suffix(dest.suffix + ".part") + tmp.unlink(missing_ok=True) + proc = subprocess.run( + [ + curl, + "-fL", + "--retry", + "5", + "--retry-all-errors", + "--retry-delay", + "3", + "--connect-timeout", + "30", + "--max-time", + "1800", + "-o", + str(tmp), + url, + ], + check=False, + capture_output=True, + text=True, + ) + if proc.returncode != 0: + tmp.unlink(missing_ok=True) + detail = (proc.stderr or proc.stdout or "").strip() or f"exit={proc.returncode}" + raise RuntimeError(detail) + size = tmp.stat().st_size + if size < min_bytes: + tmp.unlink(missing_ok=True) + raise OSError(f"download too small ({size} bytes)") + tmp.replace(dest) + + +def _download_with_torch(url: str, dest: Path, *, min_bytes: int) -> None: + import torch + + dest.parent.mkdir(parents=True, exist_ok=True) + torch.hub.download_url_to_file(url, str(dest), progress=True) + size = dest.stat().st_size + if size < min_bytes: + dest.unlink(missing_ok=True) + raise OSError(f"download too small ({size} bytes)") + + +def bake_hub_checkpoint( + spec: HubCheckpoint, + *, + torch_home: Path, +) -> None: + dest = torch_home / "hub" / "checkpoints" / spec.filename + if dest.is_file() and dest.stat().st_size >= spec.min_bytes: + print(f"already baked [{spec.label}]: {dest} ({dest.stat().st_size} bytes)") + return + + local = _local_source(spec.filename) + if local is not None: + try: + _copy_local(local, dest, min_bytes=spec.min_bytes) + print(f"baked [{spec.label}] {dest} ({dest.stat().st_size} bytes) from local {local}") + return + except OSError as exc: + print(f"local copy failed for {spec.filename}: {exc}", file=sys.stderr) + + errors: list[str] = [] + for url in _candidate_urls(spec.filename): + try: + print(f"downloading [{spec.label}] {url}") + _download_with_curl(url, dest, min_bytes=spec.min_bytes) + print(f"baked [{spec.label}] {dest} ({dest.stat().st_size} bytes) from {url}") + return + except (OSError, RuntimeError) as exc: + errors.append(f"curl {url}: {exc}") + dest.unlink(missing_ok=True) + + official = f"{OFFICIAL_PREFIX}/{spec.filename}" + try: + print(f"torch.hub fallback [{spec.label}]: {official}") + _download_with_torch(official, dest, min_bytes=spec.min_bytes) + print(f"baked [{spec.label}] {dest} ({dest.stat().st_size} bytes) from {official}") + return + except (OSError, RuntimeError) as exc: + errors.append(f"torch {official}: {exc}") + dest.unlink(missing_ok=True) + + print(f"failed to bake {spec.label} ({spec.filename}):", file=sys.stderr) + for line in errors: + print(f" - {line}", file=sys.stderr) + raise SystemExit(1) + + +def warm_torchvision_hub_models() -> None: + """Load models so torchvision verifies hub checkpoints (no network if baked).""" + from torchvision.models import ResNet50_Weights, resnet50 + from torchvision.models.video import Swin3D_T_Weights, swin3d_t + + resnet50(weights=ResNet50_Weights.IMAGENET1K_V1) + print("resnet50 IMAGENET1K_V1 ok") + swin3d_t(weights=Swin3D_T_Weights.KINETICS400_V1) + print("swin3d_t KINETICS400_V1 ok") + + +def main() -> int: + torch_home = Path(os.environ.get("TORCH_HOME", "/app/.cache/torch")) + for spec in HUB_CHECKPOINTS: + bake_hub_checkpoint(spec, torch_home=torch_home) + warm_torchvision_hub_models() + print("all pretrained hub weights baked") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/backend/scripts/bake_torch_hub_checkpoint.py b/backend/scripts/bake_torch_hub_checkpoint.py index 7cd35b6..2d88602 100644 --- a/backend/scripts/bake_torch_hub_checkpoint.py +++ b/backend/scripts/bake_torch_hub_checkpoint.py @@ -1,150 +1,11 @@ #!/usr/bin/env python3 -"""Bake torchvision hub checkpoints into TORCH_HOME during Docker image build.""" +"""Backward-compatible entrypoint; bakes all hub weights (see bake_pretrained_weights.py).""" from __future__ import annotations -import os -import shutil -import subprocess -import sys +import runpy from pathlib import Path -CHECKPOINT = "swin3d_t-7615ae03.pth" -OFFICIAL_URL = f"https://download.pytorch.org/models/{CHECKPOINT}" -AGENTSMIRROR_URL = f"https://uv.agentsmirror.com/download.pytorch.org/models/{CHECKPOINT}" -MIN_BYTES = 100_000_000 - - -def _candidate_urls() -> list[str]: - explicit = (os.environ.get("PYTORCH_MODELS_URL") or "").strip() - if explicit: - return [explicit] - - raw = (os.environ.get("PYTORCH_MODELS_MIRROR") or "").strip().rstrip("/") - prefixes = [p for p in (raw, "https://uv.agentsmirror.com/download.pytorch.org") if p] - urls: list[str] = [] - for prefix in prefixes: - urls.append(f"{prefix}/models/{CHECKPOINT}") - urls.extend([AGENTSMIRROR_URL, OFFICIAL_URL]) - - seen: set[str] = set() - ordered: list[str] = [] - for url in urls: - if url in seen: - continue - seen.add(url) - ordered.append(url) - return ordered - - -def _copy_local(src: Path, dest: Path) -> None: - size = src.stat().st_size - if size < MIN_BYTES: - raise OSError(f"local file too small ({size} bytes): {src}") - dest.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src, dest) - - -def _download_with_curl(url: str, dest: Path) -> None: - curl = shutil.which("curl") - if curl is None: - raise RuntimeError("curl not found") - tmp = dest.with_suffix(dest.suffix + ".part") - tmp.unlink(missing_ok=True) - proc = subprocess.run( - [ - curl, - "-fL", - "--retry", - "5", - "--retry-all-errors", - "--retry-delay", - "3", - "--connect-timeout", - "30", - "--max-time", - "1800", - "-o", - str(tmp), - url, - ], - check=False, - capture_output=True, - text=True, - ) - if proc.returncode != 0: - tmp.unlink(missing_ok=True) - detail = (proc.stderr or proc.stdout or "").strip() or f"exit={proc.returncode}" - raise RuntimeError(detail) - size = tmp.stat().st_size - if size < MIN_BYTES: - tmp.unlink(missing_ok=True) - raise OSError(f"download too small ({size} bytes)") - tmp.replace(dest) - - -def _download_with_torch(url: str, dest: Path) -> None: - import torch - - dest.parent.mkdir(parents=True, exist_ok=True) - torch.hub.download_url_to_file(url, str(dest), progress=True) - size = dest.stat().st_size - if size < MIN_BYTES: - dest.unlink(missing_ok=True) - raise OSError(f"download too small ({size} bytes)") - - -def main() -> int: - torch_home = Path(os.environ.get("TORCH_HOME", "/app/.cache/torch")) - dest = torch_home / "hub" / "checkpoints" / CHECKPOINT - if dest.is_file() and dest.stat().st_size >= MIN_BYTES: - print(f"already baked: {dest} ({dest.stat().st_size} bytes)") - return 0 - - local_raw = (os.environ.get("PYTORCH_MODELS_LOCAL_PATH") or "").strip() - if local_raw: - local = Path(local_raw) - if local.is_file(): - try: - _copy_local(local, dest) - print(f"baked {dest} ({dest.stat().st_size} bytes) from local {local}") - return 0 - except OSError as exc: - print(f"local copy failed: {exc}", file=sys.stderr) - - errors: list[str] = [] - for url in _candidate_urls(): - try: - print(f"downloading {url}") - _download_with_curl(url, dest) - print(f"baked {dest} ({dest.stat().st_size} bytes) from {url}") - return 0 - except (OSError, RuntimeError) as exc: - errors.append(f"curl {url}: {exc}") - dest.unlink(missing_ok=True) - - for url in (OFFICIAL_URL,): - try: - print(f"torch.hub fallback: {url}") - _download_with_torch(url, dest) - print(f"baked {dest} ({dest.stat().st_size} bytes) from {url}") - return 0 - except (OSError, RuntimeError) as exc: - errors.append(f"torch {url}: {exc}") - dest.unlink(missing_ok=True) - - print("failed to bake VideoSwin checkpoint:", file=sys.stderr) - for line in errors: - print(f" - {line}", file=sys.stderr) - print( - "hint: domestic PyPI mirrors (NJU/Tsinghua/Aliyun) do not sync /models/*.pth; " - "pre-download once and set PYTORCH_MODELS_LOCAL_PATH, or pass " - "PYTORCH_MODELS_URL / --build-arg PYTORCH_MODELS_MIRROR to a mirror that hosts " - f"/models/{CHECKPOINT}", - file=sys.stderr, - ) - return 1 - - if __name__ == "__main__": - raise SystemExit(main()) + target = Path(__file__).resolve().parent / "bake_pretrained_weights.py" + runpy.run_path(str(target), run_name="__main__") diff --git a/backend/scripts/rebuild-api-image.sh b/backend/scripts/rebuild-api-image.sh index e91048f..fb53c15 100755 --- a/backend/scripts/rebuild-api-image.sh +++ b/backend/scripts/rebuild-api-image.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash -# Clear BuildKit cache and rebuild the API image (fixes corrupted layer / unpigz errors). +# Clear BuildKit cache and rebuild the API image. +# Fixes export errors such as: +# archive/tar: invalid tar header +# unpigz: corrupted -- invalid deflate data set -euo pipefail cd "$(dirname "$0")/.." @@ -8,12 +11,20 @@ docker builder prune -af docker buildx prune -af 2>/dev/null || true docker rmi -f backend-api:latest 2>/dev/null || true +if [[ "${AGGRESSIVE_PRUNE:-0}" == "1" ]]; then + echo "Aggressive prune (dangling images + build cache)..." + docker system prune -af 2>/dev/null || true +fi + if [[ "${RESTART_DOCKER:-0}" == "1" ]]; then echo "Restarting Docker..." sudo systemctl restart docker + sleep 2 fi -echo "Building api image (--no-cache)..." -docker compose build api --no-cache +echo "Building api image (--no-cache, no attestations)..." +export DOCKER_BUILDKIT=1 +export COMPOSE_BAKE=false +docker compose build api --no-cache --provenance=false --sbom=false echo "Done. Recreate container: docker compose up -d --force-recreate api" diff --git a/backend/tests/test_bake_pretrained_weights.py b/backend/tests/test_bake_pretrained_weights.py new file mode 100644 index 0000000..4a378e5 --- /dev/null +++ b/backend/tests/test_bake_pretrained_weights.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import importlib.util +from pathlib import Path + +import pytest + + +def _load_module(): + import sys + + path = Path(__file__).resolve().parents[1] / "scripts" / "bake_pretrained_weights.py" + spec = importlib.util.spec_from_file_location("bake_pretrained_weights", path) + mod = importlib.util.module_from_spec(spec) + assert spec.loader is not None + sys.modules[spec.name] = mod + spec.loader.exec_module(mod) + return mod + + +def test_candidate_urls_default(monkeypatch: pytest.MonkeyPatch) -> None: + mod = _load_module() + monkeypatch.delenv("PYTORCH_MODELS_URL", raising=False) + monkeypatch.delenv("PYTORCH_MODELS_MIRROR", raising=False) + urls = mod._candidate_urls("resnet50-0676ba61.pth") + assert urls[0].endswith("/models/resnet50-0676ba61.pth") + assert urls[-1] == f"{mod.OFFICIAL_PREFIX}/resnet50-0676ba61.pth" + + +def test_candidate_urls_explicit_override_applies_to_swin_only( + monkeypatch: pytest.MonkeyPatch, +) -> None: + mod = _load_module() + monkeypatch.setenv("PYTORCH_MODELS_URL", "https://example.com/swin3d_t-7615ae03.pth") + assert mod._candidate_urls("swin3d_t-7615ae03.pth") == [ + "https://example.com/swin3d_t-7615ae03.pth" + ] + assert "example.com" not in mod._candidate_urls("resnet50-0676ba61.pth")[0] + + +def test_local_source_prefers_legacy_swin_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + mod = _load_module() + swin = tmp_path / "swin3d_t-7615ae03.pth" + swin.write_bytes(b"x" * mod.HUB_CHECKPOINTS[0].min_bytes) + monkeypatch.setenv("PYTORCH_MODELS_LOCAL_PATH", str(swin)) + assert mod._local_source("swin3d_t-7615ae03.pth") == swin + assert mod._local_source("resnet50-0676ba61.pth") is None + + +def test_local_source_uses_local_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + mod = _load_module() + weights = tmp_path / "weights" + weights.mkdir() + resnet = weights / "resnet50-0676ba61.pth" + resnet.write_bytes(b"x") + monkeypatch.setenv("PYTORCH_MODELS_LOCAL_DIR", str(weights)) + assert mod._local_source("resnet50-0676ba61.pth") == resnet + + +def test_bake_skips_when_dest_already_valid(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + mod = _load_module() + monkeypatch.setenv("TORCH_HOME", str(tmp_path / "torch")) + spec = mod.HUB_CHECKPOINTS[1] + dest = tmp_path / "torch" / "hub" / "checkpoints" / spec.filename + dest.parent.mkdir(parents=True) + dest.write_bytes(b"x" * spec.min_bytes) + mod.bake_hub_checkpoint(spec, torch_home=tmp_path / "torch") diff --git a/backend/tests/test_bake_torch_hub_checkpoint.py b/backend/tests/test_bake_torch_hub_checkpoint.py index e20d716..adba5f3 100644 --- a/backend/tests/test_bake_torch_hub_checkpoint.py +++ b/backend/tests/test_bake_torch_hub_checkpoint.py @@ -1,28 +1,13 @@ +"""Legacy entrypoint must delegate to bake_pretrained_weights.""" + from __future__ import annotations -import importlib.util from pathlib import Path -def _load_module(): - path = Path(__file__).resolve().parents[1] / "scripts" / "bake_torch_hub_checkpoint.py" - spec = importlib.util.spec_from_file_location("bake_torch_hub_checkpoint", path) - mod = importlib.util.module_from_spec(spec) - assert spec.loader is not None - spec.loader.exec_module(mod) - return mod - - -def test_candidate_urls_default(monkeypatch): - mod = _load_module() - monkeypatch.delenv("PYTORCH_MODELS_URL", raising=False) - monkeypatch.delenv("PYTORCH_MODELS_MIRROR", raising=False) - urls = mod._candidate_urls() - assert urls[0].startswith("https://uv.agentsmirror.com/download.pytorch.org/models/") - assert urls[-1] == mod.OFFICIAL_URL - - -def test_candidate_urls_explicit_override(monkeypatch): - mod = _load_module() - monkeypatch.setenv("PYTORCH_MODELS_URL", "https://example.com/swin3d_t-7615ae03.pth") - assert mod._candidate_urls() == ["https://example.com/swin3d_t-7615ae03.pth"] +def test_bake_torch_hub_checkpoint_delegates() -> None: + text = ( + Path(__file__).resolve().parents[1] / "scripts" / "bake_torch_hub_checkpoint.py" + ).read_text(encoding="utf-8") + assert "bake_pretrained_weights.py" in text + assert "runpy.run_path" in text diff --git a/backend/weights/.gitkeep b/backend/weights/.gitkeep index e69de29..1a5bdd5 100644 --- a/backend/weights/.gitkeep +++ b/backend/weights/.gitkeep @@ -0,0 +1,18 @@ +# Optional offline assets for Docker build (see scripts/bake_pretrained_weights.py). +# If a file exists here, image build copies it instead of downloading. +# +# Torchvision hub (→ /app/.cache/torch/hub/checkpoints/): +# swin3d_t-7615ae03.pth (~110MB) VideoSwin / batch Phase1 +# resnet50-0676ba61.pth (~98MB) doctor ReID backbone (ImageNet) +# +# MediaPipe (→ algorithm_subprocesses/5.15/doctor_identity_package/.mediapipe_models/): +# pose_landmarker_lite.task (~6MB) +# +# wget https://download.pytorch.org/models/swin3d_t-7615ae03.pth +# wget https://download.pytorch.org/models/resnet50-0676ba61.pth +# curl -fsSL -o pose_landmarker_lite.task \ +# "https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_lite/float16/1/pose_landmarker_lite.task" +# +# Bundled in image via COPY (not downloaded at build): +# algorithm_subprocesses/5.15/weights/*.pt, actionformer_epoch_045.pth.tar +# doctor_identity_package/doctor_info.pth diff --git a/backend/weights/pose_landmarker_lite.task b/backend/weights/pose_landmarker_lite.task new file mode 100644 index 0000000..09576a9 Binary files /dev/null and b/backend/weights/pose_landmarker_lite.task differ diff --git a/backend/weights/resnet50-0676ba61.pth b/backend/weights/resnet50-0676ba61.pth new file mode 100644 index 0000000..f251834 Binary files /dev/null and b/backend/weights/resnet50-0676ba61.pth differ diff --git a/backend/weights/swin3d_t-7615ae03.pth.1 b/backend/weights/swin3d_t-7615ae03.pth.1 new file mode 100644 index 0000000..ea9ba20 Binary files /dev/null and b/backend/weights/swin3d_t-7615ae03.pth.1 differ diff --git a/docs/Docker部署.md b/docs/Docker部署.md index 852816f..bd545c6 100644 --- a/docs/Docker部署.md +++ b/docs/Docker部署.md @@ -28,7 +28,12 @@ operation-room-monitor/ - 算法子进程包:`backend/algorithm_subprocesses/5.15/`(含 `main.py` 与 `weights/`;镜像构建时会 `COPY` 进容器,勿在 `.dockerignore` 中整目录排除) - 标注视频中文字体:镜像内已安装 `fonts-noto-cjk`、`fonts-wqy-microhei`(供 `visualize_result_video.py` 绘制耗材标签) - 医生识别(MediaPipe Pose):镜像内已安装 `libgles2`、`libegl1`、`libegl-mesa0`、`libglx-mesa0`、`libgl1-mesa-dri` 等 Mesa/GLVND 库;构建阶段会 `import mediapipe` 校验 `libGLESv2.so.2` 可用。子进程强制 CPU delegate。若仍见该错误,请 **`docker compose build --no-cache api`** 后重启(勿沿用旧 tarball 镜像) -- 可选备用权重:`backend/app/resources/actionformer_epoch_045.pth.tar` +- **构建时预下载的预训练资源**(`scripts/bake_pretrained_weights.py`,本地有则跳过,见 `backend/weights/.gitkeep`): + - `swin3d_t-7615ae03.pth`(VideoSwin / batch Phase1) + - `resnet50-0676ba61.pth`(医生 ReID 骨干) + - `pose_landmarker_lite.task`(MediaPipe Pose) +- 业务权重随 `algorithm_subprocesses/5.15/weights/` 打进镜像(YOLO / ActionFormer 等,非 hub 下载) +- 可选备用:`backend/app/resources/actionformer_epoch_045.pth.tar` --- @@ -58,31 +63,6 @@ docker compose down docker compose down -v # 删除 PostgreSQL / MinIO 卷 ``` -### 构建 API 镜像失败:`invalid tar header` / `unpigz: corrupted` - -`uv sync` 已成功,但在 **exporting / unpacking** 阶段报错时,通常是 **Docker 本地层缓存或存储损坏**,与 Dockerfile 无关。 - -按顺序处理: - -```bash -cd backend -chmod +x scripts/rebuild-api-image.sh - -# 清缓存并重建(推荐) -./scripts/rebuild-api-image.sh - -# 仍失败时:重启 Docker 后再跑 -RESTART_DOCKER=1 ./scripts/rebuild-api-image.sh - -# 再失败:改用旧版构建器(无 BuildKit) -COMPOSE_DOCKER_CLI_BUILD=0 DOCKER_BUILDKIT=0 docker compose build api --no-cache -docker compose up -d --force-recreate api -``` - -手动等价步骤:`docker builder prune -af` → `docker rmi -f backend-api:latest` → `docker compose build api --no-cache`。 - -确认根分区剩余空间充足(建议 ≥ 20GB);空间不足时大层导出也容易损坏。 - ### RTSP 切片在宿主机无法用 VLC 打开 默认情况下 API 容器以 **root** 写入 `./logs`,切片属主为 `root:root`。普通用户虽可用 `cat` 读取,但 **Snap 版 VLC** 等沙箱应用常会报 Permission denied。