fix docker build
This commit is contained in:
@@ -24,3 +24,6 @@ algorithm_subprocesses/**/*.xlsx
|
|||||||
algorithm_subprocesses/**/*.xls
|
algorithm_subprocesses/**/*.xls
|
||||||
*.md
|
*.md
|
||||||
.dockerignore
|
.dockerignore
|
||||||
|
# Offline bake inputs: keep *.pth / *.task; ignore partials and accidental duplicates.
|
||||||
|
weights/**/*.part
|
||||||
|
weights/**/*.pth.*
|
||||||
|
|||||||
@@ -4,11 +4,11 @@
|
|||||||
#
|
#
|
||||||
# 5-6 ActionFormer 实时算法(默认开启):
|
# 5-6 ActionFormer 实时算法(默认开启):
|
||||||
# - app/resources/actionformer_epoch_045.pth.tar 必须存在(离线下发,~110MB,未入 git)。
|
# - app/resources/actionformer_epoch_045.pth.tar 必须存在(离线下发,~110MB,未入 git)。
|
||||||
# - VideoSwin Swin3D-T 权重在 Docker 构建时预下载到 /app/.cache/torch(见 scripts/bake_torch_hub_checkpoint.py);
|
# - torchvision 预训练 hub 权重在 Docker 构建时烘焙到 /app/.cache/torch(scripts/bake_pretrained_weights.py):
|
||||||
# 运行时不再访问 pytorch.org。「首次运行」指 torch 缓存为空时才会联网下载;现已改为构建时烘焙进镜像。
|
# swin3d_t-7615ae03.pth(VideoSwin)、resnet50-0676ba61.pth(医生 ReID);运行时不再访问 pytorch.org。
|
||||||
# 国内 PyPI 镜像(南大/清华/阿里)不同步 /models/*.pth,构建默认先试 uv.agentsmirror.com 再回退官方源。
|
# 国内 PyPI 镜像不同步 /models/*.pth;构建默认 uv.agentsmirror.com 再回退官方源。
|
||||||
# 离线/弱网:先 wget 权重到 backend/weights/swin3d_t-7615ae03.pth,再 docker compose build api。
|
# 离线/弱网:将上述文件放入 backend/weights/ 后 docker compose build api(本地有则跳过下载)。
|
||||||
# 或:docker compose build --build-arg PYTORCH_MODELS_URL=https://your-mirror/.../swin3d_t-7615ae03.pth api
|
# 或:docker compose build --build-arg PYTORCH_MODELS_MIRROR=https://your-mirror/download.pytorch.org api
|
||||||
# - Linux GPU 机:镜像内 torch / torchvision / torchaudio 为 cu130 wheel;
|
# - Linux GPU 机:镜像内 torch / torchvision / torchaudio 为 cu130 wheel;
|
||||||
# 宿主机需 NVIDIA 驱动 + NVIDIA Container Toolkit;`api` 服务已配置 `gpus: all`。
|
# 宿主机需 NVIDIA 驱动 + NVIDIA Container Toolkit;`api` 服务已配置 `gpus: all`。
|
||||||
# 启动后可验证:docker compose exec api python -c "import torch; print(torch.cuda.is_available())"
|
# 启动后可验证:docker compose exec api python -c "import torch; print(torch.cuda.is_available())"
|
||||||
|
|||||||
@@ -56,11 +56,20 @@ COPY algorithm_subprocesses ./algorithm_subprocesses/
|
|||||||
# Bake runtime patches/assets so non-root api never writes the read-only bundle tree.
|
# Bake runtime patches/assets so non-root api never writes the read-only bundle tree.
|
||||||
COPY app/algorithm_runner/actionformer_release/libs/utils/nms.py \
|
COPY app/algorithm_runner/actionformer_release/libs/utils/nms.py \
|
||||||
algorithm_subprocesses/5.15/code/actionformer_release/libs/utils/nms.py
|
algorithm_subprocesses/5.15/code/actionformer_release/libs/utils/nms.py
|
||||||
RUN mkdir -p algorithm_subprocesses/5.15/doctor_identity_package/.mediapipe_models && \
|
# Optional offline assets (torch hub + MediaPipe); see backend/weights/.gitkeep
|
||||||
curl -fsSL --retry 3 \
|
COPY weights ./weights/
|
||||||
-o algorithm_subprocesses/5.15/doctor_identity_package/.mediapipe_models/pose_landmarker_lite.task \
|
RUN set -eux; \
|
||||||
"https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_lite/float16/1/pose_landmarker_lite.task" && \
|
MP_DEST=algorithm_subprocesses/5.15/doctor_identity_package/.mediapipe_models/pose_landmarker_lite.task; \
|
||||||
test -s algorithm_subprocesses/5.15/doctor_identity_package/.mediapipe_models/pose_landmarker_lite.task
|
mkdir -p "$(dirname "$MP_DEST")"; \
|
||||||
|
if [ -s weights/pose_landmarker_lite.task ]; then \
|
||||||
|
cp weights/pose_landmarker_lite.task "$MP_DEST"; \
|
||||||
|
echo "mediapipe pose model from local weights/"; \
|
||||||
|
else \
|
||||||
|
curl -fsSL --retry 3 -o "$MP_DEST" \
|
||||||
|
"https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_lite/float16/1/pose_landmarker_lite.task"; \
|
||||||
|
echo "mediapipe pose model downloaded"; \
|
||||||
|
fi; \
|
||||||
|
test -s "$MP_DEST"
|
||||||
|
|
||||||
# uv.lock pins uv.agentsmirror.com artifact URLs. Rewrite to mainland mirrors (same /packages/... paths).
|
# uv.lock pins uv.agentsmirror.com artifact URLs. Rewrite to mainland mirrors (same /packages/... paths).
|
||||||
# PyPI: Tsinghua | PyTorch wheel index: 南大 (syncs download.pytorch.org / download-r2)
|
# PyPI: Tsinghua | PyTorch wheel index: 南大 (syncs download.pytorch.org / download-r2)
|
||||||
@@ -73,29 +82,28 @@ RUN sed -i \
|
|||||||
|
|
||||||
ENV UV_DEFAULT_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple
|
ENV UV_DEFAULT_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
|
|
||||||
# VideoSwin (Swin3D-T) hub weights: bake at build time so RTSP batch jobs never hit pytorch.org at runtime.
|
# Torchvision hub weights (Swin3D-T + ResNet50): bake at build so runtime never hits pytorch.org.
|
||||||
# Domestic PyPI mirrors (NJU/Tsinghua/Aliyun) only sync pip wheels, not /models/*.pth; default tries
|
# Domestic PyPI mirrors do not sync /models/*.pth. Optional offline: backend/weights/*.pth (see .gitkeep).
|
||||||
# uv.agentsmirror.com (same ecosystem as uv.lock) then download.pytorch.org. Optional offline bake:
|
# Override: --build-arg PYTORCH_MODELS_MIRROR=... or PYTORCH_MODELS_URL=... (swin only).
|
||||||
# backend/weights/swin3d_t-7615ae03.pth (see weights/.gitkeep)
|
|
||||||
# Override: --build-arg PYTORCH_MODELS_URL=... or PYTORCH_MODELS_MIRROR=...
|
|
||||||
ARG PYTORCH_MODELS_MIRROR=
|
ARG PYTORCH_MODELS_MIRROR=
|
||||||
ARG PYTORCH_MODELS_URL=
|
ARG PYTORCH_MODELS_URL=
|
||||||
ENV PYTORCH_MODELS_MIRROR=${PYTORCH_MODELS_MIRROR} \
|
ENV PYTORCH_MODELS_MIRROR=${PYTORCH_MODELS_MIRROR} \
|
||||||
PYTORCH_MODELS_URL=${PYTORCH_MODELS_URL}
|
PYTORCH_MODELS_URL=${PYTORCH_MODELS_URL}
|
||||||
# Optional offline weight (only .gitkeep by default; add swin3d_t-7615ae03.pth before build if needed).
|
|
||||||
COPY weights ./weights/
|
|
||||||
|
|
||||||
|
# Do not cache-mount /app/.cache/torch: BuildKit cache mounts are not exported into the image.
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv sync --frozen --no-dev --no-compile --refresh-package numpy --refresh-package mediapipe && \
|
uv sync --frozen --no-dev --no-compile --refresh-package numpy --refresh-package mediapipe && \
|
||||||
.venv/bin/python -c "import alembic" && \
|
.venv/bin/python -c "import alembic" && \
|
||||||
.venv/bin/python -c "import numpy; import numpy.lib._index_tricks_impl" && \
|
.venv/bin/python -c "import numpy; import numpy.lib._index_tricks_impl" && \
|
||||||
.venv/bin/python -c "import mediapipe as mp; print('mediapipe', mp.__version__)" && \
|
.venv/bin/python -c "import mediapipe as mp; print('mediapipe', mp.__version__)" && \
|
||||||
mkdir -p /app/.cache/ultralytics && \
|
mkdir -p /app/.cache/ultralytics /app/.cache/torch/hub/checkpoints && \
|
||||||
PYTORCH_MODELS_LOCAL_PATH=/app/weights/swin3d_t-7615ae03.pth \
|
PYTORCH_MODELS_LOCAL_DIR=/app/weights \
|
||||||
.venv/bin/python scripts/bake_torch_hub_checkpoint.py && \
|
.venv/bin/python scripts/bake_pretrained_weights.py && \
|
||||||
|
test -s /app/.cache/torch/hub/checkpoints/swin3d_t-7615ae03.pth && \
|
||||||
|
test -s /app/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth && \
|
||||||
rm -rf /app/weights && \
|
rm -rf /app/weights && \
|
||||||
TORCH_HOME=/app/.cache/torch .venv/bin/python -c "from torchvision.models.video import Swin3D_T_Weights, swin3d_t; swin3d_t(weights=Swin3D_T_Weights.KINETICS400_V1); print('swin3d_t cached ok')" && \
|
chmod -R a+rX /app/.venv /app/algorithm_subprocesses && \
|
||||||
chmod -R a+rX /app/.venv /app/algorithm_subprocesses /app/.cache/torch /app/.cache/ultralytics
|
chmod -R a+rwX /app/.cache/torch /app/.cache/ultralytics
|
||||||
|
|
||||||
ENV PATH="/app/.venv/bin:$PATH"
|
ENV PATH="/app/.venv/bin:$PATH"
|
||||||
|
|
||||||
|
|||||||
212
backend/scripts/bake_pretrained_weights.py
Normal file
212
backend/scripts/bake_pretrained_weights.py
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Bake torchvision / torch.hub checkpoints into TORCH_HOME during Docker image build.
|
||||||
|
|
||||||
|
Runtime batch jobs and doctor identity must not download from pytorch.org when the
|
||||||
|
api container runs as a non-root compose user (read-only /app/.cache/torch).
|
||||||
|
|
||||||
|
Place optional offline copies under backend/weights/ before build:
|
||||||
|
- swin3d_t-7615ae03.pth (VideoSwin / Swin3D-T, ~110MB)
|
||||||
|
- resnet50-0676ba61.pth (doctor ReID backbone, ~98MB)
|
||||||
|
|
||||||
|
Override mirrors: PYTORCH_MODELS_MIRROR, PYTORCH_MODELS_URL (per-file URL not supported).
|
||||||
|
Legacy single-file: PYTORCH_MODELS_LOCAL_PATH (applies only to swin3d_t).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
OFFICIAL_PREFIX = "https://download.pytorch.org/models"
|
||||||
|
AGENTSMIRROR_PREFIX = "https://uv.agentsmirror.com/download.pytorch.org/models"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class HubCheckpoint:
|
||||||
|
filename: str
|
||||||
|
min_bytes: int
|
||||||
|
label: str
|
||||||
|
|
||||||
|
|
||||||
|
# All torchvision hub files downloaded at runtime by production paths (as of 5.15 bundle).
|
||||||
|
HUB_CHECKPOINTS: tuple[HubCheckpoint, ...] = (
|
||||||
|
HubCheckpoint(
|
||||||
|
filename="swin3d_t-7615ae03.pth",
|
||||||
|
min_bytes=100_000_000,
|
||||||
|
label="VideoSwin Swin3D-T (Kinetics-400)",
|
||||||
|
),
|
||||||
|
HubCheckpoint(
|
||||||
|
filename="resnet50-0676ba61.pth",
|
||||||
|
min_bytes=90_000_000,
|
||||||
|
label="doctor ReID ResNet50 (ImageNet-1K)",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _candidate_urls(filename: str) -> list[str]:
|
||||||
|
explicit = (os.environ.get("PYTORCH_MODELS_URL") or "").strip()
|
||||||
|
# Legacy single-URL override applied only to VideoSwin (other files use mirror list).
|
||||||
|
if explicit and filename == "swin3d_t-7615ae03.pth":
|
||||||
|
return [explicit]
|
||||||
|
|
||||||
|
raw = (os.environ.get("PYTORCH_MODELS_MIRROR") or "").strip().rstrip("/")
|
||||||
|
prefixes = [p for p in (raw, AGENTSMIRROR_PREFIX) if p]
|
||||||
|
urls: list[str] = []
|
||||||
|
for prefix in prefixes:
|
||||||
|
urls.append(f"{prefix}/models/{filename}")
|
||||||
|
urls.append(f"{OFFICIAL_PREFIX}/{filename}")
|
||||||
|
|
||||||
|
seen: set[str] = set()
|
||||||
|
ordered: list[str] = []
|
||||||
|
for url in urls:
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
ordered.append(url)
|
||||||
|
return ordered
|
||||||
|
|
||||||
|
|
||||||
|
def _local_source(filename: str) -> Path | None:
|
||||||
|
legacy = (os.environ.get("PYTORCH_MODELS_LOCAL_PATH") or "").strip()
|
||||||
|
if legacy and filename == "swin3d_t-7615ae03.pth":
|
||||||
|
path = Path(legacy)
|
||||||
|
if path.is_file():
|
||||||
|
return path
|
||||||
|
|
||||||
|
local_dir = (os.environ.get("PYTORCH_MODELS_LOCAL_DIR") or "").strip()
|
||||||
|
if local_dir:
|
||||||
|
path = Path(local_dir) / filename
|
||||||
|
if path.is_file():
|
||||||
|
return path
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _copy_local(src: Path, dest: Path, *, min_bytes: int) -> None:
|
||||||
|
size = src.stat().st_size
|
||||||
|
if size < min_bytes:
|
||||||
|
raise OSError(f"local file too small ({size} bytes): {src}")
|
||||||
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(src, dest)
|
||||||
|
|
||||||
|
|
||||||
|
def _download_with_curl(url: str, dest: Path, *, min_bytes: int) -> None:
|
||||||
|
curl = shutil.which("curl")
|
||||||
|
if curl is None:
|
||||||
|
raise RuntimeError("curl not found")
|
||||||
|
tmp = dest.with_suffix(dest.suffix + ".part")
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
proc = subprocess.run(
|
||||||
|
[
|
||||||
|
curl,
|
||||||
|
"-fL",
|
||||||
|
"--retry",
|
||||||
|
"5",
|
||||||
|
"--retry-all-errors",
|
||||||
|
"--retry-delay",
|
||||||
|
"3",
|
||||||
|
"--connect-timeout",
|
||||||
|
"30",
|
||||||
|
"--max-time",
|
||||||
|
"1800",
|
||||||
|
"-o",
|
||||||
|
str(tmp),
|
||||||
|
url,
|
||||||
|
],
|
||||||
|
check=False,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if proc.returncode != 0:
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
detail = (proc.stderr or proc.stdout or "").strip() or f"exit={proc.returncode}"
|
||||||
|
raise RuntimeError(detail)
|
||||||
|
size = tmp.stat().st_size
|
||||||
|
if size < min_bytes:
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
raise OSError(f"download too small ({size} bytes)")
|
||||||
|
tmp.replace(dest)
|
||||||
|
|
||||||
|
|
||||||
|
def _download_with_torch(url: str, dest: Path, *, min_bytes: int) -> None:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
torch.hub.download_url_to_file(url, str(dest), progress=True)
|
||||||
|
size = dest.stat().st_size
|
||||||
|
if size < min_bytes:
|
||||||
|
dest.unlink(missing_ok=True)
|
||||||
|
raise OSError(f"download too small ({size} bytes)")
|
||||||
|
|
||||||
|
|
||||||
|
def bake_hub_checkpoint(
|
||||||
|
spec: HubCheckpoint,
|
||||||
|
*,
|
||||||
|
torch_home: Path,
|
||||||
|
) -> None:
|
||||||
|
dest = torch_home / "hub" / "checkpoints" / spec.filename
|
||||||
|
if dest.is_file() and dest.stat().st_size >= spec.min_bytes:
|
||||||
|
print(f"already baked [{spec.label}]: {dest} ({dest.stat().st_size} bytes)")
|
||||||
|
return
|
||||||
|
|
||||||
|
local = _local_source(spec.filename)
|
||||||
|
if local is not None:
|
||||||
|
try:
|
||||||
|
_copy_local(local, dest, min_bytes=spec.min_bytes)
|
||||||
|
print(f"baked [{spec.label}] {dest} ({dest.stat().st_size} bytes) from local {local}")
|
||||||
|
return
|
||||||
|
except OSError as exc:
|
||||||
|
print(f"local copy failed for {spec.filename}: {exc}", file=sys.stderr)
|
||||||
|
|
||||||
|
errors: list[str] = []
|
||||||
|
for url in _candidate_urls(spec.filename):
|
||||||
|
try:
|
||||||
|
print(f"downloading [{spec.label}] {url}")
|
||||||
|
_download_with_curl(url, dest, min_bytes=spec.min_bytes)
|
||||||
|
print(f"baked [{spec.label}] {dest} ({dest.stat().st_size} bytes) from {url}")
|
||||||
|
return
|
||||||
|
except (OSError, RuntimeError) as exc:
|
||||||
|
errors.append(f"curl {url}: {exc}")
|
||||||
|
dest.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
official = f"{OFFICIAL_PREFIX}/{spec.filename}"
|
||||||
|
try:
|
||||||
|
print(f"torch.hub fallback [{spec.label}]: {official}")
|
||||||
|
_download_with_torch(official, dest, min_bytes=spec.min_bytes)
|
||||||
|
print(f"baked [{spec.label}] {dest} ({dest.stat().st_size} bytes) from {official}")
|
||||||
|
return
|
||||||
|
except (OSError, RuntimeError) as exc:
|
||||||
|
errors.append(f"torch {official}: {exc}")
|
||||||
|
dest.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
print(f"failed to bake {spec.label} ({spec.filename}):", file=sys.stderr)
|
||||||
|
for line in errors:
|
||||||
|
print(f" - {line}", file=sys.stderr)
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def warm_torchvision_hub_models() -> None:
|
||||||
|
"""Load models so torchvision verifies hub checkpoints (no network if baked)."""
|
||||||
|
from torchvision.models import ResNet50_Weights, resnet50
|
||||||
|
from torchvision.models.video import Swin3D_T_Weights, swin3d_t
|
||||||
|
|
||||||
|
resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
|
||||||
|
print("resnet50 IMAGENET1K_V1 ok")
|
||||||
|
swin3d_t(weights=Swin3D_T_Weights.KINETICS400_V1)
|
||||||
|
print("swin3d_t KINETICS400_V1 ok")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
torch_home = Path(os.environ.get("TORCH_HOME", "/app/.cache/torch"))
|
||||||
|
for spec in HUB_CHECKPOINTS:
|
||||||
|
bake_hub_checkpoint(spec, torch_home=torch_home)
|
||||||
|
warm_torchvision_hub_models()
|
||||||
|
print("all pretrained hub weights baked")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
@@ -1,150 +1,11 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Bake torchvision hub checkpoints into TORCH_HOME during Docker image build."""
|
"""Backward-compatible entrypoint; bakes all hub weights (see bake_pretrained_weights.py)."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import runpy
|
||||||
import shutil
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
CHECKPOINT = "swin3d_t-7615ae03.pth"
|
|
||||||
OFFICIAL_URL = f"https://download.pytorch.org/models/{CHECKPOINT}"
|
|
||||||
AGENTSMIRROR_URL = f"https://uv.agentsmirror.com/download.pytorch.org/models/{CHECKPOINT}"
|
|
||||||
MIN_BYTES = 100_000_000
|
|
||||||
|
|
||||||
|
|
||||||
def _candidate_urls() -> list[str]:
|
|
||||||
explicit = (os.environ.get("PYTORCH_MODELS_URL") or "").strip()
|
|
||||||
if explicit:
|
|
||||||
return [explicit]
|
|
||||||
|
|
||||||
raw = (os.environ.get("PYTORCH_MODELS_MIRROR") or "").strip().rstrip("/")
|
|
||||||
prefixes = [p for p in (raw, "https://uv.agentsmirror.com/download.pytorch.org") if p]
|
|
||||||
urls: list[str] = []
|
|
||||||
for prefix in prefixes:
|
|
||||||
urls.append(f"{prefix}/models/{CHECKPOINT}")
|
|
||||||
urls.extend([AGENTSMIRROR_URL, OFFICIAL_URL])
|
|
||||||
|
|
||||||
seen: set[str] = set()
|
|
||||||
ordered: list[str] = []
|
|
||||||
for url in urls:
|
|
||||||
if url in seen:
|
|
||||||
continue
|
|
||||||
seen.add(url)
|
|
||||||
ordered.append(url)
|
|
||||||
return ordered
|
|
||||||
|
|
||||||
|
|
||||||
def _copy_local(src: Path, dest: Path) -> None:
|
|
||||||
size = src.stat().st_size
|
|
||||||
if size < MIN_BYTES:
|
|
||||||
raise OSError(f"local file too small ({size} bytes): {src}")
|
|
||||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
shutil.copy2(src, dest)
|
|
||||||
|
|
||||||
|
|
||||||
def _download_with_curl(url: str, dest: Path) -> None:
|
|
||||||
curl = shutil.which("curl")
|
|
||||||
if curl is None:
|
|
||||||
raise RuntimeError("curl not found")
|
|
||||||
tmp = dest.with_suffix(dest.suffix + ".part")
|
|
||||||
tmp.unlink(missing_ok=True)
|
|
||||||
proc = subprocess.run(
|
|
||||||
[
|
|
||||||
curl,
|
|
||||||
"-fL",
|
|
||||||
"--retry",
|
|
||||||
"5",
|
|
||||||
"--retry-all-errors",
|
|
||||||
"--retry-delay",
|
|
||||||
"3",
|
|
||||||
"--connect-timeout",
|
|
||||||
"30",
|
|
||||||
"--max-time",
|
|
||||||
"1800",
|
|
||||||
"-o",
|
|
||||||
str(tmp),
|
|
||||||
url,
|
|
||||||
],
|
|
||||||
check=False,
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
)
|
|
||||||
if proc.returncode != 0:
|
|
||||||
tmp.unlink(missing_ok=True)
|
|
||||||
detail = (proc.stderr or proc.stdout or "").strip() or f"exit={proc.returncode}"
|
|
||||||
raise RuntimeError(detail)
|
|
||||||
size = tmp.stat().st_size
|
|
||||||
if size < MIN_BYTES:
|
|
||||||
tmp.unlink(missing_ok=True)
|
|
||||||
raise OSError(f"download too small ({size} bytes)")
|
|
||||||
tmp.replace(dest)
|
|
||||||
|
|
||||||
|
|
||||||
def _download_with_torch(url: str, dest: Path) -> None:
|
|
||||||
import torch
|
|
||||||
|
|
||||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
torch.hub.download_url_to_file(url, str(dest), progress=True)
|
|
||||||
size = dest.stat().st_size
|
|
||||||
if size < MIN_BYTES:
|
|
||||||
dest.unlink(missing_ok=True)
|
|
||||||
raise OSError(f"download too small ({size} bytes)")
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
|
||||||
torch_home = Path(os.environ.get("TORCH_HOME", "/app/.cache/torch"))
|
|
||||||
dest = torch_home / "hub" / "checkpoints" / CHECKPOINT
|
|
||||||
if dest.is_file() and dest.stat().st_size >= MIN_BYTES:
|
|
||||||
print(f"already baked: {dest} ({dest.stat().st_size} bytes)")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
local_raw = (os.environ.get("PYTORCH_MODELS_LOCAL_PATH") or "").strip()
|
|
||||||
if local_raw:
|
|
||||||
local = Path(local_raw)
|
|
||||||
if local.is_file():
|
|
||||||
try:
|
|
||||||
_copy_local(local, dest)
|
|
||||||
print(f"baked {dest} ({dest.stat().st_size} bytes) from local {local}")
|
|
||||||
return 0
|
|
||||||
except OSError as exc:
|
|
||||||
print(f"local copy failed: {exc}", file=sys.stderr)
|
|
||||||
|
|
||||||
errors: list[str] = []
|
|
||||||
for url in _candidate_urls():
|
|
||||||
try:
|
|
||||||
print(f"downloading {url}")
|
|
||||||
_download_with_curl(url, dest)
|
|
||||||
print(f"baked {dest} ({dest.stat().st_size} bytes) from {url}")
|
|
||||||
return 0
|
|
||||||
except (OSError, RuntimeError) as exc:
|
|
||||||
errors.append(f"curl {url}: {exc}")
|
|
||||||
dest.unlink(missing_ok=True)
|
|
||||||
|
|
||||||
for url in (OFFICIAL_URL,):
|
|
||||||
try:
|
|
||||||
print(f"torch.hub fallback: {url}")
|
|
||||||
_download_with_torch(url, dest)
|
|
||||||
print(f"baked {dest} ({dest.stat().st_size} bytes) from {url}")
|
|
||||||
return 0
|
|
||||||
except (OSError, RuntimeError) as exc:
|
|
||||||
errors.append(f"torch {url}: {exc}")
|
|
||||||
dest.unlink(missing_ok=True)
|
|
||||||
|
|
||||||
print("failed to bake VideoSwin checkpoint:", file=sys.stderr)
|
|
||||||
for line in errors:
|
|
||||||
print(f" - {line}", file=sys.stderr)
|
|
||||||
print(
|
|
||||||
"hint: domestic PyPI mirrors (NJU/Tsinghua/Aliyun) do not sync /models/*.pth; "
|
|
||||||
"pre-download once and set PYTORCH_MODELS_LOCAL_PATH, or pass "
|
|
||||||
"PYTORCH_MODELS_URL / --build-arg PYTORCH_MODELS_MIRROR to a mirror that hosts "
|
|
||||||
f"/models/{CHECKPOINT}",
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
raise SystemExit(main())
|
target = Path(__file__).resolve().parent / "bake_pretrained_weights.py"
|
||||||
|
runpy.run_path(str(target), run_name="__main__")
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# Clear BuildKit cache and rebuild the API image (fixes corrupted layer / unpigz errors).
|
# Clear BuildKit cache and rebuild the API image.
|
||||||
|
# Fixes export errors such as:
|
||||||
|
# archive/tar: invalid tar header
|
||||||
|
# unpigz: corrupted -- invalid deflate data
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
cd "$(dirname "$0")/.."
|
cd "$(dirname "$0")/.."
|
||||||
|
|
||||||
@@ -8,12 +11,20 @@ docker builder prune -af
|
|||||||
docker buildx prune -af 2>/dev/null || true
|
docker buildx prune -af 2>/dev/null || true
|
||||||
docker rmi -f backend-api:latest 2>/dev/null || true
|
docker rmi -f backend-api:latest 2>/dev/null || true
|
||||||
|
|
||||||
|
if [[ "${AGGRESSIVE_PRUNE:-0}" == "1" ]]; then
|
||||||
|
echo "Aggressive prune (dangling images + build cache)..."
|
||||||
|
docker system prune -af 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ "${RESTART_DOCKER:-0}" == "1" ]]; then
|
if [[ "${RESTART_DOCKER:-0}" == "1" ]]; then
|
||||||
echo "Restarting Docker..."
|
echo "Restarting Docker..."
|
||||||
sudo systemctl restart docker
|
sudo systemctl restart docker
|
||||||
|
sleep 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Building api image (--no-cache)..."
|
echo "Building api image (--no-cache, no attestations)..."
|
||||||
docker compose build api --no-cache
|
export DOCKER_BUILDKIT=1
|
||||||
|
export COMPOSE_BAKE=false
|
||||||
|
docker compose build api --no-cache --provenance=false --sbom=false
|
||||||
|
|
||||||
echo "Done. Recreate container: docker compose up -d --force-recreate api"
|
echo "Done. Recreate container: docker compose up -d --force-recreate api"
|
||||||
|
|||||||
67
backend/tests/test_bake_pretrained_weights.py
Normal file
67
backend/tests/test_bake_pretrained_weights.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def _load_module():
|
||||||
|
import sys
|
||||||
|
|
||||||
|
path = Path(__file__).resolve().parents[1] / "scripts" / "bake_pretrained_weights.py"
|
||||||
|
spec = importlib.util.spec_from_file_location("bake_pretrained_weights", path)
|
||||||
|
mod = importlib.util.module_from_spec(spec)
|
||||||
|
assert spec.loader is not None
|
||||||
|
sys.modules[spec.name] = mod
|
||||||
|
spec.loader.exec_module(mod)
|
||||||
|
return mod
|
||||||
|
|
||||||
|
|
||||||
|
def test_candidate_urls_default(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
mod = _load_module()
|
||||||
|
monkeypatch.delenv("PYTORCH_MODELS_URL", raising=False)
|
||||||
|
monkeypatch.delenv("PYTORCH_MODELS_MIRROR", raising=False)
|
||||||
|
urls = mod._candidate_urls("resnet50-0676ba61.pth")
|
||||||
|
assert urls[0].endswith("/models/resnet50-0676ba61.pth")
|
||||||
|
assert urls[-1] == f"{mod.OFFICIAL_PREFIX}/resnet50-0676ba61.pth"
|
||||||
|
|
||||||
|
|
||||||
|
def test_candidate_urls_explicit_override_applies_to_swin_only(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
mod = _load_module()
|
||||||
|
monkeypatch.setenv("PYTORCH_MODELS_URL", "https://example.com/swin3d_t-7615ae03.pth")
|
||||||
|
assert mod._candidate_urls("swin3d_t-7615ae03.pth") == [
|
||||||
|
"https://example.com/swin3d_t-7615ae03.pth"
|
||||||
|
]
|
||||||
|
assert "example.com" not in mod._candidate_urls("resnet50-0676ba61.pth")[0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_source_prefers_legacy_swin_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||||
|
mod = _load_module()
|
||||||
|
swin = tmp_path / "swin3d_t-7615ae03.pth"
|
||||||
|
swin.write_bytes(b"x" * mod.HUB_CHECKPOINTS[0].min_bytes)
|
||||||
|
monkeypatch.setenv("PYTORCH_MODELS_LOCAL_PATH", str(swin))
|
||||||
|
assert mod._local_source("swin3d_t-7615ae03.pth") == swin
|
||||||
|
assert mod._local_source("resnet50-0676ba61.pth") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_source_uses_local_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||||
|
mod = _load_module()
|
||||||
|
weights = tmp_path / "weights"
|
||||||
|
weights.mkdir()
|
||||||
|
resnet = weights / "resnet50-0676ba61.pth"
|
||||||
|
resnet.write_bytes(b"x")
|
||||||
|
monkeypatch.setenv("PYTORCH_MODELS_LOCAL_DIR", str(weights))
|
||||||
|
assert mod._local_source("resnet50-0676ba61.pth") == resnet
|
||||||
|
|
||||||
|
|
||||||
|
def test_bake_skips_when_dest_already_valid(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
mod = _load_module()
|
||||||
|
monkeypatch.setenv("TORCH_HOME", str(tmp_path / "torch"))
|
||||||
|
spec = mod.HUB_CHECKPOINTS[1]
|
||||||
|
dest = tmp_path / "torch" / "hub" / "checkpoints" / spec.filename
|
||||||
|
dest.parent.mkdir(parents=True)
|
||||||
|
dest.write_bytes(b"x" * spec.min_bytes)
|
||||||
|
mod.bake_hub_checkpoint(spec, torch_home=tmp_path / "torch")
|
||||||
@@ -1,28 +1,13 @@
|
|||||||
|
"""Legacy entrypoint must delegate to bake_pretrained_weights."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import importlib.util
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def _load_module():
|
def test_bake_torch_hub_checkpoint_delegates() -> None:
|
||||||
path = Path(__file__).resolve().parents[1] / "scripts" / "bake_torch_hub_checkpoint.py"
|
text = (
|
||||||
spec = importlib.util.spec_from_file_location("bake_torch_hub_checkpoint", path)
|
Path(__file__).resolve().parents[1] / "scripts" / "bake_torch_hub_checkpoint.py"
|
||||||
mod = importlib.util.module_from_spec(spec)
|
).read_text(encoding="utf-8")
|
||||||
assert spec.loader is not None
|
assert "bake_pretrained_weights.py" in text
|
||||||
spec.loader.exec_module(mod)
|
assert "runpy.run_path" in text
|
||||||
return mod
|
|
||||||
|
|
||||||
|
|
||||||
def test_candidate_urls_default(monkeypatch):
|
|
||||||
mod = _load_module()
|
|
||||||
monkeypatch.delenv("PYTORCH_MODELS_URL", raising=False)
|
|
||||||
monkeypatch.delenv("PYTORCH_MODELS_MIRROR", raising=False)
|
|
||||||
urls = mod._candidate_urls()
|
|
||||||
assert urls[0].startswith("https://uv.agentsmirror.com/download.pytorch.org/models/")
|
|
||||||
assert urls[-1] == mod.OFFICIAL_URL
|
|
||||||
|
|
||||||
|
|
||||||
def test_candidate_urls_explicit_override(monkeypatch):
|
|
||||||
mod = _load_module()
|
|
||||||
monkeypatch.setenv("PYTORCH_MODELS_URL", "https://example.com/swin3d_t-7615ae03.pth")
|
|
||||||
assert mod._candidate_urls() == ["https://example.com/swin3d_t-7615ae03.pth"]
|
|
||||||
|
|||||||
@@ -0,0 +1,18 @@
|
|||||||
|
# Optional offline assets for Docker build (see scripts/bake_pretrained_weights.py).
|
||||||
|
# If a file exists here, image build copies it instead of downloading.
|
||||||
|
#
|
||||||
|
# Torchvision hub (→ /app/.cache/torch/hub/checkpoints/):
|
||||||
|
# swin3d_t-7615ae03.pth (~110MB) VideoSwin / batch Phase1
|
||||||
|
# resnet50-0676ba61.pth (~98MB) doctor ReID backbone (ImageNet)
|
||||||
|
#
|
||||||
|
# MediaPipe (→ algorithm_subprocesses/5.15/doctor_identity_package/.mediapipe_models/):
|
||||||
|
# pose_landmarker_lite.task (~6MB)
|
||||||
|
#
|
||||||
|
# wget https://download.pytorch.org/models/swin3d_t-7615ae03.pth
|
||||||
|
# wget https://download.pytorch.org/models/resnet50-0676ba61.pth
|
||||||
|
# curl -fsSL -o pose_landmarker_lite.task \
|
||||||
|
# "https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_lite/float16/1/pose_landmarker_lite.task"
|
||||||
|
#
|
||||||
|
# Bundled in image via COPY (not downloaded at build):
|
||||||
|
# algorithm_subprocesses/5.15/weights/*.pt, actionformer_epoch_045.pth.tar
|
||||||
|
# doctor_identity_package/doctor_info.pth
|
||||||
|
|||||||
BIN
backend/weights/pose_landmarker_lite.task
Normal file
BIN
backend/weights/pose_landmarker_lite.task
Normal file
Binary file not shown.
BIN
backend/weights/resnet50-0676ba61.pth
Normal file
BIN
backend/weights/resnet50-0676ba61.pth
Normal file
Binary file not shown.
BIN
backend/weights/swin3d_t-7615ae03.pth.1
Normal file
BIN
backend/weights/swin3d_t-7615ae03.pth.1
Normal file
Binary file not shown.
@@ -28,7 +28,12 @@ operation-room-monitor/
|
|||||||
- 算法子进程包:`backend/algorithm_subprocesses/5.15/`(含 `main.py` 与 `weights/`;镜像构建时会 `COPY` 进容器,勿在 `.dockerignore` 中整目录排除)
|
- 算法子进程包:`backend/algorithm_subprocesses/5.15/`(含 `main.py` 与 `weights/`;镜像构建时会 `COPY` 进容器,勿在 `.dockerignore` 中整目录排除)
|
||||||
- 标注视频中文字体:镜像内已安装 `fonts-noto-cjk`、`fonts-wqy-microhei`(供 `visualize_result_video.py` 绘制耗材标签)
|
- 标注视频中文字体:镜像内已安装 `fonts-noto-cjk`、`fonts-wqy-microhei`(供 `visualize_result_video.py` 绘制耗材标签)
|
||||||
- 医生识别(MediaPipe Pose):镜像内已安装 `libgles2`、`libegl1`、`libegl-mesa0`、`libglx-mesa0`、`libgl1-mesa-dri` 等 Mesa/GLVND 库;构建阶段会 `import mediapipe` 校验 `libGLESv2.so.2` 可用。子进程强制 CPU delegate。若仍见该错误,请 **`docker compose build --no-cache api`** 后重启(勿沿用旧 tarball 镜像)
|
- 医生识别(MediaPipe Pose):镜像内已安装 `libgles2`、`libegl1`、`libegl-mesa0`、`libglx-mesa0`、`libgl1-mesa-dri` 等 Mesa/GLVND 库;构建阶段会 `import mediapipe` 校验 `libGLESv2.so.2` 可用。子进程强制 CPU delegate。若仍见该错误,请 **`docker compose build --no-cache api`** 后重启(勿沿用旧 tarball 镜像)
|
||||||
- 可选备用权重:`backend/app/resources/actionformer_epoch_045.pth.tar`
|
- **构建时预下载的预训练资源**(`scripts/bake_pretrained_weights.py`,本地有则跳过,见 `backend/weights/.gitkeep`):
|
||||||
|
- `swin3d_t-7615ae03.pth`(VideoSwin / batch Phase1)
|
||||||
|
- `resnet50-0676ba61.pth`(医生 ReID 骨干)
|
||||||
|
- `pose_landmarker_lite.task`(MediaPipe Pose)
|
||||||
|
- 业务权重随 `algorithm_subprocesses/5.15/weights/` 打进镜像(YOLO / ActionFormer 等,非 hub 下载)
|
||||||
|
- 可选备用:`backend/app/resources/actionformer_epoch_045.pth.tar`
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -58,31 +63,6 @@ docker compose down
|
|||||||
docker compose down -v # 删除 PostgreSQL / MinIO 卷
|
docker compose down -v # 删除 PostgreSQL / MinIO 卷
|
||||||
```
|
```
|
||||||
|
|
||||||
### 构建 API 镜像失败:`invalid tar header` / `unpigz: corrupted`
|
|
||||||
|
|
||||||
`uv sync` 已成功,但在 **exporting / unpacking** 阶段报错时,通常是 **Docker 本地层缓存或存储损坏**,与 Dockerfile 无关。
|
|
||||||
|
|
||||||
按顺序处理:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd backend
|
|
||||||
chmod +x scripts/rebuild-api-image.sh
|
|
||||||
|
|
||||||
# 清缓存并重建(推荐)
|
|
||||||
./scripts/rebuild-api-image.sh
|
|
||||||
|
|
||||||
# 仍失败时:重启 Docker 后再跑
|
|
||||||
RESTART_DOCKER=1 ./scripts/rebuild-api-image.sh
|
|
||||||
|
|
||||||
# 再失败:改用旧版构建器(无 BuildKit)
|
|
||||||
COMPOSE_DOCKER_CLI_BUILD=0 DOCKER_BUILDKIT=0 docker compose build api --no-cache
|
|
||||||
docker compose up -d --force-recreate api
|
|
||||||
```
|
|
||||||
|
|
||||||
手动等价步骤:`docker builder prune -af` → `docker rmi -f backend-api:latest` → `docker compose build api --no-cache`。
|
|
||||||
|
|
||||||
确认根分区剩余空间充足(建议 ≥ 20GB);空间不足时大层导出也容易损坏。
|
|
||||||
|
|
||||||
### RTSP 切片在宿主机无法用 VLC 打开
|
### RTSP 切片在宿主机无法用 VLC 打开
|
||||||
|
|
||||||
默认情况下 API 容器以 **root** 写入 `./logs`,切片属主为 `root:root`。普通用户虽可用 `cat` 读取,但 **Snap 版 VLC** 等沙箱应用常会报 Permission denied。
|
默认情况下 API 容器以 **root** 写入 `./logs`,切片属主为 `root:root`。普通用户虽可用 `cat` 读取,但 **Snap 版 VLC** 等沙箱应用常会报 Permission denied。
|
||||||
|
|||||||
Reference in New Issue
Block a user