Align API container UID with host and harden RTSP slice readiness.
Run compose api as HOST_UID/GID with cache under /tmp, poll slice files for ready_event when ffmpeg stderr is silent, invoke batch via venv python, exclude logs from build context, and document Docker cache/VLC troubleshooting. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -85,3 +85,5 @@ clients/voice-confirmation/dist/
|
||||
logs/
|
||||
Ultralytics/
|
||||
scripts/
|
||||
!backend/scripts/
|
||||
!backend/scripts/rebuild-api-image.sh
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
.git
|
||||
.gitignore
|
||||
.venv
|
||||
logs
|
||||
logs/
|
||||
__pycache__
|
||||
*.py[cod]
|
||||
.pytest_cache
|
||||
|
||||
@@ -10,6 +10,11 @@ POSTGRES_PORT=45432
|
||||
DOCKER_POSTGRES_PUBLISH_PORT=45432
|
||||
# DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:45432/operation_room
|
||||
|
||||
# --- API 容器用户(与宿主机一致,logs/rtsp_segments 切片可在宿主机直接用 VLC 打开)---
|
||||
HOST_UID=1000
|
||||
HOST_GID=1000
|
||||
DOCKER_GID=999
|
||||
|
||||
# --- 手术室站点配置(严格 JSON:仅含 video_rtsp_urls + voice_or_room_bindings)---
|
||||
# 示例:app/resources/or_site_config.sample.json。一键联调会向该文件合并更新 video_rtsp_urls,须可写。
|
||||
OR_SITE_CONFIG_JSON_FILE=app/resources/or_site_config.sample.json
|
||||
|
||||
@@ -23,6 +23,15 @@ POSTGRES_PORT=45432
|
||||
# 可选:映射 PostgreSQL 到宿主机端口(默认 45432),仅 DBA/调试用途
|
||||
# DOCKER_POSTGRES_PUBLISH_PORT=45432
|
||||
|
||||
# --- API 容器用户(与宿主机一致,便于直接打开 logs/rtsp_segments 下的 slice MP4)---
|
||||
# 填写 id -u / id -g / getent group docker | cut -d: -f3 的输出;改后需 docker compose up -d --force-recreate api
|
||||
HOST_UID=1000
|
||||
HOST_GID=1000
|
||||
DOCKER_GID=999
|
||||
# 非 root 运行时 uv/torch 缓存目录(compose 内已设为 /tmp/*,一般无需改)
|
||||
# UV_CACHE_DIR=/tmp/uv-cache
|
||||
# TORCH_HOME=/tmp/torch-cache
|
||||
|
||||
# --- HTTP(API 对外端口)---
|
||||
# 局域网语音确认终端 / Demo 客户端访问 API 时,填写
|
||||
# http://<GPU服务器局域网IP>:38080 作为服务端 Base URL。
|
||||
|
||||
@@ -26,10 +26,10 @@ def build_reference_env() -> dict[str, str]:
|
||||
|
||||
|
||||
def build_batch_main_command(*, bundle_dir: Path, config_path: Path) -> list[str]:
|
||||
# Use the image venv interpreter directly. ``uv run`` would try to update /app/uv.lock,
|
||||
# which is root-owned in the image and fails under compose ``user: HOST_UID``.
|
||||
return [
|
||||
"uv",
|
||||
"run",
|
||||
"python",
|
||||
sys.executable,
|
||||
"-X",
|
||||
"faulthandler",
|
||||
str(bundle_dir / "main.py"),
|
||||
|
||||
@@ -13,6 +13,10 @@ from loguru import logger
|
||||
|
||||
from app.services.video.rtsp_ffmpeg_opts import ffmpeg_bin, rtsp_ffmpeg_input_opts
|
||||
|
||||
# First-slice readiness: output file must reach this size (start_surgery waits on ready_event).
|
||||
_READY_MIN_OUTPUT_BYTES = 4096
|
||||
_READY_POLL_INTERVAL_SEC = 0.25
|
||||
|
||||
SegmentCallback = Callable[
|
||||
["SegmentCompleteEvent"],
|
||||
Awaitable[None],
|
||||
@@ -126,7 +130,7 @@ class RtspSegmentRecorder:
|
||||
while not stop_event.is_set():
|
||||
output_path = self._output_dir / f"slice_{self._slice_index:04d}.mp4"
|
||||
duration = self._segment_duration_sec
|
||||
proc, stderr_task = await self._start_ffmpeg(output_path, duration)
|
||||
proc, stderr_task, ready_task = await self._start_ffmpeg(output_path, duration)
|
||||
try:
|
||||
await self._wait_ffmpeg(
|
||||
proc,
|
||||
@@ -142,10 +146,11 @@ class RtspSegmentRecorder:
|
||||
except asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
await proc.wait()
|
||||
if not stderr_task.done():
|
||||
stderr_task.cancel()
|
||||
for task in (stderr_task, ready_task):
|
||||
if task is not None and not task.done():
|
||||
task.cancel()
|
||||
try:
|
||||
await stderr_task
|
||||
await task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
@@ -172,7 +177,7 @@ class RtspSegmentRecorder:
|
||||
self,
|
||||
output_path: Path,
|
||||
duration_sec: float,
|
||||
) -> tuple[asyncio.subprocess.Process, asyncio.Task[None]]:
|
||||
) -> tuple[asyncio.subprocess.Process, asyncio.Task[None], asyncio.Task[None] | None]:
|
||||
cmd = _build_ffmpeg_cmd(
|
||||
rtsp_url=self._rtsp_url,
|
||||
output_path=output_path,
|
||||
@@ -193,13 +198,39 @@ class RtspSegmentRecorder:
|
||||
stdout=asyncio.subprocess.DEVNULL,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stderr_task = asyncio.create_task(self._watch_stderr(proc, output_path))
|
||||
return proc, stderr_task
|
||||
stderr_task = asyncio.create_task(self._watch_stderr(proc))
|
||||
ready_task: asyncio.Task[None] | None = None
|
||||
if self._ready_event is not None and not self._ready_event.is_set():
|
||||
ready_task = asyncio.create_task(self._poll_ready(output_path, proc))
|
||||
return proc, stderr_task, ready_task
|
||||
|
||||
@staticmethod
|
||||
def _output_ready(path: Path) -> bool:
|
||||
try:
|
||||
return path.is_file() and path.stat().st_size >= _READY_MIN_OUTPUT_BYTES
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
async def _poll_ready(
|
||||
self,
|
||||
output_path: Path,
|
||||
proc: asyncio.subprocess.Process,
|
||||
) -> None:
|
||||
"""Detect first-slice output without relying on ffmpeg stderr (-loglevel warning is silent)."""
|
||||
try:
|
||||
while proc.returncode is None:
|
||||
if self._ready_event is None or self._ready_event.is_set():
|
||||
return
|
||||
if self._output_ready(output_path):
|
||||
self._ready_event.set()
|
||||
return
|
||||
await asyncio.sleep(_READY_POLL_INTERVAL_SEC)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
async def _watch_stderr(
|
||||
self,
|
||||
proc: asyncio.subprocess.Process,
|
||||
output_path: Path,
|
||||
) -> None:
|
||||
if proc.stderr is None:
|
||||
return
|
||||
@@ -219,9 +250,6 @@ class RtspSegmentRecorder:
|
||||
self._slice_index,
|
||||
text,
|
||||
)
|
||||
if self._ready_event is not None and not self._ready_event.is_set():
|
||||
if output_path.is_file() and output_path.stat().st_size >= 4096:
|
||||
self._ready_event.set()
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
|
||||
@@ -72,10 +72,21 @@ services:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
network: host
|
||||
# Match host uid/gid so bind-mounted ./logs (RTSP slices) is readable on the host (VLC, ffplay).
|
||||
user: "${HOST_UID:-1000}:${HOST_GID:-1000}"
|
||||
group_add:
|
||||
- "${DOCKER_GID:-999}"
|
||||
gpus: all
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
environment:
|
||||
HOST_UID: ${HOST_UID:-1000}
|
||||
HOST_GID: ${HOST_GID:-1000}
|
||||
DOCKER_GID: ${DOCKER_GID:-999}
|
||||
HOME: /tmp
|
||||
XDG_CACHE_HOME: /tmp
|
||||
UV_CACHE_DIR: /tmp/uv-cache
|
||||
TORCH_HOME: /tmp/torch-cache
|
||||
POSTGRES_USER: ${POSTGRES_USER:-postgres}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres}
|
||||
POSTGRES_DB: ${POSTGRES_DB:-operation_room}
|
||||
@@ -127,7 +138,8 @@ services:
|
||||
DEMO_HLS_PREVIEW_CONTAINER_NAME: ${DEMO_HLS_PREVIEW_CONTAINER_NAME:-orm-mediamtx-hls}
|
||||
MEDIAMTX_DOCKER_IMAGE: ${MEDIAMTX_DOCKER_IMAGE:-m.daocloud.io/docker.io/bluenviron/mediamtx:latest}
|
||||
command: >
|
||||
sh -c "uv run --no-sync alembic upgrade head &&
|
||||
sh -c "mkdir -p /tmp/uv-cache /tmp/torch-cache &&
|
||||
uv run --no-sync alembic upgrade head &&
|
||||
uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 8000"
|
||||
ports:
|
||||
- "${API_PORT:-38080}:8000"
|
||||
|
||||
19
backend/scripts/rebuild-api-image.sh
Executable file
19
backend/scripts/rebuild-api-image.sh
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
# Clear BuildKit cache and rebuild the API image (fixes corrupted layer / unpigz errors).
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
echo "Pruning BuildKit cache..."
|
||||
docker builder prune -af
|
||||
docker buildx prune -af 2>/dev/null || true
|
||||
docker rmi -f backend-api:latest 2>/dev/null || true
|
||||
|
||||
if [[ "${RESTART_DOCKER:-0}" == "1" ]]; then
|
||||
echo "Restarting Docker..."
|
||||
sudo systemctl restart docker
|
||||
fi
|
||||
|
||||
echo "Building api image (--no-cache)..."
|
||||
docker compose build api --no-cache
|
||||
|
||||
echo "Done. Recreate container: docker compose up -d --force-recreate api"
|
||||
@@ -332,10 +332,10 @@ def test_build_batch_main_command_uses_5_15_main_py(tmp_path: Path) -> None:
|
||||
config_path=tmp_path / "config.yaml",
|
||||
)
|
||||
|
||||
assert cmd[:3] == ["uv", "run", "python"]
|
||||
assert cmd[3:5] == ["-X", "faulthandler"]
|
||||
assert cmd[5].endswith("algorithm_subprocesses/5.15/main.py")
|
||||
assert cmd[6:] == ["--config", str(tmp_path / "config.yaml")]
|
||||
assert cmd[0].endswith("python") or cmd[0].endswith("python3")
|
||||
assert cmd[1:3] == ["-X", "faulthandler"]
|
||||
assert cmd[3].endswith("algorithm_subprocesses/5.15/main.py")
|
||||
assert cmd[4:] == ["--config", str(tmp_path / "config.yaml")]
|
||||
|
||||
|
||||
def test_batch_service_respects_reference_bundle_relative_env(
|
||||
|
||||
@@ -2,10 +2,15 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from app.services.video.rtsp_ffmpeg_opts import parse_rtsp_transport, rtsp_ffmpeg_input_opts
|
||||
from app.services.video.rtsp_segment_recorder import (
|
||||
RtspSegmentRecorder,
|
||||
_READY_MIN_OUTPUT_BYTES,
|
||||
_build_ffmpeg_cmd,
|
||||
rtsp_record_ffmpeg_scale_filter,
|
||||
)
|
||||
@@ -77,3 +82,35 @@ def test_build_ffmpeg_cmd_omits_timeout_when_zero(monkeypatch) -> None:
|
||||
)
|
||||
assert "-timeout" not in cmd
|
||||
assert "-stimeout" not in cmd
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_poll_ready_sets_event_without_stderr(tmp_path: Path) -> None:
|
||||
"""-loglevel warning produces no stderr; readiness must poll the output file."""
|
||||
output_path = tmp_path / "slice_0000.mp4"
|
||||
ready = asyncio.Event()
|
||||
recorder = RtspSegmentRecorder(
|
||||
surgery_id="100222",
|
||||
camera_id="or-cam-03",
|
||||
rtsp_url="rtsp://example/stream",
|
||||
output_dir=tmp_path,
|
||||
segment_duration_sec=120.0,
|
||||
segment_min_sec=1.0,
|
||||
on_segment_complete=lambda _e: asyncio.sleep(0),
|
||||
ready_event=ready,
|
||||
)
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"sleep",
|
||||
"30",
|
||||
stdout=asyncio.subprocess.DEVNULL,
|
||||
stderr=asyncio.subprocess.DEVNULL,
|
||||
)
|
||||
poll = asyncio.create_task(recorder._poll_ready(output_path, proc))
|
||||
await asyncio.sleep(0.05)
|
||||
assert not ready.is_set()
|
||||
output_path.write_bytes(b"x" * _READY_MIN_OUTPUT_BYTES)
|
||||
await asyncio.wait_for(ready.wait(), timeout=2.0)
|
||||
poll.cancel()
|
||||
await poll
|
||||
proc.terminate()
|
||||
await proc.wait()
|
||||
|
||||
@@ -58,6 +58,48 @@ docker compose down
|
||||
docker compose down -v # 删除 PostgreSQL / MinIO 卷
|
||||
```
|
||||
|
||||
### 构建 API 镜像失败:`invalid tar header` / `unpigz: corrupted`
|
||||
|
||||
`uv sync` 已成功,但在 **exporting / unpacking** 阶段报错时,通常是 **Docker 本地层缓存或存储损坏**,与 Dockerfile 无关。
|
||||
|
||||
按顺序处理:
|
||||
|
||||
```bash
|
||||
cd backend
|
||||
chmod +x scripts/rebuild-api-image.sh
|
||||
|
||||
# 清缓存并重建(推荐)
|
||||
./scripts/rebuild-api-image.sh
|
||||
|
||||
# 仍失败时:重启 Docker 后再跑
|
||||
RESTART_DOCKER=1 ./scripts/rebuild-api-image.sh
|
||||
|
||||
# 再失败:改用旧版构建器(无 BuildKit)
|
||||
COMPOSE_DOCKER_CLI_BUILD=0 DOCKER_BUILDKIT=0 docker compose build api --no-cache
|
||||
docker compose up -d --force-recreate api
|
||||
```
|
||||
|
||||
手动等价步骤:`docker builder prune -af` → `docker rmi -f backend-api:latest` → `docker compose build api --no-cache`。
|
||||
|
||||
确认根分区剩余空间充足(建议 ≥ 20GB);空间不足时大层导出也容易损坏。
|
||||
|
||||
### RTSP 切片在宿主机无法用 VLC 打开
|
||||
|
||||
默认情况下 API 容器以 **root** 写入 `./logs`,切片属主为 `root:root`。普通用户虽可用 `cat` 读取,但 **Snap 版 VLC** 等沙箱应用常会报 Permission denied。
|
||||
|
||||
在 `backend/.env` 中设置与宿主机一致的 UID/GID(见 `.env.example` 的 `HOST_UID` / `HOST_GID` / `DOCKER_GID`),然后重建 API 容器:
|
||||
|
||||
```bash
|
||||
cd backend
|
||||
docker compose up -d --force-recreate api
|
||||
```
|
||||
|
||||
**已有** root 属主的切片需一次性修正(可选):
|
||||
|
||||
```bash
|
||||
sudo chown -R "$(id -u):$(id -g)" backend/logs/rtsp_segments
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 三、手动启动客户端
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
- 落盘切片默认 **24 小时**后自动删除(`RTSP_SEGMENT_TTL_HOURS`;进程启动与后台定时 sweep)。
|
||||
- 设置 `RTSP_RECORD_ALL_CAMERAS=true` 可对请求中所有可解析 RTSP 的机位分别录像+跑 batch(多机位代码已预留)。
|
||||
- **同一机位同时只允许一场手术录制**(默认主摄 `RTSP_PRIMARY_CAMERA_ID`):另一场次开录同一 camera 时返回 `409` / `CAMERA_ALREADY_RECORDING`;注册表在拉起 ffmpeg 前即占用机位,避免双路 RTSP 抢流导致录像周期性丢帧。
|
||||
- **宿主机读切片**:Compose 中 `api` 使用 `HOST_UID`/`HOST_GID` 与宿主机对齐落盘权限;详见 [Docker部署.md](Docker部署.md)「RTSP 切片在宿主机无法用 VLC 打开」。
|
||||
|
||||
## Docker 与 RTSP 地址
|
||||
|
||||
|
||||
Reference in New Issue
Block a user