Files
FishServer/FishAction/dataset/split_videos_into_minutes.py
2026-04-08 19:32:23 +08:00

293 lines
9.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Split videos in a folder into <= N-minute chunks.
Behavior:
- For each video whose duration > max_minutes * 60, create segments:
<stem>_1.<ext>, <stem>_2.<ext>, ...
- Outputs are written to the SAME folder as the input video.
- Original file is left untouched.
By default we skip files that look already-split (e.g. *_12.mp4) to avoid re-splitting.
Requires:
- ffmpeg
- ffprobe
Example:
python dataset/split_videos_into_minutes.py \
--folder /home/ubuntu/data/fish/fish_action_videos \
--max_minutes 1 \
--recursive
"""
from __future__ import annotations
import argparse
import re
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Sequence
# Our output naming is <stem>_<idx>.<ext>, starting at idx=1.
SPLIT_SUFFIX_RE = re.compile(r"^(?P<base>.+)_(?P<idx>\d+)$")
@dataclass
class SplitResult:
path: Path
duration_s: float
did_split: bool
created: List[Path]
def _which_or_die(bin_name: str) -> str:
p = shutil.which(bin_name)
if not p:
raise RuntimeError(
f"Required binary not found in PATH: {bin_name}\n"
f"Install ffmpeg (which includes ffprobe), then re-run."
)
return p
def _run(cmd: Sequence[str]) -> None:
# Keep stdout/stderr inherited so user sees ffmpeg progress/errors.
subprocess.run(cmd, check=True)
def probe_duration_seconds(ffprobe: str, video_path: Path) -> float:
cmd = [
ffprobe,
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
str(video_path),
]
out = subprocess.check_output(cmd).decode("utf-8", errors="replace").strip()
try:
return float(out)
except ValueError as e:
raise RuntimeError(f"Could not parse duration from ffprobe output for {video_path}: {out!r}") from e
def looks_already_split(p: Path) -> bool:
"""
Avoid re-splitting files that look like our generated segments.
Important: some original camera files may naturally end with _<digits>.
We only treat as "already split" when we can see evidence of a family:
- either the unsuffixed base file exists (base.ext), OR
- another sibling segment exists (base_2.ext, base_3.ext, ...)
"""
m = SPLIT_SUFFIX_RE.match(p.stem)
if not m:
return False
base = m.group("base")
base_path = p.with_name(f"{base}{p.suffix}")
if base_path.exists():
return True
# If there are multiple segments with the same base, treat as already-split.
siblings = list(p.parent.glob(f"{base}_[0-9]*{p.suffix}"))
return len(siblings) >= 2
def iter_videos(folder: Path, recursive: bool, exts: Iterable[str]) -> List[Path]:
exts_norm = {e.lower().lstrip(".") for e in exts}
if recursive:
it = folder.rglob("*")
else:
it = folder.glob("*")
out: List[Path] = []
for p in it:
if not p.is_file():
continue
if p.suffix.lower().lstrip(".") in exts_norm:
out.append(p)
return sorted(out)
def split_video(
ffmpeg: str,
ffprobe: str,
video_path: Path,
max_seconds: int,
*,
reencode: bool,
overwrite: bool,
dry_run: bool,
) -> SplitResult:
duration_s = probe_duration_seconds(ffprobe, video_path)
if duration_s <= float(max_seconds) + 1e-3:
return SplitResult(path=video_path, duration_s=duration_s, did_split=False, created=[])
out_pattern = str(video_path.with_name(f"{video_path.stem}_%d{video_path.suffix}"))
cmd: List[str] = [
ffmpeg,
"-hide_banner",
"-loglevel",
"error",
"-stats",
]
if overwrite:
cmd += ["-y"]
else:
cmd += ["-n"]
# fflags/genpts helps with some files that have broken or missing timestamps.
cmd += ["-fflags", "+genpts", "-i", str(video_path)]
# Keep video and audio if present.
cmd += ["-map", "0:v:0?", "-map", "0:a:0?"]
if reencode:
# More reliable segment boundaries (not limited to keyframes), but slower and lossy.
# Works fine for typical "video-only" MP4s; if audio exists it will be re-encoded too.
# Force regular keyframes so segment cuts are accurate at max_seconds boundaries.
cmd += [
"-c:v",
"libx264",
"-preset",
"veryfast",
"-crf",
"23",
"-g",
str(int(max_seconds * 30)), # rough GOP cap; okay if fps differs
"-keyint_min",
str(int(max_seconds * 30)),
"-sc_threshold",
"0",
"-force_key_frames",
f"expr:gte(t,n_forced*{int(max_seconds)})",
"-c:a",
"aac",
"-b:a",
"128k",
]
else:
# Fastest, but can split only on keyframes depending on container/codecs.
cmd += ["-c", "copy", "-avoid_negative_ts", "make_zero"]
cmd += [
"-f",
"segment",
"-segment_time",
str(int(max_seconds)),
"-reset_timestamps",
"1",
"-segment_start_number",
"1",
out_pattern,
]
if dry_run:
print("[DRY RUN]", " ".join(cmd))
return SplitResult(path=video_path, duration_s=duration_s, did_split=True, created=[])
# Run ffmpeg; it will create <stem>_1.ext, <stem>_2.ext, ...
_run(cmd)
created = sorted(video_path.parent.glob(f"{video_path.stem}_[0-9]*{video_path.suffix}"))
# Filter/delete empty outputs (can happen with stream-copy + weird timestamps).
non_empty: List[Path] = []
for c in created:
try:
if c.stat().st_size <= 0:
c.unlink(missing_ok=True)
continue
non_empty.append(c)
except FileNotFoundError:
continue
created = non_empty
return SplitResult(path=video_path, duration_s=duration_s, did_split=True, created=created)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--folder", type=str, required=True, help="Folder containing videos to split")
parser.add_argument("--max_minutes", type=float, default=1.0, help="Max segment length in minutes (default: 1)")
parser.add_argument(
"--exts",
type=str,
default="mp4,mov,mkv,avi,webm",
help="Comma-separated list of video extensions to include (default: mp4,mov,mkv,avi,webm)",
)
parser.add_argument("--recursive", action="store_true", help="Recurse into subfolders")
parser.add_argument(
"--include_already_split",
action="store_true",
help="Also process files that already look like <name>_12.ext (default: skip them).",
)
parser.add_argument(
"--reencode",
action="store_true",
help="Re-encode instead of stream copy (slower, but more reliable split boundaries).",
)
parser.add_argument("--overwrite", action="store_true", help="Overwrite existing output segments")
parser.add_argument("--dry_run", action="store_true", help="Print commands without executing")
args = parser.parse_args()
ffmpeg = _which_or_die("ffmpeg")
ffprobe = _which_or_die("ffprobe")
folder = Path(args.folder).expanduser().resolve()
if not folder.is_dir():
raise FileNotFoundError(f"Folder not found: {folder}")
exts = [e.strip() for e in str(args.exts).split(",") if e.strip()]
max_seconds = int(round(float(args.max_minutes) * 60.0))
if max_seconds <= 0:
raise ValueError("--max_minutes must be > 0")
videos = iter_videos(folder, recursive=bool(args.recursive), exts=exts)
if not args.include_already_split:
videos = [p for p in videos if not looks_already_split(p)]
print(f"Found {len(videos)} video(s) in {folder} (recursive={bool(args.recursive)})")
print(f"Splitting any video longer than {max_seconds}s into {max_seconds}s segments")
print(f"Mode: {'re-encode' if args.reencode else 'stream-copy'} | overwrite={bool(args.overwrite)} | dry_run={bool(args.dry_run)}")
n_split = 0
n_skipped = 0
total_created = 0
for p in videos:
try:
res = split_video(
ffmpeg,
ffprobe,
p,
max_seconds=max_seconds,
reencode=bool(args.reencode),
overwrite=bool(args.overwrite),
dry_run=bool(args.dry_run),
)
except subprocess.CalledProcessError as e:
print(f"[ERROR] ffmpeg failed for {p}: {e}")
continue
except Exception as e:
print(f"[ERROR] {p}: {e}")
continue
if res.did_split:
n_split += 1
total_created += len(res.created)
if not args.dry_run:
print(f"[SPLIT] {p.name} ({res.duration_s:.1f}s) -> {len(res.created)} segment(s)")
else:
n_skipped += 1
print(f"Done. Split {n_split} video(s), skipped {n_skipped} short video(s), created {total_created} segment file(s).")
if __name__ == "__main__":
main()