293 lines
9.0 KiB
Python
Executable File
293 lines
9.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Split videos in a folder into <= N-minute chunks.
|
|
|
|
Behavior:
|
|
- For each video whose duration > max_minutes * 60, create segments:
|
|
<stem>_1.<ext>, <stem>_2.<ext>, ...
|
|
- Outputs are written to the SAME folder as the input video.
|
|
- Original file is left untouched.
|
|
|
|
By default we skip files that look already-split (e.g. *_12.mp4) to avoid re-splitting.
|
|
|
|
Requires:
|
|
- ffmpeg
|
|
- ffprobe
|
|
|
|
Example:
|
|
python dataset/split_videos_into_minutes.py \
|
|
--folder /home/ubuntu/data/fish/fish_action_videos \
|
|
--max_minutes 1 \
|
|
--recursive
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Iterable, List, Sequence
|
|
|
|
|
|
# Our output naming is <stem>_<idx>.<ext>, starting at idx=1.
|
|
SPLIT_SUFFIX_RE = re.compile(r"^(?P<base>.+)_(?P<idx>\d+)$")
|
|
|
|
|
|
@dataclass
|
|
class SplitResult:
|
|
path: Path
|
|
duration_s: float
|
|
did_split: bool
|
|
created: List[Path]
|
|
|
|
|
|
def _which_or_die(bin_name: str) -> str:
|
|
p = shutil.which(bin_name)
|
|
if not p:
|
|
raise RuntimeError(
|
|
f"Required binary not found in PATH: {bin_name}\n"
|
|
f"Install ffmpeg (which includes ffprobe), then re-run."
|
|
)
|
|
return p
|
|
|
|
|
|
def _run(cmd: Sequence[str]) -> None:
|
|
# Keep stdout/stderr inherited so user sees ffmpeg progress/errors.
|
|
subprocess.run(cmd, check=True)
|
|
|
|
|
|
def probe_duration_seconds(ffprobe: str, video_path: Path) -> float:
|
|
cmd = [
|
|
ffprobe,
|
|
"-v",
|
|
"error",
|
|
"-show_entries",
|
|
"format=duration",
|
|
"-of",
|
|
"default=noprint_wrappers=1:nokey=1",
|
|
str(video_path),
|
|
]
|
|
out = subprocess.check_output(cmd).decode("utf-8", errors="replace").strip()
|
|
try:
|
|
return float(out)
|
|
except ValueError as e:
|
|
raise RuntimeError(f"Could not parse duration from ffprobe output for {video_path}: {out!r}") from e
|
|
|
|
|
|
def looks_already_split(p: Path) -> bool:
|
|
"""
|
|
Avoid re-splitting files that look like our generated segments.
|
|
|
|
Important: some original camera files may naturally end with _<digits>.
|
|
We only treat as "already split" when we can see evidence of a family:
|
|
- either the unsuffixed base file exists (base.ext), OR
|
|
- another sibling segment exists (base_2.ext, base_3.ext, ...)
|
|
"""
|
|
m = SPLIT_SUFFIX_RE.match(p.stem)
|
|
if not m:
|
|
return False
|
|
base = m.group("base")
|
|
base_path = p.with_name(f"{base}{p.suffix}")
|
|
if base_path.exists():
|
|
return True
|
|
# If there are multiple segments with the same base, treat as already-split.
|
|
siblings = list(p.parent.glob(f"{base}_[0-9]*{p.suffix}"))
|
|
return len(siblings) >= 2
|
|
|
|
|
|
def iter_videos(folder: Path, recursive: bool, exts: Iterable[str]) -> List[Path]:
|
|
exts_norm = {e.lower().lstrip(".") for e in exts}
|
|
if recursive:
|
|
it = folder.rglob("*")
|
|
else:
|
|
it = folder.glob("*")
|
|
out: List[Path] = []
|
|
for p in it:
|
|
if not p.is_file():
|
|
continue
|
|
if p.suffix.lower().lstrip(".") in exts_norm:
|
|
out.append(p)
|
|
return sorted(out)
|
|
|
|
|
|
def split_video(
|
|
ffmpeg: str,
|
|
ffprobe: str,
|
|
video_path: Path,
|
|
max_seconds: int,
|
|
*,
|
|
reencode: bool,
|
|
overwrite: bool,
|
|
dry_run: bool,
|
|
) -> SplitResult:
|
|
duration_s = probe_duration_seconds(ffprobe, video_path)
|
|
if duration_s <= float(max_seconds) + 1e-3:
|
|
return SplitResult(path=video_path, duration_s=duration_s, did_split=False, created=[])
|
|
|
|
out_pattern = str(video_path.with_name(f"{video_path.stem}_%d{video_path.suffix}"))
|
|
|
|
cmd: List[str] = [
|
|
ffmpeg,
|
|
"-hide_banner",
|
|
"-loglevel",
|
|
"error",
|
|
"-stats",
|
|
]
|
|
if overwrite:
|
|
cmd += ["-y"]
|
|
else:
|
|
cmd += ["-n"]
|
|
|
|
# fflags/genpts helps with some files that have broken or missing timestamps.
|
|
cmd += ["-fflags", "+genpts", "-i", str(video_path)]
|
|
# Keep video and audio if present.
|
|
cmd += ["-map", "0:v:0?", "-map", "0:a:0?"]
|
|
|
|
if reencode:
|
|
# More reliable segment boundaries (not limited to keyframes), but slower and lossy.
|
|
# Works fine for typical "video-only" MP4s; if audio exists it will be re-encoded too.
|
|
# Force regular keyframes so segment cuts are accurate at max_seconds boundaries.
|
|
cmd += [
|
|
"-c:v",
|
|
"libx264",
|
|
"-preset",
|
|
"veryfast",
|
|
"-crf",
|
|
"23",
|
|
"-g",
|
|
str(int(max_seconds * 30)), # rough GOP cap; okay if fps differs
|
|
"-keyint_min",
|
|
str(int(max_seconds * 30)),
|
|
"-sc_threshold",
|
|
"0",
|
|
"-force_key_frames",
|
|
f"expr:gte(t,n_forced*{int(max_seconds)})",
|
|
"-c:a",
|
|
"aac",
|
|
"-b:a",
|
|
"128k",
|
|
]
|
|
else:
|
|
# Fastest, but can split only on keyframes depending on container/codecs.
|
|
cmd += ["-c", "copy", "-avoid_negative_ts", "make_zero"]
|
|
|
|
cmd += [
|
|
"-f",
|
|
"segment",
|
|
"-segment_time",
|
|
str(int(max_seconds)),
|
|
"-reset_timestamps",
|
|
"1",
|
|
"-segment_start_number",
|
|
"1",
|
|
out_pattern,
|
|
]
|
|
|
|
if dry_run:
|
|
print("[DRY RUN]", " ".join(cmd))
|
|
return SplitResult(path=video_path, duration_s=duration_s, did_split=True, created=[])
|
|
|
|
# Run ffmpeg; it will create <stem>_1.ext, <stem>_2.ext, ...
|
|
_run(cmd)
|
|
|
|
created = sorted(video_path.parent.glob(f"{video_path.stem}_[0-9]*{video_path.suffix}"))
|
|
# Filter/delete empty outputs (can happen with stream-copy + weird timestamps).
|
|
non_empty: List[Path] = []
|
|
for c in created:
|
|
try:
|
|
if c.stat().st_size <= 0:
|
|
c.unlink(missing_ok=True)
|
|
continue
|
|
non_empty.append(c)
|
|
except FileNotFoundError:
|
|
continue
|
|
|
|
created = non_empty
|
|
return SplitResult(path=video_path, duration_s=duration_s, did_split=True, created=created)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--folder", type=str, required=True, help="Folder containing videos to split")
|
|
parser.add_argument("--max_minutes", type=float, default=1.0, help="Max segment length in minutes (default: 1)")
|
|
parser.add_argument(
|
|
"--exts",
|
|
type=str,
|
|
default="mp4,mov,mkv,avi,webm",
|
|
help="Comma-separated list of video extensions to include (default: mp4,mov,mkv,avi,webm)",
|
|
)
|
|
parser.add_argument("--recursive", action="store_true", help="Recurse into subfolders")
|
|
parser.add_argument(
|
|
"--include_already_split",
|
|
action="store_true",
|
|
help="Also process files that already look like <name>_12.ext (default: skip them).",
|
|
)
|
|
parser.add_argument(
|
|
"--reencode",
|
|
action="store_true",
|
|
help="Re-encode instead of stream copy (slower, but more reliable split boundaries).",
|
|
)
|
|
parser.add_argument("--overwrite", action="store_true", help="Overwrite existing output segments")
|
|
parser.add_argument("--dry_run", action="store_true", help="Print commands without executing")
|
|
args = parser.parse_args()
|
|
|
|
ffmpeg = _which_or_die("ffmpeg")
|
|
ffprobe = _which_or_die("ffprobe")
|
|
|
|
folder = Path(args.folder).expanduser().resolve()
|
|
if not folder.is_dir():
|
|
raise FileNotFoundError(f"Folder not found: {folder}")
|
|
|
|
exts = [e.strip() for e in str(args.exts).split(",") if e.strip()]
|
|
max_seconds = int(round(float(args.max_minutes) * 60.0))
|
|
if max_seconds <= 0:
|
|
raise ValueError("--max_minutes must be > 0")
|
|
|
|
videos = iter_videos(folder, recursive=bool(args.recursive), exts=exts)
|
|
if not args.include_already_split:
|
|
videos = [p for p in videos if not looks_already_split(p)]
|
|
|
|
print(f"Found {len(videos)} video(s) in {folder} (recursive={bool(args.recursive)})")
|
|
print(f"Splitting any video longer than {max_seconds}s into {max_seconds}s segments")
|
|
print(f"Mode: {'re-encode' if args.reencode else 'stream-copy'} | overwrite={bool(args.overwrite)} | dry_run={bool(args.dry_run)}")
|
|
|
|
n_split = 0
|
|
n_skipped = 0
|
|
total_created = 0
|
|
for p in videos:
|
|
try:
|
|
res = split_video(
|
|
ffmpeg,
|
|
ffprobe,
|
|
p,
|
|
max_seconds=max_seconds,
|
|
reencode=bool(args.reencode),
|
|
overwrite=bool(args.overwrite),
|
|
dry_run=bool(args.dry_run),
|
|
)
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"[ERROR] ffmpeg failed for {p}: {e}")
|
|
continue
|
|
except Exception as e:
|
|
print(f"[ERROR] {p}: {e}")
|
|
continue
|
|
|
|
if res.did_split:
|
|
n_split += 1
|
|
total_created += len(res.created)
|
|
if not args.dry_run:
|
|
print(f"[SPLIT] {p.name} ({res.duration_s:.1f}s) -> {len(res.created)} segment(s)")
|
|
else:
|
|
n_skipped += 1
|
|
|
|
print(f"Done. Split {n_split} video(s), skipped {n_skipped} short video(s), created {total_created} segment file(s).")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|