#!/usr/bin/env python3 """ Split videos in a folder into <= N-minute chunks. Behavior: - For each video whose duration > max_minutes * 60, create segments: _1., _2., ... - Outputs are written to the SAME folder as the input video. - Original file is left untouched. By default we skip files that look already-split (e.g. *_12.mp4) to avoid re-splitting. Requires: - ffmpeg - ffprobe Example: python dataset/split_videos_into_minutes.py \ --folder /home/ubuntu/data/fish/fish_action_videos \ --max_minutes 1 \ --recursive """ from __future__ import annotations import argparse import re import shutil import subprocess from dataclasses import dataclass from pathlib import Path from typing import Iterable, List, Sequence # Our output naming is _., starting at idx=1. SPLIT_SUFFIX_RE = re.compile(r"^(?P.+)_(?P\d+)$") @dataclass class SplitResult: path: Path duration_s: float did_split: bool created: List[Path] def _which_or_die(bin_name: str) -> str: p = shutil.which(bin_name) if not p: raise RuntimeError( f"Required binary not found in PATH: {bin_name}\n" f"Install ffmpeg (which includes ffprobe), then re-run." ) return p def _run(cmd: Sequence[str]) -> None: # Keep stdout/stderr inherited so user sees ffmpeg progress/errors. subprocess.run(cmd, check=True) def probe_duration_seconds(ffprobe: str, video_path: Path) -> float: cmd = [ ffprobe, "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", str(video_path), ] out = subprocess.check_output(cmd).decode("utf-8", errors="replace").strip() try: return float(out) except ValueError as e: raise RuntimeError(f"Could not parse duration from ffprobe output for {video_path}: {out!r}") from e def looks_already_split(p: Path) -> bool: """ Avoid re-splitting files that look like our generated segments. Important: some original camera files may naturally end with _. We only treat as "already split" when we can see evidence of a family: - either the unsuffixed base file exists (base.ext), OR - another sibling segment exists (base_2.ext, base_3.ext, ...) """ m = SPLIT_SUFFIX_RE.match(p.stem) if not m: return False base = m.group("base") base_path = p.with_name(f"{base}{p.suffix}") if base_path.exists(): return True # If there are multiple segments with the same base, treat as already-split. siblings = list(p.parent.glob(f"{base}_[0-9]*{p.suffix}")) return len(siblings) >= 2 def iter_videos(folder: Path, recursive: bool, exts: Iterable[str]) -> List[Path]: exts_norm = {e.lower().lstrip(".") for e in exts} if recursive: it = folder.rglob("*") else: it = folder.glob("*") out: List[Path] = [] for p in it: if not p.is_file(): continue if p.suffix.lower().lstrip(".") in exts_norm: out.append(p) return sorted(out) def split_video( ffmpeg: str, ffprobe: str, video_path: Path, max_seconds: int, *, reencode: bool, overwrite: bool, dry_run: bool, ) -> SplitResult: duration_s = probe_duration_seconds(ffprobe, video_path) if duration_s <= float(max_seconds) + 1e-3: return SplitResult(path=video_path, duration_s=duration_s, did_split=False, created=[]) out_pattern = str(video_path.with_name(f"{video_path.stem}_%d{video_path.suffix}")) cmd: List[str] = [ ffmpeg, "-hide_banner", "-loglevel", "error", "-stats", ] if overwrite: cmd += ["-y"] else: cmd += ["-n"] # fflags/genpts helps with some files that have broken or missing timestamps. cmd += ["-fflags", "+genpts", "-i", str(video_path)] # Keep video and audio if present. cmd += ["-map", "0:v:0?", "-map", "0:a:0?"] if reencode: # More reliable segment boundaries (not limited to keyframes), but slower and lossy. # Works fine for typical "video-only" MP4s; if audio exists it will be re-encoded too. # Force regular keyframes so segment cuts are accurate at max_seconds boundaries. cmd += [ "-c:v", "libx264", "-preset", "veryfast", "-crf", "23", "-g", str(int(max_seconds * 30)), # rough GOP cap; okay if fps differs "-keyint_min", str(int(max_seconds * 30)), "-sc_threshold", "0", "-force_key_frames", f"expr:gte(t,n_forced*{int(max_seconds)})", "-c:a", "aac", "-b:a", "128k", ] else: # Fastest, but can split only on keyframes depending on container/codecs. cmd += ["-c", "copy", "-avoid_negative_ts", "make_zero"] cmd += [ "-f", "segment", "-segment_time", str(int(max_seconds)), "-reset_timestamps", "1", "-segment_start_number", "1", out_pattern, ] if dry_run: print("[DRY RUN]", " ".join(cmd)) return SplitResult(path=video_path, duration_s=duration_s, did_split=True, created=[]) # Run ffmpeg; it will create _1.ext, _2.ext, ... _run(cmd) created = sorted(video_path.parent.glob(f"{video_path.stem}_[0-9]*{video_path.suffix}")) # Filter/delete empty outputs (can happen with stream-copy + weird timestamps). non_empty: List[Path] = [] for c in created: try: if c.stat().st_size <= 0: c.unlink(missing_ok=True) continue non_empty.append(c) except FileNotFoundError: continue created = non_empty return SplitResult(path=video_path, duration_s=duration_s, did_split=True, created=created) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--folder", type=str, required=True, help="Folder containing videos to split") parser.add_argument("--max_minutes", type=float, default=1.0, help="Max segment length in minutes (default: 1)") parser.add_argument( "--exts", type=str, default="mp4,mov,mkv,avi,webm", help="Comma-separated list of video extensions to include (default: mp4,mov,mkv,avi,webm)", ) parser.add_argument("--recursive", action="store_true", help="Recurse into subfolders") parser.add_argument( "--include_already_split", action="store_true", help="Also process files that already look like _12.ext (default: skip them).", ) parser.add_argument( "--reencode", action="store_true", help="Re-encode instead of stream copy (slower, but more reliable split boundaries).", ) parser.add_argument("--overwrite", action="store_true", help="Overwrite existing output segments") parser.add_argument("--dry_run", action="store_true", help="Print commands without executing") args = parser.parse_args() ffmpeg = _which_or_die("ffmpeg") ffprobe = _which_or_die("ffprobe") folder = Path(args.folder).expanduser().resolve() if not folder.is_dir(): raise FileNotFoundError(f"Folder not found: {folder}") exts = [e.strip() for e in str(args.exts).split(",") if e.strip()] max_seconds = int(round(float(args.max_minutes) * 60.0)) if max_seconds <= 0: raise ValueError("--max_minutes must be > 0") videos = iter_videos(folder, recursive=bool(args.recursive), exts=exts) if not args.include_already_split: videos = [p for p in videos if not looks_already_split(p)] print(f"Found {len(videos)} video(s) in {folder} (recursive={bool(args.recursive)})") print(f"Splitting any video longer than {max_seconds}s into {max_seconds}s segments") print(f"Mode: {'re-encode' if args.reencode else 'stream-copy'} | overwrite={bool(args.overwrite)} | dry_run={bool(args.dry_run)}") n_split = 0 n_skipped = 0 total_created = 0 for p in videos: try: res = split_video( ffmpeg, ffprobe, p, max_seconds=max_seconds, reencode=bool(args.reencode), overwrite=bool(args.overwrite), dry_run=bool(args.dry_run), ) except subprocess.CalledProcessError as e: print(f"[ERROR] ffmpeg failed for {p}: {e}") continue except Exception as e: print(f"[ERROR] {p}: {e}") continue if res.did_split: n_split += 1 total_created += len(res.created) if not args.dry_run: print(f"[SPLIT] {p.name} ({res.duration_s:.1f}s) -> {len(res.created)} segment(s)") else: n_skipped += 1 print(f"Done. Split {n_split} video(s), skipped {n_skipped} short video(s), created {total_created} segment file(s).") if __name__ == "__main__": main()