FishAction/test_pytorchvideo_x3d_3class.py

#!/usr/bin/env python3
"""
Evaluate a trained PyTorchVideo X3D checkpoint as a 3-class classifier.

This script loads a `checkpoint_best.pt` produced by `train_pytorchvideo_x3d.py` and runs
inference on a CSV split (val.csv or test.csv), then reports 3-class accuracy and a
3x3 confusion matrix.

CSV format (whitespace-separated):
    relative/path/to/video.mp4 <label_int>

Example:
  python test_pytorchvideo_x3d_3class.py \
    --checkpoint /home/ubuntu/projects/FishAction/checkpoints/ptv_x3d_m/checkpoint_best.pt \
    --csv_dir /home/ubuntu/data/fish/fish_action_videos \
    --path_prefix /home/ubuntu/data/fish/fish_action_videos \
    --split val.csv \
    --batch_size 4 \
    --num_workers 4
"""

from __future__ import annotations

import argparse
import os
import time
import warnings
from typing import List, Tuple

import torch
from torch.utils.data import DataLoader

from pytorchvideo.data import LabeledVideoDataset, make_clip_sampler

from train_pytorchvideo_x3d import (
    VideoTransform,
    _parse_path_label_line,
    build_pretrained_x3d,
    replace_last_linear,
)


THREE_CLASS_NAMES = ["feeding", "normal", "scared"]


def read_csv_list(csv_path: str, path_prefix: str) -> List[Tuple[str, Dict]]:
    items: List[Tuple[str, Dict]] = []
    with open(csv_path, "r") as f:
        for line in f:
            parsed = _parse_path_label_line(line)
            if parsed is None:
                continue
            rel_path, label = parsed
            full = os.path.join(path_prefix, rel_path)
            items.append((full, {"label": int(label)}))
    return items


def confusion_matrix_3(preds: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
    """
    preds/labels: int tensors of shape (N,) with values in [0,2]
    returns: (3,3) where rows=true, cols=pred
    """
    cm = torch.zeros((3, 3), dtype=torch.long)
    for t, p in zip(labels.tolist(), preds.tolist()):
        cm[t][p] += 1
    return cm


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--checkpoint", type=str, required=True, help="Path to checkpoint_best.pt")
    parser.add_argument("--csv_dir", type=str, required=True, help="Folder containing split CSVs (and label_map.txt if available)")
    parser.add_argument("--path_prefix", type=str, required=True, help="Absolute path prefix for videos")
    parser.add_argument("--split", type=str, default="val.csv", help="CSV filename inside csv_dir (e.g. val.csv or test.csv)")
    parser.add_argument("--batch_size", type=int, default=4)
    parser.add_argument("--num_workers", type=int, default=4)
    parser.add_argument(
        "--log_interval",
        type=int,
        default=20,
        help="Print progress every N batches (default: 20). Set 0 to disable.",
    )
    parser.add_argument(
        "--max_batches",
        type=int,
        default=0,
        help="If >0, stop after this many batches (useful for quick sanity checks).",
    )
    parser.add_argument("--device", type=str, default="", help="cuda|cpu (auto if empty)")
    args = parser.parse_args()

    device = args.device.strip() or ("cuda" if torch.cuda.is_available() else "cpu")
    csv_dir = os.path.abspath(os.path.expanduser(args.csv_dir))
    path_prefix = os.path.abspath(os.path.expanduser(args.path_prefix))
    split_path = os.path.join(csv_dir, args.split)

    if not os.path.isfile(split_path):
        raise FileNotFoundError(f"Split CSV not found: {split_path}")

    ckpt_path = os.path.abspath(os.path.expanduser(args.checkpoint))
    # Torch 2.5+ warns about pickle by default. This is your own checkpoint, so we silence it.
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="You are using `torch.load` with `weights_only=False`.*")
        ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
    cfg = ckpt.get("cfg", {})
    model_name = cfg.get("model", "x3d_m")
    num_frames = int(cfg.get("num_frames", 16))
    sampling_rate = int(cfg.get("sampling_rate", 5))
    target_fps = int(cfg.get("target_fps", 30))
    num_classes = int(cfg.get("num_classes", 3))
    clip_duration = num_frames * sampling_rate / float(target_fps)

    # Build dataset
    items = read_csv_list(split_path, path_prefix)
    print(f"Loaded {len(items)} samples from {split_path}")
    print(
        f"Decoding clips with pyav (clip_duration={clip_duration:.2f}s, num_frames={num_frames}). "
        f"If this looks stuck, try --num_workers 0."
    )
    ds = LabeledVideoDataset(
        labeled_video_paths=items,
        clip_sampler=make_clip_sampler("uniform", clip_duration),
        decode_audio=False,
        decoder="pyav",
        transform=VideoTransform(num_frames=num_frames, train=False),
    )
    loader = DataLoader(
        ds,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        pin_memory=(device == "cuda"),
        persistent_workers=bool(args.num_workers > 0),
        prefetch_factor=2 if args.num_workers > 0 else None,
    )

    # Build model architecture and load weights.
    model = build_pretrained_x3d(model_name=model_name, pretrained=False)
    # Ensure head shape matches checkpoint state_dict (trained as 3-class).
    replace_last_linear(model, num_classes=num_classes)
    model.load_state_dict(ckpt["model"], strict=True)
    model = model.to(device)
    model.eval()

    total = 0
    correct3 = 0
    cm = torch.zeros((3, 3), dtype=torch.long)

    with torch.no_grad():
        t_last = time.time()
        for batch_idx, (videos, labels3) in enumerate(loader, start=1):
            videos = videos.to(device, non_blocking=True)
            labels3 = labels3.to(device, non_blocking=True)

            logits3 = model(videos)
            preds3 = torch.argmax(logits3, dim=1)

            total += labels3.numel()
            correct3 += (preds3 == labels3).sum().item()
            cm += confusion_matrix_3(preds3.cpu(), labels3.cpu())

            if args.log_interval > 0 and (batch_idx % args.log_interval) == 0:
                dt = time.time() - t_last
                t_last = time.time()
                print(
                    f"Processed batch {batch_idx:05d} "
                    f"({total} clips) | acc so far {(correct3/max(total,1))*100:.2f}% "
                    f"({dt:.1f}s/{args.log_interval} batches)"
                )

            if args.max_batches and batch_idx >= args.max_batches:
                break

    acc3 = correct3 / max(total, 1)
    print(f"3-class accuracy: {acc3*100:.2f}% ({correct3}/{total})")
    print("3-class confusion matrix (rows=true, cols=pred) for [feeding, normal, scared]:")
    print(cm)


if __name__ == "__main__":
    main()
Initial commit: FishServer monorepo (FishAction, FishMeasure, fish_api) Made-with: Cursor 2026-04-08 19:32:23 +08:00			`#!/usr/bin/env python3`
			`"""`
			`Evaluate a trained PyTorchVideo X3D checkpoint as a 3-class classifier.`

			This script loads a `checkpoint_best.pt` produced by `train_pytorchvideo_x3d.py` and runs
			`inference on a CSV split (val.csv or test.csv), then reports 3-class accuracy and a`
			`3x3 confusion matrix.`

			`CSV format (whitespace-separated):`
			`relative/path/to/video.mp4 <label_int>`

			`Example:`
			`python test_pytorchvideo_x3d_3class.py \`
			`--checkpoint /home/ubuntu/projects/FishAction/checkpoints/ptv_x3d_m/checkpoint_best.pt \`
			`--csv_dir /home/ubuntu/data/fish/fish_action_videos \`
			`--path_prefix /home/ubuntu/data/fish/fish_action_videos \`
			`--split val.csv \`
			`--batch_size 4 \`
			`--num_workers 4`
			`"""`

			`from __future__ import annotations`

			`import argparse`
			`import os`
			`import time`
			`import warnings`
			`from typing import List, Tuple`

			`import torch`
			`from torch.utils.data import DataLoader`

			`from pytorchvideo.data import LabeledVideoDataset, make_clip_sampler`

			`from train_pytorchvideo_x3d import (`
			`VideoTransform,`
			`_parse_path_label_line,`
			`build_pretrained_x3d,`
			`replace_last_linear,`
			`)`


			`THREE_CLASS_NAMES = ["feeding", "normal", "scared"]`


			`def read_csv_list(csv_path: str, path_prefix: str) -> List[Tuple[str, Dict]]:`
			`items: List[Tuple[str, Dict]] = []`
			`with open(csv_path, "r") as f:`
			`for line in f:`
			`parsed = _parse_path_label_line(line)`
			`if parsed is None:`
			`continue`
			`rel_path, label = parsed`
			`full = os.path.join(path_prefix, rel_path)`
			`items.append((full, {"label": int(label)}))`
			`return items`


			`def confusion_matrix_3(preds: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:`
			`"""`
			`preds/labels: int tensors of shape (N,) with values in [0,2]`
			`returns: (3,3) where rows=true, cols=pred`
			`"""`
			`cm = torch.zeros((3, 3), dtype=torch.long)`
			`for t, p in zip(labels.tolist(), preds.tolist()):`
			`cm[t][p] += 1`
			`return cm`


			`def main() -> None:`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--checkpoint", type=str, required=True, help="Path to checkpoint_best.pt")`
			`parser.add_argument("--csv_dir", type=str, required=True, help="Folder containing split CSVs (and label_map.txt if available)")`
			`parser.add_argument("--path_prefix", type=str, required=True, help="Absolute path prefix for videos")`
			`parser.add_argument("--split", type=str, default="val.csv", help="CSV filename inside csv_dir (e.g. val.csv or test.csv)")`
			`parser.add_argument("--batch_size", type=int, default=4)`
			`parser.add_argument("--num_workers", type=int, default=4)`
			`parser.add_argument(`
			`"--log_interval",`
			`type=int,`
			`default=20,`
			`help="Print progress every N batches (default: 20). Set 0 to disable.",`
			`)`
			`parser.add_argument(`
			`"--max_batches",`
			`type=int,`
			`default=0,`
			`help="If >0, stop after this many batches (useful for quick sanity checks).",`
			`)`
			`parser.add_argument("--device", type=str, default="", help="cuda\|cpu (auto if empty)")`
			`args = parser.parse_args()`

			`device = args.device.strip() or ("cuda" if torch.cuda.is_available() else "cpu")`
			`csv_dir = os.path.abspath(os.path.expanduser(args.csv_dir))`
			`path_prefix = os.path.abspath(os.path.expanduser(args.path_prefix))`
			`split_path = os.path.join(csv_dir, args.split)`

			`if not os.path.isfile(split_path):`
			`raise FileNotFoundError(f"Split CSV not found: {split_path}")`

			`ckpt_path = os.path.abspath(os.path.expanduser(args.checkpoint))`
			`# Torch 2.5+ warns about pickle by default. This is your own checkpoint, so we silence it.`
			`with warnings.catch_warnings():`
			warnings.filterwarnings("ignore", message="You are using `torch.load` with `weights_only=False`.*")
			`ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)`
			`cfg = ckpt.get("cfg", {})`
			`model_name = cfg.get("model", "x3d_m")`
			`num_frames = int(cfg.get("num_frames", 16))`
			`sampling_rate = int(cfg.get("sampling_rate", 5))`
			`target_fps = int(cfg.get("target_fps", 30))`
			`num_classes = int(cfg.get("num_classes", 3))`
			`clip_duration = num_frames * sampling_rate / float(target_fps)`

			`# Build dataset`
			`items = read_csv_list(split_path, path_prefix)`
			`print(f"Loaded {len(items)} samples from {split_path}")`
			`print(`
			`f"Decoding clips with pyav (clip_duration={clip_duration:.2f}s, num_frames={num_frames}). "`
			`f"If this looks stuck, try --num_workers 0."`
			`)`
			`ds = LabeledVideoDataset(`
			`labeled_video_paths=items,`
			`clip_sampler=make_clip_sampler("uniform", clip_duration),`
			`decode_audio=False,`
			`decoder="pyav",`
			`transform=VideoTransform(num_frames=num_frames, train=False),`
			`)`
			`loader = DataLoader(`
			`ds,`
			`batch_size=args.batch_size,`
			`num_workers=args.num_workers,`
			`pin_memory=(device == "cuda"),`
			`persistent_workers=bool(args.num_workers > 0),`
			`prefetch_factor=2 if args.num_workers > 0 else None,`
			`)`

			`# Build model architecture and load weights.`
			`model = build_pretrained_x3d(model_name=model_name, pretrained=False)`
			`# Ensure head shape matches checkpoint state_dict (trained as 3-class).`
			`replace_last_linear(model, num_classes=num_classes)`
			`model.load_state_dict(ckpt["model"], strict=True)`
			`model = model.to(device)`
			`model.eval()`

			`total = 0`
			`correct3 = 0`
			`cm = torch.zeros((3, 3), dtype=torch.long)`

			`with torch.no_grad():`
			`t_last = time.time()`
			`for batch_idx, (videos, labels3) in enumerate(loader, start=1):`
			`videos = videos.to(device, non_blocking=True)`
			`labels3 = labels3.to(device, non_blocking=True)`

			`logits3 = model(videos)`
			`preds3 = torch.argmax(logits3, dim=1)`

			`total += labels3.numel()`
			`correct3 += (preds3 == labels3).sum().item()`
			`cm += confusion_matrix_3(preds3.cpu(), labels3.cpu())`

			`if args.log_interval > 0 and (batch_idx % args.log_interval) == 0:`
			`dt = time.time() - t_last`
			`t_last = time.time()`
			`print(`
			`f"Processed batch {batch_idx:05d} "`
			`f"({total} clips) \| acc so far {(correct3/max(total,1))*100:.2f}% "`
			`f"({dt:.1f}s/{args.log_interval} batches)"`
			`)`

			`if args.max_batches and batch_idx >= args.max_batches:`
			`break`

			`acc3 = correct3 / max(total, 1)`
			`print(f"3-class accuracy: {acc3*100:.2f}% ({correct3}/{total})")`
			`print("3-class confusion matrix (rows=true, cols=pred) for [feeding, normal, scared]:")`
			`print(cm)`


			`if __name__ == "__main__":`
			`main()`