Initial commit: FishServer monorepo (FishAction, FishMeasure, fish_api)

Made-with: Cursor
2026-04-08 19:32:23 +08:00
commit 9df21f80ef
180 changed files with 96298 additions and 0 deletions
--- a/FishMeasure/weight_estimator/dataset.py
+++ b/FishMeasure/weight_estimator/dataset.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python3
+"""
+Build a dataset index mapping point clouds (.ply) to weight labels.
+
+Data:
+- Point clouds live under:
+    /home/ubuntu/data/fish/2025-11-19-output/{sample_id}/cloud/*.ply
+- Labels live at:
+    /home/ubuntu/projects/FishMeasure/measure/data/label.csv
+
+CSV format:
+- Column B (index 1): sample_id (svo2 name / folder name)
+- Column F (index 5): weight in grams (float)
+
+Output JSON:
+{
+  "meta": {...},
+  "items": [
+    {"ply": "<abs_or_rel_path>", "sample_id": "...", "weight_g": 123.45}
+  ],
+  "mapping": {
+    "<abs_or_rel_path>": 123.45
+  }
+}
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+
+DEFAULT_DATA_ROOT = "/home/ubuntu/data/fish/2025-11-19-output"
+DEFAULT_LABEL_CSV = "/home/ubuntu/projects/FishMeasure/measure/data/label.csv"
+
+
+@dataclass(frozen=True)
+class LabelRow:
+    sample_id: str
+    weight_g: Optional[float]
+
+
+def _parse_weight_g(cell: str) -> Optional[float]:
+    cell = (cell or "").strip()
+    if not cell:
+        return None
+    try:
+        return float(cell)
+    except Exception:
+        return None
+
+
+def load_labels(label_csv: Path) -> List[LabelRow]:
+    """
+    Load labels from label.csv.
+
+    Returns a list of LabelRow (keeps duplicates).
+    """
+    rows: List[LabelRow] = []
+    with label_csv.open("r", encoding="utf-8") as f:
+        reader = csv.reader(f)
+        for r in reader:
+            if len(r) < 2:
+                continue
+            sample_id = (r[1] or "").strip()
+            if not sample_id or sample_id.lower() == "xxx":
+                continue
+            weight_g = _parse_weight_g(r[5] if len(r) > 5 else "")
+            rows.append(LabelRow(sample_id=sample_id, weight_g=weight_g))
+    return rows
+
+
+def resolve_sample_weights(
+    label_rows: List[LabelRow],
+    duplicate_policy: str = "mean",
+) -> Tuple[Dict[str, Optional[float]], Dict[str, List[Optional[float]]]]:
+    """
+    Resolve sample_id -> single weight according to policy.
+
+    duplicate_policy:
+      - mean: average of non-null weights
+      - first: first non-null weight
+      - error: raise if a sample_id has multiple distinct non-null weights
+
+    Returns:
+      - resolved: {sample_id: weight_g or None}
+      - raw: {sample_id: [weight_g_or_None, ...]}
+    """
+    raw: Dict[str, List[Optional[float]]] = {}
+    for row in label_rows:
+        raw.setdefault(row.sample_id, []).append(row.weight_g)
+
+    resolved: Dict[str, Optional[float]] = {}
+    for sample_id, weights in raw.items():
+        non_null = [w for w in weights if w is not None]
+        if not non_null:
+            resolved[sample_id] = None
+            continue
+
+        if duplicate_policy == "first":
+            resolved[sample_id] = non_null[0]
+            continue
+
+        if duplicate_policy == "mean":
+            resolved[sample_id] = sum(non_null) / len(non_null)
+            continue
+
+        if duplicate_policy == "error":
+            distinct = sorted(set(non_null))
+            if len(distinct) > 1:
+                raise ValueError(
+                    f"Duplicate sample_id with multiple weights: {sample_id}: {distinct}"
+                )
+            resolved[sample_id] = distinct[0]
+            continue
+
+        raise ValueError(f"Unknown duplicate_policy: {duplicate_policy}")
+
+    return resolved, raw
+
+
+def collect_ply_items(
+    data_root: Path,
+    sample_weights: Dict[str, Optional[float]],
+    relative_paths: bool,
+) -> Tuple[List[dict], Dict[str, float], Dict[str, int]]:
+    """
+    Build (items list, mapping dict, stats dict).
+    """
+    items: List[dict] = []
+    mapping: Dict[str, float] = {}
+
+    missing_folders = 0
+    missing_weights = 0
+    total_plys = 0
+
+    for sample_id in sorted(sample_weights.keys()):
+        weight_g = sample_weights[sample_id]
+        if weight_g is None:
+            missing_weights += 1
+            continue
+
+        cloud_dir = data_root / sample_id / "cloud"
+        if not cloud_dir.exists():
+            missing_folders += 1
+            continue
+
+        ply_files = sorted(cloud_dir.glob("*.ply"))
+        if not ply_files:
+            # still count as missing folder-like data case
+            missing_folders += 1
+            continue
+
+        for ply in ply_files:
+            total_plys += 1
+            ply_path = ply
+            if relative_paths:
+                try:
+                    ply_path = ply.relative_to(data_root)
+                except Exception:
+                    ply_path = ply
+
+            ply_key = str(ply_path)
+            item = {
+                "ply": ply_key,
+                "sample_id": sample_id,
+                "weight_g": float(weight_g),
+            }
+            items.append(item)
+            mapping[ply_key] = float(weight_g)
+
+    stats = {
+        "missing_folders": missing_folders,
+        "missing_weights": missing_weights,
+        "total_plys": total_plys,
+    }
+    return items, mapping, stats
+
+
+def build_index(
+    data_root: Path,
+    label_csv: Path,
+    output_json: Path,
+    duplicate_policy: str = "mean",
+    relative_paths: bool = False,
+) -> dict:
+    label_rows = load_labels(label_csv)
+    sample_weights, raw_weights = resolve_sample_weights(
+        label_rows, duplicate_policy=duplicate_policy
+    )
+
+    items, mapping, stats = collect_ply_items(
+        data_root=data_root,
+        sample_weights=sample_weights,
+        relative_paths=relative_paths,
+    )
+
+    num_duplicates = sum(1 for _, ws in raw_weights.items() if len(ws) > 1)
+
+    out = {
+        "meta": {
+            "data_root": str(data_root),
+            "label_csv": str(label_csv),
+            "weight_column": "F",
+            "duplicate_policy": duplicate_policy,
+            "relative_paths": relative_paths,
+            "num_label_rows": len(label_rows),
+            "num_sample_ids": len(sample_weights),
+            "num_sample_ids_with_duplicate_rows": num_duplicates,
+            **stats,
+        },
+        "items": items,
+        "mapping": mapping,
+    }
+
+    output_json.parent.mkdir(parents=True, exist_ok=True)
+    output_json.write_text(json.dumps(out, indent=2, ensure_ascii=False), encoding="utf-8")
+    return out
+
+
+def prune_missing_files(index_json: Path, data_root: Optional[Path] = None) -> dict:
+    """
+    Load index JSON, remove items whose PLY file does not exist, and return updated index.
+    """
+    with index_json.open("r", encoding="utf-8") as f:
+        index = json.load(f)
+
+    items = index.get("items", [])
+    meta_root = index.get("meta", {}).get("data_root", None)
+    root = Path(data_root or meta_root or "/").expanduser().resolve()
+
+    kept: List[dict] = []
+    removed = 0
+
+    for item in items:
+        ply_str = item.get("ply", "")
+        ply_path = Path(ply_str)
+        if not ply_path.is_absolute():
+            ply_path = root / ply_path
+        ply_path = ply_path.expanduser().resolve()
+
+        if ply_path.exists():
+            kept.append(item)
+        else:
+            removed += 1
+
+    mapping = {it["ply"]: it["weight_g"] for it in kept}
+    meta = index.get("meta", {})
+    meta["total_plys"] = len(kept)
+    meta["pruned_missing"] = removed
+
+    out = {
+        "meta": meta,
+        "items": kept,
+        "mapping": mapping,
+    }
+    return out
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Create a JSON mapping each .ply to weight (grams) using label.csv column F."
+    )
+    parser.add_argument(
+        "--data-root",
+        type=str,
+        default=DEFAULT_DATA_ROOT,
+        help=f"Dataset root containing { '{sample_id}/cloud/*.ply' } (default: {DEFAULT_DATA_ROOT})",
+    )
+    parser.add_argument(
+        "--label-csv",
+        type=str,
+        default=DEFAULT_LABEL_CSV,
+        help=f"Path to label.csv (default: {DEFAULT_LABEL_CSV})",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="weight_estimator/dataset_index.json",
+        help="Output JSON path (default: weight_estimator/dataset_index.json)",
+    )
+    parser.add_argument(
+        "--duplicate-policy",
+        type=str,
+        default="mean",
+        choices=["mean", "first", "error"],
+        help="How to resolve duplicate sample_id rows in label.csv (default: mean).",
+    )
+    parser.add_argument(
+        "--relative-paths",
+        action="store_true",
+        help="Store PLY paths relative to --data-root instead of absolute paths.",
+    )
+    parser.add_argument(
+        "--prune-missing",
+        action="store_true",
+        help="Load existing index, remove entries for non-existent PLY files, and overwrite.",
+    )
+
+    args = parser.parse_args()
+
+    output_json = Path(args.output).expanduser().resolve()
+
+    if args.prune_missing:
+        if not output_json.exists():
+            raise SystemExit(f"Index JSON not found (required for --prune-missing): {output_json}")
+        data_root = Path(args.data_root).expanduser().resolve() if args.data_root else None
+        out = prune_missing_files(output_json, data_root=data_root)
+        output_json.write_text(json.dumps(out, indent=2, ensure_ascii=False), encoding="utf-8")
+        meta = out["meta"]
+        print("Pruned dataset index (removed non-existent PLY entries).")
+        print(f"  output: {output_json}")
+        print(f"  total_plys: {meta['total_plys']} (removed {meta.get('pruned_missing', 0)} missing)")
+        return
+
+    data_root = Path(args.data_root).expanduser().resolve()
+    label_csv = Path(args.label_csv).expanduser().resolve()
+
+    if not data_root.exists():
+        raise SystemExit(f"data root does not exist: {data_root}")
+    if not label_csv.exists():
+        raise SystemExit(f"label csv does not exist: {label_csv}")
+
+    out = build_index(
+        data_root=data_root,
+        label_csv=label_csv,
+        output_json=output_json,
+        duplicate_policy=args.duplicate_policy,
+        relative_paths=args.relative_paths,
+    )
+
+    meta = out["meta"]
+    print("Dataset index written.")
+    print(f"  output: {output_json}")
+    print(f"  total_plys: {meta['total_plys']}")
+    print(f"  missing_weights: {meta['missing_weights']}")
+    print(f"  missing_folders: {meta['missing_folders']}")
+    print(f"  duplicate_policy: {meta['duplicate_policy']}")
+
+
+if __name__ == "__main__":
+    main()
+