6.2

2026-06-02 16:59:42 +08:00
commit 07816bd18a
44 changed files with 9035 additions and 0 deletions
--- a/code/video_clip_cls/infer_single_0506/run_segments_consumable_vote.py
+++ b/code/video_clip_cls/infer_single_0506/run_segments_consumable_vote.py
@@ -0,0 +1,564 @@
+#!/usr/bin/env python3
+"""
+仅在「时间段 txt」内跑：人手检测 → **逐帧**好/坏门控（**top1 为 good 且 top1conf>阈值**，默认阈值 0.9）
+→ 仅通过的帧跑 41 类耗材分类；（可选）仅保留 **耗材 softmax 最大值 > --haocai-min-conf** 的帧；
+对保留帧的标签序列做 **滑动窗口多数票平滑**，再 **`consumable` 取平滑后序列众数**。
+
+**avg_softmax_*** ：仅对上述「高置信耗材帧」统计；类别为 softmax 均值分布前三；置信度为三档边际 softmax 在时间上的平均。
+
+不扫全片；每段从视频中按起止时间解码。
+
+用法（建议在 yolo 环境）:
+  python code/video_clip_cls/infer_single_0506/run_segments_consumable_vote.py \\
+    --segments .../03视频_segments_mutual_exclusive_score_gt_0.1.txt \\
+    --video .../03视频.mp4 \\
+    --out .../03视频_segments_consumables.txt
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from collections import Counter
+from pathlib import Path
+
+import cv2
+import numpy as np
+from ultralytics import YOLO
+
+for _repo in Path(__file__).resolve().parents:
+    if (_repo / "repo_root.py").is_file() and (_repo / "dataset.py").is_file():
+        if str(_repo) not in sys.path:
+            sys.path.insert(0, str(_repo))
+        break
+else:
+    raise RuntimeError("未定位到仓库 code/ 根目录")
+
+from repo_root import CODE_ROOT  # noqa: E402
+
+
+def parse_segments_txt(path: Path) -> list[tuple[int, float, float]]:
+    rows: list[tuple[int, float, float]] = []
+    for raw in path.read_text(encoding="utf-8").splitlines():
+        if not raw.strip() or raw.lower().startswith("rank"):
+            continue
+        parts = raw.split("\t")
+        if len(parts) < 4:
+            continue
+        rank = int(parts[0].strip())
+        t0 = float(parts[1].strip())
+        t1 = float(parts[2].strip())
+        rows.append((rank, t0, t1))
+    return rows
+
+
+def collect_hand_boxes(det_model: YOLO, boxes) -> list[list[float]]:
+    names = det_model.names
+    out: list[list[float]] = []
+    for box in boxes:
+        cid = int(box.cls[0])
+        label = names.get(cid, "")
+        if label == "hand":
+            out.append(box.xyxy[0].tolist())
+    return out
+
+
+def pad_box(
+    xyxy: list[float], img_w: int, img_h: int, pad_ratio: float
+) -> tuple[int, int, int, int]:
+    """四向等比外扩（legacy；Phase2 现用 pad_box_bottom_only）。"""
+    x1, y1, x2, y2 = xyxy
+    bw, bh = x2 - x1, y2 - y1
+    px, py = bw * pad_ratio, bh * pad_ratio
+    return (
+        max(0, int(x1 - px)),
+        max(0, int(y1 - py)),
+        min(img_w, int(x2 + px)),
+        min(img_h, int(y2 + py)),
+    )
+
+
+def pad_box_bottom_only(
+    xyxy: list[float], img_w: int, img_h: int, bottom_ratio: float
+) -> tuple[int, int, int, int]:
+    """紧框 union 后仅向下延伸：y2 += 框高 * bottom_ratio；上/左/右不变。"""
+    x1, y1, x2, y2 = xyxy
+    bh = y2 - y1
+    dy = bh * float(bottom_ratio)
+    return (
+        max(0, int(x1)),
+        max(0, int(y1)),
+        min(img_w, int(x2)),
+        min(img_h, int(y2 + dy)),
+    )
+
+
+def largest_hand(hands: list[list[float]]) -> list[float]:
+    def area(b: list[float]) -> float:
+        return max(0.0, b[2] - b[0]) * max(0.0, b[3] - b[1])
+
+    return max(hands, key=area)
+
+
+def _float_top1conf(pr) -> float:
+    tc = pr.top1conf
+    if tc is None:
+        return 0.0
+    if isinstance(tc, (float, int, np.floating)):
+        return float(tc)
+    return float(tc.detach().float().cpu().item())
+
+
+def passes_good_gate_top1_conf(
+    gb_model: YOLO,
+    crop: np.ndarray,
+    gb_names: dict,
+    imgsz: int,
+    top1_conf_must_exceed: float,
+) -> bool:
+    """好/坏分类：predicted top1 为 good，且 top1conf 严格大于给定阈值。"""
+    if crop.size == 0:
+        return False
+    r = gb_model.predict(crop, imgsz=imgsz, verbose=False)[0]
+    pr = r.probs
+    if pr is None:
+        return False
+    tid = int(pr.top1)
+    label = str(gb_names.get(tid, "")).strip().lower()
+    conf = _float_top1conf(pr)
+    return label == "good" and conf > top1_conf_must_exceed
+
+
+def haocai_softmax_probs(
+    cls_model: YOLO, crop: np.ndarray, imgsz: int, n_cls: int
+) -> np.ndarray | None:
+    """耗材分类：返回长度 n_cls 的 softmax 概率向量（与模型 top1 一致）。"""
+    if crop.size == 0:
+        return None
+    r = cls_model.predict(crop, imgsz=imgsz, verbose=False)[0]
+    pr = r.probs
+    if pr is None or pr.data is None:
+        return None
+    v = pr.data.detach().float().cpu().numpy().astype(np.float64).ravel()
+    if v.size < n_cls:
+        v = np.resize(v, n_cls)
+    v = v[:n_cls].copy()
+    s = float(np.sum(v))
+    if s <= 1e-12:
+        return None
+    # 若未归一化则 softmax
+    if abs(s - 1.0) > 0.08:
+        v = v - float(np.max(v))
+        e = np.exp(np.clip(v, -40.0, 40.0))
+        out = e / float(np.sum(e))
+        return out
+    return v / s
+
+
+def _cls_name(names: dict, idx: int) -> str:
+    return str(names.get(int(idx), str(idx)))
+
+
+def mean_softmax_top3(
+    probs_list: list[np.ndarray], cls_names: dict
+) -> tuple[list[str], list[float]]:
+    """
+    类名：多帧 softmax 按类逐维算术平均，在平均向量上取概率最大的前三类。
+
+    置信度（与类名解耦）：逐帧对 softmax 从高到低排序，取第 1/2/3 大的概率，
+    再在各帧上对这三档分别做算术平均（「帧内边际 topk」的时间平均）。
+    返回三个槽位（不足则用空字符串与 0.0 补齐）。
+    """
+    names_out: list[str] = []
+    probs_out: list[float] = []
+    if not probs_list:
+        for _ in range(3):
+            names_out.append("")
+            probs_out.append(0.0)
+        return names_out, probs_out
+    stacked = np.stack(probs_list, axis=0)
+    p = np.mean(stacked, axis=0, dtype=np.float64)
+    order = np.argsort(-p)
+    for k in range(3):
+        if k < order.size:
+            j = int(order[k])
+            names_out.append(_cls_name(cls_names, j))
+        else:
+            names_out.append("")
+    # 逐帧降序 softmax，对第 1/2/3 档做时间平均
+    row_sorted = np.sort(stacked, axis=1)[:, ::-1]
+    n_cls = row_sorted.shape[1]
+    for k in range(3):
+        if k < n_cls:
+            probs_out.append(float(np.mean(row_sorted[:, k], dtype=np.float64)))
+        else:
+            probs_out.append(0.0)
+    return names_out, probs_out
+
+
+def smooth_labels_majority(labels: list[str], window: int) -> list[str]:
+    """
+    对时间有序的类别名做平滑：对每个位置取以该位置为中心、长度为奇数 window 的邻域，
+    用邻域内众数替换（打破平局时用最邻域计数最高者）。
+    window<=1 时原样返回。
+    """
+    if window <= 1 or not labels:
+        return list(labels)
+    w = window if window % 2 == 1 else window + 1
+    half = w // 2
+    n = len(labels)
+    out: list[str] = []
+    for i in range(n):
+        lo = max(0, i - half)
+        hi = min(n, i + half + 1)
+        chunk = labels[lo:hi]
+        top, _c = Counter(chunk).most_common(1)[0]
+        out.append(top)
+    return out
+
+
+def process_segment(
+    cap: cv2.VideoCapture,
+    det: YOLO,
+    gb: YOLO,
+    cls_m: YOLO,
+    *,
+    start_sec: float,
+    end_sec: float,
+    seek_margin_sec: float,
+    det_conf: float,
+    pad_ratio: float,
+    imgsz_det: int,
+    imgsz_cls: int,
+    frame_stride: int,
+    good_top1_conf_threshold: float,
+    haocai_min_conf: float,
+    smooth_label_window: int,
+    gb_names: dict,
+    cls_names: dict,
+) -> dict:
+    # HEVC/部分 mp4：直接 seek 到 start 易产生坏参考帧；先往回跳再顺序解码丢到起点。
+    probe_from = float(max(0.0, start_sec - seek_margin_sec))
+    cap.set(cv2.CAP_PROP_POS_MSEC, probe_from * 1000.0)
+    synced_frame: np.ndarray | None = None
+    synced_t: float | None = None
+    tol = 0.04
+    while True:
+        ok0, grab = cap.read()
+        if not ok0 or grab is None:
+            synced_frame, synced_t = None, None
+            break
+        t0 = float(cap.get(cv2.CAP_PROP_POS_MSEC)) / 1000.0
+        if t0 + tol >= start_sec:
+            synced_frame, synced_t = grab, t0
+            break
+
+    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+    n_cls_key_max = max(int(k) for k in cls_names.keys())
+    n_cls = n_cls_key_max + 1
+
+    n_hand_frames = 0
+    # top1==good 且 top1conf>阈值的帧数（门控通过即计数，与是否成功得到 softmax 无关）
+    n_gate_pass = 0
+    cls_labels: list[str] = []
+    cls_prob_rows: list[np.ndarray] = []
+    frames_read_in_segment = 0
+
+    def one_frame(fr: np.ndarray, _t_abs: float) -> None:
+        nonlocal frames_read_in_segment, n_hand_frames, n_gate_pass, cls_labels, cls_prob_rows
+        frames_read_in_segment += 1
+        if frame_stride > 1 and (frames_read_in_segment - 1) % frame_stride != 0:
+            return
+
+        r0 = det.predict(
+            fr,
+            conf=det_conf,
+            imgsz=imgsz_det,
+            verbose=False,
+        )[0]
+        hands = collect_hand_boxes(det, r0.boxes) if r0.boxes else []
+        if not hands:
+            return
+
+        n_hand_frames += 1
+        xyxy = largest_hand(hands)
+        x1, y1, x2, y2 = pad_box(xyxy, w, h, pad_ratio)
+        crop = fr[y1:y2, x1:x2]
+        ok_gate = passes_good_gate_top1_conf(
+            gb, crop, gb_names, imgsz_cls, good_top1_conf_threshold
+        )
+        if ok_gate:
+            n_gate_pass += 1
+            vec = haocai_softmax_probs(cls_m, crop, imgsz_cls, n_cls)
+            if vec is not None:
+                top_prob = float(np.max(vec))
+                if top_prob <= haocai_min_conf:
+                    return
+                cls_prob_rows.append(vec)
+                cls_labels.append(_cls_name(cls_names, int(np.argmax(vec))))
+
+    if synced_frame is not None and synced_t is not None:
+        if synced_t <= end_sec + 0.08:
+            one_frame(synced_frame, synced_t)
+
+    while True:
+        ok, frame = cap.read()
+        if not ok or frame is None:
+            break
+        t = float(cap.get(cv2.CAP_PROP_POS_MSEC)) / 1000.0
+        if t > end_sec + 0.08:
+            break
+        if t + 1e-6 < start_sec:
+            continue
+        one_frame(frame, t)
+
+    if n_hand_frames == 0:
+        return {
+            "consumable": "（段内未检测到手部）",
+            "n_hand_frames": 0,
+            "n_gate_pass": 0,
+            "n_predictions": 0,
+            "top_vote_count": 0,
+            "avg_top1_cls": "",
+            "avg_top1_prob": "",
+            "avg_top2_cls": "",
+            "avg_top2_prob": "",
+            "avg_top3_cls": "",
+            "avg_top3_prob": "",
+        }
+
+    if not cls_labels:
+        return {
+            "consumable": (
+                "（无满足条件的耗材帧：好帧置信度或未过门控"
+                + (
+                    "" if haocai_min_conf <= 0.0
+                    else "，或耗材 top1 softmax 不大于阈值"
+                )
+                + "）"
+            ),
+            "n_hand_frames": n_hand_frames,
+            "n_gate_pass": n_gate_pass,
+            "n_predictions": 0,
+            "top_vote_count": 0,
+            "avg_top1_cls": "",
+            "avg_top1_prob": "",
+            "avg_top2_cls": "",
+            "avg_top2_prob": "",
+            "avg_top3_cls": "",
+            "avg_top3_prob": "",
+        }
+
+    smoothed = smooth_labels_majority(cls_labels, smooth_label_window)
+    top_name, vote_n = Counter(smoothed).most_common(1)[0]
+    a1, ap1 = mean_softmax_top3(cls_prob_rows, cls_names)
+    return {
+        "consumable": top_name,
+        "n_hand_frames": n_hand_frames,
+        "n_gate_pass": n_gate_pass,
+        "n_predictions": len(cls_labels),
+        "top_vote_count": int(vote_n),
+        "avg_top1_cls": a1[0],
+        "avg_top1_prob": f"{ap1[0]:.6f}",
+        "avg_top2_cls": a1[1],
+        "avg_top2_prob": f"{ap1[1]:.6f}",
+        "avg_top3_cls": a1[2],
+        "avg_top3_prob": f"{ap1[2]:.6f}",
+    }
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(
+        description="手检 + 逐帧 top1=good 且 top1conf>阈值门控 + 耗材分类；段内众数"
+    )
+    ap.add_argument(
+        "--segments",
+        type=Path,
+        default=Path(__file__).resolve().parent
+        / "results"
+        / "03视频_segments_mutual_exclusive_score_gt_0.1.txt",
+    )
+    ap.add_argument(
+        "--video",
+        type=Path,
+        default=CODE_ROOT.parent
+        / "data/haocai/5月6号视频/5月6日第二次视频/03视频.mp4",
+    )
+    ap.add_argument(
+        "--hand-model",
+        type=Path,
+        default=CODE_ROOT
+        / "hand_detection/runs/hand_det_y11s_multiframe-better/weights/best.pt",
+    )
+    ap.add_argument(
+        "--goodbad-model",
+        type=Path,
+        default=CODE_ROOT
+        / "goodORbad_frame/runs/goodbad_frame_y11m_e50/weights/best.pt",
+    )
+    ap.add_argument(
+        "--haocai-model",
+        type=Path,
+        default=CODE_ROOT
+        / "haocai_classify/runs/haocai_cls_41cls_goodframe_lastest-0.95"
+        / "weights/best.pt",
+    )
+    ap.add_argument(
+        "--out",
+        type=Path,
+        default=Path(__file__).resolve().parent
+        / "results"
+        / "03视频_segments_consumables.txt",
+    )
+    ap.add_argument(
+        "--good-top1-conf-threshold",
+        type=float,
+        default=0.90,
+        dest="good_top1_conf_threshold",
+        help="逐帧：仅当 top1 为 good 且 top1conf **严格大于**该值时才跑耗材分类（默认对应 top1conf>0.9）",
+    )
+    ap.add_argument(
+        "--haocai-min-conf",
+        type=float,
+        default=0.0,
+        metavar="P",
+        help="耗材：仅 softmax 最大值 **严格大于** P 的帧计入标签与 softmax 统计（0 表示不按耗材置信度筛）",
+    )
+    ap.add_argument(
+        "--smooth-label-window",
+        type=int,
+        default=1,
+        metavar="W",
+        help="耗材标签平滑：长度为 W 的奇数滑动窗口内多数票（W≤1 不平滑）；众数取平滑后的序列",
+    )
+    ap.add_argument("--det-conf", type=float, default=0.5)
+    ap.add_argument("--pad-ratio", type=float, default=0.30)
+    ap.add_argument("--imgsz-det", type=int, default=640)
+    ap.add_argument("--imgsz-cls", type=int, default=224)
+    ap.add_argument(
+        "--frame-stride",
+        type=int,
+        default=1,
+        help=">1 时代码逐帧解码但每 N 帧推理一次（省算力，结论可能略粗糙）",
+    )
+    ap.add_argument(
+        "--seek-margin-sec",
+        type=float,
+        default=3.0,
+        help="HEVC 等非关键帧 seek 时往回多跳若干秒再解码到段起点，减轻花屏",
+    )
+    args = ap.parse_args()
+
+    seg_path = args.segments.resolve()
+    vid_path = args.video.resolve()
+    if not seg_path.is_file():
+        print("找不到时间段文件:", seg_path, file=sys.stderr)
+        return 1
+    if not vid_path.is_file():
+        print("找不到视频:", vid_path, file=sys.stderr)
+        return 1
+    for pt, lab in (
+        (args.hand_model, "hand"),
+        (args.goodbad_model, "good/bad"),
+        (args.haocai_model, "haocai cls"),
+    ):
+        if not Path(pt).is_file():
+            print(f"缺少{lab} 权重:", pt, file=sys.stderr)
+            return 1
+
+    segments = parse_segments_txt(seg_path)
+    if not segments:
+        print("时间段为空:", seg_path, file=sys.stderr)
+        return 1
+
+    print("加载模型…", flush=True)
+    det = YOLO(str(args.hand_model))
+    gb = YOLO(str(args.goodbad_model))
+    cls_m = YOLO(str(args.haocai_model))
+    gb_names = gb.names
+    cls_names = cls_m.names
+
+    cap = cv2.VideoCapture(str(vid_path))
+    if not cap.isOpened():
+        print("无法打开视频:", vid_path, file=sys.stderr)
+        return 1
+
+    sep = "\t"
+    out_lines = [
+        sep.join([
+            "rank",
+            "start_sec",
+            "end_sec",
+            "consumable",
+            "n_hand_frames",
+            "n_frames_top1_good_conf_gt_thresh",
+            "n_consumable_predictions",
+            "top_label_vote_count",
+            "avg_softmax_top1_cls",
+            "avg_softmax_top1_prob",
+            "avg_softmax_top2_cls",
+            "avg_softmax_top2_prob",
+            "avg_softmax_top3_cls",
+            "avg_softmax_top3_prob",
+        ])
+    ]
+
+    try:
+        for rank, t0, t1 in segments:
+            print(f"段落 rank={rank} [{t0:.3f},{t1:.3f}]s …", flush=True)
+            info = process_segment(
+                cap,
+                det,
+                gb,
+                cls_m,
+                start_sec=t0,
+                end_sec=t1,
+                seek_margin_sec=args.seek_margin_sec,
+                det_conf=args.det_conf,
+                pad_ratio=args.pad_ratio,
+                imgsz_det=args.imgsz_det,
+                imgsz_cls=args.imgsz_cls,
+                frame_stride=max(1, args.frame_stride),
+                good_top1_conf_threshold=args.good_top1_conf_threshold,
+                haocai_min_conf=args.haocai_min_conf,
+                smooth_label_window=max(1, args.smooth_label_window),
+                gb_names=gb_names,
+                cls_names=cls_names,
+            )
+            row = sep.join([
+                str(rank),
+                f"{t0:.6f}",
+                f"{t1:.6f}",
+                str(info["consumable"]),
+                str(info["n_hand_frames"]),
+                str(info["n_gate_pass"]),
+                str(info["n_predictions"]),
+                str(info["top_vote_count"]),
+                info["avg_top1_cls"],
+                info["avg_top1_prob"],
+                info["avg_top2_cls"],
+                info["avg_top2_prob"],
+                info["avg_top3_cls"],
+                info["avg_top3_prob"],
+            ])
+            out_lines.append(row)
+            print(
+                f"  -> {info['consumable']} "
+                f"(votes {info['top_vote_count']}/{info['n_predictions']}, "
+                f"goodgate {info['n_gate_pass']}/{info['n_hand_frames']} hand frames)",
+                flush=True,
+            )
+    finally:
+        cap.release()
+
+    out_path = args.out.resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
+    print("已写出:", out_path, flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())