This commit is contained in:
hsz
2026-06-02 16:59:42 +08:00
commit 07816bd18a
44 changed files with 9035 additions and 0 deletions

View File

@@ -0,0 +1,564 @@
#!/usr/bin/env python3
"""
仅在「时间段 txt」内跑人手检测 → **逐帧**好/坏门控(**top1 为 good 且 top1conf>阈值**,默认阈值 0.9
→ 仅通过的帧跑 41 类耗材分类;(可选)仅保留 **耗材 softmax 最大值 > --haocai-min-conf** 的帧;
对保留帧的标签序列做 **滑动窗口多数票平滑**,再 **`consumable` 取平滑后序列众数**。
**avg_softmax_*** :仅对上述「高置信耗材帧」统计;类别为 softmax 均值分布前三;置信度为三档边际 softmax 在时间上的平均。
不扫全片;每段从视频中按起止时间解码。
用法(建议在 yolo 环境):
python code/video_clip_cls/infer_single_0506/run_segments_consumable_vote.py \\
--segments .../03视频_segments_mutual_exclusive_score_gt_0.1.txt \\
--video .../03视频.mp4 \\
--out .../03视频_segments_consumables.txt
"""
from __future__ import annotations
import argparse
import sys
from collections import Counter
from pathlib import Path
import cv2
import numpy as np
from ultralytics import YOLO
for _repo in Path(__file__).resolve().parents:
if (_repo / "repo_root.py").is_file() and (_repo / "dataset.py").is_file():
if str(_repo) not in sys.path:
sys.path.insert(0, str(_repo))
break
else:
raise RuntimeError("未定位到仓库 code/ 根目录")
from repo_root import CODE_ROOT # noqa: E402
def parse_segments_txt(path: Path) -> list[tuple[int, float, float]]:
rows: list[tuple[int, float, float]] = []
for raw in path.read_text(encoding="utf-8").splitlines():
if not raw.strip() or raw.lower().startswith("rank"):
continue
parts = raw.split("\t")
if len(parts) < 4:
continue
rank = int(parts[0].strip())
t0 = float(parts[1].strip())
t1 = float(parts[2].strip())
rows.append((rank, t0, t1))
return rows
def collect_hand_boxes(det_model: YOLO, boxes) -> list[list[float]]:
names = det_model.names
out: list[list[float]] = []
for box in boxes:
cid = int(box.cls[0])
label = names.get(cid, "")
if label == "hand":
out.append(box.xyxy[0].tolist())
return out
def pad_box(
xyxy: list[float], img_w: int, img_h: int, pad_ratio: float
) -> tuple[int, int, int, int]:
"""四向等比外扩legacyPhase2 现用 pad_box_bottom_only"""
x1, y1, x2, y2 = xyxy
bw, bh = x2 - x1, y2 - y1
px, py = bw * pad_ratio, bh * pad_ratio
return (
max(0, int(x1 - px)),
max(0, int(y1 - py)),
min(img_w, int(x2 + px)),
min(img_h, int(y2 + py)),
)
def pad_box_bottom_only(
xyxy: list[float], img_w: int, img_h: int, bottom_ratio: float
) -> tuple[int, int, int, int]:
"""紧框 union 后仅向下延伸y2 += 框高 * bottom_ratio上/左/右不变。"""
x1, y1, x2, y2 = xyxy
bh = y2 - y1
dy = bh * float(bottom_ratio)
return (
max(0, int(x1)),
max(0, int(y1)),
min(img_w, int(x2)),
min(img_h, int(y2 + dy)),
)
def largest_hand(hands: list[list[float]]) -> list[float]:
def area(b: list[float]) -> float:
return max(0.0, b[2] - b[0]) * max(0.0, b[3] - b[1])
return max(hands, key=area)
def _float_top1conf(pr) -> float:
tc = pr.top1conf
if tc is None:
return 0.0
if isinstance(tc, (float, int, np.floating)):
return float(tc)
return float(tc.detach().float().cpu().item())
def passes_good_gate_top1_conf(
gb_model: YOLO,
crop: np.ndarray,
gb_names: dict,
imgsz: int,
top1_conf_must_exceed: float,
) -> bool:
"""好/坏分类predicted top1 为 good且 top1conf 严格大于给定阈值。"""
if crop.size == 0:
return False
r = gb_model.predict(crop, imgsz=imgsz, verbose=False)[0]
pr = r.probs
if pr is None:
return False
tid = int(pr.top1)
label = str(gb_names.get(tid, "")).strip().lower()
conf = _float_top1conf(pr)
return label == "good" and conf > top1_conf_must_exceed
def haocai_softmax_probs(
cls_model: YOLO, crop: np.ndarray, imgsz: int, n_cls: int
) -> np.ndarray | None:
"""耗材分类:返回长度 n_cls 的 softmax 概率向量(与模型 top1 一致)。"""
if crop.size == 0:
return None
r = cls_model.predict(crop, imgsz=imgsz, verbose=False)[0]
pr = r.probs
if pr is None or pr.data is None:
return None
v = pr.data.detach().float().cpu().numpy().astype(np.float64).ravel()
if v.size < n_cls:
v = np.resize(v, n_cls)
v = v[:n_cls].copy()
s = float(np.sum(v))
if s <= 1e-12:
return None
# 若未归一化则 softmax
if abs(s - 1.0) > 0.08:
v = v - float(np.max(v))
e = np.exp(np.clip(v, -40.0, 40.0))
out = e / float(np.sum(e))
return out
return v / s
def _cls_name(names: dict, idx: int) -> str:
return str(names.get(int(idx), str(idx)))
def mean_softmax_top3(
probs_list: list[np.ndarray], cls_names: dict
) -> tuple[list[str], list[float]]:
"""
类名:多帧 softmax 按类逐维算术平均,在平均向量上取概率最大的前三类。
置信度(与类名解耦):逐帧对 softmax 从高到低排序,取第 1/2/3 大的概率,
再在各帧上对这三档分别做算术平均(「帧内边际 topk」的时间平均
返回三个槽位(不足则用空字符串与 0.0 补齐)。
"""
names_out: list[str] = []
probs_out: list[float] = []
if not probs_list:
for _ in range(3):
names_out.append("")
probs_out.append(0.0)
return names_out, probs_out
stacked = np.stack(probs_list, axis=0)
p = np.mean(stacked, axis=0, dtype=np.float64)
order = np.argsort(-p)
for k in range(3):
if k < order.size:
j = int(order[k])
names_out.append(_cls_name(cls_names, j))
else:
names_out.append("")
# 逐帧降序 softmax对第 1/2/3 档做时间平均
row_sorted = np.sort(stacked, axis=1)[:, ::-1]
n_cls = row_sorted.shape[1]
for k in range(3):
if k < n_cls:
probs_out.append(float(np.mean(row_sorted[:, k], dtype=np.float64)))
else:
probs_out.append(0.0)
return names_out, probs_out
def smooth_labels_majority(labels: list[str], window: int) -> list[str]:
"""
对时间有序的类别名做平滑:对每个位置取以该位置为中心、长度为奇数 window 的邻域,
用邻域内众数替换(打破平局时用最邻域计数最高者)。
window<=1 时原样返回。
"""
if window <= 1 or not labels:
return list(labels)
w = window if window % 2 == 1 else window + 1
half = w // 2
n = len(labels)
out: list[str] = []
for i in range(n):
lo = max(0, i - half)
hi = min(n, i + half + 1)
chunk = labels[lo:hi]
top, _c = Counter(chunk).most_common(1)[0]
out.append(top)
return out
def process_segment(
cap: cv2.VideoCapture,
det: YOLO,
gb: YOLO,
cls_m: YOLO,
*,
start_sec: float,
end_sec: float,
seek_margin_sec: float,
det_conf: float,
pad_ratio: float,
imgsz_det: int,
imgsz_cls: int,
frame_stride: int,
good_top1_conf_threshold: float,
haocai_min_conf: float,
smooth_label_window: int,
gb_names: dict,
cls_names: dict,
) -> dict:
# HEVC/部分 mp4直接 seek 到 start 易产生坏参考帧;先往回跳再顺序解码丢到起点。
probe_from = float(max(0.0, start_sec - seek_margin_sec))
cap.set(cv2.CAP_PROP_POS_MSEC, probe_from * 1000.0)
synced_frame: np.ndarray | None = None
synced_t: float | None = None
tol = 0.04
while True:
ok0, grab = cap.read()
if not ok0 or grab is None:
synced_frame, synced_t = None, None
break
t0 = float(cap.get(cv2.CAP_PROP_POS_MSEC)) / 1000.0
if t0 + tol >= start_sec:
synced_frame, synced_t = grab, t0
break
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
n_cls_key_max = max(int(k) for k in cls_names.keys())
n_cls = n_cls_key_max + 1
n_hand_frames = 0
# top1==good 且 top1conf>阈值的帧数(门控通过即计数,与是否成功得到 softmax 无关)
n_gate_pass = 0
cls_labels: list[str] = []
cls_prob_rows: list[np.ndarray] = []
frames_read_in_segment = 0
def one_frame(fr: np.ndarray, _t_abs: float) -> None:
nonlocal frames_read_in_segment, n_hand_frames, n_gate_pass, cls_labels, cls_prob_rows
frames_read_in_segment += 1
if frame_stride > 1 and (frames_read_in_segment - 1) % frame_stride != 0:
return
r0 = det.predict(
fr,
conf=det_conf,
imgsz=imgsz_det,
verbose=False,
)[0]
hands = collect_hand_boxes(det, r0.boxes) if r0.boxes else []
if not hands:
return
n_hand_frames += 1
xyxy = largest_hand(hands)
x1, y1, x2, y2 = pad_box(xyxy, w, h, pad_ratio)
crop = fr[y1:y2, x1:x2]
ok_gate = passes_good_gate_top1_conf(
gb, crop, gb_names, imgsz_cls, good_top1_conf_threshold
)
if ok_gate:
n_gate_pass += 1
vec = haocai_softmax_probs(cls_m, crop, imgsz_cls, n_cls)
if vec is not None:
top_prob = float(np.max(vec))
if top_prob <= haocai_min_conf:
return
cls_prob_rows.append(vec)
cls_labels.append(_cls_name(cls_names, int(np.argmax(vec))))
if synced_frame is not None and synced_t is not None:
if synced_t <= end_sec + 0.08:
one_frame(synced_frame, synced_t)
while True:
ok, frame = cap.read()
if not ok or frame is None:
break
t = float(cap.get(cv2.CAP_PROP_POS_MSEC)) / 1000.0
if t > end_sec + 0.08:
break
if t + 1e-6 < start_sec:
continue
one_frame(frame, t)
if n_hand_frames == 0:
return {
"consumable": "(段内未检测到手部)",
"n_hand_frames": 0,
"n_gate_pass": 0,
"n_predictions": 0,
"top_vote_count": 0,
"avg_top1_cls": "",
"avg_top1_prob": "",
"avg_top2_cls": "",
"avg_top2_prob": "",
"avg_top3_cls": "",
"avg_top3_prob": "",
}
if not cls_labels:
return {
"consumable": (
"(无满足条件的耗材帧:好帧置信度或未过门控"
+ (
"" if haocai_min_conf <= 0.0
else ",或耗材 top1 softmax 不大于阈值"
)
+ ""
),
"n_hand_frames": n_hand_frames,
"n_gate_pass": n_gate_pass,
"n_predictions": 0,
"top_vote_count": 0,
"avg_top1_cls": "",
"avg_top1_prob": "",
"avg_top2_cls": "",
"avg_top2_prob": "",
"avg_top3_cls": "",
"avg_top3_prob": "",
}
smoothed = smooth_labels_majority(cls_labels, smooth_label_window)
top_name, vote_n = Counter(smoothed).most_common(1)[0]
a1, ap1 = mean_softmax_top3(cls_prob_rows, cls_names)
return {
"consumable": top_name,
"n_hand_frames": n_hand_frames,
"n_gate_pass": n_gate_pass,
"n_predictions": len(cls_labels),
"top_vote_count": int(vote_n),
"avg_top1_cls": a1[0],
"avg_top1_prob": f"{ap1[0]:.6f}",
"avg_top2_cls": a1[1],
"avg_top2_prob": f"{ap1[1]:.6f}",
"avg_top3_cls": a1[2],
"avg_top3_prob": f"{ap1[2]:.6f}",
}
def main() -> int:
ap = argparse.ArgumentParser(
description="手检 + 逐帧 top1=good 且 top1conf>阈值门控 + 耗材分类;段内众数"
)
ap.add_argument(
"--segments",
type=Path,
default=Path(__file__).resolve().parent
/ "results"
/ "03视频_segments_mutual_exclusive_score_gt_0.1.txt",
)
ap.add_argument(
"--video",
type=Path,
default=CODE_ROOT.parent
/ "data/haocai/5月6号视频/5月6日第二次视频/03视频.mp4",
)
ap.add_argument(
"--hand-model",
type=Path,
default=CODE_ROOT
/ "hand_detection/runs/hand_det_y11s_multiframe-better/weights/best.pt",
)
ap.add_argument(
"--goodbad-model",
type=Path,
default=CODE_ROOT
/ "goodORbad_frame/runs/goodbad_frame_y11m_e50/weights/best.pt",
)
ap.add_argument(
"--haocai-model",
type=Path,
default=CODE_ROOT
/ "haocai_classify/runs/haocai_cls_41cls_goodframe_lastest-0.95"
/ "weights/best.pt",
)
ap.add_argument(
"--out",
type=Path,
default=Path(__file__).resolve().parent
/ "results"
/ "03视频_segments_consumables.txt",
)
ap.add_argument(
"--good-top1-conf-threshold",
type=float,
default=0.90,
dest="good_top1_conf_threshold",
help="逐帧:仅当 top1 为 good 且 top1conf **严格大于**该值时才跑耗材分类(默认对应 top1conf>0.9",
)
ap.add_argument(
"--haocai-min-conf",
type=float,
default=0.0,
metavar="P",
help="耗材:仅 softmax 最大值 **严格大于** P 的帧计入标签与 softmax 统计0 表示不按耗材置信度筛)",
)
ap.add_argument(
"--smooth-label-window",
type=int,
default=1,
metavar="W",
help="耗材标签平滑:长度为 W 的奇数滑动窗口内多数票W≤1 不平滑);众数取平滑后的序列",
)
ap.add_argument("--det-conf", type=float, default=0.5)
ap.add_argument("--pad-ratio", type=float, default=0.30)
ap.add_argument("--imgsz-det", type=int, default=640)
ap.add_argument("--imgsz-cls", type=int, default=224)
ap.add_argument(
"--frame-stride",
type=int,
default=1,
help=">1 时代码逐帧解码但每 N 帧推理一次(省算力,结论可能略粗糙)",
)
ap.add_argument(
"--seek-margin-sec",
type=float,
default=3.0,
help="HEVC 等非关键帧 seek 时往回多跳若干秒再解码到段起点,减轻花屏",
)
args = ap.parse_args()
seg_path = args.segments.resolve()
vid_path = args.video.resolve()
if not seg_path.is_file():
print("找不到时间段文件:", seg_path, file=sys.stderr)
return 1
if not vid_path.is_file():
print("找不到视频:", vid_path, file=sys.stderr)
return 1
for pt, lab in (
(args.hand_model, "hand"),
(args.goodbad_model, "good/bad"),
(args.haocai_model, "haocai cls"),
):
if not Path(pt).is_file():
print(f"缺少{lab} 权重:", pt, file=sys.stderr)
return 1
segments = parse_segments_txt(seg_path)
if not segments:
print("时间段为空:", seg_path, file=sys.stderr)
return 1
print("加载模型…", flush=True)
det = YOLO(str(args.hand_model))
gb = YOLO(str(args.goodbad_model))
cls_m = YOLO(str(args.haocai_model))
gb_names = gb.names
cls_names = cls_m.names
cap = cv2.VideoCapture(str(vid_path))
if not cap.isOpened():
print("无法打开视频:", vid_path, file=sys.stderr)
return 1
sep = "\t"
out_lines = [
sep.join([
"rank",
"start_sec",
"end_sec",
"consumable",
"n_hand_frames",
"n_frames_top1_good_conf_gt_thresh",
"n_consumable_predictions",
"top_label_vote_count",
"avg_softmax_top1_cls",
"avg_softmax_top1_prob",
"avg_softmax_top2_cls",
"avg_softmax_top2_prob",
"avg_softmax_top3_cls",
"avg_softmax_top3_prob",
])
]
try:
for rank, t0, t1 in segments:
print(f"段落 rank={rank} [{t0:.3f},{t1:.3f}]s …", flush=True)
info = process_segment(
cap,
det,
gb,
cls_m,
start_sec=t0,
end_sec=t1,
seek_margin_sec=args.seek_margin_sec,
det_conf=args.det_conf,
pad_ratio=args.pad_ratio,
imgsz_det=args.imgsz_det,
imgsz_cls=args.imgsz_cls,
frame_stride=max(1, args.frame_stride),
good_top1_conf_threshold=args.good_top1_conf_threshold,
haocai_min_conf=args.haocai_min_conf,
smooth_label_window=max(1, args.smooth_label_window),
gb_names=gb_names,
cls_names=cls_names,
)
row = sep.join([
str(rank),
f"{t0:.6f}",
f"{t1:.6f}",
str(info["consumable"]),
str(info["n_hand_frames"]),
str(info["n_gate_pass"]),
str(info["n_predictions"]),
str(info["top_vote_count"]),
info["avg_top1_cls"],
info["avg_top1_prob"],
info["avg_top2_cls"],
info["avg_top2_prob"],
info["avg_top3_cls"],
info["avg_top3_prob"],
])
out_lines.append(row)
print(
f" -> {info['consumable']} "
f"(votes {info['top_vote_count']}/{info['n_predictions']}, "
f"goodgate {info['n_gate_pass']}/{info['n_hand_frames']} hand frames)",
flush=True,
)
finally:
cap.release()
out_path = args.out.resolve()
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
print("已写出:", out_path, flush=True)
return 0
if __name__ == "__main__":
raise SystemExit(main())