From 36f6389d22fc457515c25a5ca4073584073e6b79 Mon Sep 17 00:00:00 2001 From: tian Date: Mon, 16 Mar 2026 12:10:01 +0800 Subject: [PATCH] Use real person boxes for ROI dataset generation --- 01_download_dataset.py | 191 ++++++++++++++++- 05_prepare_ppe_shoe_subset.py | 79 ++++++-- 09_build_roi_shoe_dataset.py | 227 ++++++++++++++++----- 13_preview_roi_samples.ps1 | 371 ++++++++++++++++++++++++++++++++++ README.md | 76 +++++-- 5 files changed, 862 insertions(+), 82 deletions(-) create mode 100644 13_preview_roi_samples.ps1 diff --git a/01_download_dataset.py b/01_download_dataset.py index 9cc91e0..059c263 100644 --- a/01_download_dataset.py +++ b/01_download_dataset.py @@ -19,6 +19,7 @@ Open Images 推荐类别: """ import argparse +import ast import os import random import shutil @@ -32,6 +33,8 @@ from pathlib import Path OPENIMAGES_RECOMMENDED_CLASSES = ["Footwear", "Boot"] OPENIMAGES_OPTIONAL_CLASSES = ["Sandal"] OPENIMAGES_NOT_RECOMMENDED_CLASSES = ["High heels", "Roller skates"] +OPENIMAGES_PERSON_CLASS = "Person" +ROI_SOURCE_DEFAULT_DIR = "datasets/openimages-person-shoes" def download_ultralytics_cppe(dataset_dir: str = "datasets/construction-ppe"): @@ -123,6 +126,32 @@ dataset_info: return yaml_path +def create_roi_source_yaml(dataset_dir: str, source_name: str, dataset_path_value: str = "."): + """创建 person+shoe ROI 源数据集配置。""" + yaml_content = f"""# 人体+鞋子 ROI 源数据集配置 + +path: {dataset_path_value} +train: images/train +val: images/val +test: images/test + +nc: 2 +names: ['person', 'shoe'] + +dataset_info: + name: {source_name} + task: detect_person_and_shoe_for_roi + note: 用真实 Person 框生成脚部 ROI,shoe 为 ROI 内检测目标 +""" + + yaml_path = os.path.join(dataset_dir, "data.yaml") + with open(yaml_path, "w", encoding="utf-8") as f: + f.write(yaml_content) + + print(f"\n✓ ROI 源配置文件创建: {yaml_path}") + return yaml_path + + def rewrite_yaml_for_existing_splits(dataset_dir: str, source_name: str): """根据现有目录结构重写 data.yaml。""" images_root = Path(dataset_dir) / "images" @@ -157,6 +186,68 @@ dataset_info: return str(yaml_path) +def rewrite_roi_yaml_for_existing_splits(dataset_dir: str, source_name: str): + """根据现有目录结构重写 person+shoe ROI 源 data.yaml。""" + images_root = Path(dataset_dir) / "images" + split_names = [name for name in ("train", "val", "test") if (images_root / name).exists()] + + if not split_names: + raise RuntimeError(f"未找到任何图像 split: {images_root}") + + train_split = "train" if "train" in split_names else split_names[0] + val_split = "val" if "val" in split_names else train_split + test_line = f"test: images/{'test' if 'test' in split_names else val_split}" + + yaml_content = f"""# 人体+鞋子 ROI 源数据集配置 + +path: . +train: images/{train_split} +val: images/{val_split} +{test_line} + +nc: 2 +names: ['person', 'shoe'] + +dataset_info: + name: {source_name} + task: detect_person_and_shoe_for_roi + note: 用真实 Person 框生成脚部 ROI,shoe 为 ROI 内检测目标 +""" + + yaml_path = Path(dataset_dir) / "data.yaml" + yaml_path.write_text(yaml_content, encoding="utf-8") + print(f"\n✓ ROI 源配置文件更新: {yaml_path}") + return str(yaml_path) + + +def load_dataset_name_map(dataset_yaml: Path) -> dict[int, str]: + """读取 YOLO names,不依赖额外 yaml 库。""" + names: dict[int, str] = {} + lines = dataset_yaml.read_text(encoding="utf-8").splitlines() + for index, line in enumerate(lines): + stripped = line.strip() + if not stripped.startswith("names:"): + continue + + inline_value = stripped[len("names:") :].strip() + if inline_value: + value = ast.literal_eval(inline_value) + if isinstance(value, list): + return {idx: str(name) for idx, name in enumerate(value)} + + for child in lines[index + 1 :]: + if not child.startswith(" "): + break + child_stripped = child.strip() + if ":" not in child_stripped: + continue + key_text, value_text = child_stripped.split(":", 1) + if key_text.strip().isdigit(): + names[int(key_text.strip())] = value_text.strip().strip("'\"") + break + return names + + def ensure_openimages_train_val_split(export_dir: str, train_ratio: float = 0.9, seed: int = 42): """如果导出结果只有单个 split,则自动切分为 train/val。""" images_root = Path(export_dir) / "images" @@ -236,8 +327,76 @@ def merge_openimages_to_single_class(export_dir: str): return True -def download_openimages(classes: list, max_samples: int, dataset_dir: str): - """通过 FiftyOne 下载 Open Images 并导出为单类 shoe 数据集。""" +def rewrite_openimages_to_roi_source(export_dir: str): + """将 Open Images 导出的标签改写为 person+shoe ROI 源数据。""" + labels_root = Path(export_dir) / "labels" + dataset_yaml = Path(export_dir) / "dataset.yaml" + if not labels_root.exists(): + print(f"✗ 未找到标签目录: {labels_root}") + return False + if not dataset_yaml.exists(): + print(f"✗ 未找到 dataset.yaml: {dataset_yaml}") + return False + + name_map = load_dataset_name_map(dataset_yaml) + person_ids = {idx for idx, name in name_map.items() if name.lower() == "person"} + shoe_ids = { + idx + for idx, name in name_map.items() + if name.lower() in {"footwear", "boot", "sandal", "high heels", "roller skates"} + } + + if not person_ids: + print("✗ 导出结果中未找到 Person 类") + return False + if not shoe_ids: + print("✗ 导出结果中未找到鞋类") + return False + + kept_files = 0 + total_person = 0 + total_shoe = 0 + + for label_file in labels_root.rglob("*.txt"): + lines = label_file.read_text(encoding="utf-8").splitlines() + rewritten: list[str] = [] + file_person = 0 + file_shoe = 0 + + for line in lines: + parts = line.strip().split() + if len(parts) < 5 or not parts[0].isdigit(): + continue + class_id = int(parts[0]) + if class_id in person_ids: + parts[0] = "0" + rewritten.append(" ".join(parts)) + total_person += 1 + file_person += 1 + elif class_id in shoe_ids: + parts[0] = "1" + rewritten.append(" ".join(parts)) + total_shoe += 1 + file_shoe += 1 + + if file_person > 0 and file_shoe > 0: + label_file.write_text("\n".join(rewritten) + "\n", encoding="utf-8") + kept_files += 1 + else: + label_file.write_text("", encoding="utf-8") + + ensure_openimages_train_val_split(export_dir) + rewrite_roi_yaml_for_existing_splits(export_dir, "Open Images V7 ROI Source") + + print("\nROI 源重写完成:") + print(f" 同时含 person + shoe 的标签文件: {kept_files}") + print(f" Person 标注框数: {total_person}") + print(f" Shoe 标注框数: {total_shoe}") + return True + + +def download_openimages(classes: list, max_samples: int, dataset_dir: str, mode: str): + """通过 FiftyOne 下载 Open Images 并导出为单类或 ROI 源数据集。""" try: import fiftyone as fo import fiftyone.zoo as foz @@ -247,11 +406,15 @@ def download_openimages(classes: list, max_samples: int, dataset_dir: str): return False export_dir = dataset_dir + "-yolo" + requested_classes = list(classes) + if mode == "roi-source" and OPENIMAGES_PERSON_CLASS not in requested_classes: + requested_classes = [OPENIMAGES_PERSON_CLASS] + requested_classes print("=" * 70) print("下载 Open Images V7 数据集") print("=" * 70) - print(f"类别: {classes}") + print(f"模式: {mode}") + print(f"类别: {requested_classes}") print(f"最大样本数: {max_samples}") print("原始缓存目录: FiftyOne 默认缓存目录") print(f"YOLO 导出目录: {export_dir}") @@ -266,7 +429,7 @@ def download_openimages(classes: list, max_samples: int, dataset_dir: str): "open-images-v7", split="train", label_types=["detections"], - classes=classes, + classes=requested_classes, max_samples=max_samples, ) @@ -277,8 +440,12 @@ def download_openimages(classes: list, max_samples: int, dataset_dir: str): label_field="ground_truth", ) - if not merge_openimages_to_single_class(export_dir): - return False + if mode == "roi-source": + if not rewrite_openimages_to_roi_source(export_dir): + return False + else: + if not merge_openimages_to_single_class(export_dir): + return False print(f"\n✓ 数据集保存: {export_dir}") return True @@ -352,6 +519,12 @@ def main(): default="datasets/openimages-shoes", help="数据集保存目录", ) + parser.add_argument( + "--mode", + choices=["single-class", "roi-source"], + default="single-class", + help="导出模式: single-class 用于单类训练, roi-source 保留 person + shoe 供 ROI 构建", + ) parser.add_argument( "--max-samples", type=int, @@ -376,6 +549,8 @@ def main(): print(f"推荐类别: {OPENIMAGES_RECOMMENDED_CLASSES}") print(f"可选补充: {OPENIMAGES_OPTIONAL_CLASSES}") print(f"默认不建议: {OPENIMAGES_NOT_RECOMMENDED_CLASSES}") + if args.mode == "roi-source": + print(f"将额外保留: {OPENIMAGES_PERSON_CLASS}") print() if args.source == "ultralytics": @@ -385,7 +560,9 @@ def main(): check_dataset(args.dir) elif args.source == "openimages": - success = download_openimages(args.classes, args.max_samples, args.dir) + if args.mode == "roi-source" and args.dir == parser.get_default("dir"): + args.dir = ROI_SOURCE_DEFAULT_DIR + success = download_openimages(args.classes, args.max_samples, args.dir, args.mode) final_dataset_dir = args.dir + "-yolo" if success: check_dataset(final_dataset_dir) diff --git a/05_prepare_ppe_shoe_subset.py b/05_prepare_ppe_shoe_subset.py index 24cf3c2..0d51d45 100644 --- a/05_prepare_ppe_shoe_subset.py +++ b/05_prepare_ppe_shoe_subset.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""从 Construction-PPE 中提取单类 shoe 子集。""" +"""从 Construction-PPE 中提取单类 shoe 或 person+shoe ROI 源数据。""" import argparse import shutil @@ -7,6 +7,7 @@ from pathlib import Path SHOE_CLASSES = {"3", "10"} # boots, no_boots +PERSON_CLASSES = {"6"} # Person def ensure_clean_dir(path: Path): @@ -15,7 +16,7 @@ def ensure_clean_dir(path: Path): path.mkdir(parents=True, exist_ok=True) -def write_yaml(output_dir: Path): +def write_shoe_yaml(output_dir: Path): yaml_path = output_dir / "data.yaml" abs_output = output_dir.resolve().as_posix() yaml_path.write_text( @@ -42,7 +43,34 @@ def write_yaml(output_dir: Path): ) -def convert_split(source_dir: Path, output_dir: Path, split: str): +def write_roi_source_yaml(output_dir: Path): + yaml_path = output_dir / "data.yaml" + abs_output = output_dir.resolve().as_posix() + yaml_path.write_text( + "\n".join( + [ + "# PPE 人体+鞋子 ROI 源数据配置", + "", + f"path: {abs_output}", + "train: images/train", + "val: images/val", + "test: images/test", + "", + "nc: 2", + "names: ['person', 'shoe']", + "", + "dataset_info:", + " name: Construction-PPE ROI source", + " source: Construction-PPE", + " note: 保留 Person 和鞋类,用真实人框生成脚部 ROI", + "", + ] + ), + encoding="utf-8", + ) + + +def convert_split(source_dir: Path, output_dir: Path, split: str, mode: str): image_src = source_dir / "images" / split label_src = source_dir / "labels" / split image_dst = output_dir / "images" / split @@ -56,16 +84,32 @@ def convert_split(source_dir: Path, output_dir: Path, split: str): for label_file in sorted(label_src.glob("*.txt")): lines = label_file.read_text(encoding="utf-8").splitlines() - shoe_lines = [] + out_lines = [] + file_person = 0 + file_shoe = 0 for line in lines: parts = line.strip().split() - if len(parts) < 5 or parts[0] not in SHOE_CLASSES: + if len(parts) < 5: continue - parts[0] = "0" - shoe_lines.append(" ".join(parts)) + class_id = parts[0] + if mode == "roi-source": + if class_id in PERSON_CLASSES: + parts[0] = "0" + out_lines.append(" ".join(parts)) + file_person += 1 + elif class_id in SHOE_CLASSES: + parts[0] = "1" + out_lines.append(" ".join(parts)) + file_shoe += 1 + elif class_id in SHOE_CLASSES: + parts[0] = "0" + out_lines.append(" ".join(parts)) + file_shoe += 1 - if not shoe_lines: + if mode == "roi-source" and (file_person == 0 or file_shoe == 0): + continue + if mode != "roi-source" and file_shoe == 0: continue image_file = image_src / f"{label_file.stem}.jpg" @@ -75,9 +119,9 @@ def convert_split(source_dir: Path, output_dir: Path, split: str): continue shutil.copy2(image_file, image_dst / image_file.name) - (label_dst / label_file.name).write_text("\n".join(shoe_lines) + "\n", encoding="utf-8") + (label_dst / label_file.name).write_text("\n".join(out_lines) + "\n", encoding="utf-8") kept_images += 1 - kept_boxes += len(shoe_lines) + kept_boxes += len(out_lines) return kept_images, kept_boxes @@ -94,22 +138,33 @@ def main(): default="datasets/ppe-shoes", help="输出目录", ) + parser.add_argument( + "--mode", + choices=["shoe-only", "roi-source"], + default="shoe-only", + help="shoe-only 输出单类 shoe; roi-source 保留 person + shoe 供 ROI 构建使用", + ) args = parser.parse_args() source_dir = Path(args.source) output_dir = Path(args.output) + if args.mode == "roi-source" and args.output == parser.get_default("output"): + output_dir = Path("datasets/ppe-person-shoes") ensure_clean_dir(output_dir) total_images = 0 total_boxes = 0 for split in ("train", "val", "test"): - kept_images, kept_boxes = convert_split(source_dir, output_dir, split) + kept_images, kept_boxes = convert_split(source_dir, output_dir, split, args.mode) total_images += kept_images total_boxes += kept_boxes print(f"[{split}] images={kept_images} boxes={kept_boxes}") - write_yaml(output_dir) + if args.mode == "roi-source": + write_roi_source_yaml(output_dir) + else: + write_shoe_yaml(output_dir) print(f"\n输出目录: {output_dir}") print(f"总图片数: {total_images}") print(f"总框数: {total_boxes}") diff --git a/09_build_roi_shoe_dataset.py b/09_build_roi_shoe_dataset.py index cd5bfaa..147a544 100644 --- a/09_build_roi_shoe_dataset.py +++ b/09_build_roi_shoe_dataset.py @@ -1,9 +1,22 @@ #!/usr/bin/env python3 -"""Build a foot-ROI shoe dataset from existing YOLO shoe datasets.""" +"""Build a foot-ROI shoe dataset from existing YOLO shoe datasets. + +Preferred training input should come from person-bottom ROIs, matching online inference: + + roi_x = x - 0.24w + roi_y = y + 0.64h + roi_w = 1.48w + roi_h = 0.58h + +When person boxes are available, this script uses them directly. +When only shoe boxes are available, it falls back to shoe-based ROI approximation +that still tries to match person-bottom input distribution rather than shoe closeups. +""" from __future__ import annotations import argparse +import ast import math import shutil from collections import defaultdict @@ -14,8 +27,9 @@ from PIL import Image DEFAULT_SOURCES = [ + "datasets/ppe-person-shoes", + "datasets/openimages-person-shoes-yolo", "datasets/openimages-shoes-yolo", - "datasets/ppe-shoes", ] IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".bmp", ".webp") @@ -24,8 +38,8 @@ PAIR_MAX_Y_GAP_FACTOR = 1.2 PAIR_MIN_AREA_RATIO = 0.4 PAIR_MAX_AREA_RATIO = 2.5 -SINGLE_AREA_RANGE = (0.15, 0.35) -PAIR_AREA_RANGE = (0.25, 0.50) +SINGLE_AREA_RANGE = (0.10, 0.26) +PAIR_AREA_RANGE = (0.18, 0.40) @dataclass(frozen=True) @@ -72,6 +86,14 @@ class RoiSample: mode: str +@dataclass(frozen=True) +class SourceSpec: + dataset_dir: Path + person_ids: set[int] + shoe_ids: set[int] + uses_person_boxes: bool + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Build a foot-context ROI shoe dataset") parser.add_argument( @@ -107,8 +129,71 @@ def find_image(image_dir: Path, stem: str) -> Path | None: return None -def load_boxes(label_path: Path, image_width: int, image_height: int) -> list[Box]: - boxes: list[Box] = [] +def parse_names_from_yaml(yaml_path: Path) -> dict[int, str]: + names: dict[int, str] = {} + lines = yaml_path.read_text(encoding="utf-8").splitlines() + for index, line in enumerate(lines): + stripped = line.strip() + if not stripped.startswith("names:"): + continue + inline = stripped[len("names:") :].strip() + if inline: + value = ast.literal_eval(inline) + if isinstance(value, list): + return {idx: str(name) for idx, name in enumerate(value)} + for child in lines[index + 1 :]: + if not child.startswith(" "): + break + child_stripped = child.strip() + if ":" not in child_stripped: + continue + key_text, value_text = child_stripped.split(":", 1) + if key_text.strip().isdigit(): + names[int(key_text.strip())] = value_text.strip().strip("'\"") + break + return names + + +def resolve_source_spec(source_dir: Path) -> SourceSpec: + names: dict[int, str] = {} + # Prefer data.yaml because ROI-source exports intentionally rewrite class ids. + for candidate in (source_dir / "data.yaml", source_dir / "dataset.yaml"): + if candidate.exists(): + names = parse_names_from_yaml(candidate) + if names: + break + + lowered = {idx: name.lower() for idx, name in names.items()} + person_ids = { + idx for idx, name in lowered.items() if name in {"person", "man", "woman", "boy", "girl"} + } + shoe_ids = { + idx + for idx, name in lowered.items() + if name in {"shoe", "footwear", "boot", "boots", "no_boots", "sandal", "high heels"} + } + + if not names: + shoe_ids = {0} + + if not shoe_ids: + raise RuntimeError(f"未能在 {source_dir} 识别鞋类标签") + + return SourceSpec( + dataset_dir=source_dir, + person_ids=person_ids, + shoe_ids=shoe_ids, + uses_person_boxes=bool(person_ids), + ) + + +def load_annotations( + label_path: Path, + image_width: int, + image_height: int, + allowed_ids: set[int], +) -> list[tuple[int, Box]]: + annotations: list[tuple[int, Box]] = [] for raw_line in label_path.read_text(encoding="utf-8").splitlines(): line = raw_line.strip() if not line: @@ -116,6 +201,9 @@ def load_boxes(label_path: Path, image_width: int, image_height: int) -> list[Bo parts = line.split() if len(parts) < 5: continue + class_id = int(parts[0]) + if class_id not in allowed_ids: + continue _, xc, yc, w, h = parts[:5] box_w = float(w) * image_width box_h = float(h) * image_height @@ -128,17 +216,20 @@ def load_boxes(label_path: Path, image_width: int, image_height: int) -> list[Bo center_y + box_h / 2.0, ).clip(image_width, image_height) if box is not None and box.area > 1.0: - boxes.append(box) - return dedupe_boxes(boxes) + annotations.append((class_id, box)) + return dedupe_annotations(annotations) -def dedupe_boxes(boxes: list[Box], iou_threshold: float = 0.9) -> list[Box]: - kept: list[Box] = [] - for box in sorted(boxes, key=lambda item: item.area, reverse=True): - if any(iou(box, existing) >= iou_threshold for existing in kept): +def dedupe_annotations( + annotations: list[tuple[int, Box]], + iou_threshold: float = 0.9, +) -> list[tuple[int, Box]]: + kept: list[tuple[int, Box]] = [] + for class_id, box in sorted(annotations, key=lambda item: item[1].area, reverse=True): + if any(class_id == kept_id and iou(box, existing) >= iou_threshold for kept_id, existing in kept): continue - kept.append(box) - return sorted(kept, key=lambda item: (item.cx, item.cy)) + kept.append((class_id, box)) + return sorted(kept, key=lambda item: (item[1].cx, item[1].cy)) def iou(a: Box, b: Box) -> float: @@ -202,16 +293,20 @@ def greedy_group_boxes(boxes: list[Box]) -> list[tuple[int, ...]]: return groups -def expand_single(box: Box) -> Box: - return Box( - box.x1 - (0.6 * box.w), - box.y1 - (0.5 * box.h), - box.x1 - (0.6 * box.w) + (2.2 * box.w), - box.y1 - (0.5 * box.h) + (2.4 * box.h), - ) +def estimate_person_from_single_shoe(box: Box) -> Box: + """Estimate a loose full-body box from a single shoe box.""" + person_w = max(4.2 * box.w, 2.8 * box.h) + person_h = max(7.6 * box.h, 3.4 * box.w) + person_cx = box.cx + # Put shoe close to the bottom of the estimated person, leaving small ground margin. + person_y2 = box.y2 + (0.08 * person_h) + person_x1 = person_cx - (person_w / 2.0) + person_y1 = person_y2 - person_h + return Box(person_x1, person_y1, person_x1 + person_w, person_y2) -def expand_pair(boxes: list[Box], group: tuple[int, int]) -> Box: +def estimate_person_from_pair(boxes: list[Box], group: tuple[int, int]) -> Box: + """Estimate a loose full-body box from a visible pair of shoes.""" first = boxes[group[0]] second = boxes[group[1]] union_x1 = min(first.x1, second.x1) @@ -220,14 +315,28 @@ def expand_pair(boxes: list[Box], group: tuple[int, int]) -> Box: union_y2 = max(first.y2, second.y2) union_w = union_x2 - union_x1 union_h = union_y2 - union_y1 - roi_x = union_x1 - (0.35 * union_w) - roi_y = union_y1 - (0.45 * union_h) - return Box( - roi_x, - roi_y, - roi_x + (1.7 * union_w), - roi_y + (2.0 * union_h), - ) + person_w = max(1.95 * union_w, 2.6 * union_h) + person_h = max(7.8 * union_h, 2.9 * union_w) + person_cx = (union_x1 + union_x2) / 2.0 + person_y2 = union_y2 + (0.08 * person_h) + person_x1 = person_cx - (person_w / 2.0) + person_y1 = person_y2 - person_h + return Box(person_x1, person_y1, person_x1 + person_w, person_y2) + + +def roi_from_person_box(person_box: Box) -> Box: + """Apply the online person-bottom ROI rule and loosen it slightly.""" + roi_x = person_box.x1 - (0.24 * person_box.w) + roi_y = person_box.y1 + (0.64 * person_box.h) + roi_w = 1.48 * person_box.w + roi_h = 0.58 * person_box.h + + # Slightly enlarge to keep more trouser leg, ground, and side context than online. + roi_x -= 0.08 * roi_w + roi_y -= 0.08 * roi_h + roi_w *= 1.16 + roi_h *= 1.18 + return Box(roi_x, roi_y, roi_x + roi_w, roi_y + roi_h) def clamp_roi(roi: Box, image_width: int, image_height: int) -> Box | None: @@ -302,16 +411,28 @@ def boxes_in_roi(boxes: list[Box], roi: Box) -> list[Box]: return included +def make_person_roi_samples(person_boxes: list[Box], image_width: int, image_height: int) -> list[RoiSample]: + samples: list[RoiSample] = [] + for person_idx, person_box in enumerate(person_boxes): + roi = roi_from_person_box(person_box) + roi = clamp_roi(roi, image_width, image_height) + if roi is not None: + samples.append(RoiSample(roi=roi, members=(person_idx,), mode="person")) + return samples + + def make_roi_samples(boxes: list[Box], image_width: int, image_height: int) -> list[RoiSample]: samples: list[RoiSample] = [] groups = greedy_group_boxes(boxes) for group in groups: if len(group) == 2: - roi = expand_pair(boxes, group) + person_box = estimate_person_from_pair(boxes, group) + roi = roi_from_person_box(person_box) area_range = PAIR_AREA_RANGE mode = "pair" else: - roi = expand_single(boxes[group[0]]) + person_box = estimate_person_from_single_shoe(boxes[group[0]]) + roi = roi_from_person_box(person_box) area_range = SINGLE_AREA_RANGE mode = "single" @@ -358,7 +479,7 @@ def write_yaml(output_dir: Path, sources: list[str]) -> None: " name: shoe-roi-mix", " task: detect_shoe_roi", f" source: {source_names}", - " note: cropped to foot-context ROIs to match online two-stage inference", + " note: prefer person-bottom ROIs; current public data uses shoe-box fallback crops", "", ] ), @@ -366,13 +487,14 @@ def write_yaml(output_dir: Path, sources: list[str]) -> None: ) -def build_split(source_dir: Path, output_dir: Path, split: str) -> dict[str, int]: +def build_split(source_spec: SourceSpec, output_dir: Path, split: str) -> dict[str, int]: + source_dir = source_spec.dataset_dir image_dir = source_dir / "images" / split label_dir = source_dir / "labels" / split if not image_dir.exists() or not label_dir.exists(): - return {"images": 0, "boxes": 0, "single": 0, "pair": 0} + return {"images": 0, "boxes": 0, "single": 0, "pair": 0, "person": 0} - stats = {"images": 0, "boxes": 0, "single": 0, "pair": 0} + stats = {"images": 0, "boxes": 0, "single": 0, "pair": 0, "person": 0} prefix = source_dir.name.replace("-", "_") for label_path in sorted(label_dir.glob("*.txt")): @@ -383,13 +505,27 @@ def build_split(source_dir: Path, output_dir: Path, split: str) -> dict[str, int with Image.open(image_path) as image: image = image.convert("RGB") width, height = image.size - boxes = load_boxes(label_path, width, height) - if not boxes: + annotations = load_annotations( + label_path, + width, + height, + source_spec.person_ids | source_spec.shoe_ids, + ) + if not annotations: continue - samples = make_roi_samples(boxes, width, height) + shoe_boxes = [box for class_id, box in annotations if class_id in source_spec.shoe_ids] + person_boxes = [box for class_id, box in annotations if class_id in source_spec.person_ids] + if not shoe_boxes: + continue + + if source_spec.uses_person_boxes and person_boxes: + samples = make_person_roi_samples(person_boxes, width, height) + else: + samples = make_roi_samples(shoe_boxes, width, height) + for sample_idx, sample in enumerate(samples): - roi_boxes = boxes_in_roi(boxes, sample.roi) + roi_boxes = boxes_in_roi(shoe_boxes, sample.roi) if not roi_boxes: continue @@ -420,18 +556,19 @@ def main() -> None: ensure_output_layout(output_dir) summary: dict[str, dict[str, dict[str, int]]] = defaultdict(dict) - totals = {"images": 0, "boxes": 0, "single": 0, "pair": 0} + totals = {"images": 0, "boxes": 0, "single": 0, "pair": 0, "person": 0} for source in args.sources: source_dir = Path(source) if not source_dir.exists(): raise FileNotFoundError(f"Source dataset not found: {source_dir}") + source_spec = resolve_source_spec(source_dir) for split in ("train", "val", "test"): - stats = build_split(source_dir, output_dir, split) + stats = build_split(source_spec, output_dir, split) summary[source_dir.name][split] = stats for key in totals: - totals[key] += stats[key] + totals[key] += stats.get(key, 0) write_yaml(output_dir, args.sources) @@ -439,10 +576,10 @@ def main() -> None: for source_name, split_map in summary.items(): print(f"[{source_name}]") for split in ("train", "val", "test"): - stats = split_map.get(split, {"images": 0, "boxes": 0, "single": 0, "pair": 0}) + stats = split_map.get(split, {"images": 0, "boxes": 0, "single": 0, "pair": 0, "person": 0}) print( f" {split}: rois={stats['images']} boxes={stats['boxes']} " - f"single={stats['single']} pair={stats['pair']}" + f"person={stats['person']} single={stats['single']} pair={stats['pair']}" ) print( diff --git a/13_preview_roi_samples.ps1 b/13_preview_roi_samples.ps1 new file mode 100644 index 0000000..fa37013 --- /dev/null +++ b/13_preview_roi_samples.ps1 @@ -0,0 +1,371 @@ +$ErrorActionPreference = "Stop" + +Add-Type -AssemblyName System.Drawing + +$repo = $PSScriptRoot +$outputDir = Join-Path $repo "samples\roi_preview" + +if (Test-Path $outputDir) { + Remove-Item $outputDir -Recurse -Force +} +New-Item -ItemType Directory -Path $outputDir | Out-Null + +$imageExts = @(".jpg", ".jpeg", ".png", ".bmp", ".webp") + +function New-Box { + param( + [double]$X1, + [double]$Y1, + [double]$X2, + [double]$Y2 + ) + [PSCustomObject]@{ + X1 = $X1 + Y1 = $Y1 + X2 = $X2 + Y2 = $Y2 + W = [Math]::Max(0.0, $X2 - $X1) + H = [Math]::Max(0.0, $Y2 - $Y1) + Cx = ($X1 + $X2) / 2.0 + Cy = ($Y1 + $Y2) / 2.0 + Area = [Math]::Max(0.0, $X2 - $X1) * [Math]::Max(0.0, $Y2 - $Y1) + } +} + +function Clip-Box { + param( + $Box, + [double]$Width, + [double]$Height + ) + $x1 = [Math]::Min([Math]::Max($Box.X1, 0.0), $Width) + $y1 = [Math]::Min([Math]::Max($Box.Y1, 0.0), $Height) + $x2 = [Math]::Min([Math]::Max($Box.X2, 0.0), $Width) + $y2 = [Math]::Min([Math]::Max($Box.Y2, 0.0), $Height) + if ($x2 -le $x1 -or $y2 -le $y1) { + return $null + } + return New-Box -X1 $x1 -Y1 $y1 -X2 $x2 -Y2 $y2 +} + +function Find-Image { + param( + [string]$ImageDir, + [string]$Stem + ) + foreach ($ext in $imageExts) { + $path = Join-Path $ImageDir ($Stem + $ext) + if (Test-Path $path) { + return $path + } + } + return $null +} + +function Load-Boxes { + param( + [string]$LabelPath, + [int]$ImageWidth, + [int]$ImageHeight + ) + $boxes = @() + foreach ($line in Get-Content $LabelPath) { + $text = $line.Trim() + if (-not $text) { + continue + } + $parts = $text -split "\s+" + if ($parts.Length -lt 5) { + continue + } + $xc = [double]$parts[1] * $ImageWidth + $yc = [double]$parts[2] * $ImageHeight + $w = [double]$parts[3] * $ImageWidth + $h = [double]$parts[4] * $ImageHeight + $box = New-Box -X1 ($xc - $w / 2.0) -Y1 ($yc - $h / 2.0) -X2 ($xc + $w / 2.0) -Y2 ($yc + $h / 2.0) + $clipped = Clip-Box -Box $box -Width $ImageWidth -Height $ImageHeight + if ($null -ne $clipped -and $clipped.Area -gt 1.0) { + $boxes += $clipped + } + } + return $boxes | Sort-Object Cx, Cy +} + +function Should-Pair { + param($Left, $Right) + $widthRef = [Math]::Max($Left.W, $Right.W) + $heightRef = [Math]::Max($Left.H, $Right.H) + if ($widthRef -le 0 -or $heightRef -le 0) { + return $false + } + $dx = [Math]::Abs($Left.Cx - $Right.Cx) + $dy = [Math]::Abs($Left.Cy - $Right.Cy) + $areaRatio = if ($Right.Area -gt 0) { $Left.Area / $Right.Area } else { [double]::PositiveInfinity } + return ( + $dx -le ($widthRef * 3.2) -and + $dy -le ($heightRef * 1.2) -and + $areaRatio -ge 0.4 -and + $areaRatio -le 2.5 + ) +} + +function Get-Groups { + param($Boxes) + $groups = @() + $used = @{} + $candidates = @() + for ($i = 0; $i -lt $Boxes.Count; $i++) { + for ($j = $i + 1; $j -lt $Boxes.Count; $j++) { + if (Should-Pair $Boxes[$i] $Boxes[$j]) { + $score = [Math]::Abs($Boxes[$i].Cx - $Boxes[$j].Cx) + 0.5 * [Math]::Abs($Boxes[$i].Cy - $Boxes[$j].Cy) + $candidates += [PSCustomObject]@{ Score = $score; I = $i; J = $j } + } + } + } + foreach ($candidate in ($candidates | Sort-Object Score)) { + if ($used.ContainsKey($candidate.I) -or $used.ContainsKey($candidate.J)) { + continue + } + $used[$candidate.I] = $true + $used[$candidate.J] = $true + $groups += ,@($candidate.I, $candidate.J) + } + for ($i = 0; $i -lt $Boxes.Count; $i++) { + if (-not $used.ContainsKey($i)) { + $groups += ,@($i) + } + } + return $groups +} + +function Estimate-PersonFromSingle { + param($Box) + $personW = [Math]::Max(4.2 * $Box.W, 2.8 * $Box.H) + $personH = [Math]::Max(7.6 * $Box.H, 3.4 * $Box.W) + $personCx = $Box.Cx + $personY2 = $Box.Y2 + 0.08 * $personH + $personX1 = $personCx - $personW / 2.0 + $personY1 = $personY2 - $personH + return New-Box -X1 $personX1 -Y1 $personY1 -X2 ($personX1 + $personW) -Y2 $personY2 +} + +function Estimate-PersonFromPair { + param($A, $B) + $ux1 = [Math]::Min($A.X1, $B.X1) + $uy1 = [Math]::Min($A.Y1, $B.Y1) + $ux2 = [Math]::Max($A.X2, $B.X2) + $uy2 = [Math]::Max($A.Y2, $B.Y2) + $uw = $ux2 - $ux1 + $uh = $uy2 - $uy1 + $personW = [Math]::Max(1.95 * $uw, 2.6 * $uh) + $personH = [Math]::Max(7.8 * $uh, 2.9 * $uw) + $personCx = ($ux1 + $ux2) / 2.0 + $personY2 = $uy2 + 0.08 * $personH + $personX1 = $personCx - $personW / 2.0 + $personY1 = $personY2 - $personH + return New-Box -X1 $personX1 -Y1 $personY1 -X2 ($personX1 + $personW) -Y2 $personY2 +} + +function Get-RoiFromPerson { + param($Person) + $roiX = $Person.X1 - 0.24 * $Person.W + $roiY = $Person.Y1 + 0.64 * $Person.H + $roiW = 1.48 * $Person.W + $roiH = 0.58 * $Person.H + $roiX = $roiX - 0.08 * $roiW + $roiY = $roiY - 0.08 * $roiH + $roiW = $roiW * 1.16 + $roiH = $roiH * 1.18 + return New-Box -X1 $roiX -Y1 $roiY -X2 ($roiX + $roiW) -Y2 ($roiY + $roiH) +} + +function Clamp-Roi { + param( + $Roi, + [int]$ImageWidth, + [int]$ImageHeight + ) + $clipped = Clip-Box -Box $Roi -Width $ImageWidth -Height $ImageHeight + if ($null -eq $clipped) { + return $null + } + $x1 = [Math]::Max(0, [Math]::Min([int][Math]::Floor($clipped.X1), $ImageWidth - 1)) + $y1 = [Math]::Max(0, [Math]::Min([int][Math]::Floor($clipped.Y1), $ImageHeight - 1)) + $x2 = [Math]::Max($x1 + 1, [Math]::Min([int][Math]::Ceiling($clipped.X2), $ImageWidth)) + $y2 = [Math]::Max($y1 + 1, [Math]::Min([int][Math]::Ceiling($clipped.Y2), $ImageHeight)) + return New-Box -X1 $x1 -Y1 $y1 -X2 $x2 -Y2 $y2 +} + +function Resize-RoiToTarget { + param( + $Roi, + [int]$ImageWidth, + [int]$ImageHeight, + [double]$ObjectArea, + [double]$MinRatio, + [double]$MaxRatio + ) + $adjusted = $Roi + $targetRatio = ($MinRatio + $MaxRatio) / 2.0 + for ($iter = 0; $iter -lt 3; $iter++) { + $ratio = if ($adjusted.Area -gt 0) { $ObjectArea / $adjusted.Area } else { 0.0 } + if ($ratio -ge $MinRatio -and $ratio -le $MaxRatio) { + return $adjusted + } + $scale = [Math]::Sqrt($ratio / $targetRatio) + if ($ratio -lt $MinRatio) { + $scale = [Math]::Max(0.6, [Math]::Min(0.95, $scale)) + } + else { + $scale = [Math]::Min(1.8, [Math]::Max(1.05, $scale)) + } + $newW = $adjusted.W * $scale + $newH = $adjusted.H * $scale + $cx = $adjusted.Cx + $cy = $adjusted.Cy + $adjusted = Clamp-Roi (New-Box -X1 ($cx - $newW / 2.0) -Y1 ($cy - $newH / 2.0) -X2 ($cx + $newW / 2.0) -Y2 ($cy + $newH / 2.0)) $ImageWidth $ImageHeight + if ($null -eq $adjusted) { + return $null + } + } + return $adjusted +} + +function Draw-Rect { + param( + [System.Drawing.Graphics]$Graphics, + [System.Drawing.Pen]$Pen, + $Box + ) + $Graphics.DrawRectangle($Pen, [float]$Box.X1, [float]$Box.Y1, [float]$Box.W, [float]$Box.H) +} + +$samples = @( + @{ Dataset = "datasets\openimages-shoes-yolo"; Split = "train"; Stem = "00015a7cf95ec19d"; Label = "openimages_single" }, + @{ Dataset = "datasets\openimages-shoes-yolo"; Split = "train"; Stem = "0036655159bdef7f"; Label = "openimages_pair" }, + @{ Dataset = "datasets\openimages-shoes-yolo"; Split = "train"; Stem = "00003223e04e2e66"; Label = "openimages_mixed" }, + @{ Dataset = "datasets\ppe-shoes"; Split = "train"; Stem = "image1001"; Label = "ppe_pair_a" }, + @{ Dataset = "datasets\ppe-shoes"; Split = "train"; Stem = "image1011"; Label = "ppe_pair_b" } +) + +$summary = @() + +foreach ($sample in $samples) { + $imageDir = Join-Path $repo (Join-Path $sample.Dataset ("images\" + $sample.Split)) + $labelPath = Join-Path $repo (Join-Path $sample.Dataset ("labels\" + $sample.Split + "\" + $sample.Stem + ".txt")) + $imagePath = Find-Image -ImageDir $imageDir -Stem $sample.Stem + if (-not $imagePath -or -not (Test-Path $labelPath)) { + continue + } + + $bitmap = [System.Drawing.Bitmap]::new($imagePath) + try { + $boxes = Load-Boxes -LabelPath $labelPath -ImageWidth $bitmap.Width -ImageHeight $bitmap.Height + if ($boxes.Count -eq 0) { + continue + } + + $groups = Get-Groups -Boxes $boxes + $canvas = [System.Drawing.Bitmap]::new($bitmap) + $graphics = [System.Drawing.Graphics]::FromImage($canvas) + $graphics.SmoothingMode = [System.Drawing.Drawing2D.SmoothingMode]::HighQuality + $shoePen = [System.Drawing.Pen]::new([System.Drawing.Color]::Lime, 3) + $roiPen = [System.Drawing.Pen]::new([System.Drawing.Color]::Red, 4) + $font = [System.Drawing.Font]::new("Arial", 18, [System.Drawing.FontStyle]::Bold) + $brush = [System.Drawing.SolidBrush]::new([System.Drawing.Color]::Yellow) + + try { + for ($i = 0; $i -lt $boxes.Count; $i++) { + Draw-Rect -Graphics $graphics -Pen $shoePen -Box $boxes[$i] + } + + $roiIndex = 0 + foreach ($group in $groups) { + if ($group.Count -eq 2) { + $person = Estimate-PersonFromPair $boxes[$group[0]] $boxes[$group[1]] + $roi = Get-RoiFromPerson $person + $minRatio = 0.18 + $maxRatio = 0.40 + $mode = "pair" + } + else { + $person = Estimate-PersonFromSingle $boxes[$group[0]] + $roi = Get-RoiFromPerson $person + $minRatio = 0.10 + $maxRatio = 0.26 + $mode = "single" + } + $roi = Clamp-Roi $roi $bitmap.Width $bitmap.Height + if ($null -eq $roi) { + continue + } + $objectArea = 0.0 + foreach ($idx in $group) { + $objectArea += $boxes[$idx].Area + } + $roi = Resize-RoiToTarget $roi $bitmap.Width $bitmap.Height $objectArea $minRatio $maxRatio + if ($null -eq $roi) { + continue + } + + Draw-Rect -Graphics $graphics -Pen $roiPen -Box $roi + $graphics.DrawString("ROI$roiIndex", $font, $brush, [float]($roi.X1 + 4), [float]([Math]::Max(0, $roi.Y1 - 28))) + + $cropRect = [System.Drawing.Rectangle]::new([int]$roi.X1, [int]$roi.Y1, [int]$roi.W, [int]$roi.H) + $crop = $bitmap.Clone($cropRect, $bitmap.PixelFormat) + $cropGraphics = [System.Drawing.Graphics]::FromImage($crop) + $cropPen = [System.Drawing.Pen]::new([System.Drawing.Color]::Lime, 3) + try { + foreach ($box in $boxes) { + if ($box.Cx -lt $roi.X1 -or $box.Cx -gt $roi.X2 -or $box.Cy -lt $roi.Y1 -or $box.Cy -gt $roi.Y2) { + continue + } + $local = New-Box -X1 ($box.X1 - $roi.X1) -Y1 ($box.Y1 - $roi.Y1) -X2 ($box.X2 - $roi.X1) -Y2 ($box.Y2 - $roi.Y1) + $local = Clip-Box -Box $local -Width $roi.W -Height $roi.H + if ($null -ne $local) { + Draw-Rect -Graphics $cropGraphics -Pen $cropPen -Box $local + } + } + } + finally { + $cropGraphics.Dispose() + $cropPen.Dispose() + } + + $cropPath = Join-Path $outputDir ($sample.Label + "_roi" + $roiIndex + ".jpg") + $crop.Save($cropPath, [System.Drawing.Imaging.ImageFormat]::Jpeg) + $crop.Dispose() + + $areaRatio = [Math]::Round($objectArea / $roi.Area, 3) + $summary += [PSCustomObject]@{ + Sample = $sample.Label + Roi = "roi$roiIndex" + Mode = $mode + Boxes = $group.Count + AreaRatio = $areaRatio + Crop = $cropPath + } + $roiIndex++ + } + + $overlayPath = Join-Path $outputDir ($sample.Label + "_overlay.jpg") + $canvas.Save($overlayPath, [System.Drawing.Imaging.ImageFormat]::Jpeg) + } + finally { + $graphics.Dispose() + $shoePen.Dispose() + $roiPen.Dispose() + $font.Dispose() + $brush.Dispose() + $canvas.Dispose() + } + } + finally { + $bitmap.Dispose() + } +} + +$summaryPath = Join-Path $outputDir "summary.txt" +$summary | Sort-Object Sample, Roi | Format-Table -AutoSize | Out-String | Set-Content -Path $summaryPath -Encoding UTF8 +Write-Output "Preview images written to: $outputDir" +Write-Output "Summary written to: $summaryPath" diff --git a/README.md b/README.md index f008ef9..a9ba194 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ 当前项目的主训练方向已经调整为: - 只训练 `yolov8s`、输入尺寸固定 `640x640` - 训练数据不再直接使用“整张场景图”或“鞋子纯特写图” -- 先根据鞋框裁出更接近线上输入分布的“脚部 ROI 图”,再训练鞋检测模型 +- 优先根据真实`人体框`裁出更接近线上输入分布的“脚部 ROI 图”,再训练鞋检测模型 这样做的原因是线上链路并不是直接在整张图上找鞋,而是: 1. 先从人体框生成脚部 ROI @@ -15,30 +15,70 @@ ### ROI 规则 -单鞋 ROI: -- 已知鞋框 `(x, y, w, h)` -- `roi_x = x - 0.6w` -- `roi_y = y - 0.5h` -- `roi_w = 2.2w` -- `roi_h = 2.4h` +优先原则: +- 最优方式:如果有`人体框`,直接按线上人体下部 ROI 规则裁图 +- 次优方式:如果只有`鞋框`,再按鞋框扩图,尽量模拟人体下部脚部 ROI 的视觉分布 -双鞋 ROI: -- 优先把两只鞋裁进同一张 ROI -- 先取两只鞋框并集,再扩框: -- `roi_x = union_x - 0.35 * union_w` -- `roi_y = union_y - 0.45 * union_h` -- `roi_w = 1.7 * union_w` -- `roi_h = 2.0 * union_h` +线上人体下部 ROI 规则: +- 已知人体框 `(x, y, w, h)` +- `roi_x = x - 0.24w` +- `roi_y = y + 0.64h` +- `roi_w = 1.48w` +- `roi_h = 0.58h` -裁图会自动裁剪到图像边界内。 +这条规则的目标是: +- 横向比人体略宽,尽量把双脚都包进去 +- 纵向覆盖人体下部到脚下地面 +- 让鞋模型看到的输入更接近真实线上两阶段链路 + +当前推荐的数据准备方式: + +1. PPE 数据保留 `Person + shoe` + +```bash +python 05_prepare_ppe_shoe_subset.py --mode roi-source +``` + +输出目录: +- `datasets/ppe-person-shoes` + +2. Open Images 重新下载,并保留 `Person + shoe` + +```bash +python 01_download_dataset.py --source openimages --mode roi-source --max-samples 8000 +``` + +默认输出目录: +- `datasets/openimages-person-shoes-yolo` + +3. 构建 ROI 化训练集 + +```bash +python 09_build_roi_shoe_dataset.py --clean +``` + +构建规则: +- 如果源数据里有真实 `Person` 框,直接按人体下部 ROI 规则裁图 +- 只有在没有人框时,才退回鞋框扩图 fallback + +如果只有鞋框,没有人体框: +- 仍然可以做鞋框 fallback 扩图 +- 但目标不是做鞋子纯特写,而是尽量近似“人体下部脚部 ROI” +- 也就是保留一定裤脚、脚下地面和周围背景 +- 裁图会自动截到图像边界内 + +训练目标总结: +- 不要做鞋子纯特写 +- 要做“脚部局部图” +- 让训练输入尽量贴近线上“人体下部脚部 ROI” ### 新主流程 -1. 准备原始单类鞋数据集 +1. 准备 ROI 源数据集 ```bash -python 01_download_dataset.py --source openimages --max-samples 5000 -python 05_prepare_ppe_shoe_subset.py +python 05_prepare_ppe_shoe_subset.py --mode roi-source +python 01_download_dataset.py --source openimages --mode roi-source --max-samples 8000 ``` 2. 构建 ROI 化训练集