From 1f0d8e1e55eb3e6cb2867b14cc53b211e9c8fc06 Mon Sep 17 00:00:00 2001 From: tian <11429339@qq.com> Date: Sat, 14 Mar 2026 12:40:43 +0800 Subject: [PATCH] Add dynamic ROI mode for shoe detection --- configs/sample_person_shoe_two_stage.json | 131 ++++++++++++++ plugins/ai_shoe_det/README.md | 94 +++++++++- plugins/ai_shoe_det/ai_shoe_det_node.cpp | 209 ++++++++++++++++------ 3 files changed, 381 insertions(+), 53 deletions(-) create mode 100644 configs/sample_person_shoe_two_stage.json diff --git a/configs/sample_person_shoe_two_stage.json b/configs/sample_person_shoe_two_stage.json new file mode 100644 index 0000000..286702b --- /dev/null +++ b/configs/sample_person_shoe_two_stage.json @@ -0,0 +1,131 @@ +{ + "queue": { + "size": 8, + "strategy": "drop_oldest" + }, + "graphs": [ + { + "name": "person_shoe_two_stage", + "nodes": [ + { + "id": "in", + "type": "input_rtsp", + "role": "source", + "enable": true, + "url": "rtsp://10.0.0.49:8554/cam", + "fps": 30, + "width": 1920, + "height": 1080, + "use_mpp": true, + "force_tcp": true + }, + { + "id": "pre_rgb", + "type": "preprocess", + "role": "filter", + "enable": true, + "dst_w": 1920, + "dst_h": 1080, + "dst_format": "rgb", + "dst_packed": true, + "resize_mode": "stretch", + "rga_gate": "person_shoe_two_stage", + "use_rga": true + }, + { + "id": "person_det", + "type": "ai_yolo", + "role": "filter", + "enable": true, + "infer_fps": 5, + "model_path": "./models/yolov8n-640.rknn", + "model_version": "v8", + "model_w": 640, + "model_h": 640, + "num_classes": 80, + "conf": 0.3, + "nms": 0.45, + "class_filter": [0] + }, + { + "id": "person_trk", + "type": "tracker", + "role": "filter", + "enable": true, + "mode": "bytetrack_lite", + "per_class": true, + "track_classes": [0], + "max_age_ms": 1500, + "min_hits": 2 + }, + { + "id": "shoe_det", + "type": "ai_shoe_det", + "role": "filter", + "enable": true, + "infer_fps": 3, + "model_path": "./models/shoe_detector_openimages_ppe_v1.rknn", + "model_w": 640, + "model_h": 640, + "conf": 0.35, + "nms": 0.45, + "append_detections": true, + "dynamic_roi": { + "enable": true, + "person_class_id": 0, + "shoe_class_id": 1, + "max_rois": 6, + "min_person_height": 80, + "x_offset": -0.15, + "y_offset": 0.72, + "width_scale": 1.30, + "height_scale": 0.38 + } + }, + { + "id": "osd", + "type": "osd", + "role": "filter", + "enable": true, + "draw_bbox": true, + "draw_text": true, + "labels": ["person", "shoe"] + }, + { + "id": "post", + "type": "preprocess", + "role": "filter", + "enable": true, + "dst_w": 1920, + "dst_h": 1080, + "dst_format": "nv12", + "resize_mode": "stretch", + "rga_gate": "person_shoe_two_stage", + "use_rga": true + }, + { + "id": "pub", + "type": "publish", + "role": "filter", + "enable": true, + "codec": "h264", + "fps": 30, + "bitrate_kbps": 2000, + "use_mpp": true, + "outputs": [ + {"proto": "rtsp_server", "port": 8555, "path": "/live/cam1"} + ] + } + ], + "edges": [ + ["in", "pre_rgb"], + ["pre_rgb", "person_det"], + ["person_det", "person_trk"], + ["person_trk", "shoe_det"], + ["shoe_det", "osd"], + ["osd", "post"], + ["post", "pub"] + ] + } + ] +} diff --git a/plugins/ai_shoe_det/README.md b/plugins/ai_shoe_det/README.md index 03dce84..60ef4c8 100644 --- a/plugins/ai_shoe_det/README.md +++ b/plugins/ai_shoe_det/README.md @@ -5,9 +5,11 @@ ## 特性 - **滑动窗口支持**:可配置多窗口覆盖全图,提高小鞋子检测精度 +- **动态 ROI 支持**:可直接读取上一阶段的人框,只在脚部区域跑鞋模型 - **单类优化**:专门针对 shoe 单类检测优化 - **自动 NMS**:多窗口结果自动合并去重 - **轻量快速**:基于 RK3588 NPU 加速 +- **低频运行**:支持 `infer_fps` / `infer_interval_ms` 控制推理频次 ## 配置参数 @@ -18,6 +20,7 @@ "model_path": "./models/shoe_detector.rknn", "model_w": 640, "model_h": 640, + "infer_fps": 5, "conf": 0.25, "nms": 0.45, "windows": [ @@ -34,9 +37,13 @@ | `model_path` | string | - | RKNN 模型路径 | | `model_w` | int | 640 | 模型输入宽度 | | `model_h` | int | 640 | 模型输入高度 | +| `infer_fps` | float | 0 | 推理帧率限制,0 表示每帧都跑 | +| `infer_interval_ms` | int | 0 | 推理间隔,优先级高于 `infer_fps` | | `conf` | float | 0.25 | 置信度阈值 | | `nms` | float | 0.45 | NMS IoU 阈值 | +| `append_detections` | bool | false | 是否保留前级检测结果并追加鞋框 | | `windows` | array | - | 滑动窗口配置,不配置则使用全图单窗口 | +| `dynamic_roi` | object | - | 动态脚部 ROI 配置,启用后会根据 person 框生成检测窗口 | ### 窗口配置 @@ -49,6 +56,49 @@ ] ``` +## 动态 ROI 配置 + +适合两阶段检测链路:先用 `ai_yolo` 检 `person`,再由 `ai_shoe_det` 读取人框生成脚部 ROI。 + +```json +{ + "id": "shoe_det", + "type": "ai_shoe_det", + "model_path": "./models/shoe_detector_openimages_ppe_v1.rknn", + "model_w": 640, + "model_h": 640, + "infer_fps": 3, + "conf": 0.35, + "nms": 0.45, + "append_detections": true, + "dynamic_roi": { + "enable": true, + "person_class_id": 0, + "shoe_class_id": 0, + "max_rois": 6, + "min_person_height": 80, + "x_offset": -0.15, + "y_offset": 0.72, + "width_scale": 1.30, + "height_scale": 0.38 + } +} +``` + +### `dynamic_roi` 字段说明 + +| 字段 | 默认值 | 说明 | +|------|--------|------| +| `enable` | false | 是否启用基于人框的动态脚部 ROI | +| `person_class_id` | 0 | 前级 person 类别 ID | +| `shoe_class_id` | 0 | 追加到结果中的鞋类别 ID | +| `max_rois` | 8 | 每帧最多处理多少个人 | +| `min_person_height` | 0 | 忽略过小的人框,减少无效 ROI | +| `x_offset` | -0.15 | ROI 左上角相对人框左上角的横向偏移系数 | +| `y_offset` | 0.72 | ROI 左上角相对人框左上角的纵向偏移系数 | +| `width_scale` | 1.30 | ROI 宽度相对人框宽度的放大系数 | +| `height_scale` | 0.38 | ROI 高度相对人框高度的放大系数 | + ## Pipeline 示例 ```json @@ -80,6 +130,46 @@ } ``` +## 两阶段示例 + +```json +{ + "nodes": [ + {"id": "pre", "type": "preprocess", "dst_w": 1920, "dst_h": 1080, "dst_format": "rgb"}, + { + "id": "person", + "type": "ai_yolo", + "model_path": "./models/yolov8n-640.rknn", + "model_version": "v8", + "model_w": 640, + "model_h": 640, + "num_classes": 80, + "class_filter": [0], + "infer_fps": 5 + }, + { + "id": "shoe_det", + "type": "ai_shoe_det", + "model_path": "./models/shoe_detector_openimages_ppe_v1.rknn", + "model_w": 640, + "model_h": 640, + "infer_fps": 3, + "append_detections": true, + "dynamic_roi": { + "enable": true, + "person_class_id": 0, + "shoe_class_id": 1, + "max_rois": 6, + "x_offset": -0.15, + "y_offset": 0.72, + "width_scale": 1.30, + "height_scale": 0.38 + } + } + ] +} +``` + ## 编译 ```bash @@ -92,4 +182,6 @@ make ai_shoe_det -j4 1. 模型必须是单类(shoe)YOLOv8 格式 2. 多窗口会增加 NPU 负载(2窗口 = 2倍推理时间) -3. 窗口之间有重叠时,NMS 会自动去重 +3. 动态 ROI 模式依赖前级 `frame->det` 中已经存在 person 检测结果 +4. 两阶段模式建议开启 `append_detections=true`,否则会覆盖前级的人框 +5. 窗口之间有重叠时,NMS 会自动去重 diff --git a/plugins/ai_shoe_det/ai_shoe_det_node.cpp b/plugins/ai_shoe_det/ai_shoe_det_node.cpp index 0bdbfd8..366501c 100644 --- a/plugins/ai_shoe_det/ai_shoe_det_node.cpp +++ b/plugins/ai_shoe_det/ai_shoe_det_node.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "ai_scheduler.h" @@ -32,6 +33,18 @@ struct DetBox { int class_id; }; +struct DynamicRoiConfig { + bool enable = false; + int person_class_id = 0; + int shoe_class_id = 0; + int max_rois = 8; + int min_person_height = 0; + float x_offset = -0.15f; + float y_offset = 0.72f; + float width_scale = 1.30f; + float height_scale = 0.38f; +}; + class AiShoeDetNode : public INode { public: std::string Id() const override { return id_; } @@ -39,14 +52,40 @@ public: bool Init(const SimpleJson& config, const NodeContext& ctx) override { id_ = config.ValueOr("id", "shoe_det"); - model_path_ = config.ValueOr("model_path", + model_path_ = config.ValueOr("model_path", "./models/shoe_detector.rknn"); - + model_w_ = config.ValueOr("model_w", 640); model_h_ = config.ValueOr("model_h", 640); conf_thresh_ = config.ValueOr("conf", 0.25f); nms_thresh_ = config.ValueOr("nms", 0.45f); - + append_detections_ = config.ValueOr("append_detections", false); + + infer_interval_ms_ = std::max( + 0, static_cast(config.ValueOr("infer_interval_ms", 0))); + if (infer_interval_ms_ <= 0) { + const double infer_fps = config.ValueOr("infer_fps", 0.0); + if (infer_fps > 0.0) { + infer_interval_ms_ = static_cast(1000.0 / infer_fps); + if (infer_interval_ms_ < 1) infer_interval_ms_ = 1; + } + } + + if (const SimpleJson* dyn = config.Find("dynamic_roi"); dyn && dyn->IsObject()) { + dynamic_roi_.enable = dyn->ValueOr("enable", false); + dynamic_roi_.person_class_id = + dyn->ValueOr("person_class_id", dynamic_roi_.person_class_id); + dynamic_roi_.shoe_class_id = + dyn->ValueOr("shoe_class_id", dynamic_roi_.shoe_class_id); + dynamic_roi_.max_rois = std::max(1, dyn->ValueOr("max_rois", dynamic_roi_.max_rois)); + dynamic_roi_.min_person_height = + std::max(0, dyn->ValueOr("min_person_height", dynamic_roi_.min_person_height)); + dynamic_roi_.x_offset = dyn->ValueOr("x_offset", dynamic_roi_.x_offset); + dynamic_roi_.y_offset = dyn->ValueOr("y_offset", dynamic_roi_.y_offset); + dynamic_roi_.width_scale = dyn->ValueOr("width_scale", dynamic_roi_.width_scale); + dynamic_roi_.height_scale = dyn->ValueOr("height_scale", dynamic_roi_.height_scale); + } + // 解析窗口配置 windows_.clear(); if (const SimpleJson* win_arr = config.Find("windows"); win_arr && win_arr->IsArray()) { @@ -61,25 +100,25 @@ public: } } } - + // 默认单窗口(全图) - if (windows_.empty()) { + if (!dynamic_roi_.enable && windows_.empty()) { windows_.push_back({0, 0, 0, 0}); // 0表示全图 } - + input_queue_ = ctx.input_queue; output_queues_ = ctx.output_queues; if (!input_queue_) { LogError("[ai_shoe_det] no input queue"); return false; } - + infer_backend_ = ctx.infer_backend; if (!infer_backend_) { LogError("[ai_shoe_det] no infer backend"); return false; } - + #if defined(RK3588_ENABLE_RKNN) std::string err; model_handle_ = infer_backend_->LoadModel(model_path_, err); @@ -92,12 +131,15 @@ public: #else LogWarn("[ai_shoe_det] RKNN disabled"); #endif - + return true; } bool Start() override { - LogInfo("[ai_shoe_det] start, windows=" + std::to_string(windows_.size())); + LogInfo("[ai_shoe_det] start mode=" + std::string(dynamic_roi_.enable ? "dynamic_roi" : "windows") + + " windows=" + std::to_string(windows_.size()) + + " append=" + std::string(append_detections_ ? "true" : "false") + + " infer_interval_ms=" + std::to_string(infer_interval_ms_)); return true; } @@ -114,6 +156,14 @@ public: NodeStatus Process(FramePtr frame) override { if (!frame) return NodeStatus::DROP; #if defined(RK3588_ENABLE_RKNN) + if (infer_interval_ms_ > 0 && frame->pts > 0) { + const int64_t pts_ms = static_cast(frame->pts / 1000ULL); + if (last_infer_pts_ms_ > 0 && (pts_ms - last_infer_pts_ms_) < infer_interval_ms_) { + Push(frame); + return NodeStatus::OK; + } + last_infer_pts_ms_ = pts_ms; + } RunDetection(frame); #endif Push(frame); @@ -129,29 +179,40 @@ private: void RunDetection(FramePtr frame) { if (!frame->data || frame->data_size == 0) return; - + const int src_w = frame->width; const int src_h = frame->height; - + std::vector all_dets; - + + std::vector active_windows = windows_; + if (dynamic_roi_.enable) { + active_windows = BuildDynamicWindows(frame, src_w, src_h); + if (active_windows.empty()) { + return; + } + } + // 对每个窗口进行检测 - for (const auto& win : windows_) { + for (const auto& win : active_windows) { auto dets = DetectWindow(frame, src_w, src_h, win); all_dets.insert(all_dets.end(), dets.begin(), dets.end()); } - + // NMS all_dets = ApplyNMS(all_dets, nms_thresh_); - + // 填充结果 if (!frame->det) { frame->det = std::make_shared(); } - frame->det->items.clear(); + + if (!append_detections_) { + frame->det->items.clear(); + } frame->det->img_w = src_w; frame->det->img_h = src_h; - + for (const auto& d : all_dets) { Detection item; item.bbox = {d.x, d.y, d.w, d.h}; @@ -160,10 +221,50 @@ private: frame->det->items.push_back(item); } } - + + std::vector BuildDynamicWindows(FramePtr frame, int src_w, int src_h) const { + std::vector rois; + if (!frame->det) return rois; + + std::vector persons; + persons.reserve(frame->det->items.size()); + for (const auto& det : frame->det->items) { + if (det.cls_id != dynamic_roi_.person_class_id) continue; + if (det.bbox.h < dynamic_roi_.min_person_height) continue; + persons.push_back(det); + } + + if (persons.empty()) return rois; + + std::sort(persons.begin(), persons.end(), + [](const Detection& a, const Detection& b) { return a.score > b.score; }); + + const size_t max_rois = std::min( + persons.size(), static_cast(std::max(1, dynamic_roi_.max_rois))); + rois.reserve(max_rois); + + for (size_t i = 0; i < max_rois; ++i) { + const Rect& bbox = persons[i].bbox; + const float roi_x = bbox.x + bbox.w * dynamic_roi_.x_offset; + const float roi_y = bbox.y + bbox.h * dynamic_roi_.y_offset; + const float roi_w = bbox.w * dynamic_roi_.width_scale; + const float roi_h = bbox.h * dynamic_roi_.height_scale; + + const int x0 = std::max(0, static_cast(roi_x)); + const int y0 = std::max(0, static_cast(roi_y)); + const int x1 = std::min(src_w, static_cast(roi_x + roi_w)); + const int y1 = std::min(src_h, static_cast(roi_y + roi_h)); + + if (x1 <= x0 || y1 <= y0) continue; + rois.push_back({x0, y0, x1 - x0, y1 - y0}); + } + + return rois; + } + std::vector DetectWindow(FramePtr frame, int src_w, int src_h, const DetWindow& win) { std::vector dets; - + // 确定裁剪区域 int win_x, win_y, win_w, win_h; if (win.w == 0 || win.h == 0) { @@ -174,16 +275,16 @@ private: win_w = std::min(win.w, src_w - win_x); win_h = std::min(win.h, src_h - win_y); } - + if (win_w <= 0 || win_h <= 0) return dets; - + // 获取源数据 const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data; int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride : (frame->stride > 0 ? frame->stride : src_w * 3); - + if (!src || src_stride <= 0) return dets; - + // 裁剪到临时缓冲区 std::vector crop_buf(static_cast(win_w) * win_h * 3); for (int row = 0; row < win_h; ++row) { @@ -191,11 +292,11 @@ private: uint8_t* dst_row = crop_buf.data() + row * win_w * 3; memcpy(dst_row, src_row, static_cast(win_w) * 3); } - + // Resize 到模型输入尺寸 ResizeRgbBilinear(crop_buf.data(), win_w, win_h, win_w * 3, input_buf_.data(), model_w_, model_h_, model_w_ * 3); - + // 推理 InferInput input; input.width = model_w_; @@ -204,66 +305,66 @@ private: input.data = input_buf_.data(); input.size = input_buf_.size(); input.type = RKNN_TENSOR_UINT8; - + auto r = infer_backend_->InferBorrowed(model_handle_, input); if (!r.success || r.outputs.empty() || !r.outputs[0].data) { LogWarn("[ai_shoe_det] inference failed"); return dets; } - + // 解析输出 dets = ParseOutput(r.outputs, win_x, win_y, win_w, win_h); return dets; } - + std::vector ParseOutput(const std::vector& outputs, int win_x, int win_y, int win_w, int win_h) { std::vector dets; if (outputs.empty() || !outputs[0].data) return dets; - + // 假设 YOLOv8 输出格式: [num_boxes, 5] = x, y, w, h, conf // 实际格式可能不同,需要根据模型调整 const float* data = reinterpret_cast(outputs[0].data); int num_boxes = 8400; // YOLOv8 默认 - + float scale_x = static_cast(win_w) / model_w_; float scale_y = static_cast(win_h) / model_h_; - + for (int i = 0; i < num_boxes; ++i) { float x = data[i * 5 + 0]; float y = data[i * 5 + 1]; float w = data[i * 5 + 2]; float h = data[i * 5 + 3]; float conf = data[i * 5 + 4]; - + if (conf < conf_thresh_) continue; - + DetBox box; box.x = win_x + x * scale_x; box.y = win_y + y * scale_y; box.w = w * scale_x; box.h = h * scale_y; box.conf = conf; - box.class_id = 0; + box.class_id = dynamic_roi_.shoe_class_id; dets.push_back(box); } - + return dets; } - + std::vector ApplyNMS(std::vector& dets, float thresh) { if (dets.empty()) return dets; - - std::sort(dets.begin(), dets.end(), + + std::sort(dets.begin(), dets.end(), [](const DetBox& a, const DetBox& b) { return a.conf > b.conf; }); - + std::vector keep; std::vector suppressed(dets.size(), false); - + for (size_t i = 0; i < dets.size(); ++i) { if (suppressed[i]) continue; keep.push_back(dets[i]); - + for (size_t j = i + 1; j < dets.size(); ++j) { if (suppressed[j]) continue; float iou = ComputeIoU(dets[i], dets[j]); @@ -272,52 +373,52 @@ private: } } } - + return keep; } - + float ComputeIoU(const DetBox& a, const DetBox& b) { float x1 = std::max(a.x, b.x); float y1 = std::max(a.y, b.y); float x2 = std::min(a.x + a.w, b.x + b.w); float y2 = std::min(a.y + a.h, b.y + b.h); - + float inter = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1); float area_a = a.w * a.h; float area_b = b.w * b.h; float uni = area_a + area_b - inter; - + return uni > 0 ? inter / uni : 0; } - + void ResizeRgbBilinear(const uint8_t* src, int src_w, int src_h, int src_stride, uint8_t* dst, int dst_w, int dst_h, int dst_stride) { float scale_x = static_cast(src_w) / dst_w; float scale_y = static_cast(src_h) / dst_h; - + for (int y = 0; y < dst_h; ++y) { float fy = y * scale_y; int y0 = static_cast(fy); int y1 = std::min(y0 + 1, src_h - 1); float dy = fy - y0; - + for (int x = 0; x < dst_w; ++x) { float fx = x * scale_x; int x0 = static_cast(fx); int x1 = std::min(x0 + 1, src_w - 1); float dx = fx - x0; - + for (int c = 0; c < 3; ++c) { float v00 = src[y0 * src_stride + x0 * 3 + c]; float v01 = src[y0 * src_stride + x1 * 3 + c]; float v10 = src[y1 * src_stride + x0 * 3 + c]; float v11 = src[y1 * src_stride + x1 * 3 + c]; - + float v = v00 * (1-dx) * (1-dy) + v01 * dx * (1-dy) + v10 * (1-dx) * dy + v11 * dx * dy; - + dst[y * dst_stride + x * 3 + c] = static_cast(v); } } @@ -332,6 +433,10 @@ private: int model_h_ = 640; float conf_thresh_ = 0.25f; float nms_thresh_ = 0.45f; + bool append_detections_ = false; + int64_t infer_interval_ms_ = 0; + int64_t last_infer_pts_ms_ = 0; + DynamicRoiConfig dynamic_roi_; std::vector windows_; std::vector input_buf_;