Add dynamic ROI mode for shoe detection

2026-03-14 12:40:43 +08:00 · 2026-03-14 12:40:43 +08:00 · 1f0d8e1e55
commit 1f0d8e1e55
parent e17f49c53c
3 changed files with 381 additions and 53 deletions
--- a/configs/sample_person_shoe_two_stage.json
+++ b/configs/sample_person_shoe_two_stage.json
@ -0,0 +1,131 @@
+{
+  "queue": {
+    "size": 8,
+    "strategy": "drop_oldest"
+  },
+  "graphs": [
+    {
+      "name": "person_shoe_two_stage",
+      "nodes": [
+        {
+          "id": "in",
+          "type": "input_rtsp",
+          "role": "source",
+          "enable": true,
+          "url": "rtsp://10.0.0.49:8554/cam",
+          "fps": 30,
+          "width": 1920,
+          "height": 1080,
+          "use_mpp": true,
+          "force_tcp": true
+        },
+        {
+          "id": "pre_rgb",
+          "type": "preprocess",
+          "role": "filter",
+          "enable": true,
+          "dst_w": 1920,
+          "dst_h": 1080,
+          "dst_format": "rgb",
+          "dst_packed": true,
+          "resize_mode": "stretch",
+          "rga_gate": "person_shoe_two_stage",
+          "use_rga": true
+        },
+        {
+          "id": "person_det",
+          "type": "ai_yolo",
+          "role": "filter",
+          "enable": true,
+          "infer_fps": 5,
+          "model_path": "./models/yolov8n-640.rknn",
+          "model_version": "v8",
+          "model_w": 640,
+          "model_h": 640,
+          "num_classes": 80,
+          "conf": 0.3,
+          "nms": 0.45,
+          "class_filter": [0]
+        },
+        {
+          "id": "person_trk",
+          "type": "tracker",
+          "role": "filter",
+          "enable": true,
+          "mode": "bytetrack_lite",
+          "per_class": true,
+          "track_classes": [0],
+          "max_age_ms": 1500,
+          "min_hits": 2
+        },
+        {
+          "id": "shoe_det",
+          "type": "ai_shoe_det",
+          "role": "filter",
+          "enable": true,
+          "infer_fps": 3,
+          "model_path": "./models/shoe_detector_openimages_ppe_v1.rknn",
+          "model_w": 640,
+          "model_h": 640,
+          "conf": 0.35,
+          "nms": 0.45,
+          "append_detections": true,
+          "dynamic_roi": {
+            "enable": true,
+            "person_class_id": 0,
+            "shoe_class_id": 1,
+            "max_rois": 6,
+            "min_person_height": 80,
+            "x_offset": -0.15,
+            "y_offset": 0.72,
+            "width_scale": 1.30,
+            "height_scale": 0.38
+          }
+        },
+        {
+          "id": "osd",
+          "type": "osd",
+          "role": "filter",
+          "enable": true,
+          "draw_bbox": true,
+          "draw_text": true,
+          "labels": ["person", "shoe"]
+        },
+        {
+          "id": "post",
+          "type": "preprocess",
+          "role": "filter",
+          "enable": true,
+          "dst_w": 1920,
+          "dst_h": 1080,
+          "dst_format": "nv12",
+          "resize_mode": "stretch",
+          "rga_gate": "person_shoe_two_stage",
+          "use_rga": true
+        },
+        {
+          "id": "pub",
+          "type": "publish",
+          "role": "filter",
+          "enable": true,
+          "codec": "h264",
+          "fps": 30,
+          "bitrate_kbps": 2000,
+          "use_mpp": true,
+          "outputs": [
+            {"proto": "rtsp_server", "port": 8555, "path": "/live/cam1"}
+          ]
+        }
+      ],
+      "edges": [
+        ["in", "pre_rgb"],
+        ["pre_rgb", "person_det"],
+        ["person_det", "person_trk"],
+        ["person_trk", "shoe_det"],
+        ["shoe_det", "osd"],
+        ["osd", "post"],
+        ["post", "pub"]
+      ]
+    }
+  ]
+}
--- a/plugins/ai_shoe_det/README.md
+++ b/plugins/ai_shoe_det/README.md
@ -5,9 +5,11 @@
 ## 特性

 - **滑动窗口支持**：可配置多窗口覆盖全图，提高小鞋子检测精度
+- **动态 ROI 支持**：可直接读取上一阶段的人框，只在脚部区域跑鞋模型
 - **单类优化**：专门针对 shoe 单类检测优化
 - **自动 NMS**：多窗口结果自动合并去重
 - **轻量快速**：基于 RK3588 NPU 加速
+- **低频运行**：支持 `infer_fps` / `infer_interval_ms` 控制推理频次

 ## 配置参数

@ -18,6 +20,7 @@
  "model_path": "./models/shoe_detector.rknn",
  "model_w": 640,
  "model_h": 640,
+  "infer_fps": 5,
  "conf": 0.25,
  "nms": 0.45,
  "windows": [
@ -34,9 +37,13 @@
 | `model_path` | string | - | RKNN 模型路径 |
 | `model_w` | int | 640 | 模型输入宽度 |
 | `model_h` | int | 640 | 模型输入高度 |
+| `infer_fps` | float | 0 | 推理帧率限制，0 表示每帧都跑 |
+| `infer_interval_ms` | int | 0 | 推理间隔，优先级高于 `infer_fps` |
 | `conf` | float | 0.25 | 置信度阈值 |
 | `nms` | float | 0.45 | NMS IoU 阈值 |
+| `append_detections` | bool | false | 是否保留前级检测结果并追加鞋框 |
 | `windows` | array | - | 滑动窗口配置，不配置则使用全图单窗口 |
+| `dynamic_roi` | object | - | 动态脚部 ROI 配置，启用后会根据 person 框生成检测窗口 |

 ### 窗口配置

@ -49,6 +56,49 @@
  ]
  ```

+## 动态 ROI 配置
+
+适合两阶段检测链路：先用 `ai_yolo` 检 `person`，再由 `ai_shoe_det` 读取人框生成脚部 ROI。
+
+```json
+{
+  "id": "shoe_det",
+  "type": "ai_shoe_det",
+  "model_path": "./models/shoe_detector_openimages_ppe_v1.rknn",
+  "model_w": 640,
+  "model_h": 640,
+  "infer_fps": 3,
+  "conf": 0.35,
+  "nms": 0.45,
+  "append_detections": true,
+  "dynamic_roi": {
+    "enable": true,
+    "person_class_id": 0,
+    "shoe_class_id": 0,
+    "max_rois": 6,
+    "min_person_height": 80,
+    "x_offset": -0.15,
+    "y_offset": 0.72,
+    "width_scale": 1.30,
+    "height_scale": 0.38
+  }
+}
+```
+
+### `dynamic_roi` 字段说明
+
+| 字段 | 默认值 | 说明 |
+|------|--------|------|
+| `enable` | false | 是否启用基于人框的动态脚部 ROI |
+| `person_class_id` | 0 | 前级 person 类别 ID |
+| `shoe_class_id` | 0 | 追加到结果中的鞋类别 ID |
+| `max_rois` | 8 | 每帧最多处理多少个人 |
+| `min_person_height` | 0 | 忽略过小的人框，减少无效 ROI |
+| `x_offset` | -0.15 | ROI 左上角相对人框左上角的横向偏移系数 |
+| `y_offset` | 0.72 | ROI 左上角相对人框左上角的纵向偏移系数 |
+| `width_scale` | 1.30 | ROI 宽度相对人框宽度的放大系数 |
+| `height_scale` | 0.38 | ROI 高度相对人框高度的放大系数 |
+
 ## Pipeline 示例

 ```json
@ -80,6 +130,46 @@
 }
 ```

+## 两阶段示例
+
+```json
+{
+  "nodes": [
+    {"id": "pre", "type": "preprocess", "dst_w": 1920, "dst_h": 1080, "dst_format": "rgb"},
+    {
+      "id": "person",
+      "type": "ai_yolo",
+      "model_path": "./models/yolov8n-640.rknn",
+      "model_version": "v8",
+      "model_w": 640,
+      "model_h": 640,
+      "num_classes": 80,
+      "class_filter": [0],
+      "infer_fps": 5
+    },
+    {
+      "id": "shoe_det",
+      "type": "ai_shoe_det",
+      "model_path": "./models/shoe_detector_openimages_ppe_v1.rknn",
+      "model_w": 640,
+      "model_h": 640,
+      "infer_fps": 3,
+      "append_detections": true,
+      "dynamic_roi": {
+        "enable": true,
+        "person_class_id": 0,
+        "shoe_class_id": 1,
+        "max_rois": 6,
+        "x_offset": -0.15,
+        "y_offset": 0.72,
+        "width_scale": 1.30,
+        "height_scale": 0.38
+      }
+    }
+  ]
+}
+```
+
 ## 编译

 ```bash
@ -92,4 +182,6 @@ make ai_shoe_det -j4

 1. 模型必须是单类（shoe）YOLOv8 格式
 2. 多窗口会增加 NPU 负载（2窗口 = 2倍推理时间）
-3. 窗口之间有重叠时，NMS 会自动去重
+3. 动态 ROI 模式依赖前级 `frame->det` 中已经存在 person 检测结果
+4. 两阶段模式建议开启 `append_detections=true`，否则会覆盖前级的人框
+5. 窗口之间有重叠时，NMS 会自动去重
--- a/plugins/ai_shoe_det/ai_shoe_det_node.cpp
+++ b/plugins/ai_shoe_det/ai_shoe_det_node.cpp
@ -8,6 +8,7 @@
 #include <cstdint>
 #include <cstring>
 #include <memory>
+#include <string>
 #include <vector>

 #include "ai_scheduler.h"
@ -32,6 +33,18 @@ struct DetBox {
    int class_id;
 };

+struct DynamicRoiConfig {
+    bool enable = false;
+    int person_class_id = 0;
+    int shoe_class_id = 0;
+    int max_rois = 8;
+    int min_person_height = 0;
+    float x_offset = -0.15f;
+    float y_offset = 0.72f;
+    float width_scale = 1.30f;
+    float height_scale = 0.38f;
+};
+
 class AiShoeDetNode : public INode {
 public:
    std::string Id() const override { return id_; }
@ -39,14 +52,40 @@ public:

    bool Init(const SimpleJson& config, const NodeContext& ctx) override {
        id_ = config.ValueOr<std::string>("id", "shoe_det");
-        model_path_ = config.ValueOr<std::string>("model_path", 
+        model_path_ = config.ValueOr<std::string>("model_path",
            "./models/shoe_detector.rknn");
-        
+
        model_w_ = config.ValueOr<int>("model_w", 640);
        model_h_ = config.ValueOr<int>("model_h", 640);
        conf_thresh_ = config.ValueOr<float>("conf", 0.25f);
        nms_thresh_ = config.ValueOr<float>("nms", 0.45f);
-        
+        append_detections_ = config.ValueOr<bool>("append_detections", false);
+
+        infer_interval_ms_ = std::max<int64_t>(
+            0, static_cast<int64_t>(config.ValueOr<int>("infer_interval_ms", 0)));
+        if (infer_interval_ms_ <= 0) {
+            const double infer_fps = config.ValueOr<double>("infer_fps", 0.0);
+            if (infer_fps > 0.0) {
+                infer_interval_ms_ = static_cast<int64_t>(1000.0 / infer_fps);
+                if (infer_interval_ms_ < 1) infer_interval_ms_ = 1;
+            }
+        }
+
+        if (const SimpleJson* dyn = config.Find("dynamic_roi"); dyn && dyn->IsObject()) {
+            dynamic_roi_.enable = dyn->ValueOr<bool>("enable", false);
+            dynamic_roi_.person_class_id =
+                dyn->ValueOr<int>("person_class_id", dynamic_roi_.person_class_id);
+            dynamic_roi_.shoe_class_id =
+                dyn->ValueOr<int>("shoe_class_id", dynamic_roi_.shoe_class_id);
+            dynamic_roi_.max_rois = std::max(1, dyn->ValueOr<int>("max_rois", dynamic_roi_.max_rois));
+            dynamic_roi_.min_person_height =
+                std::max(0, dyn->ValueOr<int>("min_person_height", dynamic_roi_.min_person_height));
+            dynamic_roi_.x_offset = dyn->ValueOr<float>("x_offset", dynamic_roi_.x_offset);
+            dynamic_roi_.y_offset = dyn->ValueOr<float>("y_offset", dynamic_roi_.y_offset);
+            dynamic_roi_.width_scale = dyn->ValueOr<float>("width_scale", dynamic_roi_.width_scale);
+            dynamic_roi_.height_scale = dyn->ValueOr<float>("height_scale", dynamic_roi_.height_scale);
+        }
+
        // 解析窗口配置
        windows_.clear();
        if (const SimpleJson* win_arr = config.Find("windows"); win_arr && win_arr->IsArray()) {
@ -61,25 +100,25 @@ public:
                }
            }
        }
-        
+
        // 默认单窗口（全图）
-        if (windows_.empty()) {
+        if (!dynamic_roi_.enable && windows_.empty()) {
            windows_.push_back({0, 0, 0, 0});  // 0表示全图
        }
-        
+
        input_queue_ = ctx.input_queue;
        output_queues_ = ctx.output_queues;
        if (!input_queue_) {
            LogError("[ai_shoe_det] no input queue");
            return false;
        }
-        
+
        infer_backend_ = ctx.infer_backend;
        if (!infer_backend_) {
            LogError("[ai_shoe_det] no infer backend");
            return false;
        }
-        
+
 #if defined(RK3588_ENABLE_RKNN)
        std::string err;
        model_handle_ = infer_backend_->LoadModel(model_path_, err);
@ -92,12 +131,15 @@ public:
 #else
        LogWarn("[ai_shoe_det] RKNN disabled");
 #endif
-        
+
        return true;
    }

    bool Start() override {
-        LogInfo("[ai_shoe_det] start, windows=" + std::to_string(windows_.size()));
+        LogInfo("[ai_shoe_det] start mode=" + std::string(dynamic_roi_.enable ? "dynamic_roi" : "windows") +
+                " windows=" + std::to_string(windows_.size()) +
+                " append=" + std::string(append_detections_ ? "true" : "false") +
+                " infer_interval_ms=" + std::to_string(infer_interval_ms_));
        return true;
    }

@ -114,6 +156,14 @@ public:
    NodeStatus Process(FramePtr frame) override {
        if (!frame) return NodeStatus::DROP;
 #if defined(RK3588_ENABLE_RKNN)
+        if (infer_interval_ms_ > 0 && frame->pts > 0) {
+            const int64_t pts_ms = static_cast<int64_t>(frame->pts / 1000ULL);
+            if (last_infer_pts_ms_ > 0 && (pts_ms - last_infer_pts_ms_) < infer_interval_ms_) {
+                Push(frame);
+                return NodeStatus::OK;
+            }
+            last_infer_pts_ms_ = pts_ms;
+        }
        RunDetection(frame);
 #endif
        Push(frame);
@ -129,29 +179,40 @@ private:
    
    void RunDetection(FramePtr frame) {
        if (!frame->data || frame->data_size == 0) return;
-        
+
        const int src_w = frame->width;
        const int src_h = frame->height;
-        
+
        std::vector<DetBox> all_dets;
-        
+
+        std::vector<DetWindow> active_windows = windows_;
+        if (dynamic_roi_.enable) {
+            active_windows = BuildDynamicWindows(frame, src_w, src_h);
+            if (active_windows.empty()) {
+                return;
+            }
+        }
+
        // 对每个窗口进行检测
-        for (const auto& win : windows_) {
+        for (const auto& win : active_windows) {
            auto dets = DetectWindow(frame, src_w, src_h, win);
            all_dets.insert(all_dets.end(), dets.begin(), dets.end());
        }
-        
+
        // NMS
        all_dets = ApplyNMS(all_dets, nms_thresh_);
-        
+
        // 填充结果
        if (!frame->det) {
            frame->det = std::make_shared<DetectionResult>();
        }
-        frame->det->items.clear();
+
+        if (!append_detections_) {
+            frame->det->items.clear();
+        }
        frame->det->img_w = src_w;
        frame->det->img_h = src_h;
-        
+
        for (const auto& d : all_dets) {
            Detection item;
            item.bbox = {d.x, d.y, d.w, d.h};
@ -160,10 +221,50 @@ private:
            frame->det->items.push_back(item);
        }
    }
-    
+
+    std::vector<DetWindow> BuildDynamicWindows(FramePtr frame, int src_w, int src_h) const {
+        std::vector<DetWindow> rois;
+        if (!frame->det) return rois;
+
+        std::vector<Detection> persons;
+        persons.reserve(frame->det->items.size());
+        for (const auto& det : frame->det->items) {
+            if (det.cls_id != dynamic_roi_.person_class_id) continue;
+            if (det.bbox.h < dynamic_roi_.min_person_height) continue;
+            persons.push_back(det);
+        }
+
+        if (persons.empty()) return rois;
+
+        std::sort(persons.begin(), persons.end(),
+                  [](const Detection& a, const Detection& b) { return a.score > b.score; });
+
+        const size_t max_rois = std::min(
+            persons.size(), static_cast<size_t>(std::max(1, dynamic_roi_.max_rois)));
+        rois.reserve(max_rois);
+
+        for (size_t i = 0; i < max_rois; ++i) {
+            const Rect& bbox = persons[i].bbox;
+            const float roi_x = bbox.x + bbox.w * dynamic_roi_.x_offset;
+            const float roi_y = bbox.y + bbox.h * dynamic_roi_.y_offset;
+            const float roi_w = bbox.w * dynamic_roi_.width_scale;
+            const float roi_h = bbox.h * dynamic_roi_.height_scale;
+
+            const int x0 = std::max(0, static_cast<int>(roi_x));
+            const int y0 = std::max(0, static_cast<int>(roi_y));
+            const int x1 = std::min(src_w, static_cast<int>(roi_x + roi_w));
+            const int y1 = std::min(src_h, static_cast<int>(roi_y + roi_h));
+
+            if (x1 <= x0 || y1 <= y0) continue;
+            rois.push_back({x0, y0, x1 - x0, y1 - y0});
+        }
+
+        return rois;
+    }
+
    std::vector<DetBox> DetectWindow(FramePtr frame, int src_w, int src_h, const DetWindow& win) {
        std::vector<DetBox> dets;
-        
+
        // 确定裁剪区域
        int win_x, win_y, win_w, win_h;
        if (win.w == 0 || win.h == 0) {
@ -174,16 +275,16 @@ private:
            win_w = std::min(win.w, src_w - win_x);
            win_h = std::min(win.h, src_h - win_y);
        }
-        
+
        if (win_w <= 0 || win_h <= 0) return dets;
-        
+
        // 获取源数据
        const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
        int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
                        : (frame->stride > 0 ? frame->stride : src_w * 3);
-        
+
        if (!src || src_stride <= 0) return dets;
-        
+
        // 裁剪到临时缓冲区
        std::vector<uint8_t> crop_buf(static_cast<size_t>(win_w) * win_h * 3);
        for (int row = 0; row < win_h; ++row) {
@ -191,11 +292,11 @@ private:
            uint8_t* dst_row = crop_buf.data() + row * win_w * 3;
            memcpy(dst_row, src_row, static_cast<size_t>(win_w) * 3);
        }
-        
+
        // Resize 到模型输入尺寸
        ResizeRgbBilinear(crop_buf.data(), win_w, win_h, win_w * 3,
                          input_buf_.data(), model_w_, model_h_, model_w_ * 3);
-        
+
        // 推理
        InferInput input;
        input.width = model_w_;
@ -204,66 +305,66 @@ private:
        input.data = input_buf_.data();
        input.size = input_buf_.size();
        input.type = RKNN_TENSOR_UINT8;
-        
+
        auto r = infer_backend_->InferBorrowed(model_handle_, input);
        if (!r.success || r.outputs.empty() || !r.outputs[0].data) {
            LogWarn("[ai_shoe_det] inference failed");
            return dets;
        }
-        
+
        // 解析输出
        dets = ParseOutput(r.outputs, win_x, win_y, win_w, win_h);
        return dets;
    }
-    
+
    std::vector<DetBox> ParseOutput(const std::vector<AiScheduler::BorrowedOutput>& outputs,
                                    int win_x, int win_y, int win_w, int win_h) {
        std::vector<DetBox> dets;
        if (outputs.empty() || !outputs[0].data) return dets;
-        
+
        // 假设 YOLOv8 输出格式: [num_boxes, 5] = x, y, w, h, conf
        // 实际格式可能不同，需要根据模型调整
        const float* data = reinterpret_cast<const float*>(outputs[0].data);
        int num_boxes = 8400;  // YOLOv8 默认
-        
+
        float scale_x = static_cast<float>(win_w) / model_w_;
        float scale_y = static_cast<float>(win_h) / model_h_;
-        
+
        for (int i = 0; i < num_boxes; ++i) {
            float x = data[i * 5 + 0];
            float y = data[i * 5 + 1];
            float w = data[i * 5 + 2];
            float h = data[i * 5 + 3];
            float conf = data[i * 5 + 4];
-            
+
            if (conf < conf_thresh_) continue;
-            
+
            DetBox box;
            box.x = win_x + x * scale_x;
            box.y = win_y + y * scale_y;
            box.w = w * scale_x;
            box.h = h * scale_y;
            box.conf = conf;
-            box.class_id = 0;
+            box.class_id = dynamic_roi_.shoe_class_id;
            dets.push_back(box);
        }
-        
+
        return dets;
    }
-    
+
    std::vector<DetBox> ApplyNMS(std::vector<DetBox>& dets, float thresh) {
        if (dets.empty()) return dets;
-        
-        std::sort(dets.begin(), dets.end(), 
+
+        std::sort(dets.begin(), dets.end(),
                  [](const DetBox& a, const DetBox& b) { return a.conf > b.conf; });
-        
+
        std::vector<DetBox> keep;
        std::vector<bool> suppressed(dets.size(), false);
-        
+
        for (size_t i = 0; i < dets.size(); ++i) {
            if (suppressed[i]) continue;
            keep.push_back(dets[i]);
-            
+
            for (size_t j = i + 1; j < dets.size(); ++j) {
                if (suppressed[j]) continue;
                float iou = ComputeIoU(dets[i], dets[j]);
@ -272,52 +373,52 @@ private:
                }
            }
        }
-        
+
        return keep;
    }
-    
+
    float ComputeIoU(const DetBox& a, const DetBox& b) {
        float x1 = std::max(a.x, b.x);
        float y1 = std::max(a.y, b.y);
        float x2 = std::min(a.x + a.w, b.x + b.w);
        float y2 = std::min(a.y + a.h, b.y + b.h);
-        
+
        float inter = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
        float area_a = a.w * a.h;
        float area_b = b.w * b.h;
        float uni = area_a + area_b - inter;
-        
+
        return uni > 0 ? inter / uni : 0;
    }
-    
+
    void ResizeRgbBilinear(const uint8_t* src, int src_w, int src_h, int src_stride,
                           uint8_t* dst, int dst_w, int dst_h, int dst_stride) {
        float scale_x = static_cast<float>(src_w) / dst_w;
        float scale_y = static_cast<float>(src_h) / dst_h;
-        
+
        for (int y = 0; y < dst_h; ++y) {
            float fy = y * scale_y;
            int y0 = static_cast<int>(fy);
            int y1 = std::min(y0 + 1, src_h - 1);
            float dy = fy - y0;
-            
+
            for (int x = 0; x < dst_w; ++x) {
                float fx = x * scale_x;
                int x0 = static_cast<int>(fx);
                int x1 = std::min(x0 + 1, src_w - 1);
                float dx = fx - x0;
-                
+
                for (int c = 0; c < 3; ++c) {
                    float v00 = src[y0 * src_stride + x0 * 3 + c];
                    float v01 = src[y0 * src_stride + x1 * 3 + c];
                    float v10 = src[y1 * src_stride + x0 * 3 + c];
                    float v11 = src[y1 * src_stride + x1 * 3 + c];
-                    
+
                    float v = v00 * (1-dx) * (1-dy) +
                              v01 * dx * (1-dy) +
                              v10 * (1-dx) * dy +
                              v11 * dx * dy;
-                    
+
                    dst[y * dst_stride + x * 3 + c] = static_cast<uint8_t>(v);
                }
            }
@ -332,6 +433,10 @@ private:
    int model_h_ = 640;
    float conf_thresh_ = 0.25f;
    float nms_thresh_ = 0.45f;
+    bool append_detections_ = false;
+    int64_t infer_interval_ms_ = 0;
+    int64_t last_infer_pts_ms_ = 0;
+    DynamicRoiConfig dynamic_roi_;
    std::vector<DetWindow> windows_;
    std::vector<uint8_t> input_buf_;