OrangePi3588Media/plugins/ai_shoe_det/ai_shoe_det_node.cpp

849 lines
33 KiB
C++

/**
* ai_shoe_det - 鞋子检测节点(支持滑动窗口)
*
* 基于 ai_yolo 实现,添加多窗口检测支持
*/
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <string>
#include <vector>
#include "ai_scheduler.h"
#include "frame/frame.h"
#include "hw/i_image_processor.h"
#include "hw/i_infer_backend.h"
#include "node.h"
#include "utils/logger.h"
#include "utils/person_shoe_roi.h"
#if defined(RK3588_ENABLE_RKNN)
#include "rknn_api.h"
#endif
namespace rk3588 {
struct DetWindow {
int x, y, w, h;
};
struct DetBox {
float x, y, w, h;
float conf;
int class_id;
};
struct DynamicRoiConfig {
bool enable = false;
int person_class_id = 0;
int shoe_class_id = 0;
int debug_roi_class_id = -1;
int max_rois = 8;
int min_person_height = 0;
float max_box_area_ratio = 0.0f;
float y_offset = 0.72f;
float width_scale = 1.30f;
float height_scale = 0.38f;
};
enum class V8BoxFormat { Auto, CxCyWh, XyXy, XyWh };
enum class V8ClsActivation { Auto, None, Sigmoid };
uint32_t TensorTypeSizeBytes(rknn_tensor_type t) {
switch (t) {
case RKNN_TENSOR_INT8:
case RKNN_TENSOR_UINT8:
return 1;
case RKNN_TENSOR_FLOAT16:
return 2;
case RKNN_TENSOR_FLOAT32:
return 4;
default:
return 1;
}
}
inline float Sigmoid(float x) {
return 1.0f / (1.0f + std::exp(-x));
}
inline float Fp16ToFp32(uint16_t h) {
const int sign = (h & 0x8000) ? -1 : 1;
const int exp = (h >> 10) & 0x1F;
const int mant = h & 0x03FF;
if (exp == 0) {
if (mant == 0) return sign < 0 ? -0.0f : 0.0f;
return static_cast<float>(sign) * std::ldexp(static_cast<float>(mant), -24);
}
if (exp == 0x1F) {
if (mant == 0) return sign < 0 ? -INFINITY : INFINITY;
return std::numeric_limits<float>::quiet_NaN();
}
return static_cast<float>(sign) *
std::ldexp(static_cast<float>(mant + 1024), exp - 25);
}
inline int32_t ClipFloat(float val, float min_val, float max_val) {
return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
}
inline int8_t QuantizeF32ToAffine(float f32, int32_t zp, float scale) {
float dst_val = (f32 / scale) + zp;
return static_cast<int8_t>(ClipFloat(dst_val, -128, 127));
}
inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
}
int DefaultV8NumBoxes(int model_h, int model_w) {
if (model_h <= 0 || model_w <= 0) return 0;
return (model_h / 8) * (model_w / 8) +
(model_h / 16) * (model_w / 16) +
(model_h / 32) * (model_w / 32);
}
struct V8LayoutInfo {
int num_boxes = 0;
bool channels_first = true;
};
float ScoreBoxCandidate(float x, float y, float w, float h, int model_w, int model_h) {
float s = 0.0f;
if (w > 0.0f && h > 0.0f) s += 3.0f;
if (w <= model_w * 1.2f) s += 1.0f;
if (h <= model_h * 1.2f) s += 1.0f;
if (x >= -model_w * 0.1f) s += 1.0f;
if (y >= -model_h * 0.1f) s += 1.0f;
if ((x + w) <= model_w * 1.2f) s += 1.0f;
if ((y + h) <= model_h * 1.2f) s += 1.0f;
return s;
}
bool SeemsNormalized(float a, float b, float c, float d) {
auto in_range = [](float v) { return v >= -0.05f && v <= 2.5f; };
return in_range(a) && in_range(b) && in_range(c) && in_range(d);
}
void DecodeV8Box(float a, float b, float c, float d, int model_w, int model_h, V8BoxFormat fmt,
float& out_x, float& out_y, float& out_w, float& out_h) {
if (SeemsNormalized(a, b, c, d)) {
a *= static_cast<float>(model_w);
b *= static_cast<float>(model_h);
c *= static_cast<float>(model_w);
d *= static_cast<float>(model_h);
}
auto decode_cxcywh = [&](float& x, float& y, float& w, float& h) {
x = a - c / 2.0f;
y = b - d / 2.0f;
w = c;
h = d;
};
auto decode_xyxy = [&](float& x, float& y, float& w, float& h) {
x = a;
y = b;
w = c - a;
h = d - b;
};
auto decode_xywh = [&](float& x, float& y, float& w, float& h) {
x = a;
y = b;
w = c;
h = d;
};
if (fmt == V8BoxFormat::CxCyWh) {
decode_cxcywh(out_x, out_y, out_w, out_h);
return;
}
if (fmt == V8BoxFormat::XyXy) {
decode_xyxy(out_x, out_y, out_w, out_h);
return;
}
if (fmt == V8BoxFormat::XyWh) {
decode_xywh(out_x, out_y, out_w, out_h);
return;
}
float x1 = 0.0f, y1 = 0.0f, w1 = 0.0f, h1 = 0.0f;
float x2 = 0.0f, y2 = 0.0f, w2 = 0.0f, h2 = 0.0f;
float x3 = 0.0f, y3 = 0.0f, w3 = 0.0f, h3 = 0.0f;
decode_cxcywh(x1, y1, w1, h1);
decode_xyxy(x2, y2, w2, h2);
decode_xywh(x3, y3, w3, h3);
const float s1 = ScoreBoxCandidate(x1, y1, w1, h1, model_w, model_h);
const float s2 = ScoreBoxCandidate(x2, y2, w2, h2, model_w, model_h);
const float s3 = ScoreBoxCandidate(x3, y3, w3, h3, model_w, model_h);
if (s2 >= s1 && s2 >= s3) {
out_x = x2; out_y = y2; out_w = w2; out_h = h2;
} else if (s3 >= s1 && s3 >= s2) {
out_x = x3; out_y = y3; out_w = w3; out_h = h3;
} else {
out_x = x1; out_y = y1; out_w = w1; out_h = h1;
}
}
bool ResolveV8ApplySigmoid(const float* output, int num_boxes, int num_classes, bool channels_first,
V8ClsActivation act_mode) {
if (act_mode == V8ClsActivation::None) return false;
if (act_mode == V8ClsActivation::Sigmoid) return true;
if (!output || num_boxes <= 0 || num_classes <= 0) return false;
const int num_channels = 4 + num_classes;
const int sample_boxes = std::min(num_boxes, 64);
float min_v = 1e9f;
float max_v = -1e9f;
for (int i = 0; i < sample_boxes; ++i) {
for (int c = 0; c < num_classes; ++c) {
const float v = channels_first ? output[(4 + c) * num_boxes + i]
: output[i * num_channels + (4 + c)];
if (v < min_v) min_v = v;
if (v > max_v) max_v = v;
}
}
return (min_v < -0.1f || max_v > 1.5f);
}
V8LayoutInfo ResolveV8Layout(const std::vector<uint32_t>& dims, size_t byte_size,
rknn_tensor_type type, int num_classes,
int model_h, int model_w) {
V8LayoutInfo info;
const int num_channels = 4 + num_classes;
const uint32_t elem_bytes = TensorTypeSizeBytes(type);
const size_t total_elems = elem_bytes > 0 ? (byte_size / elem_bytes) : 0;
const size_t max_boxes_from_data = static_cast<size_t>(num_channels) > 0
? (total_elems / static_cast<size_t>(num_channels))
: 0;
int ch_idx = -1;
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] == static_cast<uint32_t>(num_channels)) {
ch_idx = static_cast<int>(i);
break;
}
}
if (ch_idx >= 0 && total_elems >= static_cast<size_t>(num_channels)) {
info.num_boxes = static_cast<int>(max_boxes_from_data);
int prev_non1 = 1;
for (int i = ch_idx - 1; i >= 0; --i) {
if (dims[static_cast<size_t>(i)] > 1U) {
prev_non1 = static_cast<int>(dims[static_cast<size_t>(i)]);
break;
}
}
int next_non1 = 1;
for (size_t i = static_cast<size_t>(ch_idx + 1); i < dims.size(); ++i) {
if (dims[i] > 1U) {
next_non1 = static_cast<int>(dims[i]);
break;
}
}
if (next_non1 > 1 && prev_non1 <= 1) {
info.channels_first = true;
} else if (prev_non1 > 1 && next_non1 <= 1) {
info.channels_first = false;
} else if (next_non1 > 1 && prev_non1 > 1) {
info.channels_first = next_non1 >= prev_non1;
}
}
if (info.num_boxes <= 0 && max_boxes_from_data > 0) {
info.num_boxes = static_cast<int>(max_boxes_from_data);
}
if (info.num_boxes <= 0) {
info.num_boxes = DefaultV8NumBoxes(model_h, model_w);
}
if (info.num_boxes <= 0) {
info.num_boxes = 8400;
}
if (max_boxes_from_data > 0 && static_cast<size_t>(info.num_boxes) > max_boxes_from_data) {
info.num_boxes = static_cast<int>(max_boxes_from_data);
}
return info;
}
class AiShoeDetNode : public INode {
public:
std::string Id() const override { return id_; }
std::string Type() const override { return "ai_shoe_det"; }
bool Init(const SimpleJson& config, const NodeContext& ctx) override {
id_ = config.ValueOr<std::string>("id", "shoe_det");
model_path_ = config.ValueOr<std::string>("model_path",
"./models/shoe_detector.rknn");
model_w_ = config.ValueOr<int>("model_w", 640);
model_h_ = config.ValueOr<int>("model_h", 640);
conf_thresh_ = config.ValueOr<float>("conf", 0.25f);
nms_thresh_ = config.ValueOr<float>("nms", 0.45f);
append_detections_ = config.ValueOr<bool>("append_detections", false);
{
const std::string bf = config.ValueOr<std::string>("v8_box_format", "auto");
if (bf == "xyxy") {
v8_box_format_ = V8BoxFormat::XyXy;
} else if (bf == "xywh") {
v8_box_format_ = V8BoxFormat::XyWh;
} else if (bf == "cxcywh") {
v8_box_format_ = V8BoxFormat::CxCyWh;
} else {
v8_box_format_ = V8BoxFormat::Auto;
}
}
{
const std::string act = config.ValueOr<std::string>("v8_cls_activation", "auto");
if (act == "sigmoid") {
v8_cls_activation_ = V8ClsActivation::Sigmoid;
} else if (act == "none") {
v8_cls_activation_ = V8ClsActivation::None;
} else {
v8_cls_activation_ = V8ClsActivation::Auto;
}
}
infer_interval_ms_ = std::max<int64_t>(
0, static_cast<int64_t>(config.ValueOr<int>("infer_interval_ms", 0)));
if (infer_interval_ms_ <= 0) {
const double infer_fps = config.ValueOr<double>("infer_fps", 0.0);
if (infer_fps > 0.0) {
infer_interval_ms_ = static_cast<int64_t>(1000.0 / infer_fps);
if (infer_interval_ms_ < 1) infer_interval_ms_ = 1;
}
}
infer_phase_ms_ = std::max<int64_t>(
0, static_cast<int64_t>(config.ValueOr<int>("infer_phase_ms", 0)));
if (infer_interval_ms_ > 0 && infer_phase_ms_ >= infer_interval_ms_) {
infer_phase_ms_ %= infer_interval_ms_;
}
use_dma_input_ = config.ValueOr<bool>("use_dma_input", use_dma_input_);
if (const SimpleJson* dyn = config.Find("dynamic_roi"); dyn && dyn->IsObject()) {
dynamic_roi_.enable = dyn->ValueOr<bool>("enable", false);
dynamic_roi_.person_class_id =
dyn->ValueOr<int>("person_class_id", dynamic_roi_.person_class_id);
dynamic_roi_.shoe_class_id =
dyn->ValueOr<int>("shoe_class_id", dynamic_roi_.shoe_class_id);
dynamic_roi_.debug_roi_class_id =
dyn->ValueOr<int>("debug_roi_class_id", dynamic_roi_.debug_roi_class_id);
dynamic_roi_.max_rois = std::max(1, dyn->ValueOr<int>("max_rois", dynamic_roi_.max_rois));
dynamic_roi_.min_person_height =
std::max(0, dyn->ValueOr<int>("min_person_height", dynamic_roi_.min_person_height));
dynamic_roi_.max_box_area_ratio =
std::max(0.0f, dyn->ValueOr<float>("max_box_area_ratio", dynamic_roi_.max_box_area_ratio));
dynamic_roi_.y_offset = dyn->ValueOr<float>("y_offset", dynamic_roi_.y_offset);
dynamic_roi_.width_scale = dyn->ValueOr<float>("width_scale", dynamic_roi_.width_scale);
dynamic_roi_.height_scale = dyn->ValueOr<float>("height_scale", dynamic_roi_.height_scale);
}
// 解析窗口配置
windows_.clear();
if (const SimpleJson* win_arr = config.Find("windows"); win_arr && win_arr->IsArray()) {
for (const auto& w : win_arr->AsArray()) {
if (w.IsObject()) {
DetWindow win;
win.x = w.ValueOr<int>("x", 0);
win.y = w.ValueOr<int>("y", 0);
win.w = w.ValueOr<int>("w", 640);
win.h = w.ValueOr<int>("h", 640);
windows_.push_back(win);
}
}
}
// 默认单窗口(全图)
if (!dynamic_roi_.enable && windows_.empty()) {
windows_.push_back({0, 0, 0, 0}); // 0表示全图
}
input_queue_ = ctx.input_queue;
output_queues_ = ctx.output_queues;
if (!input_queue_) {
LogError("[ai_shoe_det] no input queue");
return false;
}
infer_backend_ = ctx.infer_backend;
if (!infer_backend_) {
LogError("[ai_shoe_det] no infer backend");
return false;
}
image_processor_ = ctx.image_processor;
#if defined(RK3588_ENABLE_RKNN)
std::string err;
model_handle_ = infer_backend_->LoadModel(model_path_, err);
if (model_handle_ == kInvalidModelHandle) {
LogError("[ai_shoe_det] failed to load model: " + err);
return false;
}
input_buf_.resize(model_w_ * model_h_ * 3);
LogInfo("[ai_shoe_det] model loaded: " + model_path_);
#else
LogWarn("[ai_shoe_det] RKNN disabled");
#endif
return true;
}
bool Start() override {
LogInfo("[ai_shoe_det] start mode=" + std::string(dynamic_roi_.enable ? "dynamic_roi" : "windows") +
" windows=" + std::to_string(windows_.size()) +
" append=" + std::string(append_detections_ ? "true" : "false") +
" infer_interval_ms=" + std::to_string(infer_interval_ms_));
return true;
}
void Stop() override {
#if defined(RK3588_ENABLE_RKNN)
if (model_handle_ != kInvalidModelHandle) {
infer_backend_->UnloadModel(model_handle_);
model_handle_ = kInvalidModelHandle;
}
#endif
LogInfo("[ai_shoe_det] stop");
}
NodeStatus Process(FramePtr frame) override {
if (!frame) return NodeStatus::DROP;
#if defined(RK3588_ENABLE_RKNN)
if (infer_interval_ms_ > 0 && frame->pts > 0) {
const int64_t pts_ms = static_cast<int64_t>(frame->pts / 1000ULL);
const int64_t effective_pts_ms = pts_ms + infer_phase_ms_;
const int64_t delta_ms = effective_pts_ms - last_infer_pts_ms_;
if (last_infer_pts_ms_ > 0 && delta_ms > 0 && delta_ms < infer_interval_ms_) {
Push(frame);
return NodeStatus::OK;
}
last_infer_pts_ms_ = effective_pts_ms;
}
RunDetection(frame);
#endif
Push(frame);
return NodeStatus::OK;
}
private:
void Push(FramePtr frame) {
for (auto& q : output_queues_) q->Push(frame);
}
#if defined(RK3588_ENABLE_RKNN)
void RunDetection(FramePtr frame) {
if (!frame->data || frame->data_size == 0) return;
const int src_w = frame->width;
const int src_h = frame->height;
std::vector<DetBox> all_dets;
std::vector<DetWindow> active_windows = windows_;
if (dynamic_roi_.enable) {
active_windows = BuildDynamicWindows(frame, src_w, src_h);
if (active_windows.empty()) {
return;
}
}
// 对每个窗口进行检测
for (const auto& win : active_windows) {
auto dets = DetectWindow(frame, src_w, src_h, win);
all_dets.insert(all_dets.end(), dets.begin(), dets.end());
}
// NMS
all_dets = ApplyNMS(all_dets, nms_thresh_);
// 填充结果
if (!frame->det) {
frame->det = std::make_shared<DetectionResult>();
}
if (!append_detections_) {
frame->det->items.clear();
}
frame->det->img_w = src_w;
frame->det->img_h = src_h;
if (dynamic_roi_.enable && dynamic_roi_.debug_roi_class_id >= 0) {
AddDebugRoiDetections(frame, active_windows);
}
for (const auto& d : all_dets) {
Detection item;
item.bbox = {d.x, d.y, d.w, d.h};
item.score = d.conf;
item.cls_id = d.class_id;
frame->det->items.push_back(item);
}
}
std::vector<DetWindow> BuildDynamicWindows(FramePtr frame, int src_w, int src_h) {
std::vector<DetWindow> rois;
if (!frame->det) return rois;
std::vector<Detection> persons;
persons.reserve(frame->det->items.size());
for (const auto& det : frame->det->items) {
if (det.cls_id != dynamic_roi_.person_class_id) continue;
if (det.bbox.h < dynamic_roi_.min_person_height) continue;
persons.push_back(det);
}
if (persons.empty()) return rois;
std::sort(persons.begin(), persons.end(),
[](const Detection& a, const Detection& b) { return a.score > b.score; });
const size_t max_rois = std::min(
persons.size(), static_cast<size_t>(std::max(1, dynamic_roi_.max_rois)));
rois.reserve(max_rois);
const size_t start_index = persons.empty() ? 0 : (roi_round_robin_offset_ % persons.size());
for (size_t i = 0; i < max_rois; ++i) {
const Detection& person = persons[(start_index + i) % persons.size()];
const Rect& bbox = person.bbox;
const Rect roi = BuildPersonFootRegion(bbox,
src_w,
src_h,
dynamic_roi_.y_offset,
dynamic_roi_.width_scale,
dynamic_roi_.height_scale);
const int x0 = std::max(0, static_cast<int>(roi.x));
const int y0 = std::max(0, static_cast<int>(roi.y));
const int x1 = std::min(src_w, static_cast<int>(roi.x + roi.w));
const int y1 = std::min(src_h, static_cast<int>(roi.y + roi.h));
if (x1 <= x0 || y1 <= y0) continue;
rois.push_back({x0, y0, x1 - x0, y1 - y0});
}
if (!persons.empty()) {
roi_round_robin_offset_ = (start_index + max_rois) % persons.size();
}
return rois;
}
void AddDebugRoiDetections(FramePtr frame, const std::vector<DetWindow>& rois) const {
if (!frame || !frame->det) return;
for (const auto& roi : rois) {
Detection item;
item.cls_id = dynamic_roi_.debug_roi_class_id;
item.score = 1.0f;
item.track_id = -1;
item.bbox = {
static_cast<float>(roi.x),
static_cast<float>(roi.y),
static_cast<float>(roi.w),
static_cast<float>(roi.h)
};
frame->det->items.push_back(item);
}
}
std::vector<DetBox> DetectWindow(FramePtr frame, int src_w, int src_h, const DetWindow& win) {
std::vector<DetBox> dets;
// 确定裁剪区域
int win_x, win_y, win_w, win_h;
if (win.w == 0 || win.h == 0) {
win_x = 0; win_y = 0; win_w = src_w; win_h = src_h;
} else {
win_x = std::max(0, std::min(win.x, src_w - 1));
win_y = std::max(0, std::min(win.y, src_h - 1));
win_w = std::min(win.w, src_w - win_x);
win_h = std::min(win.h, src_h - win_y);
}
if (win_w <= 0 || win_h <= 0) return dets;
// 获取源数据
const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
: (frame->stride > 0 ? frame->stride : src_w * 3);
if (!src || src_stride <= 0) return dets;
const uint8_t* roi_src = src + static_cast<size_t>(win_y) * static_cast<size_t>(src_stride) +
static_cast<size_t>(win_x) * 3;
const uint8_t* model_input_data = nullptr;
size_t model_input_size = 0;
int model_input_dma_fd = -1;
Frame resized_frame;
if (image_processor_ && win_w > 0 && win_h > 0) {
Frame crop_frame;
crop_frame.width = win_w;
crop_frame.height = win_h;
crop_frame.format = PixelFormat::RGB;
crop_frame.stride = src_stride;
crop_frame.data = const_cast<uint8_t*>(roi_src);
crop_frame.data_size = frame->data_size;
crop_frame.plane_count = 1;
crop_frame.planes[0] = {crop_frame.data, crop_frame.stride, static_cast<int>(frame->data_size), 0};
resized_frame.width = model_w_;
resized_frame.height = model_h_;
resized_frame.format = PixelFormat::RGB;
Status st = image_processor_->Resize(crop_frame, resized_frame);
if (st.Ok()) {
const uint8_t* resized_data = resized_frame.planes[0].data ? resized_frame.planes[0].data : resized_frame.data;
const int resized_stride = resized_frame.planes[0].stride > 0
? resized_frame.planes[0].stride
: (resized_frame.stride > 0 ? resized_frame.stride : model_w_ * 3);
const size_t packed_size = static_cast<size_t>(model_w_) * static_cast<size_t>(model_h_) * 3;
if (use_dma_input_ && resized_frame.DmaFd() >= 0 && resized_data) {
model_input_data = resized_data;
model_input_size = resized_frame.data_size > 0 ? resized_frame.data_size : packed_size;
model_input_dma_fd = resized_frame.DmaFd();
} else if (resized_data && resized_stride > 0) {
if (packed_size > input_buf_.size()) input_buf_.resize(packed_size);
for (int row = 0; row < model_h_; ++row) {
memcpy(input_buf_.data() + static_cast<size_t>(row) * static_cast<size_t>(model_w_) * 3,
resized_data + static_cast<size_t>(row) * static_cast<size_t>(resized_stride),
static_cast<size_t>(model_w_) * 3);
}
model_input_data = input_buf_.data();
model_input_size = packed_size;
}
}
}
if (!model_input_data) {
ResizeRgbBilinear(roi_src, win_w, win_h, src_stride,
input_buf_.data(), model_w_, model_h_, model_w_ * 3);
model_input_data = input_buf_.data();
model_input_size = input_buf_.size();
}
// 推理
InferInput input;
input.width = model_w_;
input.height = model_h_;
input.is_nhwc = true;
input.data = model_input_data;
input.size = model_input_size;
input.dma_fd = model_input_dma_fd;
input.type = RKNN_TENSOR_UINT8;
auto r = infer_backend_->InferBorrowed(model_handle_, input);
if (!r.success || r.outputs.empty() || !r.outputs[0].data) {
LogWarn("[ai_shoe_det] inference failed");
return dets;
}
// 解析输出
dets = ParseOutput(r.outputs, win_x, win_y, win_w, win_h);
return dets;
}
std::vector<DetBox> ParseOutput(const std::vector<AiScheduler::BorrowedOutput>& outputs,
int win_x, int win_y, int win_w, int win_h) {
std::vector<DetBox> dets;
if (outputs.empty() || !outputs[0].data) return dets;
constexpr int kNumClasses = 1;
const V8LayoutInfo layout = ResolveV8Layout(
outputs[0].dims, outputs[0].size, outputs[0].type, kNumClasses, model_h_, model_w_);
const int num_boxes = layout.num_boxes;
if (num_boxes <= 0) return dets;
float scale_x = static_cast<float>(win_w) / model_w_;
float scale_y = static_cast<float>(win_h) / model_h_;
const int num_channels = 4 + kNumClasses;
auto append_box = [&](float a, float b, float c, float d, float conf) {
if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) return;
float x = 0.0f, y = 0.0f, w = 0.0f, h = 0.0f;
DecodeV8Box(a, b, c, d, model_w_, model_h_, v8_box_format_, x, y, w, h);
if (!std::isfinite(x) || !std::isfinite(y) || !std::isfinite(w) || !std::isfinite(h)) return;
if (w <= 1e-3f || h <= 1e-3f) return;
if (w > model_w_ * 1.2f || h > model_h_ * 1.2f) return;
if (dynamic_roi_.enable && dynamic_roi_.max_box_area_ratio > 0.0f && win_w > 0 && win_h > 0) {
const float mapped_area_ratio =
(w * h) / (static_cast<float>(model_w_) * static_cast<float>(model_h_));
if (mapped_area_ratio > dynamic_roi_.max_box_area_ratio) return;
}
DetBox box;
box.x = win_x + x * scale_x;
box.y = win_y + y * scale_y;
box.w = w * scale_x;
box.h = h * scale_y;
box.conf = conf;
box.class_id = dynamic_roi_.shoe_class_id;
dets.push_back(box);
};
if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
const float* data = reinterpret_cast<const float*>(outputs[0].data);
const bool apply_sigmoid = ResolveV8ApplySigmoid(
data, num_boxes, kNumClasses, layout.channels_first, v8_cls_activation_);
for (int i = 0; i < num_boxes; ++i) {
const float score = layout.channels_first ? data[4 * num_boxes + i]
: data[i * num_channels + 4];
const float conf = apply_sigmoid ? Sigmoid(score) : score;
if (conf < conf_thresh_) continue;
const float a = layout.channels_first ? data[0 * num_boxes + i] : data[i * num_channels + 0];
const float b = layout.channels_first ? data[1 * num_boxes + i] : data[i * num_channels + 1];
const float c = layout.channels_first ? data[2 * num_boxes + i] : data[i * num_channels + 2];
const float d = layout.channels_first ? data[3 * num_boxes + i] : data[i * num_channels + 3];
append_box(a, b, c, d, conf);
}
} else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
const uint16_t* data = reinterpret_cast<const uint16_t*>(outputs[0].data);
fp32_decode_buf_.resize(outputs[0].size / sizeof(uint16_t));
for (size_t i = 0; i < fp32_decode_buf_.size(); ++i) {
fp32_decode_buf_[i] = Fp16ToFp32(data[i]);
}
const bool apply_sigmoid = ResolveV8ApplySigmoid(
fp32_decode_buf_.data(), num_boxes, kNumClasses, layout.channels_first, v8_cls_activation_);
for (int i = 0; i < num_boxes; ++i) {
const float score = layout.channels_first ? fp32_decode_buf_[4 * num_boxes + i]
: fp32_decode_buf_[i * num_channels + 4];
const float conf = apply_sigmoid ? Sigmoid(score) : score;
if (conf < conf_thresh_) continue;
const float a = layout.channels_first ? fp32_decode_buf_[0 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 0];
const float b = layout.channels_first ? fp32_decode_buf_[1 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 1];
const float c = layout.channels_first ? fp32_decode_buf_[2 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 2];
const float d = layout.channels_first ? fp32_decode_buf_[3 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 3];
append_box(a, b, c, d, conf);
}
} else if (outputs[0].type == RKNN_TENSOR_INT8) {
const int8_t* data = reinterpret_cast<const int8_t*>(outputs[0].data);
const int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh_, outputs[0].zp, outputs[0].scale);
for (int i = 0; i < num_boxes; ++i) {
const int8_t score_i8 = layout.channels_first ? data[4 * num_boxes + i]
: data[i * num_channels + 4];
if (score_i8 < thresh_i8) continue;
const float conf = DequantizeAffineToF32(score_i8, outputs[0].zp, outputs[0].scale);
const float a = DequantizeAffineToF32(layout.channels_first ? data[0 * num_boxes + i] : data[i * num_channels + 0], outputs[0].zp, outputs[0].scale);
const float b = DequantizeAffineToF32(layout.channels_first ? data[1 * num_boxes + i] : data[i * num_channels + 1], outputs[0].zp, outputs[0].scale);
const float c = DequantizeAffineToF32(layout.channels_first ? data[2 * num_boxes + i] : data[i * num_channels + 2], outputs[0].zp, outputs[0].scale);
const float d = DequantizeAffineToF32(layout.channels_first ? data[3 * num_boxes + i] : data[i * num_channels + 3], outputs[0].zp, outputs[0].scale);
append_box(a, b, c, d, conf);
}
} else {
LogWarn("[ai_shoe_det] unsupported output tensor type");
}
return dets;
}
std::vector<DetBox> ApplyNMS(std::vector<DetBox>& dets, float thresh) {
if (dets.empty()) return dets;
std::sort(dets.begin(), dets.end(),
[](const DetBox& a, const DetBox& b) { return a.conf > b.conf; });
std::vector<DetBox> keep;
std::vector<bool> suppressed(dets.size(), false);
for (size_t i = 0; i < dets.size(); ++i) {
if (suppressed[i]) continue;
keep.push_back(dets[i]);
for (size_t j = i + 1; j < dets.size(); ++j) {
if (suppressed[j]) continue;
float iou = ComputeIoU(dets[i], dets[j]);
if (iou > thresh) {
suppressed[j] = true;
}
}
}
return keep;
}
float ComputeIoU(const DetBox& a, const DetBox& b) {
float x1 = std::max(a.x, b.x);
float y1 = std::max(a.y, b.y);
float x2 = std::min(a.x + a.w, b.x + b.w);
float y2 = std::min(a.y + a.h, b.y + b.h);
float inter = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
float area_a = a.w * a.h;
float area_b = b.w * b.h;
float uni = area_a + area_b - inter;
return uni > 0 ? inter / uni : 0;
}
void ResizeRgbBilinear(const uint8_t* src, int src_w, int src_h, int src_stride,
uint8_t* dst, int dst_w, int dst_h, int dst_stride) {
float scale_x = static_cast<float>(src_w) / dst_w;
float scale_y = static_cast<float>(src_h) / dst_h;
for (int y = 0; y < dst_h; ++y) {
float fy = y * scale_y;
int y0 = static_cast<int>(fy);
int y1 = std::min(y0 + 1, src_h - 1);
float dy = fy - y0;
for (int x = 0; x < dst_w; ++x) {
float fx = x * scale_x;
int x0 = static_cast<int>(fx);
int x1 = std::min(x0 + 1, src_w - 1);
float dx = fx - x0;
for (int c = 0; c < 3; ++c) {
float v00 = src[y0 * src_stride + x0 * 3 + c];
float v01 = src[y0 * src_stride + x1 * 3 + c];
float v10 = src[y1 * src_stride + x0 * 3 + c];
float v11 = src[y1 * src_stride + x1 * 3 + c];
float v = v00 * (1-dx) * (1-dy) +
v01 * dx * (1-dy) +
v10 * (1-dx) * dy +
v11 * dx * dy;
dst[y * dst_stride + x * 3 + c] = static_cast<uint8_t>(v);
}
}
}
}
#endif
std::string id_;
std::string model_path_;
int model_w_ = 640;
int model_h_ = 640;
float conf_thresh_ = 0.25f;
float nms_thresh_ = 0.45f;
bool append_detections_ = false;
V8BoxFormat v8_box_format_ = V8BoxFormat::Auto;
V8ClsActivation v8_cls_activation_ = V8ClsActivation::Auto;
int64_t infer_interval_ms_ = 0;
int64_t infer_phase_ms_ = 0;
int64_t last_infer_pts_ms_ = 0;
bool use_dma_input_ = true;
DynamicRoiConfig dynamic_roi_;
std::vector<DetWindow> windows_;
std::vector<uint8_t> input_buf_;
std::vector<float> fp32_decode_buf_;
size_t roi_round_robin_offset_ = 0;
std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
std::shared_ptr<IImageProcessor> image_processor_;
#if defined(RK3588_ENABLE_RKNN)
std::shared_ptr<IInferBackend> infer_backend_;
ModelHandle model_handle_ = kInvalidModelHandle;
#endif
};
REGISTER_NODE(AiShoeDetNode, "ai_shoe_det");
} // namespace rk3588