849 lines
33 KiB
C++
849 lines
33 KiB
C++
/**
|
|
* ai_shoe_det - 鞋子检测节点(支持滑动窗口)
|
|
*
|
|
* 基于 ai_yolo 实现,添加多窗口检测支持
|
|
*/
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
#include <limits>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "ai_scheduler.h"
|
|
#include "frame/frame.h"
|
|
#include "hw/i_image_processor.h"
|
|
#include "hw/i_infer_backend.h"
|
|
#include "node.h"
|
|
#include "utils/logger.h"
|
|
#include "utils/person_shoe_roi.h"
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
#include "rknn_api.h"
|
|
#endif
|
|
|
|
namespace rk3588 {
|
|
|
|
struct DetWindow {
|
|
int x, y, w, h;
|
|
};
|
|
|
|
struct DetBox {
|
|
float x, y, w, h;
|
|
float conf;
|
|
int class_id;
|
|
};
|
|
|
|
struct DynamicRoiConfig {
|
|
bool enable = false;
|
|
int person_class_id = 0;
|
|
int shoe_class_id = 0;
|
|
int debug_roi_class_id = -1;
|
|
int max_rois = 8;
|
|
int min_person_height = 0;
|
|
float max_box_area_ratio = 0.0f;
|
|
float y_offset = 0.72f;
|
|
float width_scale = 1.30f;
|
|
float height_scale = 0.38f;
|
|
};
|
|
|
|
enum class V8BoxFormat { Auto, CxCyWh, XyXy, XyWh };
|
|
enum class V8ClsActivation { Auto, None, Sigmoid };
|
|
|
|
uint32_t TensorTypeSizeBytes(rknn_tensor_type t) {
|
|
switch (t) {
|
|
case RKNN_TENSOR_INT8:
|
|
case RKNN_TENSOR_UINT8:
|
|
return 1;
|
|
case RKNN_TENSOR_FLOAT16:
|
|
return 2;
|
|
case RKNN_TENSOR_FLOAT32:
|
|
return 4;
|
|
default:
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
inline float Sigmoid(float x) {
|
|
return 1.0f / (1.0f + std::exp(-x));
|
|
}
|
|
|
|
inline float Fp16ToFp32(uint16_t h) {
|
|
const int sign = (h & 0x8000) ? -1 : 1;
|
|
const int exp = (h >> 10) & 0x1F;
|
|
const int mant = h & 0x03FF;
|
|
|
|
if (exp == 0) {
|
|
if (mant == 0) return sign < 0 ? -0.0f : 0.0f;
|
|
return static_cast<float>(sign) * std::ldexp(static_cast<float>(mant), -24);
|
|
}
|
|
if (exp == 0x1F) {
|
|
if (mant == 0) return sign < 0 ? -INFINITY : INFINITY;
|
|
return std::numeric_limits<float>::quiet_NaN();
|
|
}
|
|
return static_cast<float>(sign) *
|
|
std::ldexp(static_cast<float>(mant + 1024), exp - 25);
|
|
}
|
|
|
|
inline int32_t ClipFloat(float val, float min_val, float max_val) {
|
|
return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
|
|
}
|
|
|
|
inline int8_t QuantizeF32ToAffine(float f32, int32_t zp, float scale) {
|
|
float dst_val = (f32 / scale) + zp;
|
|
return static_cast<int8_t>(ClipFloat(dst_val, -128, 127));
|
|
}
|
|
|
|
inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
|
|
return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
|
|
}
|
|
|
|
int DefaultV8NumBoxes(int model_h, int model_w) {
|
|
if (model_h <= 0 || model_w <= 0) return 0;
|
|
return (model_h / 8) * (model_w / 8) +
|
|
(model_h / 16) * (model_w / 16) +
|
|
(model_h / 32) * (model_w / 32);
|
|
}
|
|
|
|
struct V8LayoutInfo {
|
|
int num_boxes = 0;
|
|
bool channels_first = true;
|
|
};
|
|
|
|
float ScoreBoxCandidate(float x, float y, float w, float h, int model_w, int model_h) {
|
|
float s = 0.0f;
|
|
if (w > 0.0f && h > 0.0f) s += 3.0f;
|
|
if (w <= model_w * 1.2f) s += 1.0f;
|
|
if (h <= model_h * 1.2f) s += 1.0f;
|
|
if (x >= -model_w * 0.1f) s += 1.0f;
|
|
if (y >= -model_h * 0.1f) s += 1.0f;
|
|
if ((x + w) <= model_w * 1.2f) s += 1.0f;
|
|
if ((y + h) <= model_h * 1.2f) s += 1.0f;
|
|
return s;
|
|
}
|
|
|
|
bool SeemsNormalized(float a, float b, float c, float d) {
|
|
auto in_range = [](float v) { return v >= -0.05f && v <= 2.5f; };
|
|
return in_range(a) && in_range(b) && in_range(c) && in_range(d);
|
|
}
|
|
|
|
void DecodeV8Box(float a, float b, float c, float d, int model_w, int model_h, V8BoxFormat fmt,
|
|
float& out_x, float& out_y, float& out_w, float& out_h) {
|
|
if (SeemsNormalized(a, b, c, d)) {
|
|
a *= static_cast<float>(model_w);
|
|
b *= static_cast<float>(model_h);
|
|
c *= static_cast<float>(model_w);
|
|
d *= static_cast<float>(model_h);
|
|
}
|
|
|
|
auto decode_cxcywh = [&](float& x, float& y, float& w, float& h) {
|
|
x = a - c / 2.0f;
|
|
y = b - d / 2.0f;
|
|
w = c;
|
|
h = d;
|
|
};
|
|
auto decode_xyxy = [&](float& x, float& y, float& w, float& h) {
|
|
x = a;
|
|
y = b;
|
|
w = c - a;
|
|
h = d - b;
|
|
};
|
|
auto decode_xywh = [&](float& x, float& y, float& w, float& h) {
|
|
x = a;
|
|
y = b;
|
|
w = c;
|
|
h = d;
|
|
};
|
|
|
|
if (fmt == V8BoxFormat::CxCyWh) {
|
|
decode_cxcywh(out_x, out_y, out_w, out_h);
|
|
return;
|
|
}
|
|
if (fmt == V8BoxFormat::XyXy) {
|
|
decode_xyxy(out_x, out_y, out_w, out_h);
|
|
return;
|
|
}
|
|
if (fmt == V8BoxFormat::XyWh) {
|
|
decode_xywh(out_x, out_y, out_w, out_h);
|
|
return;
|
|
}
|
|
|
|
float x1 = 0.0f, y1 = 0.0f, w1 = 0.0f, h1 = 0.0f;
|
|
float x2 = 0.0f, y2 = 0.0f, w2 = 0.0f, h2 = 0.0f;
|
|
float x3 = 0.0f, y3 = 0.0f, w3 = 0.0f, h3 = 0.0f;
|
|
decode_cxcywh(x1, y1, w1, h1);
|
|
decode_xyxy(x2, y2, w2, h2);
|
|
decode_xywh(x3, y3, w3, h3);
|
|
|
|
const float s1 = ScoreBoxCandidate(x1, y1, w1, h1, model_w, model_h);
|
|
const float s2 = ScoreBoxCandidate(x2, y2, w2, h2, model_w, model_h);
|
|
const float s3 = ScoreBoxCandidate(x3, y3, w3, h3, model_w, model_h);
|
|
if (s2 >= s1 && s2 >= s3) {
|
|
out_x = x2; out_y = y2; out_w = w2; out_h = h2;
|
|
} else if (s3 >= s1 && s3 >= s2) {
|
|
out_x = x3; out_y = y3; out_w = w3; out_h = h3;
|
|
} else {
|
|
out_x = x1; out_y = y1; out_w = w1; out_h = h1;
|
|
}
|
|
}
|
|
|
|
bool ResolveV8ApplySigmoid(const float* output, int num_boxes, int num_classes, bool channels_first,
|
|
V8ClsActivation act_mode) {
|
|
if (act_mode == V8ClsActivation::None) return false;
|
|
if (act_mode == V8ClsActivation::Sigmoid) return true;
|
|
if (!output || num_boxes <= 0 || num_classes <= 0) return false;
|
|
|
|
const int num_channels = 4 + num_classes;
|
|
const int sample_boxes = std::min(num_boxes, 64);
|
|
float min_v = 1e9f;
|
|
float max_v = -1e9f;
|
|
for (int i = 0; i < sample_boxes; ++i) {
|
|
for (int c = 0; c < num_classes; ++c) {
|
|
const float v = channels_first ? output[(4 + c) * num_boxes + i]
|
|
: output[i * num_channels + (4 + c)];
|
|
if (v < min_v) min_v = v;
|
|
if (v > max_v) max_v = v;
|
|
}
|
|
}
|
|
return (min_v < -0.1f || max_v > 1.5f);
|
|
}
|
|
|
|
V8LayoutInfo ResolveV8Layout(const std::vector<uint32_t>& dims, size_t byte_size,
|
|
rknn_tensor_type type, int num_classes,
|
|
int model_h, int model_w) {
|
|
V8LayoutInfo info;
|
|
const int num_channels = 4 + num_classes;
|
|
const uint32_t elem_bytes = TensorTypeSizeBytes(type);
|
|
const size_t total_elems = elem_bytes > 0 ? (byte_size / elem_bytes) : 0;
|
|
const size_t max_boxes_from_data = static_cast<size_t>(num_channels) > 0
|
|
? (total_elems / static_cast<size_t>(num_channels))
|
|
: 0;
|
|
|
|
int ch_idx = -1;
|
|
for (size_t i = 0; i < dims.size(); ++i) {
|
|
if (dims[i] == static_cast<uint32_t>(num_channels)) {
|
|
ch_idx = static_cast<int>(i);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (ch_idx >= 0 && total_elems >= static_cast<size_t>(num_channels)) {
|
|
info.num_boxes = static_cast<int>(max_boxes_from_data);
|
|
int prev_non1 = 1;
|
|
for (int i = ch_idx - 1; i >= 0; --i) {
|
|
if (dims[static_cast<size_t>(i)] > 1U) {
|
|
prev_non1 = static_cast<int>(dims[static_cast<size_t>(i)]);
|
|
break;
|
|
}
|
|
}
|
|
int next_non1 = 1;
|
|
for (size_t i = static_cast<size_t>(ch_idx + 1); i < dims.size(); ++i) {
|
|
if (dims[i] > 1U) {
|
|
next_non1 = static_cast<int>(dims[i]);
|
|
break;
|
|
}
|
|
}
|
|
if (next_non1 > 1 && prev_non1 <= 1) {
|
|
info.channels_first = true;
|
|
} else if (prev_non1 > 1 && next_non1 <= 1) {
|
|
info.channels_first = false;
|
|
} else if (next_non1 > 1 && prev_non1 > 1) {
|
|
info.channels_first = next_non1 >= prev_non1;
|
|
}
|
|
}
|
|
|
|
if (info.num_boxes <= 0 && max_boxes_from_data > 0) {
|
|
info.num_boxes = static_cast<int>(max_boxes_from_data);
|
|
}
|
|
if (info.num_boxes <= 0) {
|
|
info.num_boxes = DefaultV8NumBoxes(model_h, model_w);
|
|
}
|
|
if (info.num_boxes <= 0) {
|
|
info.num_boxes = 8400;
|
|
}
|
|
if (max_boxes_from_data > 0 && static_cast<size_t>(info.num_boxes) > max_boxes_from_data) {
|
|
info.num_boxes = static_cast<int>(max_boxes_from_data);
|
|
}
|
|
return info;
|
|
}
|
|
|
|
class AiShoeDetNode : public INode {
|
|
public:
|
|
std::string Id() const override { return id_; }
|
|
std::string Type() const override { return "ai_shoe_det"; }
|
|
|
|
bool Init(const SimpleJson& config, const NodeContext& ctx) override {
|
|
id_ = config.ValueOr<std::string>("id", "shoe_det");
|
|
model_path_ = config.ValueOr<std::string>("model_path",
|
|
"./models/shoe_detector.rknn");
|
|
|
|
model_w_ = config.ValueOr<int>("model_w", 640);
|
|
model_h_ = config.ValueOr<int>("model_h", 640);
|
|
conf_thresh_ = config.ValueOr<float>("conf", 0.25f);
|
|
nms_thresh_ = config.ValueOr<float>("nms", 0.45f);
|
|
append_detections_ = config.ValueOr<bool>("append_detections", false);
|
|
{
|
|
const std::string bf = config.ValueOr<std::string>("v8_box_format", "auto");
|
|
if (bf == "xyxy") {
|
|
v8_box_format_ = V8BoxFormat::XyXy;
|
|
} else if (bf == "xywh") {
|
|
v8_box_format_ = V8BoxFormat::XyWh;
|
|
} else if (bf == "cxcywh") {
|
|
v8_box_format_ = V8BoxFormat::CxCyWh;
|
|
} else {
|
|
v8_box_format_ = V8BoxFormat::Auto;
|
|
}
|
|
}
|
|
{
|
|
const std::string act = config.ValueOr<std::string>("v8_cls_activation", "auto");
|
|
if (act == "sigmoid") {
|
|
v8_cls_activation_ = V8ClsActivation::Sigmoid;
|
|
} else if (act == "none") {
|
|
v8_cls_activation_ = V8ClsActivation::None;
|
|
} else {
|
|
v8_cls_activation_ = V8ClsActivation::Auto;
|
|
}
|
|
}
|
|
|
|
infer_interval_ms_ = std::max<int64_t>(
|
|
0, static_cast<int64_t>(config.ValueOr<int>("infer_interval_ms", 0)));
|
|
if (infer_interval_ms_ <= 0) {
|
|
const double infer_fps = config.ValueOr<double>("infer_fps", 0.0);
|
|
if (infer_fps > 0.0) {
|
|
infer_interval_ms_ = static_cast<int64_t>(1000.0 / infer_fps);
|
|
if (infer_interval_ms_ < 1) infer_interval_ms_ = 1;
|
|
}
|
|
}
|
|
infer_phase_ms_ = std::max<int64_t>(
|
|
0, static_cast<int64_t>(config.ValueOr<int>("infer_phase_ms", 0)));
|
|
if (infer_interval_ms_ > 0 && infer_phase_ms_ >= infer_interval_ms_) {
|
|
infer_phase_ms_ %= infer_interval_ms_;
|
|
}
|
|
use_dma_input_ = config.ValueOr<bool>("use_dma_input", use_dma_input_);
|
|
|
|
if (const SimpleJson* dyn = config.Find("dynamic_roi"); dyn && dyn->IsObject()) {
|
|
dynamic_roi_.enable = dyn->ValueOr<bool>("enable", false);
|
|
dynamic_roi_.person_class_id =
|
|
dyn->ValueOr<int>("person_class_id", dynamic_roi_.person_class_id);
|
|
dynamic_roi_.shoe_class_id =
|
|
dyn->ValueOr<int>("shoe_class_id", dynamic_roi_.shoe_class_id);
|
|
dynamic_roi_.debug_roi_class_id =
|
|
dyn->ValueOr<int>("debug_roi_class_id", dynamic_roi_.debug_roi_class_id);
|
|
dynamic_roi_.max_rois = std::max(1, dyn->ValueOr<int>("max_rois", dynamic_roi_.max_rois));
|
|
dynamic_roi_.min_person_height =
|
|
std::max(0, dyn->ValueOr<int>("min_person_height", dynamic_roi_.min_person_height));
|
|
dynamic_roi_.max_box_area_ratio =
|
|
std::max(0.0f, dyn->ValueOr<float>("max_box_area_ratio", dynamic_roi_.max_box_area_ratio));
|
|
dynamic_roi_.y_offset = dyn->ValueOr<float>("y_offset", dynamic_roi_.y_offset);
|
|
dynamic_roi_.width_scale = dyn->ValueOr<float>("width_scale", dynamic_roi_.width_scale);
|
|
dynamic_roi_.height_scale = dyn->ValueOr<float>("height_scale", dynamic_roi_.height_scale);
|
|
}
|
|
|
|
// 解析窗口配置
|
|
windows_.clear();
|
|
if (const SimpleJson* win_arr = config.Find("windows"); win_arr && win_arr->IsArray()) {
|
|
for (const auto& w : win_arr->AsArray()) {
|
|
if (w.IsObject()) {
|
|
DetWindow win;
|
|
win.x = w.ValueOr<int>("x", 0);
|
|
win.y = w.ValueOr<int>("y", 0);
|
|
win.w = w.ValueOr<int>("w", 640);
|
|
win.h = w.ValueOr<int>("h", 640);
|
|
windows_.push_back(win);
|
|
}
|
|
}
|
|
}
|
|
|
|
// 默认单窗口(全图)
|
|
if (!dynamic_roi_.enable && windows_.empty()) {
|
|
windows_.push_back({0, 0, 0, 0}); // 0表示全图
|
|
}
|
|
|
|
input_queue_ = ctx.input_queue;
|
|
output_queues_ = ctx.output_queues;
|
|
if (!input_queue_) {
|
|
LogError("[ai_shoe_det] no input queue");
|
|
return false;
|
|
}
|
|
|
|
infer_backend_ = ctx.infer_backend;
|
|
if (!infer_backend_) {
|
|
LogError("[ai_shoe_det] no infer backend");
|
|
return false;
|
|
}
|
|
image_processor_ = ctx.image_processor;
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
std::string err;
|
|
model_handle_ = infer_backend_->LoadModel(model_path_, err);
|
|
if (model_handle_ == kInvalidModelHandle) {
|
|
LogError("[ai_shoe_det] failed to load model: " + err);
|
|
return false;
|
|
}
|
|
input_buf_.resize(model_w_ * model_h_ * 3);
|
|
LogInfo("[ai_shoe_det] model loaded: " + model_path_);
|
|
#else
|
|
LogWarn("[ai_shoe_det] RKNN disabled");
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Start() override {
|
|
LogInfo("[ai_shoe_det] start mode=" + std::string(dynamic_roi_.enable ? "dynamic_roi" : "windows") +
|
|
" windows=" + std::to_string(windows_.size()) +
|
|
" append=" + std::string(append_detections_ ? "true" : "false") +
|
|
" infer_interval_ms=" + std::to_string(infer_interval_ms_));
|
|
return true;
|
|
}
|
|
|
|
void Stop() override {
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
if (model_handle_ != kInvalidModelHandle) {
|
|
infer_backend_->UnloadModel(model_handle_);
|
|
model_handle_ = kInvalidModelHandle;
|
|
}
|
|
#endif
|
|
LogInfo("[ai_shoe_det] stop");
|
|
}
|
|
|
|
NodeStatus Process(FramePtr frame) override {
|
|
if (!frame) return NodeStatus::DROP;
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
if (infer_interval_ms_ > 0 && frame->pts > 0) {
|
|
const int64_t pts_ms = static_cast<int64_t>(frame->pts / 1000ULL);
|
|
const int64_t effective_pts_ms = pts_ms + infer_phase_ms_;
|
|
const int64_t delta_ms = effective_pts_ms - last_infer_pts_ms_;
|
|
if (last_infer_pts_ms_ > 0 && delta_ms > 0 && delta_ms < infer_interval_ms_) {
|
|
Push(frame);
|
|
return NodeStatus::OK;
|
|
}
|
|
last_infer_pts_ms_ = effective_pts_ms;
|
|
}
|
|
RunDetection(frame);
|
|
#endif
|
|
Push(frame);
|
|
return NodeStatus::OK;
|
|
}
|
|
|
|
private:
|
|
void Push(FramePtr frame) {
|
|
for (auto& q : output_queues_) q->Push(frame);
|
|
}
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
|
|
void RunDetection(FramePtr frame) {
|
|
if (!frame->data || frame->data_size == 0) return;
|
|
|
|
const int src_w = frame->width;
|
|
const int src_h = frame->height;
|
|
|
|
std::vector<DetBox> all_dets;
|
|
|
|
std::vector<DetWindow> active_windows = windows_;
|
|
if (dynamic_roi_.enable) {
|
|
active_windows = BuildDynamicWindows(frame, src_w, src_h);
|
|
if (active_windows.empty()) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
// 对每个窗口进行检测
|
|
for (const auto& win : active_windows) {
|
|
auto dets = DetectWindow(frame, src_w, src_h, win);
|
|
all_dets.insert(all_dets.end(), dets.begin(), dets.end());
|
|
}
|
|
|
|
// NMS
|
|
all_dets = ApplyNMS(all_dets, nms_thresh_);
|
|
|
|
// 填充结果
|
|
if (!frame->det) {
|
|
frame->det = std::make_shared<DetectionResult>();
|
|
}
|
|
|
|
if (!append_detections_) {
|
|
frame->det->items.clear();
|
|
}
|
|
frame->det->img_w = src_w;
|
|
frame->det->img_h = src_h;
|
|
|
|
if (dynamic_roi_.enable && dynamic_roi_.debug_roi_class_id >= 0) {
|
|
AddDebugRoiDetections(frame, active_windows);
|
|
}
|
|
|
|
for (const auto& d : all_dets) {
|
|
Detection item;
|
|
item.bbox = {d.x, d.y, d.w, d.h};
|
|
item.score = d.conf;
|
|
item.cls_id = d.class_id;
|
|
frame->det->items.push_back(item);
|
|
}
|
|
}
|
|
|
|
std::vector<DetWindow> BuildDynamicWindows(FramePtr frame, int src_w, int src_h) {
|
|
std::vector<DetWindow> rois;
|
|
if (!frame->det) return rois;
|
|
|
|
std::vector<Detection> persons;
|
|
persons.reserve(frame->det->items.size());
|
|
for (const auto& det : frame->det->items) {
|
|
if (det.cls_id != dynamic_roi_.person_class_id) continue;
|
|
if (det.bbox.h < dynamic_roi_.min_person_height) continue;
|
|
persons.push_back(det);
|
|
}
|
|
|
|
if (persons.empty()) return rois;
|
|
|
|
std::sort(persons.begin(), persons.end(),
|
|
[](const Detection& a, const Detection& b) { return a.score > b.score; });
|
|
|
|
const size_t max_rois = std::min(
|
|
persons.size(), static_cast<size_t>(std::max(1, dynamic_roi_.max_rois)));
|
|
rois.reserve(max_rois);
|
|
const size_t start_index = persons.empty() ? 0 : (roi_round_robin_offset_ % persons.size());
|
|
|
|
for (size_t i = 0; i < max_rois; ++i) {
|
|
const Detection& person = persons[(start_index + i) % persons.size()];
|
|
const Rect& bbox = person.bbox;
|
|
const Rect roi = BuildPersonFootRegion(bbox,
|
|
src_w,
|
|
src_h,
|
|
dynamic_roi_.y_offset,
|
|
dynamic_roi_.width_scale,
|
|
dynamic_roi_.height_scale);
|
|
|
|
const int x0 = std::max(0, static_cast<int>(roi.x));
|
|
const int y0 = std::max(0, static_cast<int>(roi.y));
|
|
const int x1 = std::min(src_w, static_cast<int>(roi.x + roi.w));
|
|
const int y1 = std::min(src_h, static_cast<int>(roi.y + roi.h));
|
|
|
|
if (x1 <= x0 || y1 <= y0) continue;
|
|
rois.push_back({x0, y0, x1 - x0, y1 - y0});
|
|
}
|
|
if (!persons.empty()) {
|
|
roi_round_robin_offset_ = (start_index + max_rois) % persons.size();
|
|
}
|
|
|
|
return rois;
|
|
}
|
|
|
|
void AddDebugRoiDetections(FramePtr frame, const std::vector<DetWindow>& rois) const {
|
|
if (!frame || !frame->det) return;
|
|
for (const auto& roi : rois) {
|
|
Detection item;
|
|
item.cls_id = dynamic_roi_.debug_roi_class_id;
|
|
item.score = 1.0f;
|
|
item.track_id = -1;
|
|
item.bbox = {
|
|
static_cast<float>(roi.x),
|
|
static_cast<float>(roi.y),
|
|
static_cast<float>(roi.w),
|
|
static_cast<float>(roi.h)
|
|
};
|
|
frame->det->items.push_back(item);
|
|
}
|
|
}
|
|
|
|
std::vector<DetBox> DetectWindow(FramePtr frame, int src_w, int src_h, const DetWindow& win) {
|
|
std::vector<DetBox> dets;
|
|
|
|
// 确定裁剪区域
|
|
int win_x, win_y, win_w, win_h;
|
|
if (win.w == 0 || win.h == 0) {
|
|
win_x = 0; win_y = 0; win_w = src_w; win_h = src_h;
|
|
} else {
|
|
win_x = std::max(0, std::min(win.x, src_w - 1));
|
|
win_y = std::max(0, std::min(win.y, src_h - 1));
|
|
win_w = std::min(win.w, src_w - win_x);
|
|
win_h = std::min(win.h, src_h - win_y);
|
|
}
|
|
|
|
if (win_w <= 0 || win_h <= 0) return dets;
|
|
|
|
// 获取源数据
|
|
const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
|
|
int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
|
|
: (frame->stride > 0 ? frame->stride : src_w * 3);
|
|
|
|
if (!src || src_stride <= 0) return dets;
|
|
|
|
const uint8_t* roi_src = src + static_cast<size_t>(win_y) * static_cast<size_t>(src_stride) +
|
|
static_cast<size_t>(win_x) * 3;
|
|
|
|
const uint8_t* model_input_data = nullptr;
|
|
size_t model_input_size = 0;
|
|
int model_input_dma_fd = -1;
|
|
Frame resized_frame;
|
|
|
|
if (image_processor_ && win_w > 0 && win_h > 0) {
|
|
Frame crop_frame;
|
|
crop_frame.width = win_w;
|
|
crop_frame.height = win_h;
|
|
crop_frame.format = PixelFormat::RGB;
|
|
crop_frame.stride = src_stride;
|
|
crop_frame.data = const_cast<uint8_t*>(roi_src);
|
|
crop_frame.data_size = frame->data_size;
|
|
crop_frame.plane_count = 1;
|
|
crop_frame.planes[0] = {crop_frame.data, crop_frame.stride, static_cast<int>(frame->data_size), 0};
|
|
|
|
resized_frame.width = model_w_;
|
|
resized_frame.height = model_h_;
|
|
resized_frame.format = PixelFormat::RGB;
|
|
|
|
Status st = image_processor_->Resize(crop_frame, resized_frame);
|
|
if (st.Ok()) {
|
|
const uint8_t* resized_data = resized_frame.planes[0].data ? resized_frame.planes[0].data : resized_frame.data;
|
|
const int resized_stride = resized_frame.planes[0].stride > 0
|
|
? resized_frame.planes[0].stride
|
|
: (resized_frame.stride > 0 ? resized_frame.stride : model_w_ * 3);
|
|
const size_t packed_size = static_cast<size_t>(model_w_) * static_cast<size_t>(model_h_) * 3;
|
|
if (use_dma_input_ && resized_frame.DmaFd() >= 0 && resized_data) {
|
|
model_input_data = resized_data;
|
|
model_input_size = resized_frame.data_size > 0 ? resized_frame.data_size : packed_size;
|
|
model_input_dma_fd = resized_frame.DmaFd();
|
|
} else if (resized_data && resized_stride > 0) {
|
|
if (packed_size > input_buf_.size()) input_buf_.resize(packed_size);
|
|
for (int row = 0; row < model_h_; ++row) {
|
|
memcpy(input_buf_.data() + static_cast<size_t>(row) * static_cast<size_t>(model_w_) * 3,
|
|
resized_data + static_cast<size_t>(row) * static_cast<size_t>(resized_stride),
|
|
static_cast<size_t>(model_w_) * 3);
|
|
}
|
|
model_input_data = input_buf_.data();
|
|
model_input_size = packed_size;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!model_input_data) {
|
|
ResizeRgbBilinear(roi_src, win_w, win_h, src_stride,
|
|
input_buf_.data(), model_w_, model_h_, model_w_ * 3);
|
|
model_input_data = input_buf_.data();
|
|
model_input_size = input_buf_.size();
|
|
}
|
|
|
|
// 推理
|
|
InferInput input;
|
|
input.width = model_w_;
|
|
input.height = model_h_;
|
|
input.is_nhwc = true;
|
|
input.data = model_input_data;
|
|
input.size = model_input_size;
|
|
input.dma_fd = model_input_dma_fd;
|
|
input.type = RKNN_TENSOR_UINT8;
|
|
|
|
auto r = infer_backend_->InferBorrowed(model_handle_, input);
|
|
if (!r.success || r.outputs.empty() || !r.outputs[0].data) {
|
|
LogWarn("[ai_shoe_det] inference failed");
|
|
return dets;
|
|
}
|
|
|
|
// 解析输出
|
|
dets = ParseOutput(r.outputs, win_x, win_y, win_w, win_h);
|
|
return dets;
|
|
}
|
|
|
|
std::vector<DetBox> ParseOutput(const std::vector<AiScheduler::BorrowedOutput>& outputs,
|
|
int win_x, int win_y, int win_w, int win_h) {
|
|
std::vector<DetBox> dets;
|
|
if (outputs.empty() || !outputs[0].data) return dets;
|
|
constexpr int kNumClasses = 1;
|
|
const V8LayoutInfo layout = ResolveV8Layout(
|
|
outputs[0].dims, outputs[0].size, outputs[0].type, kNumClasses, model_h_, model_w_);
|
|
const int num_boxes = layout.num_boxes;
|
|
if (num_boxes <= 0) return dets;
|
|
|
|
float scale_x = static_cast<float>(win_w) / model_w_;
|
|
float scale_y = static_cast<float>(win_h) / model_h_;
|
|
const int num_channels = 4 + kNumClasses;
|
|
|
|
auto append_box = [&](float a, float b, float c, float d, float conf) {
|
|
if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) return;
|
|
float x = 0.0f, y = 0.0f, w = 0.0f, h = 0.0f;
|
|
DecodeV8Box(a, b, c, d, model_w_, model_h_, v8_box_format_, x, y, w, h);
|
|
if (!std::isfinite(x) || !std::isfinite(y) || !std::isfinite(w) || !std::isfinite(h)) return;
|
|
if (w <= 1e-3f || h <= 1e-3f) return;
|
|
if (w > model_w_ * 1.2f || h > model_h_ * 1.2f) return;
|
|
if (dynamic_roi_.enable && dynamic_roi_.max_box_area_ratio > 0.0f && win_w > 0 && win_h > 0) {
|
|
const float mapped_area_ratio =
|
|
(w * h) / (static_cast<float>(model_w_) * static_cast<float>(model_h_));
|
|
if (mapped_area_ratio > dynamic_roi_.max_box_area_ratio) return;
|
|
}
|
|
|
|
DetBox box;
|
|
box.x = win_x + x * scale_x;
|
|
box.y = win_y + y * scale_y;
|
|
box.w = w * scale_x;
|
|
box.h = h * scale_y;
|
|
box.conf = conf;
|
|
box.class_id = dynamic_roi_.shoe_class_id;
|
|
dets.push_back(box);
|
|
};
|
|
|
|
if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
|
|
const float* data = reinterpret_cast<const float*>(outputs[0].data);
|
|
const bool apply_sigmoid = ResolveV8ApplySigmoid(
|
|
data, num_boxes, kNumClasses, layout.channels_first, v8_cls_activation_);
|
|
for (int i = 0; i < num_boxes; ++i) {
|
|
const float score = layout.channels_first ? data[4 * num_boxes + i]
|
|
: data[i * num_channels + 4];
|
|
const float conf = apply_sigmoid ? Sigmoid(score) : score;
|
|
if (conf < conf_thresh_) continue;
|
|
const float a = layout.channels_first ? data[0 * num_boxes + i] : data[i * num_channels + 0];
|
|
const float b = layout.channels_first ? data[1 * num_boxes + i] : data[i * num_channels + 1];
|
|
const float c = layout.channels_first ? data[2 * num_boxes + i] : data[i * num_channels + 2];
|
|
const float d = layout.channels_first ? data[3 * num_boxes + i] : data[i * num_channels + 3];
|
|
append_box(a, b, c, d, conf);
|
|
}
|
|
} else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
|
|
const uint16_t* data = reinterpret_cast<const uint16_t*>(outputs[0].data);
|
|
fp32_decode_buf_.resize(outputs[0].size / sizeof(uint16_t));
|
|
for (size_t i = 0; i < fp32_decode_buf_.size(); ++i) {
|
|
fp32_decode_buf_[i] = Fp16ToFp32(data[i]);
|
|
}
|
|
const bool apply_sigmoid = ResolveV8ApplySigmoid(
|
|
fp32_decode_buf_.data(), num_boxes, kNumClasses, layout.channels_first, v8_cls_activation_);
|
|
for (int i = 0; i < num_boxes; ++i) {
|
|
const float score = layout.channels_first ? fp32_decode_buf_[4 * num_boxes + i]
|
|
: fp32_decode_buf_[i * num_channels + 4];
|
|
const float conf = apply_sigmoid ? Sigmoid(score) : score;
|
|
if (conf < conf_thresh_) continue;
|
|
const float a = layout.channels_first ? fp32_decode_buf_[0 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 0];
|
|
const float b = layout.channels_first ? fp32_decode_buf_[1 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 1];
|
|
const float c = layout.channels_first ? fp32_decode_buf_[2 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 2];
|
|
const float d = layout.channels_first ? fp32_decode_buf_[3 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 3];
|
|
append_box(a, b, c, d, conf);
|
|
}
|
|
} else if (outputs[0].type == RKNN_TENSOR_INT8) {
|
|
const int8_t* data = reinterpret_cast<const int8_t*>(outputs[0].data);
|
|
const int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh_, outputs[0].zp, outputs[0].scale);
|
|
for (int i = 0; i < num_boxes; ++i) {
|
|
const int8_t score_i8 = layout.channels_first ? data[4 * num_boxes + i]
|
|
: data[i * num_channels + 4];
|
|
if (score_i8 < thresh_i8) continue;
|
|
const float conf = DequantizeAffineToF32(score_i8, outputs[0].zp, outputs[0].scale);
|
|
const float a = DequantizeAffineToF32(layout.channels_first ? data[0 * num_boxes + i] : data[i * num_channels + 0], outputs[0].zp, outputs[0].scale);
|
|
const float b = DequantizeAffineToF32(layout.channels_first ? data[1 * num_boxes + i] : data[i * num_channels + 1], outputs[0].zp, outputs[0].scale);
|
|
const float c = DequantizeAffineToF32(layout.channels_first ? data[2 * num_boxes + i] : data[i * num_channels + 2], outputs[0].zp, outputs[0].scale);
|
|
const float d = DequantizeAffineToF32(layout.channels_first ? data[3 * num_boxes + i] : data[i * num_channels + 3], outputs[0].zp, outputs[0].scale);
|
|
append_box(a, b, c, d, conf);
|
|
}
|
|
} else {
|
|
LogWarn("[ai_shoe_det] unsupported output tensor type");
|
|
}
|
|
|
|
return dets;
|
|
}
|
|
|
|
std::vector<DetBox> ApplyNMS(std::vector<DetBox>& dets, float thresh) {
|
|
if (dets.empty()) return dets;
|
|
|
|
std::sort(dets.begin(), dets.end(),
|
|
[](const DetBox& a, const DetBox& b) { return a.conf > b.conf; });
|
|
|
|
std::vector<DetBox> keep;
|
|
std::vector<bool> suppressed(dets.size(), false);
|
|
|
|
for (size_t i = 0; i < dets.size(); ++i) {
|
|
if (suppressed[i]) continue;
|
|
keep.push_back(dets[i]);
|
|
|
|
for (size_t j = i + 1; j < dets.size(); ++j) {
|
|
if (suppressed[j]) continue;
|
|
float iou = ComputeIoU(dets[i], dets[j]);
|
|
if (iou > thresh) {
|
|
suppressed[j] = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return keep;
|
|
}
|
|
|
|
float ComputeIoU(const DetBox& a, const DetBox& b) {
|
|
float x1 = std::max(a.x, b.x);
|
|
float y1 = std::max(a.y, b.y);
|
|
float x2 = std::min(a.x + a.w, b.x + b.w);
|
|
float y2 = std::min(a.y + a.h, b.y + b.h);
|
|
|
|
float inter = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
|
|
float area_a = a.w * a.h;
|
|
float area_b = b.w * b.h;
|
|
float uni = area_a + area_b - inter;
|
|
|
|
return uni > 0 ? inter / uni : 0;
|
|
}
|
|
|
|
void ResizeRgbBilinear(const uint8_t* src, int src_w, int src_h, int src_stride,
|
|
uint8_t* dst, int dst_w, int dst_h, int dst_stride) {
|
|
float scale_x = static_cast<float>(src_w) / dst_w;
|
|
float scale_y = static_cast<float>(src_h) / dst_h;
|
|
|
|
for (int y = 0; y < dst_h; ++y) {
|
|
float fy = y * scale_y;
|
|
int y0 = static_cast<int>(fy);
|
|
int y1 = std::min(y0 + 1, src_h - 1);
|
|
float dy = fy - y0;
|
|
|
|
for (int x = 0; x < dst_w; ++x) {
|
|
float fx = x * scale_x;
|
|
int x0 = static_cast<int>(fx);
|
|
int x1 = std::min(x0 + 1, src_w - 1);
|
|
float dx = fx - x0;
|
|
|
|
for (int c = 0; c < 3; ++c) {
|
|
float v00 = src[y0 * src_stride + x0 * 3 + c];
|
|
float v01 = src[y0 * src_stride + x1 * 3 + c];
|
|
float v10 = src[y1 * src_stride + x0 * 3 + c];
|
|
float v11 = src[y1 * src_stride + x1 * 3 + c];
|
|
|
|
float v = v00 * (1-dx) * (1-dy) +
|
|
v01 * dx * (1-dy) +
|
|
v10 * (1-dx) * dy +
|
|
v11 * dx * dy;
|
|
|
|
dst[y * dst_stride + x * 3 + c] = static_cast<uint8_t>(v);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
std::string id_;
|
|
std::string model_path_;
|
|
int model_w_ = 640;
|
|
int model_h_ = 640;
|
|
float conf_thresh_ = 0.25f;
|
|
float nms_thresh_ = 0.45f;
|
|
bool append_detections_ = false;
|
|
V8BoxFormat v8_box_format_ = V8BoxFormat::Auto;
|
|
V8ClsActivation v8_cls_activation_ = V8ClsActivation::Auto;
|
|
int64_t infer_interval_ms_ = 0;
|
|
int64_t infer_phase_ms_ = 0;
|
|
int64_t last_infer_pts_ms_ = 0;
|
|
bool use_dma_input_ = true;
|
|
DynamicRoiConfig dynamic_roi_;
|
|
std::vector<DetWindow> windows_;
|
|
std::vector<uint8_t> input_buf_;
|
|
std::vector<float> fp32_decode_buf_;
|
|
size_t roi_round_robin_offset_ = 0;
|
|
|
|
std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
|
|
std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
|
|
std::shared_ptr<IImageProcessor> image_processor_;
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
std::shared_ptr<IInferBackend> infer_backend_;
|
|
ModelHandle model_handle_ = kInvalidModelHandle;
|
|
#endif
|
|
};
|
|
|
|
REGISTER_NODE(AiShoeDetNode, "ai_shoe_det");
|
|
|
|
} // namespace rk3588
|