1277 lines
54 KiB
C++
1277 lines
54 KiB
C++
#include <algorithm>
|
|
#include <atomic>
|
|
#include <chrono>
|
|
#include <cmath>
|
|
#include <cstddef>
|
|
#include <cstring>
|
|
#include <limits>
|
|
#include <memory>
|
|
#include <set>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
#include "hw/i_infer_backend.h"
|
|
#include "node.h"
|
|
#include "utils/dma_alloc.h"
|
|
#include "utils/logger.h"
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
#include "rknn_api.h"
|
|
#endif
|
|
|
|
namespace rk3588 {
|
|
|
|
namespace {
|
|
|
|
constexpr int kObjClassNum = 80;
|
|
constexpr int kPropBoxSizeV5 = 5 + kObjClassNum; // YOLOv5: x,y,w,h,conf + 80 classes
|
|
constexpr int kPropBoxSizeV8 = 4 + kObjClassNum; // YOLOv8: x,y,w,h + 80 classes (no conf)
|
|
constexpr int kMaxDetections = 64;
|
|
|
|
// YOLOv5 anchors
|
|
const int kAnchor0[6] = {10, 13, 16, 30, 33, 23};
|
|
const int kAnchor1[6] = {30, 61, 62, 45, 59, 119};
|
|
const int kAnchor2[6] = {116, 90, 156, 198, 373, 326};
|
|
|
|
enum class YoloVersion { V5, V8 };
|
|
enum class V8BoxFormat { Auto, CxCyWh, XyXy, XyWh };
|
|
enum class V8ClsActivation { Auto, None, Sigmoid };
|
|
|
|
const char* kCocoLabels[kObjClassNum] = {
|
|
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
|
|
"traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
|
|
"dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
|
|
"umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
|
|
"kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
|
|
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
|
|
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
|
|
"couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
|
|
"remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator",
|
|
"book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
|
|
};
|
|
|
|
inline int Clamp(float val, int min_val, int max_val) {
|
|
return val > min_val ? (val < max_val ? static_cast<int>(val) : max_val) : min_val;
|
|
}
|
|
|
|
struct DetCoordContext {
|
|
bool has_transform = false;
|
|
int out_w = 0;
|
|
int out_h = 0;
|
|
float scale_x = 1.0f;
|
|
float scale_y = 1.0f;
|
|
float pad_x = 0.0f;
|
|
float pad_y = 0.0f;
|
|
float fallback_scale_w = 1.0f;
|
|
float fallback_scale_h = 1.0f;
|
|
};
|
|
|
|
DetCoordContext BuildDetCoordContext(const Frame& frame, int model_input_w, int model_input_h) {
|
|
DetCoordContext ctx{};
|
|
ctx.fallback_scale_w = frame.width > 0 ? static_cast<float>(model_input_w) / frame.width : 1.0f;
|
|
ctx.fallback_scale_h = frame.height > 0 ? static_cast<float>(model_input_h) / frame.height : 1.0f;
|
|
ctx.out_w = frame.width;
|
|
ctx.out_h = frame.height;
|
|
|
|
const bool infer_input_matches_frame = (frame.width == model_input_w && frame.height == model_input_h);
|
|
|
|
if (infer_input_matches_frame &&
|
|
frame.transform_meta && frame.transform_meta->valid &&
|
|
frame.transform_meta->src_w > 0 && frame.transform_meta->src_h > 0 &&
|
|
frame.transform_meta->scale_x > 1e-6f && frame.transform_meta->scale_y > 1e-6f) {
|
|
ctx.has_transform = true;
|
|
ctx.out_w = frame.transform_meta->src_w;
|
|
ctx.out_h = frame.transform_meta->src_h;
|
|
ctx.scale_x = frame.transform_meta->scale_x;
|
|
ctx.scale_y = frame.transform_meta->scale_y;
|
|
ctx.pad_x = frame.transform_meta->pad_x;
|
|
ctx.pad_y = frame.transform_meta->pad_y;
|
|
}
|
|
return ctx;
|
|
}
|
|
|
|
struct BboxExpandConfig {
|
|
bool enable = false;
|
|
int class_id = 0;
|
|
float left = 0.05f;
|
|
float right = 0.05f;
|
|
float top = 0.05f;
|
|
float bottom = 0.12f;
|
|
};
|
|
|
|
Rect DecodeToOutputRect(float x, float y, float w, float h, const DetCoordContext& ctx) {
|
|
float ox = x;
|
|
float oy = y;
|
|
float ow = w;
|
|
float oh = h;
|
|
|
|
if (ctx.has_transform) {
|
|
ox = (x - ctx.pad_x) / ctx.scale_x;
|
|
oy = (y - ctx.pad_y) / ctx.scale_y;
|
|
ow = w / ctx.scale_x;
|
|
oh = h / ctx.scale_y;
|
|
} else {
|
|
ox = x / ctx.fallback_scale_w;
|
|
oy = y / ctx.fallback_scale_h;
|
|
ow = w / ctx.fallback_scale_w;
|
|
oh = h / ctx.fallback_scale_h;
|
|
}
|
|
|
|
Rect r{};
|
|
const int out_w = std::max(1, ctx.out_w);
|
|
const int out_h = std::max(1, ctx.out_h);
|
|
r.x = static_cast<float>(Clamp(static_cast<int>(ox), 0, out_w));
|
|
r.y = static_cast<float>(Clamp(static_cast<int>(oy), 0, out_h));
|
|
r.w = static_cast<float>(Clamp(static_cast<int>(ow), 0, out_w - static_cast<int>(r.x)));
|
|
r.h = static_cast<float>(Clamp(static_cast<int>(oh), 0, out_h - static_cast<int>(r.y)));
|
|
return r;
|
|
}
|
|
|
|
Rect ExpandRect(const Rect& in, int img_w, int img_h, const BboxExpandConfig& cfg) {
|
|
if (!cfg.enable || img_w <= 0 || img_h <= 0) return in;
|
|
|
|
const float ex = in.w * cfg.left;
|
|
const float ey = in.h * cfg.top;
|
|
const float ew = in.w * (cfg.left + cfg.right);
|
|
const float eh = in.h * (cfg.top + cfg.bottom);
|
|
|
|
Rect out{};
|
|
out.x = std::max(0.0f, in.x - ex);
|
|
out.y = std::max(0.0f, in.y - ey);
|
|
out.w = std::min(static_cast<float>(img_w) - out.x, in.w + ew);
|
|
out.h = std::min(static_cast<float>(img_h) - out.y, in.h + eh);
|
|
out.w = std::max(0.0f, out.w);
|
|
out.h = std::max(0.0f, out.h);
|
|
return out;
|
|
}
|
|
|
|
inline int32_t ClipFloat(float val, float min_val, float max_val) {
|
|
return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
|
|
}
|
|
|
|
inline int8_t QuantizeF32ToAffine(float f32, int32_t zp, float scale) {
|
|
float dst_val = (f32 / scale) + zp;
|
|
return static_cast<int8_t>(ClipFloat(dst_val, -128, 127));
|
|
}
|
|
|
|
inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
|
|
return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
|
|
}
|
|
|
|
inline float Sigmoid(float x) {
|
|
return 1.0f / (1.0f + std::exp(-x));
|
|
}
|
|
|
|
// FP16 (half) to FP32 conversion.
|
|
// Uses arithmetic reconstruction to avoid undefined behavior on subnormals.
|
|
inline float Fp16ToFp32(uint16_t h) {
|
|
const int sign = (h & 0x8000) ? -1 : 1;
|
|
const int exp = (h >> 10) & 0x1F;
|
|
const int mant = h & 0x03FF;
|
|
|
|
if (exp == 0) {
|
|
if (mant == 0) return sign < 0 ? -0.0f : 0.0f;
|
|
// subnormal: mant * 2^-24
|
|
return static_cast<float>(sign) * std::ldexp(static_cast<float>(mant), -24);
|
|
}
|
|
if (exp == 0x1F) {
|
|
if (mant == 0) return sign < 0 ? -INFINITY : INFINITY;
|
|
return std::numeric_limits<float>::quiet_NaN();
|
|
}
|
|
// normal: (mant + 1024) * 2^(exp-25)
|
|
return static_cast<float>(sign) *
|
|
std::ldexp(static_cast<float>(mant + 1024), exp - 25);
|
|
}
|
|
|
|
float CalculateIoU(float x1_min, float y1_min, float x1_max, float y1_max,
|
|
float x2_min, float y2_min, float x2_max, float y2_max) {
|
|
float w = std::fmax(0.f, std::fmin(x1_max, x2_max) - std::fmax(x1_min, x2_min) + 1.0f);
|
|
float h = std::fmax(0.f, std::fmin(y1_max, y2_max) - std::fmax(y1_min, y2_min) + 1.0f);
|
|
float inter = w * h;
|
|
float area1 = (x1_max - x1_min + 1.0f) * (y1_max - y1_min + 1.0f);
|
|
float area2 = (x2_max - x2_min + 1.0f) * (y2_max - y2_min + 1.0f);
|
|
float uni = area1 + area2 - inter;
|
|
return uni <= 0.f ? 0.f : (inter / uni);
|
|
}
|
|
|
|
void QuickSortDescending(std::vector<float>& values, int left, int right, std::vector<int>& indices) {
|
|
if (left >= right) return;
|
|
float pivot = values[left];
|
|
int pivot_idx = indices[left];
|
|
int low = left, high = right;
|
|
while (low < high) {
|
|
while (low < high && values[high] <= pivot) high--;
|
|
values[low] = values[high];
|
|
indices[low] = indices[high];
|
|
while (low < high && values[low] >= pivot) low++;
|
|
values[high] = values[low];
|
|
indices[high] = indices[low];
|
|
}
|
|
values[low] = pivot;
|
|
indices[low] = pivot_idx;
|
|
QuickSortDescending(values, left, low - 1, indices);
|
|
QuickSortDescending(values, low + 1, right, indices);
|
|
}
|
|
|
|
void NMS(int valid_count, std::vector<float>& boxes, std::vector<int>& class_ids,
|
|
std::vector<int>& order, int filter_id, float threshold) {
|
|
for (int i = 0; i < valid_count; ++i) {
|
|
int n = order[i];
|
|
if (n < 0 || n >= valid_count) continue;
|
|
if (class_ids[n] != filter_id) continue;
|
|
for (int j = i + 1; j < valid_count; ++j) {
|
|
int m = order[j];
|
|
if (m < 0 || m >= valid_count) continue;
|
|
if (class_ids[m] != filter_id) continue;
|
|
float x1_min = boxes[n * 4 + 0];
|
|
float y1_min = boxes[n * 4 + 1];
|
|
float x1_max = x1_min + boxes[n * 4 + 2];
|
|
float y1_max = y1_min + boxes[n * 4 + 3];
|
|
float x2_min = boxes[m * 4 + 0];
|
|
float y2_min = boxes[m * 4 + 1];
|
|
float x2_max = x2_min + boxes[m * 4 + 2];
|
|
float y2_max = y2_min + boxes[m * 4 + 3];
|
|
if (CalculateIoU(x1_min, y1_min, x1_max, y1_max, x2_min, y2_min, x2_max, y2_max) > threshold) {
|
|
order[j] = -1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
// YOLOv5 feature map processing (anchor-based)
|
|
int ProcessFeatureMapV5(int8_t* input, const int* anchor, int grid_h, int grid_w,
|
|
int model_h, int model_w, int stride,
|
|
std::vector<float>& boxes, std::vector<float>& obj_probs,
|
|
std::vector<int>& class_ids, float conf_thresh, int32_t zp, float scale) {
|
|
int valid_count = 0;
|
|
int grid_len = grid_h * grid_w;
|
|
int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);
|
|
|
|
for (int a = 0; a < 3; ++a) {
|
|
for (int i = 0; i < grid_h; ++i) {
|
|
for (int j = 0; j < grid_w; ++j) {
|
|
int8_t box_conf = input[(kPropBoxSizeV5 * a + 4) * grid_len + i * grid_w + j];
|
|
if (box_conf >= thresh_i8) {
|
|
int offset = (kPropBoxSizeV5 * a) * grid_len + i * grid_w + j;
|
|
int8_t* ptr = input + offset;
|
|
|
|
float bx = DequantizeAffineToF32(*ptr, zp, scale) * 2.0f - 0.5f;
|
|
float by = DequantizeAffineToF32(ptr[grid_len], zp, scale) * 2.0f - 0.5f;
|
|
float bw = DequantizeAffineToF32(ptr[2 * grid_len], zp, scale) * 2.0f;
|
|
float bh = DequantizeAffineToF32(ptr[3 * grid_len], zp, scale) * 2.0f;
|
|
|
|
bx = (bx + j) * stride;
|
|
by = (by + i) * stride;
|
|
bw = bw * bw * anchor[a * 2];
|
|
bh = bh * bh * anchor[a * 2 + 1];
|
|
bx -= bw / 2.0f;
|
|
by -= bh / 2.0f;
|
|
|
|
int8_t max_cls_prob = ptr[5 * grid_len];
|
|
int max_cls_id = 0;
|
|
for (int k = 1; k < kObjClassNum; ++k) {
|
|
int8_t prob = ptr[(5 + k) * grid_len];
|
|
if (prob > max_cls_prob) {
|
|
max_cls_id = k;
|
|
max_cls_prob = prob;
|
|
}
|
|
}
|
|
|
|
if (max_cls_prob > thresh_i8) {
|
|
float score = DequantizeAffineToF32(max_cls_prob, zp, scale) *
|
|
DequantizeAffineToF32(box_conf, zp, scale);
|
|
obj_probs.push_back(score);
|
|
class_ids.push_back(max_cls_id);
|
|
boxes.push_back(bx);
|
|
boxes.push_back(by);
|
|
boxes.push_back(bw);
|
|
boxes.push_back(bh);
|
|
++valid_count;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return valid_count;
|
|
}
|
|
|
|
uint32_t TensorTypeSizeBytes(rknn_tensor_type t) {
|
|
switch (t) {
|
|
case RKNN_TENSOR_INT8:
|
|
case RKNN_TENSOR_UINT8:
|
|
return 1;
|
|
case RKNN_TENSOR_FLOAT16:
|
|
return 2;
|
|
case RKNN_TENSOR_FLOAT32:
|
|
return 4;
|
|
default:
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
int DefaultV8NumBoxes(int model_h, int model_w) {
|
|
if (model_h <= 0 || model_w <= 0) return 0;
|
|
return (model_h / 8) * (model_w / 8) +
|
|
(model_h / 16) * (model_w / 16) +
|
|
(model_h / 32) * (model_w / 32);
|
|
}
|
|
|
|
struct V8LayoutInfo {
|
|
int num_boxes = 0;
|
|
bool channels_first = true; // true: CxN, false: NxC
|
|
};
|
|
|
|
float ScoreBoxCandidate(float x, float y, float w, float h, int model_w, int model_h) {
|
|
float s = 0.0f;
|
|
if (w > 0.0f && h > 0.0f) s += 3.0f;
|
|
if (w <= model_w * 1.2f) s += 1.0f;
|
|
if (h <= model_h * 1.2f) s += 1.0f;
|
|
if (x >= -model_w * 0.1f) s += 1.0f;
|
|
if (y >= -model_h * 0.1f) s += 1.0f;
|
|
if ((x + w) <= model_w * 1.2f) s += 1.0f;
|
|
if ((y + h) <= model_h * 1.2f) s += 1.0f;
|
|
return s;
|
|
}
|
|
|
|
bool SeemsNormalized(float a, float b, float c, float d) {
|
|
auto in_range = [](float v) { return v >= -0.05f && v <= 2.5f; };
|
|
return in_range(a) && in_range(b) && in_range(c) && in_range(d);
|
|
}
|
|
|
|
const char* V8BoxFormatName(V8BoxFormat fmt) {
|
|
switch (fmt) {
|
|
case V8BoxFormat::CxCyWh: return "cxcywh";
|
|
case V8BoxFormat::XyXy: return "xyxy";
|
|
case V8BoxFormat::XyWh: return "xywh";
|
|
default: return "auto";
|
|
}
|
|
}
|
|
|
|
void DecodeV8Box(float a, float b, float c, float d, int model_w, int model_h, V8BoxFormat fmt,
|
|
float& out_x, float& out_y, float& out_w, float& out_h, V8BoxFormat* used_fmt = nullptr) {
|
|
if (SeemsNormalized(a, b, c, d)) {
|
|
a *= static_cast<float>(model_w);
|
|
b *= static_cast<float>(model_h);
|
|
c *= static_cast<float>(model_w);
|
|
d *= static_cast<float>(model_h);
|
|
}
|
|
|
|
auto decode_cxcywh = [&](float& x, float& y, float& w, float& h) {
|
|
x = a - c / 2.0f;
|
|
y = b - d / 2.0f;
|
|
w = c;
|
|
h = d;
|
|
};
|
|
auto decode_xyxy = [&](float& x, float& y, float& w, float& h) {
|
|
x = a;
|
|
y = b;
|
|
w = c - a;
|
|
h = d - b;
|
|
};
|
|
auto decode_xywh = [&](float& x, float& y, float& w, float& h) {
|
|
x = a;
|
|
y = b;
|
|
w = c;
|
|
h = d;
|
|
};
|
|
|
|
if (fmt == V8BoxFormat::CxCyWh) {
|
|
decode_cxcywh(out_x, out_y, out_w, out_h);
|
|
if (used_fmt) *used_fmt = V8BoxFormat::CxCyWh;
|
|
return;
|
|
}
|
|
if (fmt == V8BoxFormat::XyXy) {
|
|
decode_xyxy(out_x, out_y, out_w, out_h);
|
|
if (used_fmt) *used_fmt = V8BoxFormat::XyXy;
|
|
return;
|
|
}
|
|
if (fmt == V8BoxFormat::XyWh) {
|
|
decode_xywh(out_x, out_y, out_w, out_h);
|
|
if (used_fmt) *used_fmt = V8BoxFormat::XyWh;
|
|
return;
|
|
}
|
|
|
|
float x1 = 0.0f, y1 = 0.0f, w1 = 0.0f, h1 = 0.0f;
|
|
float x2 = 0.0f, y2 = 0.0f, w2 = 0.0f, h2 = 0.0f;
|
|
float x3 = 0.0f, y3 = 0.0f, w3 = 0.0f, h3 = 0.0f;
|
|
decode_cxcywh(x1, y1, w1, h1);
|
|
decode_xyxy(x2, y2, w2, h2);
|
|
decode_xywh(x3, y3, w3, h3);
|
|
|
|
const float s1 = ScoreBoxCandidate(x1, y1, w1, h1, model_w, model_h);
|
|
const float s2 = ScoreBoxCandidate(x2, y2, w2, h2, model_w, model_h);
|
|
const float s3 = ScoreBoxCandidate(x3, y3, w3, h3, model_w, model_h);
|
|
if (s2 >= s1 && s2 >= s3) {
|
|
out_x = x2; out_y = y2; out_w = w2; out_h = h2;
|
|
if (used_fmt) *used_fmt = V8BoxFormat::XyXy;
|
|
} else if (s3 >= s1 && s3 >= s2) {
|
|
out_x = x3; out_y = y3; out_w = w3; out_h = h3;
|
|
if (used_fmt) *used_fmt = V8BoxFormat::XyWh;
|
|
} else {
|
|
out_x = x1; out_y = y1; out_w = w1; out_h = h1;
|
|
if (used_fmt) *used_fmt = V8BoxFormat::CxCyWh;
|
|
}
|
|
}
|
|
|
|
bool ResolveV8ApplySigmoid(const float* output, int num_boxes, int num_classes, bool channels_first,
|
|
V8ClsActivation act_mode) {
|
|
if (act_mode == V8ClsActivation::None) return false;
|
|
if (act_mode == V8ClsActivation::Sigmoid) return true;
|
|
if (!output || num_boxes <= 0 || num_classes <= 0) return false;
|
|
|
|
const int num_channels = 4 + num_classes;
|
|
const int sample_boxes = std::min(num_boxes, 64);
|
|
float min_v = 1e9f;
|
|
float max_v = -1e9f;
|
|
for (int i = 0; i < sample_boxes; ++i) {
|
|
for (int c = 0; c < num_classes; ++c) {
|
|
const float v = channels_first ? output[(4 + c) * num_boxes + i]
|
|
: output[i * num_channels + (4 + c)];
|
|
if (v < min_v) min_v = v;
|
|
if (v > max_v) max_v = v;
|
|
}
|
|
}
|
|
// If class outputs clearly look like logits, enable sigmoid.
|
|
return (min_v < -0.1f || max_v > 1.5f);
|
|
}
|
|
|
|
V8LayoutInfo ResolveV8Layout(const std::vector<uint32_t>& dims, size_t byte_size,
|
|
rknn_tensor_type type, int num_classes,
|
|
int model_h, int model_w) {
|
|
V8LayoutInfo info;
|
|
const int num_channels = 4 + num_classes;
|
|
if (num_channels <= 0) return info;
|
|
|
|
const uint32_t elem_bytes = TensorTypeSizeBytes(type);
|
|
const size_t total_elems = elem_bytes > 0 ? (byte_size / elem_bytes) : 0;
|
|
const size_t max_boxes_from_data = static_cast<size_t>(num_channels) > 0
|
|
? (total_elems / static_cast<size_t>(num_channels))
|
|
: 0;
|
|
|
|
int ch_idx = -1;
|
|
for (size_t i = 0; i < dims.size(); ++i) {
|
|
if (dims[i] == static_cast<uint32_t>(num_channels)) {
|
|
ch_idx = static_cast<int>(i);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (ch_idx >= 0 && total_elems >= static_cast<size_t>(num_channels)) {
|
|
info.num_boxes = static_cast<int>(max_boxes_from_data);
|
|
|
|
int prev_non1 = 1;
|
|
for (int i = ch_idx - 1; i >= 0; --i) {
|
|
if (dims[static_cast<size_t>(i)] > 1U) {
|
|
prev_non1 = static_cast<int>(dims[static_cast<size_t>(i)]);
|
|
break;
|
|
}
|
|
}
|
|
int next_non1 = 1;
|
|
for (size_t i = static_cast<size_t>(ch_idx + 1); i < dims.size(); ++i) {
|
|
if (dims[i] > 1U) {
|
|
next_non1 = static_cast<int>(dims[i]);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (next_non1 > 1 && prev_non1 <= 1) {
|
|
info.channels_first = true;
|
|
} else if (prev_non1 > 1 && next_non1 <= 1) {
|
|
info.channels_first = false;
|
|
} else if (next_non1 > 1 && prev_non1 > 1) {
|
|
info.channels_first = next_non1 >= prev_non1;
|
|
} else {
|
|
info.channels_first = true;
|
|
}
|
|
} else if (dims.size() >= 3) {
|
|
// Compatibility with old rank-3 assumptions.
|
|
if (dims[1] == static_cast<uint32_t>(num_channels)) {
|
|
info.num_boxes = static_cast<int>(dims[2]);
|
|
info.channels_first = true;
|
|
} else if (dims[2] == static_cast<uint32_t>(num_channels)) {
|
|
info.num_boxes = static_cast<int>(dims[1]);
|
|
info.channels_first = false;
|
|
}
|
|
}
|
|
|
|
if (info.num_boxes <= 0 && max_boxes_from_data > 0) {
|
|
info.num_boxes = static_cast<int>(max_boxes_from_data);
|
|
}
|
|
if (info.num_boxes <= 0) {
|
|
info.num_boxes = DefaultV8NumBoxes(model_h, model_w);
|
|
}
|
|
if (info.num_boxes <= 0) {
|
|
info.num_boxes = 8400;
|
|
}
|
|
|
|
if (max_boxes_from_data > 0 && static_cast<size_t>(info.num_boxes) > max_boxes_from_data) {
|
|
info.num_boxes = static_cast<int>(max_boxes_from_data);
|
|
}
|
|
if (info.num_boxes < 0) info.num_boxes = 0;
|
|
return info;
|
|
}
|
|
|
|
// YOLOv8 output processing (anchor-free, single output tensor)
|
|
int ProcessOutputV8(float* output, int num_boxes, int num_classes,
|
|
int model_h, int model_w,
|
|
std::vector<float>& boxes, std::vector<float>& obj_probs,
|
|
std::vector<int>& class_ids, float conf_thresh,
|
|
bool channels_first, V8BoxFormat box_format, bool apply_sigmoid,
|
|
bool debug_decode, int* debug_left) {
|
|
int valid_count = 0;
|
|
const int num_channels = 4 + num_classes;
|
|
|
|
for (int i = 0; i < num_boxes; ++i) {
|
|
float max_score = 0.0f;
|
|
int max_cls_id = 0;
|
|
for (int c = 0; c < num_classes; ++c) {
|
|
float score = channels_first ? output[(4 + c) * num_boxes + i]
|
|
: output[i * num_channels + (4 + c)];
|
|
if (apply_sigmoid) score = Sigmoid(score);
|
|
if (score > max_score) {
|
|
max_score = score;
|
|
max_cls_id = c;
|
|
}
|
|
}
|
|
|
|
if (max_score >= conf_thresh) {
|
|
const float a = channels_first ? output[0 * num_boxes + i] : output[i * num_channels + 0];
|
|
const float b = channels_first ? output[1 * num_boxes + i] : output[i * num_channels + 1];
|
|
const float c = channels_first ? output[2 * num_boxes + i] : output[i * num_channels + 2];
|
|
const float d = channels_first ? output[3 * num_boxes + i] : output[i * num_channels + 3];
|
|
if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) {
|
|
continue;
|
|
}
|
|
float x1 = 0.0f, y1 = 0.0f, w = 0.0f, h = 0.0f;
|
|
V8BoxFormat used_fmt = box_format;
|
|
DecodeV8Box(a, b, c, d, model_w, model_h, box_format, x1, y1, w, h, &used_fmt);
|
|
if (!std::isfinite(x1) || !std::isfinite(y1) || !std::isfinite(w) || !std::isfinite(h)) {
|
|
continue;
|
|
}
|
|
if (w <= 1e-3f || h <= 1e-3f) continue;
|
|
if (debug_decode && debug_left && *debug_left > 0) {
|
|
--(*debug_left);
|
|
LogInfo("[ai_yolo] v8 decode f32: raw4(" + std::to_string(a) + "," + std::to_string(b) + "," +
|
|
std::to_string(c) + "," + std::to_string(d) + ") fmt=" + V8BoxFormatName(used_fmt) +
|
|
" -> xywh(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
|
|
std::to_string(w) + "," + std::to_string(h) + ") cls=" +
|
|
std::to_string(max_cls_id) + " score=" + std::to_string(max_score));
|
|
}
|
|
|
|
boxes.push_back(x1);
|
|
boxes.push_back(y1);
|
|
boxes.push_back(w);
|
|
boxes.push_back(h);
|
|
obj_probs.push_back(max_score);
|
|
class_ids.push_back(max_cls_id);
|
|
++valid_count;
|
|
}
|
|
}
|
|
return valid_count;
|
|
}
|
|
|
|
// YOLOv8 INT8 output processing
|
|
int ProcessOutputV8Int8(int8_t* output, int num_boxes, int num_classes,
|
|
int model_h, int model_w,
|
|
std::vector<float>& boxes, std::vector<float>& obj_probs,
|
|
std::vector<int>& class_ids, float conf_thresh,
|
|
int32_t zp, float scale, bool channels_first, V8BoxFormat box_format,
|
|
bool debug_decode, int* debug_left) {
|
|
int valid_count = 0;
|
|
int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);
|
|
const int num_channels = 4 + num_classes;
|
|
|
|
for (int i = 0; i < num_boxes; ++i) {
|
|
int8_t max_score_i8 = -128;
|
|
int max_cls_id = 0;
|
|
for (int c = 0; c < num_classes; ++c) {
|
|
int8_t score = channels_first ? output[(4 + c) * num_boxes + i]
|
|
: output[i * num_channels + (4 + c)];
|
|
if (score > max_score_i8) {
|
|
max_score_i8 = score;
|
|
max_cls_id = c;
|
|
}
|
|
}
|
|
|
|
if (max_score_i8 >= thresh_i8) {
|
|
float a = DequantizeAffineToF32(
|
|
channels_first ? output[0 * num_boxes + i] : output[i * num_channels + 0], zp, scale);
|
|
float b = DequantizeAffineToF32(
|
|
channels_first ? output[1 * num_boxes + i] : output[i * num_channels + 1], zp, scale);
|
|
float c = DequantizeAffineToF32(
|
|
channels_first ? output[2 * num_boxes + i] : output[i * num_channels + 2], zp, scale);
|
|
float d = DequantizeAffineToF32(
|
|
channels_first ? output[3 * num_boxes + i] : output[i * num_channels + 3], zp, scale);
|
|
float max_score = DequantizeAffineToF32(max_score_i8, zp, scale);
|
|
if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) {
|
|
continue;
|
|
}
|
|
float x1 = 0.0f, y1 = 0.0f, w = 0.0f, h = 0.0f;
|
|
V8BoxFormat used_fmt = box_format;
|
|
DecodeV8Box(a, b, c, d, model_w, model_h, box_format, x1, y1, w, h, &used_fmt);
|
|
if (!std::isfinite(x1) || !std::isfinite(y1) || !std::isfinite(w) || !std::isfinite(h)) {
|
|
continue;
|
|
}
|
|
if (w <= 1e-3f || h <= 1e-3f) continue;
|
|
if (debug_decode && debug_left && *debug_left > 0) {
|
|
--(*debug_left);
|
|
LogInfo("[ai_yolo] v8 decode int8: raw4(" + std::to_string(a) + "," + std::to_string(b) + "," +
|
|
std::to_string(c) + "," + std::to_string(d) + ") fmt=" + V8BoxFormatName(used_fmt) +
|
|
" -> xywh(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
|
|
std::to_string(w) + "," + std::to_string(h) + ") cls=" +
|
|
std::to_string(max_cls_id) + " score=" + std::to_string(max_score));
|
|
}
|
|
|
|
boxes.push_back(x1);
|
|
boxes.push_back(y1);
|
|
boxes.push_back(w);
|
|
boxes.push_back(h);
|
|
obj_probs.push_back(max_score);
|
|
class_ids.push_back(max_cls_id);
|
|
++valid_count;
|
|
}
|
|
}
|
|
return valid_count;
|
|
}
|
|
#endif
|
|
|
|
} // namespace
|
|
|
|
class AiYoloNode : public INode {
|
|
public:
|
|
std::string Id() const override { return id_; }
|
|
std::string Type() const override { return "ai_yolo"; }
|
|
|
|
bool Init(const SimpleJson& config, const NodeContext& ctx) override {
|
|
id_ = config.ValueOr<std::string>("id", "ai_yolo");
|
|
model_path_ = config.ValueOr<std::string>("model_path", "");
|
|
conf_thresh_ = config.ValueOr<float>("conf", 0.25f);
|
|
nms_thresh_ = config.ValueOr<float>("nms", 0.45f);
|
|
model_input_w_ = config.ValueOr<int>("model_w", 640);
|
|
model_input_h_ = config.ValueOr<int>("model_h", 640);
|
|
num_classes_ = config.ValueOr<int>("num_classes", 80);
|
|
{
|
|
const std::string bf = config.ValueOr<std::string>("v8_box_format", "cxcywh");
|
|
if (bf == "xyxy") {
|
|
v8_box_format_ = V8BoxFormat::XyXy;
|
|
} else if (bf == "xywh") {
|
|
v8_box_format_ = V8BoxFormat::XyWh;
|
|
} else if (bf == "cxcywh") {
|
|
v8_box_format_ = V8BoxFormat::CxCyWh;
|
|
} else {
|
|
v8_box_format_ = V8BoxFormat::Auto;
|
|
}
|
|
}
|
|
{
|
|
const std::string act = config.ValueOr<std::string>("v8_cls_activation", "auto");
|
|
if (act == "sigmoid") {
|
|
v8_cls_activation_ = V8ClsActivation::Sigmoid;
|
|
} else if (act == "none") {
|
|
v8_cls_activation_ = V8ClsActivation::None;
|
|
} else {
|
|
v8_cls_activation_ = V8ClsActivation::Auto;
|
|
}
|
|
}
|
|
|
|
if (const SimpleJson* dbg = config.Find("debug"); dbg && dbg->IsObject()) {
|
|
stats_log_ = dbg->ValueOr<bool>("stats", stats_log_);
|
|
stats_interval_ = std::max<uint64_t>(
|
|
1, static_cast<uint64_t>(dbg->ValueOr<int>("stats_interval", static_cast<int>(stats_interval_))));
|
|
debug_det_ = dbg->ValueOr<bool>("detections", debug_det_);
|
|
}
|
|
|
|
// Optional inference throttle. 0 = run every frame.
|
|
infer_interval_ms_ = std::max<int64_t>(0, static_cast<int64_t>(config.ValueOr<int>("infer_interval_ms", 0)));
|
|
if (infer_interval_ms_ <= 0) {
|
|
const double infer_fps = config.ValueOr<double>("infer_fps", 0.0);
|
|
if (infer_fps > 0.0) {
|
|
infer_interval_ms_ = static_cast<int64_t>(1000.0 / infer_fps);
|
|
if (infer_interval_ms_ < 1) infer_interval_ms_ = 1;
|
|
}
|
|
}
|
|
|
|
std::string ver = config.ValueOr<std::string>("model_version", "auto");
|
|
if (ver == "v5") {
|
|
yolo_version_ = YoloVersion::V5;
|
|
} else if (ver == "v8") {
|
|
yolo_version_ = YoloVersion::V8;
|
|
} else {
|
|
yolo_version_ = YoloVersion::V8;
|
|
auto_detect_version_ = true;
|
|
}
|
|
|
|
if (const SimpleJson* filter = config.Find("class_filter")) {
|
|
for (const auto& item : filter->AsArray()) {
|
|
class_filter_.insert(item.AsInt(-1));
|
|
}
|
|
}
|
|
|
|
if (const SimpleJson* expand = config.Find("bbox_expand"); expand && expand->IsObject()) {
|
|
bbox_expand_.enable = expand->ValueOr<bool>("enable", false);
|
|
bbox_expand_.class_id = expand->ValueOr<int>("class_id", bbox_expand_.class_id);
|
|
bbox_expand_.left = expand->ValueOr<float>("left", bbox_expand_.left);
|
|
bbox_expand_.right = expand->ValueOr<float>("right", bbox_expand_.right);
|
|
bbox_expand_.top = expand->ValueOr<float>("top", bbox_expand_.top);
|
|
bbox_expand_.bottom = expand->ValueOr<float>("bottom", bbox_expand_.bottom);
|
|
}
|
|
|
|
input_queue_ = ctx.input_queue;
|
|
if (!input_queue_) {
|
|
LogError("[ai_yolo] no input queue for node " + id_);
|
|
return false;
|
|
}
|
|
if (ctx.output_queues.empty()) {
|
|
LogError("[ai_yolo] no output queue for node " + id_);
|
|
return false;
|
|
}
|
|
output_queues_ = ctx.output_queues;
|
|
|
|
infer_backend_ = ctx.infer_backend;
|
|
if (!infer_backend_) {
|
|
LogError("[ai_yolo] no infer backend for node " + id_);
|
|
return false;
|
|
}
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
if (model_path_.empty()) {
|
|
LogError("[ai_yolo] model_path is required");
|
|
return false;
|
|
}
|
|
|
|
std::string err;
|
|
model_handle_ = infer_backend_->LoadModel(model_path_, err);
|
|
if (model_handle_ == kInvalidModelHandle) {
|
|
LogError("[ai_yolo] failed to load model: " + err);
|
|
return false;
|
|
}
|
|
|
|
ModelInfo info;
|
|
if (infer_backend_->GetModelInfo(model_handle_, info)) {
|
|
model_input_w_ = info.input_width;
|
|
model_input_h_ = info.input_height;
|
|
n_output_ = info.n_output;
|
|
|
|
if (auto_detect_version_) {
|
|
if (n_output_ == 1) {
|
|
yolo_version_ = YoloVersion::V8;
|
|
} else if (n_output_ >= 3) {
|
|
yolo_version_ = YoloVersion::V5;
|
|
}
|
|
}
|
|
}
|
|
|
|
LogInfo("[ai_yolo] model loaded via InferBackend: " + model_path_ +
|
|
" (handle=" + std::to_string(model_handle_) + ", version=" +
|
|
(yolo_version_ == YoloVersion::V5 ? "v5" : "v8") + ")");
|
|
#else
|
|
LogWarn("[ai_yolo] RKNN disabled, will passthrough frames");
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
bool Start() override {
|
|
LogInfo("[ai_yolo] start id=" + id_ + " conf=" + std::to_string(conf_thresh_) +
|
|
" nms=" + std::to_string(nms_thresh_));
|
|
return true;
|
|
}
|
|
|
|
void Stop() override {
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
if (model_handle_ != kInvalidModelHandle) {
|
|
infer_backend_->UnloadModel(model_handle_);
|
|
model_handle_ = kInvalidModelHandle;
|
|
}
|
|
#endif
|
|
LogInfo("[ai_yolo] stop id=" + id_);
|
|
}
|
|
|
|
NodeStatus Process(FramePtr frame) override {
|
|
if (!frame) return NodeStatus::DROP;
|
|
|
|
if (infer_interval_ms_ > 0 && frame->pts > 0) {
|
|
const int64_t pts_ms = static_cast<int64_t>(frame->pts / 1000ULL);
|
|
const int64_t delta_ms = pts_ms - last_infer_pts_ms_;
|
|
if (last_infer_pts_ms_ > 0 && delta_ms > 0 && delta_ms < infer_interval_ms_) {
|
|
PushToDownstream(frame);
|
|
++processed_;
|
|
return NodeStatus::OK;
|
|
}
|
|
last_infer_pts_ms_ = pts_ms;
|
|
}
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
RunInference(frame);
|
|
#endif
|
|
PushToDownstream(frame);
|
|
++processed_;
|
|
|
|
// Stats logging disabled
|
|
(void)stats_log_;
|
|
(void)stats_interval_;
|
|
return NodeStatus::OK;
|
|
}
|
|
|
|
private:
|
|
void PushToDownstream(FramePtr frame) {
|
|
for (auto& q : output_queues_) {
|
|
q->Push(frame);
|
|
}
|
|
}
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
void ResizeRgbBilinear(const uint8_t* src, int src_w, int src_h, int src_stride,
|
|
uint8_t* dst, int dst_w, int dst_h, int dst_stride) {
|
|
const float scale_x = static_cast<float>(src_w) / static_cast<float>(dst_w);
|
|
const float scale_y = static_cast<float>(src_h) / static_cast<float>(dst_h);
|
|
|
|
for (int y = 0; y < dst_h; ++y) {
|
|
const float fy = static_cast<float>(y) * scale_y;
|
|
const int y0 = static_cast<int>(fy);
|
|
const int y1 = std::min(y0 + 1, src_h - 1);
|
|
const float dy = fy - static_cast<float>(y0);
|
|
|
|
for (int x = 0; x < dst_w; ++x) {
|
|
const float fx = static_cast<float>(x) * scale_x;
|
|
const int x0 = static_cast<int>(fx);
|
|
const int x1 = std::min(x0 + 1, src_w - 1);
|
|
const float dx = fx - static_cast<float>(x0);
|
|
|
|
for (int c = 0; c < 3; ++c) {
|
|
const float v00 = src[y0 * src_stride + x0 * 3 + c];
|
|
const float v01 = src[y0 * src_stride + x1 * 3 + c];
|
|
const float v10 = src[y1 * src_stride + x0 * 3 + c];
|
|
const float v11 = src[y1 * src_stride + x1 * 3 + c];
|
|
const float v = v00 * (1.0f - dx) * (1.0f - dy) +
|
|
v01 * dx * (1.0f - dy) +
|
|
v10 * (1.0f - dx) * dy +
|
|
v11 * dx * dy;
|
|
dst[y * dst_stride + x * 3 + c] = static_cast<uint8_t>(v);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void RunInference(FramePtr frame) {
|
|
if (!frame->data || frame->data_size == 0) return;
|
|
|
|
bool is_rgb = (frame->format == PixelFormat::RGB || frame->format == PixelFormat::BGR);
|
|
if (!is_rgb) {
|
|
LogWarn("[ai_yolo] input must be RGB/BGR, got other format");
|
|
return;
|
|
}
|
|
|
|
const int w = frame->width;
|
|
const int h = frame->height;
|
|
const size_t packed_row = static_cast<size_t>(w) * 3;
|
|
const size_t packed_size = packed_row * static_cast<size_t>(h);
|
|
|
|
const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
|
|
int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
|
|
: (frame->stride > 0 ? frame->stride : static_cast<int>(packed_row));
|
|
if (!src || src_stride <= 0) return;
|
|
|
|
InferInput input;
|
|
const bool exact_model_input = (w == model_input_w_ && h == model_input_h_);
|
|
if (exact_model_input && static_cast<size_t>(src_stride) == packed_row && frame->data_size >= packed_size) {
|
|
input.data = src;
|
|
input.size = packed_size;
|
|
input.width = w;
|
|
input.height = h;
|
|
|
|
// Best-effort RKNN DMA-BUF zero-copy path.
|
|
if (frame->DmaFd() >= 0 && frame->data) {
|
|
const ptrdiff_t off = src - frame->data;
|
|
if (off >= 0 && static_cast<size_t>(off) + packed_size <= frame->data_size) {
|
|
input.dma_fd = frame->DmaFd();
|
|
input.dma_offset = static_cast<int>(off);
|
|
}
|
|
}
|
|
} else {
|
|
if (frame->data_size < static_cast<size_t>(src_stride) * static_cast<size_t>(h)) {
|
|
LogWarn("[ai_yolo] invalid RGB buffer size/stride (data_size=" + std::to_string(frame->data_size) +
|
|
", stride=" + std::to_string(src_stride) +
|
|
", h=" + std::to_string(h) + ")");
|
|
return;
|
|
}
|
|
|
|
if (frame->DmaFd() >= 0) frame->SyncStart();
|
|
const size_t input_row = static_cast<size_t>(model_input_w_) * 3;
|
|
const size_t input_size = input_row * static_cast<size_t>(model_input_h_);
|
|
resized_input_.resize(input_size);
|
|
|
|
if (exact_model_input) {
|
|
for (int y = 0; y < h; ++y) {
|
|
memcpy(resized_input_.data() + static_cast<size_t>(y) * input_row,
|
|
src + static_cast<size_t>(y) * static_cast<size_t>(src_stride),
|
|
input_row);
|
|
}
|
|
} else {
|
|
ResizeRgbBilinear(src, w, h, src_stride,
|
|
resized_input_.data(), model_input_w_, model_input_h_,
|
|
static_cast<int>(input_row));
|
|
}
|
|
if (frame->DmaFd() >= 0) frame->SyncEnd();
|
|
input.data = resized_input_.data();
|
|
input.size = input_size;
|
|
input.width = model_input_w_;
|
|
input.height = model_input_h_;
|
|
}
|
|
input.is_nhwc = true;
|
|
|
|
auto result = infer_backend_->InferBorrowed(model_handle_, input);
|
|
if (!result.success) {
|
|
LogWarn("[ai_yolo] inference failed: " + result.error);
|
|
return;
|
|
}
|
|
|
|
PostProcessBorrowed(result.outputs, frame);
|
|
}
|
|
|
|
void PostProcessBorrowed(const std::vector<AiScheduler::BorrowedOutput>& outputs, FramePtr frame) {
|
|
std::vector<float> boxes;
|
|
std::vector<float> obj_probs;
|
|
std::vector<int> class_ids;
|
|
int valid_count = 0;
|
|
|
|
if (yolo_version_ == YoloVersion::V5) {
|
|
if (outputs.size() < 3) return;
|
|
if (!outputs[0].data || !outputs[1].data || !outputs[2].data) return;
|
|
|
|
int stride0 = 8, stride1 = 16, stride2 = 32;
|
|
int grid_h0 = model_input_h_ / stride0, grid_w0 = model_input_w_ / stride0;
|
|
int grid_h1 = model_input_h_ / stride1, grid_w1 = model_input_w_ / stride1;
|
|
int grid_h2 = model_input_h_ / stride2, grid_w2 = model_input_w_ / stride2;
|
|
|
|
int cnt0 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[0].data)), kAnchor0,
|
|
grid_h0, grid_w0, model_input_h_, model_input_w_, stride0,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
outputs[0].zp, outputs[0].scale);
|
|
int cnt1 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[1].data)), kAnchor1,
|
|
grid_h1, grid_w1, model_input_h_, model_input_w_, stride1,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
outputs[1].zp, outputs[1].scale);
|
|
int cnt2 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[2].data)), kAnchor2,
|
|
grid_h2, grid_w2, model_input_h_, model_input_w_, stride2,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
outputs[2].zp, outputs[2].scale);
|
|
valid_count = cnt0 + cnt1 + cnt2;
|
|
} else {
|
|
if (outputs.empty()) return;
|
|
if (!outputs[0].data || outputs[0].size == 0) return;
|
|
|
|
const V8LayoutInfo layout = ResolveV8Layout(outputs[0].dims, outputs[0].size,
|
|
outputs[0].type, num_classes_,
|
|
model_input_h_, model_input_w_);
|
|
const int num_boxes = layout.num_boxes;
|
|
int debug_decode_left = (debug_det_ && processed_ < 20) ? 5 : 0;
|
|
if (num_boxes <= 0) return;
|
|
if (debug_det_ && processed_ < 5) {
|
|
std::string dims_s;
|
|
for (size_t di = 0; di < outputs[0].dims.size(); ++di) {
|
|
dims_s += (di == 0 ? "[" : ",");
|
|
dims_s += std::to_string(outputs[0].dims[di]);
|
|
}
|
|
dims_s += "]";
|
|
LogInfo("[ai_yolo] v8 out type=" + std::to_string(static_cast<int>(outputs[0].type)) +
|
|
" size=" + std::to_string(outputs[0].size) +
|
|
" dims=" + dims_s +
|
|
" num_boxes=" + std::to_string(num_boxes) +
|
|
" layout=" + std::string(layout.channels_first ? "CxN" : "NxC"));
|
|
}
|
|
|
|
if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
|
|
const bool apply_sigmoid = ResolveV8ApplySigmoid(
|
|
reinterpret_cast<float*>(const_cast<uint8_t*>(outputs[0].data)),
|
|
num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
|
|
if (debug_det_ && processed_ < 5) {
|
|
LogInfo("[ai_yolo] v8 cls activation=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
|
|
}
|
|
valid_count = ProcessOutputV8(reinterpret_cast<float*>(const_cast<uint8_t*>(outputs[0].data)),
|
|
num_boxes, num_classes_,
|
|
model_input_h_, model_input_w_,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
layout.channels_first, v8_box_format_, apply_sigmoid,
|
|
debug_det_, &debug_decode_left);
|
|
} else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
|
|
// Convert FP16 to FP32
|
|
size_t num_elements = outputs[0].size / sizeof(uint16_t);
|
|
fp32_buffer_.resize(num_elements);
|
|
const uint16_t* fp16_data = reinterpret_cast<const uint16_t*>(outputs[0].data);
|
|
for (size_t i = 0; i < num_elements; ++i) {
|
|
fp32_buffer_[i] = Fp16ToFp32(fp16_data[i]);
|
|
}
|
|
const bool apply_sigmoid = ResolveV8ApplySigmoid(
|
|
fp32_buffer_.data(), num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
|
|
if (debug_det_ && processed_ < 5) {
|
|
LogInfo("[ai_yolo] v8 cls activation=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
|
|
}
|
|
valid_count = ProcessOutputV8(fp32_buffer_.data(),
|
|
num_boxes, num_classes_,
|
|
model_input_h_, model_input_w_,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
layout.channels_first, v8_box_format_, apply_sigmoid,
|
|
debug_det_, &debug_decode_left);
|
|
} else {
|
|
valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[0].data)),
|
|
num_boxes, num_classes_,
|
|
model_input_h_, model_input_w_,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
outputs[0].zp, outputs[0].scale,
|
|
layout.channels_first, v8_box_format_,
|
|
debug_det_, &debug_decode_left);
|
|
}
|
|
}
|
|
|
|
if (valid_count <= 0) return;
|
|
|
|
std::vector<int> indices(valid_count);
|
|
for (int i = 0; i < valid_count; ++i) indices[i] = i;
|
|
|
|
QuickSortDescending(obj_probs, 0, valid_count - 1, indices);
|
|
|
|
std::set<int> class_set(class_ids.begin(), class_ids.end());
|
|
for (int c : class_set) {
|
|
NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
|
|
}
|
|
|
|
const DetCoordContext coord_ctx = BuildDetCoordContext(*frame, model_input_w_, model_input_h_);
|
|
|
|
auto det_result = std::make_shared<DetectionResult>();
|
|
det_result->img_w = coord_ctx.out_w;
|
|
det_result->img_h = coord_ctx.out_h;
|
|
det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";
|
|
|
|
for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
|
|
if (indices[i] == -1) continue;
|
|
int n = indices[i];
|
|
int cls_id = class_ids[n];
|
|
|
|
if (!class_filter_.empty() && class_filter_.find(cls_id) == class_filter_.end()) {
|
|
continue;
|
|
}
|
|
|
|
float x1 = boxes[n * 4 + 0];
|
|
float y1 = boxes[n * 4 + 1];
|
|
float w = boxes[n * 4 + 2];
|
|
float h = boxes[n * 4 + 3];
|
|
|
|
Detection det;
|
|
det.cls_id = cls_id;
|
|
det.score = obj_probs[i];
|
|
det.bbox = DecodeToOutputRect(x1, y1, w, h, coord_ctx);
|
|
if (bbox_expand_.enable && det.cls_id == bbox_expand_.class_id) {
|
|
det.bbox = ExpandRect(det.bbox, coord_ctx.out_w, coord_ctx.out_h, bbox_expand_);
|
|
}
|
|
det.track_id = -1;
|
|
|
|
if (debug_det_ && det_result->items.size() < 5 && processed_ < 20) {
|
|
LogInfo("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
|
|
std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
|
|
std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
|
|
std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
|
|
std::to_string(cls_id) + " score=" + std::to_string(det.score));
|
|
}
|
|
|
|
det_result->items.push_back(det);
|
|
}
|
|
if (debug_det_ && processed_ < 20) {
|
|
LogInfo("[ai_yolo] det summary: valid_count=" + std::to_string(valid_count) +
|
|
" final=" + std::to_string(det_result->items.size()));
|
|
}
|
|
|
|
frame->det = det_result;
|
|
}
|
|
|
|
void PostProcess(std::vector<InferOutput>& outputs, FramePtr frame) {
|
|
std::vector<float> boxes;
|
|
std::vector<float> obj_probs;
|
|
std::vector<int> class_ids;
|
|
int valid_count = 0;
|
|
|
|
if (yolo_version_ == YoloVersion::V5) {
|
|
if (outputs.size() < 3) return;
|
|
|
|
int stride0 = 8, stride1 = 16, stride2 = 32;
|
|
int grid_h0 = model_input_h_ / stride0, grid_w0 = model_input_w_ / stride0;
|
|
int grid_h1 = model_input_h_ / stride1, grid_w1 = model_input_w_ / stride1;
|
|
int grid_h2 = model_input_h_ / stride2, grid_w2 = model_input_w_ / stride2;
|
|
|
|
int cnt0 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[0].data.data()), kAnchor0,
|
|
grid_h0, grid_w0, model_input_h_, model_input_w_, stride0,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
outputs[0].zp, outputs[0].scale);
|
|
int cnt1 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[1].data.data()), kAnchor1,
|
|
grid_h1, grid_w1, model_input_h_, model_input_w_, stride1,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
outputs[1].zp, outputs[1].scale);
|
|
int cnt2 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[2].data.data()), kAnchor2,
|
|
grid_h2, grid_w2, model_input_h_, model_input_w_, stride2,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
outputs[2].zp, outputs[2].scale);
|
|
valid_count = cnt0 + cnt1 + cnt2;
|
|
} else {
|
|
if (outputs.empty()) return;
|
|
|
|
const V8LayoutInfo layout = ResolveV8Layout(outputs[0].dims, outputs[0].data.size(),
|
|
outputs[0].type, num_classes_,
|
|
model_input_h_, model_input_w_);
|
|
const int num_boxes = layout.num_boxes;
|
|
int debug_decode_left = (debug_det_ && processed_ < 20) ? 5 : 0;
|
|
if (num_boxes <= 0) return;
|
|
if (debug_det_ && processed_ < 5) {
|
|
std::string dims_s;
|
|
for (size_t di = 0; di < outputs[0].dims.size(); ++di) {
|
|
dims_s += (di == 0 ? "[" : ",");
|
|
dims_s += std::to_string(outputs[0].dims[di]);
|
|
}
|
|
dims_s += "]";
|
|
LogInfo("[ai_yolo] v8 out(type copy) type=" + std::to_string(static_cast<int>(outputs[0].type)) +
|
|
" size=" + std::to_string(outputs[0].data.size()) +
|
|
" dims=" + dims_s +
|
|
" num_boxes=" + std::to_string(num_boxes) +
|
|
" layout=" + std::string(layout.channels_first ? "CxN" : "NxC"));
|
|
}
|
|
|
|
if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
|
|
const bool apply_sigmoid = ResolveV8ApplySigmoid(
|
|
reinterpret_cast<float*>(outputs[0].data.data()),
|
|
num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
|
|
if (debug_det_ && processed_ < 5) {
|
|
LogInfo("[ai_yolo] v8 cls activation(copy)=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
|
|
}
|
|
valid_count = ProcessOutputV8(reinterpret_cast<float*>(outputs[0].data.data()),
|
|
num_boxes, num_classes_,
|
|
model_input_h_, model_input_w_,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
layout.channels_first, v8_box_format_, apply_sigmoid,
|
|
debug_det_, &debug_decode_left);
|
|
} else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
|
|
// Convert FP16 to FP32
|
|
size_t num_elements = outputs[0].data.size() / sizeof(uint16_t);
|
|
fp32_buffer_.resize(num_elements);
|
|
const uint16_t* fp16_data = reinterpret_cast<const uint16_t*>(outputs[0].data.data());
|
|
for (size_t i = 0; i < num_elements; ++i) {
|
|
fp32_buffer_[i] = Fp16ToFp32(fp16_data[i]);
|
|
}
|
|
const bool apply_sigmoid = ResolveV8ApplySigmoid(
|
|
fp32_buffer_.data(), num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
|
|
if (debug_det_ && processed_ < 5) {
|
|
LogInfo("[ai_yolo] v8 cls activation(copy)=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
|
|
}
|
|
valid_count = ProcessOutputV8(fp32_buffer_.data(),
|
|
num_boxes, num_classes_,
|
|
model_input_h_, model_input_w_,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
layout.channels_first, v8_box_format_, apply_sigmoid,
|
|
debug_det_, &debug_decode_left);
|
|
} else {
|
|
valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(outputs[0].data.data()),
|
|
num_boxes, num_classes_,
|
|
model_input_h_, model_input_w_,
|
|
boxes, obj_probs, class_ids, conf_thresh_,
|
|
outputs[0].zp, outputs[0].scale,
|
|
layout.channels_first, v8_box_format_,
|
|
debug_det_, &debug_decode_left);
|
|
}
|
|
}
|
|
|
|
if (valid_count <= 0) return;
|
|
|
|
std::vector<int> indices(valid_count);
|
|
for (int i = 0; i < valid_count; ++i) indices[i] = i;
|
|
|
|
QuickSortDescending(obj_probs, 0, valid_count - 1, indices);
|
|
|
|
std::set<int> class_set(class_ids.begin(), class_ids.end());
|
|
for (int c : class_set) {
|
|
NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
|
|
}
|
|
|
|
const DetCoordContext coord_ctx = BuildDetCoordContext(*frame, model_input_w_, model_input_h_);
|
|
|
|
auto det_result = std::make_shared<DetectionResult>();
|
|
det_result->img_w = coord_ctx.out_w;
|
|
det_result->img_h = coord_ctx.out_h;
|
|
det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";
|
|
|
|
for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
|
|
if (indices[i] == -1) continue;
|
|
int n = indices[i];
|
|
int cls_id = class_ids[n];
|
|
|
|
if (!class_filter_.empty() && class_filter_.find(cls_id) == class_filter_.end()) {
|
|
continue;
|
|
}
|
|
|
|
float x1 = boxes[n * 4 + 0];
|
|
float y1 = boxes[n * 4 + 1];
|
|
float w = boxes[n * 4 + 2];
|
|
float h = boxes[n * 4 + 3];
|
|
|
|
Detection det;
|
|
det.cls_id = cls_id;
|
|
det.score = obj_probs[i];
|
|
det.bbox = DecodeToOutputRect(x1, y1, w, h, coord_ctx);
|
|
if (bbox_expand_.enable && det.cls_id == bbox_expand_.class_id) {
|
|
det.bbox = ExpandRect(det.bbox, coord_ctx.out_w, coord_ctx.out_h, bbox_expand_);
|
|
}
|
|
det.track_id = -1;
|
|
|
|
if (debug_det_ && det_result->items.size() < 5 && processed_ < 20) {
|
|
LogInfo("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
|
|
std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
|
|
std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
|
|
std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
|
|
std::to_string(cls_id) + " score=" + std::to_string(det.score));
|
|
}
|
|
|
|
det_result->items.push_back(det);
|
|
}
|
|
if (debug_det_ && processed_ < 20) {
|
|
LogInfo("[ai_yolo] det summary(copy): valid_count=" + std::to_string(valid_count) +
|
|
" final=" + std::to_string(det_result->items.size()));
|
|
}
|
|
|
|
frame->det = det_result;
|
|
}
|
|
#endif
|
|
|
|
std::string id_;
|
|
std::string model_path_;
|
|
float conf_thresh_ = 0.25f;
|
|
float nms_thresh_ = 0.45f;
|
|
int model_input_w_ = 640;
|
|
int model_input_h_ = 640;
|
|
int num_classes_ = 80;
|
|
V8BoxFormat v8_box_format_ = V8BoxFormat::CxCyWh;
|
|
V8ClsActivation v8_cls_activation_ = V8ClsActivation::Auto;
|
|
YoloVersion yolo_version_ = YoloVersion::V8;
|
|
bool auto_detect_version_ = false;
|
|
std::set<int> class_filter_;
|
|
|
|
std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
|
|
std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
|
|
std::shared_ptr<IInferBackend> infer_backend_;
|
|
uint64_t processed_ = 0;
|
|
|
|
bool stats_log_ = false;
|
|
uint64_t stats_interval_ = 100;
|
|
bool debug_det_ = false;
|
|
BboxExpandConfig bbox_expand_{};
|
|
|
|
int64_t infer_interval_ms_ = 0;
|
|
int64_t last_infer_pts_ms_ = 0;
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
ModelHandle model_handle_ = kInvalidModelHandle;
|
|
uint32_t n_output_ = 0;
|
|
std::vector<uint8_t> resized_input_;
|
|
std::vector<float> fp32_buffer_; // For FP16 to FP32 conversion
|
|
#endif
|
|
};
|
|
|
|
REGISTER_NODE(AiYoloNode, "ai_yolo");
|
|
|
|
} // namespace rk3588
|