476 lines
17 KiB
C++
476 lines
17 KiB
C++
#include <atomic>
|
|
#include <chrono>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <memory>
|
|
#include <set>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
#include "node.h"
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
#include "rknn_api.h"
|
|
#endif
|
|
|
|
namespace rk3588 {
|
|
|
|
namespace {
|
|
|
|
constexpr int kObjClassNum = 80;
|
|
constexpr int kPropBoxSize = 5 + kObjClassNum;
|
|
constexpr int kMaxDetections = 64;
|
|
|
|
const int kAnchor0[6] = {10, 13, 16, 30, 33, 23};
|
|
const int kAnchor1[6] = {30, 61, 62, 45, 59, 119};
|
|
const int kAnchor2[6] = {116, 90, 156, 198, 373, 326};
|
|
|
|
const char* kCocoLabels[kObjClassNum] = {
|
|
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
|
|
"traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
|
|
"dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
|
|
"umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
|
|
"kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
|
|
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
|
|
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
|
|
"couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
|
|
"remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator",
|
|
"book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
|
|
};
|
|
|
|
inline int Clamp(float val, int min_val, int max_val) {
|
|
return val > min_val ? (val < max_val ? static_cast<int>(val) : max_val) : min_val;
|
|
}
|
|
|
|
inline int32_t ClipFloat(float val, float min_val, float max_val) {
|
|
return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
|
|
}
|
|
|
|
inline int8_t QuantizeF32ToAffine(float f32, int32_t zp, float scale) {
|
|
float dst_val = (f32 / scale) + zp;
|
|
return static_cast<int8_t>(ClipFloat(dst_val, -128, 127));
|
|
}
|
|
|
|
inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
|
|
return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
|
|
}
|
|
|
|
float CalculateIoU(float x1_min, float y1_min, float x1_max, float y1_max,
|
|
float x2_min, float y2_min, float x2_max, float y2_max) {
|
|
float w = std::fmax(0.f, std::fmin(x1_max, x2_max) - std::fmax(x1_min, x2_min) + 1.0f);
|
|
float h = std::fmax(0.f, std::fmin(y1_max, y2_max) - std::fmax(y1_min, y2_min) + 1.0f);
|
|
float inter = w * h;
|
|
float area1 = (x1_max - x1_min + 1.0f) * (y1_max - y1_min + 1.0f);
|
|
float area2 = (x2_max - x2_min + 1.0f) * (y2_max - y2_min + 1.0f);
|
|
float uni = area1 + area2 - inter;
|
|
return uni <= 0.f ? 0.f : (inter / uni);
|
|
}
|
|
|
|
void QuickSortDescending(std::vector<float>& values, int left, int right, std::vector<int>& indices) {
|
|
if (left >= right) return;
|
|
float pivot = values[left];
|
|
int pivot_idx = indices[left];
|
|
int low = left, high = right;
|
|
while (low < high) {
|
|
while (low < high && values[high] <= pivot) high--;
|
|
values[low] = values[high];
|
|
indices[low] = indices[high];
|
|
while (low < high && values[low] >= pivot) low++;
|
|
values[high] = values[low];
|
|
indices[high] = indices[low];
|
|
}
|
|
values[low] = pivot;
|
|
indices[low] = pivot_idx;
|
|
QuickSortDescending(values, left, low - 1, indices);
|
|
QuickSortDescending(values, low + 1, right, indices);
|
|
}
|
|
|
|
void NMS(int valid_count, std::vector<float>& boxes, std::vector<int>& class_ids,
|
|
std::vector<int>& order, int filter_id, float threshold) {
|
|
for (int i = 0; i < valid_count; ++i) {
|
|
if (order[i] == -1 || class_ids[i] != filter_id) continue;
|
|
int n = order[i];
|
|
for (int j = i + 1; j < valid_count; ++j) {
|
|
int m = order[j];
|
|
if (m == -1 || class_ids[j] != filter_id) continue;
|
|
float x1_min = boxes[n * 4 + 0];
|
|
float y1_min = boxes[n * 4 + 1];
|
|
float x1_max = x1_min + boxes[n * 4 + 2];
|
|
float y1_max = y1_min + boxes[n * 4 + 3];
|
|
float x2_min = boxes[m * 4 + 0];
|
|
float y2_min = boxes[m * 4 + 1];
|
|
float x2_max = x2_min + boxes[m * 4 + 2];
|
|
float y2_max = y2_min + boxes[m * 4 + 3];
|
|
if (CalculateIoU(x1_min, y1_min, x1_max, y1_max, x2_min, y2_min, x2_max, y2_max) > threshold) {
|
|
order[j] = -1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
int ProcessFeatureMap(int8_t* input, const int* anchor, int grid_h, int grid_w,
|
|
int model_h, int model_w, int stride,
|
|
std::vector<float>& boxes, std::vector<float>& obj_probs,
|
|
std::vector<int>& class_ids, float conf_thresh, int32_t zp, float scale) {
|
|
int valid_count = 0;
|
|
int grid_len = grid_h * grid_w;
|
|
int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);
|
|
|
|
for (int a = 0; a < 3; ++a) {
|
|
for (int i = 0; i < grid_h; ++i) {
|
|
for (int j = 0; j < grid_w; ++j) {
|
|
int8_t box_conf = input[(kPropBoxSize * a + 4) * grid_len + i * grid_w + j];
|
|
if (box_conf >= thresh_i8) {
|
|
int offset = (kPropBoxSize * a) * grid_len + i * grid_w + j;
|
|
int8_t* ptr = input + offset;
|
|
|
|
float bx = DequantizeAffineToF32(*ptr, zp, scale) * 2.0f - 0.5f;
|
|
float by = DequantizeAffineToF32(ptr[grid_len], zp, scale) * 2.0f - 0.5f;
|
|
float bw = DequantizeAffineToF32(ptr[2 * grid_len], zp, scale) * 2.0f;
|
|
float bh = DequantizeAffineToF32(ptr[3 * grid_len], zp, scale) * 2.0f;
|
|
|
|
bx = (bx + j) * stride;
|
|
by = (by + i) * stride;
|
|
bw = bw * bw * anchor[a * 2];
|
|
bh = bh * bh * anchor[a * 2 + 1];
|
|
bx -= bw / 2.0f;
|
|
by -= bh / 2.0f;
|
|
|
|
int8_t max_cls_prob = ptr[5 * grid_len];
|
|
int max_cls_id = 0;
|
|
for (int k = 1; k < kObjClassNum; ++k) {
|
|
int8_t prob = ptr[(5 + k) * grid_len];
|
|
if (prob > max_cls_prob) {
|
|
max_cls_id = k;
|
|
max_cls_prob = prob;
|
|
}
|
|
}
|
|
|
|
if (max_cls_prob > thresh_i8) {
|
|
float score = DequantizeAffineToF32(max_cls_prob, zp, scale) *
|
|
DequantizeAffineToF32(box_conf, zp, scale);
|
|
obj_probs.push_back(score);
|
|
class_ids.push_back(max_cls_id);
|
|
boxes.push_back(bx);
|
|
boxes.push_back(by);
|
|
boxes.push_back(bw);
|
|
boxes.push_back(bh);
|
|
++valid_count;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return valid_count;
|
|
}
|
|
#endif
|
|
|
|
} // namespace
|
|
|
|
class AiYoloNode : public INode {
|
|
public:
|
|
std::string Id() const override { return id_; }
|
|
std::string Type() const override { return "ai_yolo"; }
|
|
|
|
bool Init(const SimpleJson& config, const NodeContext& ctx) override {
|
|
id_ = config.ValueOr<std::string>("id", "ai_yolo");
|
|
model_path_ = config.ValueOr<std::string>("model_path", "");
|
|
conf_thresh_ = config.ValueOr<float>("conf", 0.25f);
|
|
nms_thresh_ = config.ValueOr<float>("nms", 0.45f);
|
|
model_input_w_ = config.ValueOr<int>("model_w", 640);
|
|
model_input_h_ = config.ValueOr<int>("model_h", 640);
|
|
|
|
if (const SimpleJson* filter = config.Find("class_filter")) {
|
|
for (const auto& item : filter->AsArray()) {
|
|
class_filter_.insert(item.AsInt(-1));
|
|
}
|
|
}
|
|
|
|
input_queue_ = ctx.input_queue;
|
|
if (!input_queue_) {
|
|
std::cerr << "[ai_yolo] no input queue for node " << id_ << "\n";
|
|
return false;
|
|
}
|
|
if (ctx.output_queues.empty()) {
|
|
std::cerr << "[ai_yolo] no output queue for node " << id_ << "\n";
|
|
return false;
|
|
}
|
|
output_queues_ = ctx.output_queues;
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
if (model_path_.empty()) {
|
|
std::cerr << "[ai_yolo] model_path is required\n";
|
|
return false;
|
|
}
|
|
if (!LoadModel()) {
|
|
std::cerr << "[ai_yolo] failed to load model: " << model_path_ << "\n";
|
|
return false;
|
|
}
|
|
std::cout << "[ai_yolo] model loaded: " << model_path_ << "\n";
|
|
#else
|
|
std::cout << "[ai_yolo] RKNN disabled, will passthrough frames\n";
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
bool Start() override {
|
|
if (!input_queue_) return false;
|
|
running_.store(true);
|
|
worker_ = std::thread(&AiYoloNode::WorkerLoop, this);
|
|
std::cout << "[ai_yolo] started, conf=" << conf_thresh_ << " nms=" << nms_thresh_ << "\n";
|
|
return true;
|
|
}
|
|
|
|
void Stop() override {
|
|
running_.store(false);
|
|
if (input_queue_) input_queue_->Stop();
|
|
for (auto& q : output_queues_) q->Stop();
|
|
if (worker_.joinable()) worker_.join();
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
if (rknn_ctx_) {
|
|
rknn_destroy(rknn_ctx_);
|
|
rknn_ctx_ = 0;
|
|
}
|
|
#endif
|
|
std::cout << "[ai_yolo] stopped\n";
|
|
}
|
|
|
|
private:
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
bool LoadModel() {
|
|
std::ifstream file(model_path_, std::ios::binary | std::ios::ate);
|
|
if (!file.is_open()) return false;
|
|
|
|
size_t model_size = file.tellg();
|
|
file.seekg(0, std::ios::beg);
|
|
model_data_.resize(model_size);
|
|
if (!file.read(reinterpret_cast<char*>(model_data_.data()), model_size)) {
|
|
return false;
|
|
}
|
|
|
|
int ret = rknn_init(&rknn_ctx_, model_data_.data(), model_size, 0, nullptr);
|
|
if (ret < 0) {
|
|
std::cerr << "[ai_yolo] rknn_init failed: " << ret << "\n";
|
|
return false;
|
|
}
|
|
|
|
rknn_input_output_num io_num;
|
|
ret = rknn_query(rknn_ctx_, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
|
|
if (ret < 0) {
|
|
std::cerr << "[ai_yolo] rknn_query IO num failed\n";
|
|
return false;
|
|
}
|
|
n_input_ = io_num.n_input;
|
|
n_output_ = io_num.n_output;
|
|
|
|
input_attrs_.resize(n_input_);
|
|
for (uint32_t i = 0; i < n_input_; ++i) {
|
|
input_attrs_[i].index = i;
|
|
rknn_query(rknn_ctx_, RKNN_QUERY_INPUT_ATTR, &input_attrs_[i], sizeof(rknn_tensor_attr));
|
|
}
|
|
|
|
output_attrs_.resize(n_output_);
|
|
for (uint32_t i = 0; i < n_output_; ++i) {
|
|
output_attrs_[i].index = i;
|
|
rknn_query(rknn_ctx_, RKNN_QUERY_OUTPUT_ATTR, &output_attrs_[i], sizeof(rknn_tensor_attr));
|
|
}
|
|
|
|
if (input_attrs_[0].fmt == RKNN_TENSOR_NCHW) {
|
|
model_input_h_ = input_attrs_[0].dims[2];
|
|
model_input_w_ = input_attrs_[0].dims[3];
|
|
} else {
|
|
model_input_h_ = input_attrs_[0].dims[1];
|
|
model_input_w_ = input_attrs_[0].dims[2];
|
|
}
|
|
|
|
std::cout << "[ai_yolo] model input: " << model_input_w_ << "x" << model_input_h_
|
|
<< ", outputs: " << n_output_ << "\n";
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
void PushToDownstream(FramePtr frame) {
|
|
for (auto& q : output_queues_) {
|
|
q->Push(frame);
|
|
}
|
|
}
|
|
|
|
void WorkerLoop() {
|
|
using namespace std::chrono;
|
|
FramePtr frame;
|
|
|
|
while (running_.load()) {
|
|
if (!input_queue_->Pop(frame, milliseconds(200))) continue;
|
|
if (!frame) continue;
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
RunInference(frame);
|
|
#endif
|
|
PushToDownstream(frame);
|
|
++processed_;
|
|
|
|
if (processed_ % 100 == 0) {
|
|
std::cout << "[ai_yolo] processed " << processed_ << " frames\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
void RunInference(FramePtr frame) {
|
|
if (!frame->data || frame->data_size == 0) return;
|
|
|
|
bool is_rgb = (frame->format == PixelFormat::RGB || frame->format == PixelFormat::BGR);
|
|
if (!is_rgb) {
|
|
std::cerr << "[ai_yolo] input must be RGB/BGR, got other format\n";
|
|
return;
|
|
}
|
|
|
|
rknn_input inputs[1];
|
|
memset(inputs, 0, sizeof(inputs));
|
|
inputs[0].index = 0;
|
|
inputs[0].type = RKNN_TENSOR_UINT8;
|
|
inputs[0].size = frame->width * frame->height * 3;
|
|
inputs[0].fmt = RKNN_TENSOR_NHWC;
|
|
inputs[0].buf = frame->data;
|
|
inputs[0].pass_through = 0;
|
|
|
|
int ret = rknn_inputs_set(rknn_ctx_, n_input_, inputs);
|
|
if (ret < 0) {
|
|
std::cerr << "[ai_yolo] rknn_inputs_set failed: " << ret << "\n";
|
|
return;
|
|
}
|
|
|
|
ret = rknn_run(rknn_ctx_, nullptr);
|
|
if (ret < 0) {
|
|
std::cerr << "[ai_yolo] rknn_run failed: " << ret << "\n";
|
|
return;
|
|
}
|
|
|
|
std::vector<rknn_output> outputs(n_output_);
|
|
memset(outputs.data(), 0, sizeof(rknn_output) * n_output_);
|
|
for (uint32_t i = 0; i < n_output_; ++i) {
|
|
outputs[i].want_float = 0;
|
|
}
|
|
|
|
ret = rknn_outputs_get(rknn_ctx_, n_output_, outputs.data(), nullptr);
|
|
if (ret < 0) {
|
|
std::cerr << "[ai_yolo] rknn_outputs_get failed: " << ret << "\n";
|
|
return;
|
|
}
|
|
|
|
PostProcess(outputs, frame);
|
|
rknn_outputs_release(rknn_ctx_, n_output_, outputs.data());
|
|
}
|
|
|
|
void PostProcess(std::vector<rknn_output>& outputs, FramePtr frame) {
|
|
if (n_output_ < 3) return;
|
|
|
|
std::vector<float> boxes;
|
|
std::vector<float> obj_probs;
|
|
std::vector<int> class_ids;
|
|
|
|
std::vector<int32_t> zps;
|
|
std::vector<float> scales;
|
|
for (uint32_t i = 0; i < n_output_; ++i) {
|
|
zps.push_back(output_attrs_[i].zp);
|
|
scales.push_back(output_attrs_[i].scale);
|
|
}
|
|
|
|
int stride0 = 8, stride1 = 16, stride2 = 32;
|
|
int grid_h0 = model_input_h_ / stride0, grid_w0 = model_input_w_ / stride0;
|
|
int grid_h1 = model_input_h_ / stride1, grid_w1 = model_input_w_ / stride1;
|
|
int grid_h2 = model_input_h_ / stride2, grid_w2 = model_input_w_ / stride2;
|
|
|
|
int cnt0 = ProcessFeatureMap(reinterpret_cast<int8_t*>(outputs[0].buf), kAnchor0,
|
|
grid_h0, grid_w0, model_input_h_, model_input_w_, stride0,
|
|
boxes, obj_probs, class_ids, conf_thresh_, zps[0], scales[0]);
|
|
int cnt1 = ProcessFeatureMap(reinterpret_cast<int8_t*>(outputs[1].buf), kAnchor1,
|
|
grid_h1, grid_w1, model_input_h_, model_input_w_, stride1,
|
|
boxes, obj_probs, class_ids, conf_thresh_, zps[1], scales[1]);
|
|
int cnt2 = ProcessFeatureMap(reinterpret_cast<int8_t*>(outputs[2].buf), kAnchor2,
|
|
grid_h2, grid_w2, model_input_h_, model_input_w_, stride2,
|
|
boxes, obj_probs, class_ids, conf_thresh_, zps[2], scales[2]);
|
|
|
|
int valid_count = cnt0 + cnt1 + cnt2;
|
|
if (valid_count <= 0) return;
|
|
|
|
std::vector<int> indices(valid_count);
|
|
for (int i = 0; i < valid_count; ++i) indices[i] = i;
|
|
|
|
QuickSortDescending(obj_probs, 0, valid_count - 1, indices);
|
|
|
|
std::set<int> class_set(class_ids.begin(), class_ids.end());
|
|
for (int c : class_set) {
|
|
NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
|
|
}
|
|
|
|
float scale_w = static_cast<float>(model_input_w_) / frame->width;
|
|
float scale_h = static_cast<float>(model_input_h_) / frame->height;
|
|
|
|
auto det_result = std::make_shared<DetectionResult>();
|
|
det_result->img_w = frame->width;
|
|
det_result->img_h = frame->height;
|
|
det_result->model_name = "yolov5";
|
|
|
|
for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
|
|
if (indices[i] == -1) continue;
|
|
int n = indices[i];
|
|
int cls_id = class_ids[n];
|
|
|
|
if (!class_filter_.empty() && class_filter_.find(cls_id) == class_filter_.end()) {
|
|
continue;
|
|
}
|
|
|
|
float x1 = boxes[n * 4 + 0];
|
|
float y1 = boxes[n * 4 + 1];
|
|
float w = boxes[n * 4 + 2];
|
|
float h = boxes[n * 4 + 3];
|
|
|
|
Detection det;
|
|
det.cls_id = cls_id;
|
|
det.score = obj_probs[i];
|
|
det.bbox.x = Clamp(x1 / scale_w, 0, frame->width);
|
|
det.bbox.y = Clamp(y1 / scale_h, 0, frame->height);
|
|
det.bbox.w = Clamp(w / scale_w, 0, frame->width - det.bbox.x);
|
|
det.bbox.h = Clamp(h / scale_h, 0, frame->height - det.bbox.y);
|
|
det.track_id = -1;
|
|
|
|
det_result->items.push_back(det);
|
|
}
|
|
|
|
frame->det = det_result;
|
|
}
|
|
#endif
|
|
|
|
std::string id_;
|
|
std::string model_path_;
|
|
float conf_thresh_ = 0.25f;
|
|
float nms_thresh_ = 0.45f;
|
|
int model_input_w_ = 640;
|
|
int model_input_h_ = 640;
|
|
std::set<int> class_filter_;
|
|
|
|
std::atomic<bool> running_{false};
|
|
std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
|
|
std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
|
|
std::thread worker_;
|
|
uint64_t processed_ = 0;
|
|
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
rknn_context rknn_ctx_ = 0;
|
|
std::vector<uint8_t> model_data_;
|
|
uint32_t n_input_ = 0;
|
|
uint32_t n_output_ = 0;
|
|
std::vector<rknn_tensor_attr> input_attrs_;
|
|
std::vector<rknn_tensor_attr> output_attrs_;
|
|
#endif
|
|
};
|
|
|
|
REGISTER_NODE(AiYoloNode, "ai_yolo");
|
|
|
|
} // namespace rk3588
|