OrangePi3588Media/plugins/ai_yolo/ai_yolo_node.cpp

#include <algorithm>
#include <atomic>
#include <chrono>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <memory>
#include <set>
#include <thread>
#include <vector>

#include "node.h"

#if defined(RK3588_ENABLE_RKNN)
#include "rknn_api.h"
#endif

namespace rk3588 {

namespace {

constexpr int kObjClassNum = 80;
constexpr int kPropBoxSizeV5 = 5 + kObjClassNum;  // YOLOv5: x,y,w,h,conf + 80 classes
constexpr int kPropBoxSizeV8 = 4 + kObjClassNum;  // YOLOv8: x,y,w,h + 80 classes (no conf)
constexpr int kMaxDetections = 64;

// YOLOv5 anchors
const int kAnchor0[6] = {10, 13, 16, 30, 33, 23};
const int kAnchor1[6] = {30, 61, 62, 45, 59, 119};
const int kAnchor2[6] = {116, 90, 156, 198, 373, 326};

enum class YoloVersion { V5, V8 };

const char* kCocoLabels[kObjClassNum] = {
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
    "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
    "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
    "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
    "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator",
    "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
};

inline int Clamp(float val, int min_val, int max_val) {
    return val > min_val ? (val < max_val ? static_cast<int>(val) : max_val) : min_val;
}

inline int32_t ClipFloat(float val, float min_val, float max_val) {
    return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
}

inline int8_t QuantizeF32ToAffine(float f32, int32_t zp, float scale) {
    float dst_val = (f32 / scale) + zp;
    return static_cast<int8_t>(ClipFloat(dst_val, -128, 127));
}

inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
    return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
}

float CalculateIoU(float x1_min, float y1_min, float x1_max, float y1_max,
                   float x2_min, float y2_min, float x2_max, float y2_max) {
    float w = std::fmax(0.f, std::fmin(x1_max, x2_max) - std::fmax(x1_min, x2_min) + 1.0f);
    float h = std::fmax(0.f, std::fmin(y1_max, y2_max) - std::fmax(y1_min, y2_min) + 1.0f);
    float inter = w * h;
    float area1 = (x1_max - x1_min + 1.0f) * (y1_max - y1_min + 1.0f);
    float area2 = (x2_max - x2_min + 1.0f) * (y2_max - y2_min + 1.0f);
    float uni = area1 + area2 - inter;
    return uni <= 0.f ? 0.f : (inter / uni);
}

void QuickSortDescending(std::vector<float>& values, int left, int right, std::vector<int>& indices) {
    if (left >= right) return;
    float pivot = values[left];
    int pivot_idx = indices[left];
    int low = left, high = right;
    while (low < high) {
        while (low < high && values[high] <= pivot) high--;
        values[low] = values[high];
        indices[low] = indices[high];
        while (low < high && values[low] >= pivot) low++;
        values[high] = values[low];
        indices[high] = indices[low];
    }
    values[low] = pivot;
    indices[low] = pivot_idx;
    QuickSortDescending(values, left, low - 1, indices);
    QuickSortDescending(values, low + 1, right, indices);
}

void NMS(int valid_count, std::vector<float>& boxes, std::vector<int>& class_ids,
         std::vector<int>& order, int filter_id, float threshold) {
    for (int i = 0; i < valid_count; ++i) {
        if (order[i] == -1 || class_ids[i] != filter_id) continue;
        int n = order[i];
        for (int j = i + 1; j < valid_count; ++j) {
            int m = order[j];
            if (m == -1 || class_ids[j] != filter_id) continue;
            float x1_min = boxes[n * 4 + 0];
            float y1_min = boxes[n * 4 + 1];
            float x1_max = x1_min + boxes[n * 4 + 2];
            float y1_max = y1_min + boxes[n * 4 + 3];
            float x2_min = boxes[m * 4 + 0];
            float y2_min = boxes[m * 4 + 1];
            float x2_max = x2_min + boxes[m * 4 + 2];
            float y2_max = y2_min + boxes[m * 4 + 3];
            if (CalculateIoU(x1_min, y1_min, x1_max, y1_max, x2_min, y2_min, x2_max, y2_max) > threshold) {
                order[j] = -1;
            }
        }
    }
}

#if defined(RK3588_ENABLE_RKNN)
// YOLOv5 feature map processing (anchor-based)
int ProcessFeatureMapV5(int8_t* input, const int* anchor, int grid_h, int grid_w,
                        int model_h, int model_w, int stride,
                        std::vector<float>& boxes, std::vector<float>& obj_probs,
                        std::vector<int>& class_ids, float conf_thresh, int32_t zp, float scale) {
    int valid_count = 0;
    int grid_len = grid_h * grid_w;
    int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);

    for (int a = 0; a < 3; ++a) {
        for (int i = 0; i < grid_h; ++i) {
            for (int j = 0; j < grid_w; ++j) {
                int8_t box_conf = input[(kPropBoxSizeV5 * a + 4) * grid_len + i * grid_w + j];
                if (box_conf >= thresh_i8) {
                    int offset = (kPropBoxSizeV5 * a) * grid_len + i * grid_w + j;
                    int8_t* ptr = input + offset;

                    float bx = DequantizeAffineToF32(*ptr, zp, scale) * 2.0f - 0.5f;
                    float by = DequantizeAffineToF32(ptr[grid_len], zp, scale) * 2.0f - 0.5f;
                    float bw = DequantizeAffineToF32(ptr[2 * grid_len], zp, scale) * 2.0f;
                    float bh = DequantizeAffineToF32(ptr[3 * grid_len], zp, scale) * 2.0f;

                    bx = (bx + j) * stride;
                    by = (by + i) * stride;
                    bw = bw * bw * anchor[a * 2];
                    bh = bh * bh * anchor[a * 2 + 1];
                    bx -= bw / 2.0f;
                    by -= bh / 2.0f;

                    int8_t max_cls_prob = ptr[5 * grid_len];
                    int max_cls_id = 0;
                    for (int k = 1; k < kObjClassNum; ++k) {
                        int8_t prob = ptr[(5 + k) * grid_len];
                        if (prob > max_cls_prob) {
                            max_cls_id = k;
                            max_cls_prob = prob;
                        }
                    }

                    if (max_cls_prob > thresh_i8) {
                        float score = DequantizeAffineToF32(max_cls_prob, zp, scale) *
                                      DequantizeAffineToF32(box_conf, zp, scale);
                        obj_probs.push_back(score);
                        class_ids.push_back(max_cls_id);
                        boxes.push_back(bx);
                        boxes.push_back(by);
                        boxes.push_back(bw);
                        boxes.push_back(bh);
                        ++valid_count;
                    }
                }
            }
        }
    }
    return valid_count;
}

// YOLOv8 output processing (anchor-free, single output tensor)
// Output format: [1, 84, 8400] where 84 = 4 (bbox) + 80 (classes)
// bbox format: cx, cy, w, h (center-based)
int ProcessOutputV8(float* output, int num_boxes, int num_classes,
                    int model_h, int model_w,
                    std::vector<float>& boxes, std::vector<float>& obj_probs,
                    std::vector<int>& class_ids, float conf_thresh) {
    int valid_count = 0;
    int num_channels = 4 + num_classes;  // 4 bbox + num_classes

    for (int i = 0; i < num_boxes; ++i) {
        // Find max class score
        float max_score = 0.0f;
        int max_cls_id = 0;
        for (int c = 0; c < num_classes; ++c) {
            float score = output[(4 + c) * num_boxes + i];
            if (score > max_score) {
                max_score = score;
                max_cls_id = c;
            }
        }

        if (max_score >= conf_thresh) {
            float cx = output[0 * num_boxes + i];
            float cy = output[1 * num_boxes + i];
            float w = output[2 * num_boxes + i];
            float h = output[3 * num_boxes + i];

            // Convert from center to top-left
            float x1 = cx - w / 2.0f;
            float y1 = cy - h / 2.0f;

            boxes.push_back(x1);
            boxes.push_back(y1);
            boxes.push_back(w);
            boxes.push_back(h);
            obj_probs.push_back(max_score);
            class_ids.push_back(max_cls_id);
            ++valid_count;
        }
    }
    return valid_count;
}

// YOLOv8 INT8 output processing
int ProcessOutputV8Int8(int8_t* output, int num_boxes, int num_classes,
                        int model_h, int model_w,
                        std::vector<float>& boxes, std::vector<float>& obj_probs,
                        std::vector<int>& class_ids, float conf_thresh,
                        int32_t zp, float scale) {
    int valid_count = 0;
    int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);

    for (int i = 0; i < num_boxes; ++i) {
        // Find max class score
        int8_t max_score_i8 = -128;
        int max_cls_id = 0;
        for (int c = 0; c < num_classes; ++c) {
            int8_t score = output[(4 + c) * num_boxes + i];
            if (score > max_score_i8) {
                max_score_i8 = score;
                max_cls_id = c;
            }
        }

        if (max_score_i8 >= thresh_i8) {
            float cx = DequantizeAffineToF32(output[0 * num_boxes + i], zp, scale);
            float cy = DequantizeAffineToF32(output[1 * num_boxes + i], zp, scale);
            float w = DequantizeAffineToF32(output[2 * num_boxes + i], zp, scale);
            float h = DequantizeAffineToF32(output[3 * num_boxes + i], zp, scale);
            float max_score = DequantizeAffineToF32(max_score_i8, zp, scale);

            // Convert from center to top-left
            float x1 = cx - w / 2.0f;
            float y1 = cy - h / 2.0f;

            boxes.push_back(x1);
            boxes.push_back(y1);
            boxes.push_back(w);
            boxes.push_back(h);
            obj_probs.push_back(max_score);
            class_ids.push_back(max_cls_id);
            ++valid_count;
        }
    }
    return valid_count;
}
#endif

}  // namespace

class AiYoloNode : public INode {
public:
    std::string Id() const override { return id_; }
    std::string Type() const override { return "ai_yolo"; }

    bool Init(const SimpleJson& config, const NodeContext& ctx) override {
        id_ = config.ValueOr<std::string>("id", "ai_yolo");
        model_path_ = config.ValueOr<std::string>("model_path", "");
        conf_thresh_ = config.ValueOr<float>("conf", 0.25f);
        nms_thresh_ = config.ValueOr<float>("nms", 0.45f);
        model_input_w_ = config.ValueOr<int>("model_w", 640);
        model_input_h_ = config.ValueOr<int>("model_h", 640);
        num_classes_ = config.ValueOr<int>("num_classes", 80);

        // Model version: "v5", "v8", or "auto" (default)
        std::string ver = config.ValueOr<std::string>("model_version", "auto");
        if (ver == "v5") {
            yolo_version_ = YoloVersion::V5;
        } else if (ver == "v8") {
            yolo_version_ = YoloVersion::V8;
        } else {
            yolo_version_ = YoloVersion::V8;  // Default to v8, will auto-detect in LoadModel
            auto_detect_version_ = true;
        }

        if (const SimpleJson* filter = config.Find("class_filter")) {
            for (const auto& item : filter->AsArray()) {
                class_filter_.insert(item.AsInt(-1));
            }
        }

        input_queue_ = ctx.input_queue;
        if (!input_queue_) {
            std::cerr << "[ai_yolo] no input queue for node " << id_ << "\n";
            return false;
        }
        if (ctx.output_queues.empty()) {
            std::cerr << "[ai_yolo] no output queue for node " << id_ << "\n";
            return false;
        }
        output_queues_ = ctx.output_queues;

#if defined(RK3588_ENABLE_RKNN)
        if (model_path_.empty()) {
            std::cerr << "[ai_yolo] model_path is required\n";
            return false;
        }
        if (!LoadModel()) {
            std::cerr << "[ai_yolo] failed to load model: " << model_path_ << "\n";
            return false;
        }
        std::cout << "[ai_yolo] model loaded: " << model_path_ << "\n";
#else
        std::cout << "[ai_yolo] RKNN disabled, will passthrough frames\n";
#endif
        return true;
    }

    bool Start() override {
        if (!input_queue_) return false;
        running_.store(true);
        worker_ = std::thread(&AiYoloNode::WorkerLoop, this);
        std::cout << "[ai_yolo] started, conf=" << conf_thresh_ << " nms=" << nms_thresh_ << "\n";
        return true;
    }

    void Stop() override {
        running_.store(false);
        if (input_queue_) input_queue_->Stop();
        for (auto& q : output_queues_) q->Stop();
        if (worker_.joinable()) worker_.join();

#if defined(RK3588_ENABLE_RKNN)
        if (rknn_ctx_) {
            rknn_destroy(rknn_ctx_);
            rknn_ctx_ = 0;
        }
#endif
        std::cout << "[ai_yolo] stopped\n";
    }

private:
#if defined(RK3588_ENABLE_RKNN)
    bool LoadModel() {
        std::ifstream file(model_path_, std::ios::binary | std::ios::ate);
        if (!file.is_open()) return false;

        size_t model_size = file.tellg();
        file.seekg(0, std::ios::beg);
        model_data_.resize(model_size);
        if (!file.read(reinterpret_cast<char*>(model_data_.data()), model_size)) {
            return false;
        }

        int ret = rknn_init(&rknn_ctx_, model_data_.data(), model_size, 0, nullptr);
        if (ret < 0) {
            std::cerr << "[ai_yolo] rknn_init failed: " << ret << "\n";
            return false;
        }

        rknn_input_output_num io_num;
        ret = rknn_query(rknn_ctx_, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
        if (ret < 0) {
            std::cerr << "[ai_yolo] rknn_query IO num failed\n";
            return false;
        }
        n_input_ = io_num.n_input;
        n_output_ = io_num.n_output;

        input_attrs_.resize(n_input_);
        for (uint32_t i = 0; i < n_input_; ++i) {
            input_attrs_[i].index = i;
            rknn_query(rknn_ctx_, RKNN_QUERY_INPUT_ATTR, &input_attrs_[i], sizeof(rknn_tensor_attr));
        }

        output_attrs_.resize(n_output_);
        for (uint32_t i = 0; i < n_output_; ++i) {
            output_attrs_[i].index = i;
            rknn_query(rknn_ctx_, RKNN_QUERY_OUTPUT_ATTR, &output_attrs_[i], sizeof(rknn_tensor_attr));
        }

        if (input_attrs_[0].fmt == RKNN_TENSOR_NCHW) {
            model_input_h_ = input_attrs_[0].dims[2];
            model_input_w_ = input_attrs_[0].dims[3];
        } else {
            model_input_h_ = input_attrs_[0].dims[1];
            model_input_w_ = input_attrs_[0].dims[2];
        }

        // Auto-detect YOLO version based on output structure
        if (auto_detect_version_) {
            if (n_output_ == 1) {
                // Single output tensor: YOLOv8 style
                yolo_version_ = YoloVersion::V8;
            } else if (n_output_ >= 3) {
                // Multiple outputs: likely YOLOv5 style
                // Check output dimensions to confirm
                // YOLOv5: 3 outputs with shape [1, 255, H, W] (255 = 3*(5+80))
                // YOLOv8: can also have multiple outputs but different structure
                uint32_t out0_elems = output_attrs_[0].n_elems;
                int grid_h = model_input_h_ / 8;
                int grid_w = model_input_w_ / 8;
                int expected_v5 = 3 * kPropBoxSizeV5 * grid_h * grid_w;
                if (out0_elems == static_cast<uint32_t>(expected_v5)) {
                    yolo_version_ = YoloVersion::V5;
                } else {
                    yolo_version_ = YoloVersion::V8;
                }
            }
        }

        const char* ver_str = (yolo_version_ == YoloVersion::V5) ? "v5" : "v8";
        std::cout << "[ai_yolo] model input: " << model_input_w_ << "x" << model_input_h_
                  << ", outputs: " << n_output_ << ", version: " << ver_str << "\n";
        return true;
    }
#endif

    void PushToDownstream(FramePtr frame) {
        for (auto& q : output_queues_) {
            q->Push(frame);
        }
    }

    void WorkerLoop() {
        using namespace std::chrono;
        FramePtr frame;

        while (running_.load()) {
            if (!input_queue_->Pop(frame, milliseconds(200))) continue;
            if (!frame) continue;

#if defined(RK3588_ENABLE_RKNN)
            RunInference(frame);
#endif
            PushToDownstream(frame);
            ++processed_;

            if (processed_ % 100 == 0) {
                std::cout << "[ai_yolo] processed " << processed_ << " frames\n";
            }
        }
    }

#if defined(RK3588_ENABLE_RKNN)
    void RunInference(FramePtr frame) {
        if (!frame->data || frame->data_size == 0) return;

        bool is_rgb = (frame->format == PixelFormat::RGB || frame->format == PixelFormat::BGR);
        if (!is_rgb) {
            std::cerr << "[ai_yolo] input must be RGB/BGR, got other format\n";
            return;
        }

        rknn_input inputs[1];
        memset(inputs, 0, sizeof(inputs));
        inputs[0].index = 0;
        inputs[0].type = RKNN_TENSOR_UINT8;
        inputs[0].size = frame->width * frame->height * 3;
        inputs[0].fmt = RKNN_TENSOR_NHWC;
        inputs[0].buf = frame->data;
        inputs[0].pass_through = 0;

        int ret = rknn_inputs_set(rknn_ctx_, n_input_, inputs);
        if (ret < 0) {
            std::cerr << "[ai_yolo] rknn_inputs_set failed: " << ret << "\n";
            return;
        }

        ret = rknn_run(rknn_ctx_, nullptr);
        if (ret < 0) {
            std::cerr << "[ai_yolo] rknn_run failed: " << ret << "\n";
            return;
        }

        std::vector<rknn_output> outputs(n_output_);
        memset(outputs.data(), 0, sizeof(rknn_output) * n_output_);
        for (uint32_t i = 0; i < n_output_; ++i) {
            outputs[i].want_float = 0;
        }

        ret = rknn_outputs_get(rknn_ctx_, n_output_, outputs.data(), nullptr);
        if (ret < 0) {
            std::cerr << "[ai_yolo] rknn_outputs_get failed: " << ret << "\n";
            return;
        }

        PostProcess(outputs, frame);
        rknn_outputs_release(rknn_ctx_, n_output_, outputs.data());
    }

    void PostProcess(std::vector<rknn_output>& outputs, FramePtr frame) {
        std::vector<float> boxes;
        std::vector<float> obj_probs;
        std::vector<int> class_ids;
        int valid_count = 0;

        std::vector<int32_t> zps;
        std::vector<float> scales;
        for (uint32_t i = 0; i < n_output_; ++i) {
            zps.push_back(output_attrs_[i].zp);
            scales.push_back(output_attrs_[i].scale);
        }

        if (yolo_version_ == YoloVersion::V5) {
            // YOLOv5: 3 feature maps with anchors
            if (n_output_ < 3) return;

            int stride0 = 8, stride1 = 16, stride2 = 32;
            int grid_h0 = model_input_h_ / stride0, grid_w0 = model_input_w_ / stride0;
            int grid_h1 = model_input_h_ / stride1, grid_w1 = model_input_w_ / stride1;
            int grid_h2 = model_input_h_ / stride2, grid_w2 = model_input_w_ / stride2;

            int cnt0 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[0].buf), kAnchor0,
                                           grid_h0, grid_w0, model_input_h_, model_input_w_, stride0,
                                           boxes, obj_probs, class_ids, conf_thresh_, zps[0], scales[0]);
            int cnt1 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[1].buf), kAnchor1,
                                           grid_h1, grid_w1, model_input_h_, model_input_w_, stride1,
                                           boxes, obj_probs, class_ids, conf_thresh_, zps[1], scales[1]);
            int cnt2 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[2].buf), kAnchor2,
                                           grid_h2, grid_w2, model_input_h_, model_input_w_, stride2,
                                           boxes, obj_probs, class_ids, conf_thresh_, zps[2], scales[2]);
            valid_count = cnt0 + cnt1 + cnt2;
        } else {
            // YOLOv8: single output or different structure
            // Output shape: [1, 84, 8400] or [1, num_classes+4, num_boxes]
            if (n_output_ < 1) return;

            // Determine number of boxes from output dimensions
            // Typical YOLOv8 output: [1, 84, 8400] where 84 = 4 + 80 classes
            int num_boxes = 0;
            int num_channels = 0;

            // Check output format
            if (output_attrs_[0].n_dims >= 3) {
                // Shape: [batch, channels, boxes] or [batch, boxes, channels]
                if (output_attrs_[0].dims[1] == static_cast<uint32_t>(4 + num_classes_)) {
                    num_channels = output_attrs_[0].dims[1];
                    num_boxes = output_attrs_[0].dims[2];
                } else if (output_attrs_[0].dims[2] == static_cast<uint32_t>(4 + num_classes_)) {
                    num_boxes = output_attrs_[0].dims[1];
                    num_channels = output_attrs_[0].dims[2];
                } else {
                    // Fallback: assume standard 8400 boxes
                    num_boxes = 8400;
                    num_channels = 4 + num_classes_;
                }
            } else {
                // Flat output, calculate from total elements
                num_channels = 4 + num_classes_;
                num_boxes = output_attrs_[0].n_elems / num_channels;
            }

            if (output_attrs_[0].type == RKNN_TENSOR_FLOAT32 ||
                output_attrs_[0].type == RKNN_TENSOR_FLOAT16) {
                valid_count = ProcessOutputV8(reinterpret_cast<float*>(outputs[0].buf),
                                              num_boxes, num_classes_,
                                              model_input_h_, model_input_w_,
                                              boxes, obj_probs, class_ids, conf_thresh_);
            } else {
                valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(outputs[0].buf),
                                                  num_boxes, num_classes_,
                                                  model_input_h_, model_input_w_,
                                                  boxes, obj_probs, class_ids, conf_thresh_,
                                                  zps[0], scales[0]);
            }
        }

        if (valid_count <= 0) return;

        std::vector<int> indices(valid_count);
        for (int i = 0; i < valid_count; ++i) indices[i] = i;

        QuickSortDescending(obj_probs, 0, valid_count - 1, indices);

        std::set<int> class_set(class_ids.begin(), class_ids.end());
        for (int c : class_set) {
            NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
        }

        float scale_w = static_cast<float>(model_input_w_) / frame->width;
        float scale_h = static_cast<float>(model_input_h_) / frame->height;

        auto det_result = std::make_shared<DetectionResult>();
        det_result->img_w = frame->width;
        det_result->img_h = frame->height;
        det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";

        for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
            if (indices[i] == -1) continue;
            int n = indices[i];
            int cls_id = class_ids[n];

            if (!class_filter_.empty() && class_filter_.find(cls_id) == class_filter_.end()) {
                continue;
            }

            float x1 = boxes[n * 4 + 0];
            float y1 = boxes[n * 4 + 1];
            float w = boxes[n * 4 + 2];
            float h = boxes[n * 4 + 3];

            Detection det;
            det.cls_id = cls_id;
            det.score = obj_probs[i];
            det.bbox.x = static_cast<float>(Clamp(static_cast<int>(x1 / scale_w), 0, frame->width));
            det.bbox.y = static_cast<float>(Clamp(static_cast<int>(y1 / scale_h), 0, frame->height));
            det.bbox.w = static_cast<float>(Clamp(static_cast<int>(w / scale_w), 0, frame->width - static_cast<int>(det.bbox.x)));
            det.bbox.h = static_cast<float>(Clamp(static_cast<int>(h / scale_h), 0, frame->height - static_cast<int>(det.bbox.y)));
            det.track_id = -1;

            det_result->items.push_back(det);
        }

        frame->det = det_result;
    }
#endif

    std::string id_;
    std::string model_path_;
    float conf_thresh_ = 0.25f;
    float nms_thresh_ = 0.45f;
    int model_input_w_ = 640;
    int model_input_h_ = 640;
    int num_classes_ = 80;
    YoloVersion yolo_version_ = YoloVersion::V8;
    bool auto_detect_version_ = false;
    std::set<int> class_filter_;

    std::atomic<bool> running_{false};
    std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
    std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
    std::thread worker_;
    uint64_t processed_ = 0;

#if defined(RK3588_ENABLE_RKNN)
    rknn_context rknn_ctx_ = 0;
    std::vector<uint8_t> model_data_;
    uint32_t n_input_ = 0;
    uint32_t n_output_ = 0;
    std::vector<rknn_tensor_attr> input_attrs_;
    std::vector<rknn_tensor_attr> output_attrs_;
#endif
};

REGISTER_NODE(AiYoloNode, "ai_yolo");

}  // namespace rk3588