OrangePi3588Media/plugins/ai_yolo/ai_yolo_node.cpp

#include <atomic>
#include <chrono>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <memory>
#include <set>
#include <thread>
#include <vector>

#include "node.h"

#if defined(RK3588_ENABLE_RKNN)
#include "rknn_api.h"
#endif

namespace rk3588 {

namespace {

constexpr int kObjClassNum = 80;
constexpr int kPropBoxSize = 5 + kObjClassNum;
constexpr int kMaxDetections = 64;

const int kAnchor0[6] = {10, 13, 16, 30, 33, 23};
const int kAnchor1[6] = {30, 61, 62, 45, 59, 119};
const int kAnchor2[6] = {116, 90, 156, 198, 373, 326};

const char* kCocoLabels[kObjClassNum] = {
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
    "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
    "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
    "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
    "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator",
    "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
};

inline int Clamp(float val, int min_val, int max_val) {
    return val > min_val ? (val < max_val ? static_cast<int>(val) : max_val) : min_val;
}

inline int32_t ClipFloat(float val, float min_val, float max_val) {
    return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
}

inline int8_t QuantizeF32ToAffine(float f32, int32_t zp, float scale) {
    float dst_val = (f32 / scale) + zp;
    return static_cast<int8_t>(ClipFloat(dst_val, -128, 127));
}

inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
    return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
}

float CalculateIoU(float x1_min, float y1_min, float x1_max, float y1_max,
                   float x2_min, float y2_min, float x2_max, float y2_max) {
    float w = std::fmax(0.f, std::fmin(x1_max, x2_max) - std::fmax(x1_min, x2_min) + 1.0f);
    float h = std::fmax(0.f, std::fmin(y1_max, y2_max) - std::fmax(y1_min, y2_min) + 1.0f);
    float inter = w * h;
    float area1 = (x1_max - x1_min + 1.0f) * (y1_max - y1_min + 1.0f);
    float area2 = (x2_max - x2_min + 1.0f) * (y2_max - y2_min + 1.0f);
    float uni = area1 + area2 - inter;
    return uni <= 0.f ? 0.f : (inter / uni);
}

void QuickSortDescending(std::vector<float>& values, int left, int right, std::vector<int>& indices) {
    if (left >= right) return;
    float pivot = values[left];
    int pivot_idx = indices[left];
    int low = left, high = right;
    while (low < high) {
        while (low < high && values[high] <= pivot) high--;
        values[low] = values[high];
        indices[low] = indices[high];
        while (low < high && values[low] >= pivot) low++;
        values[high] = values[low];
        indices[high] = indices[low];
    }
    values[low] = pivot;
    indices[low] = pivot_idx;
    QuickSortDescending(values, left, low - 1, indices);
    QuickSortDescending(values, low + 1, right, indices);
}

void NMS(int valid_count, std::vector<float>& boxes, std::vector<int>& class_ids,
         std::vector<int>& order, int filter_id, float threshold) {
    for (int i = 0; i < valid_count; ++i) {
        if (order[i] == -1 || class_ids[i] != filter_id) continue;
        int n = order[i];
        for (int j = i + 1; j < valid_count; ++j) {
            int m = order[j];
            if (m == -1 || class_ids[j] != filter_id) continue;
            float x1_min = boxes[n * 4 + 0];
            float y1_min = boxes[n * 4 + 1];
            float x1_max = x1_min + boxes[n * 4 + 2];
            float y1_max = y1_min + boxes[n * 4 + 3];
            float x2_min = boxes[m * 4 + 0];
            float y2_min = boxes[m * 4 + 1];
            float x2_max = x2_min + boxes[m * 4 + 2];
            float y2_max = y2_min + boxes[m * 4 + 3];
            if (CalculateIoU(x1_min, y1_min, x1_max, y1_max, x2_min, y2_min, x2_max, y2_max) > threshold) {
                order[j] = -1;
            }
        }
    }
}

#if defined(RK3588_ENABLE_RKNN)
int ProcessFeatureMap(int8_t* input, const int* anchor, int grid_h, int grid_w,
                      int model_h, int model_w, int stride,
                      std::vector<float>& boxes, std::vector<float>& obj_probs,
                      std::vector<int>& class_ids, float conf_thresh, int32_t zp, float scale) {
    int valid_count = 0;
    int grid_len = grid_h * grid_w;
    int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);

    for (int a = 0; a < 3; ++a) {
        for (int i = 0; i < grid_h; ++i) {
            for (int j = 0; j < grid_w; ++j) {
                int8_t box_conf = input[(kPropBoxSize * a + 4) * grid_len + i * grid_w + j];
                if (box_conf >= thresh_i8) {
                    int offset = (kPropBoxSize * a) * grid_len + i * grid_w + j;
                    int8_t* ptr = input + offset;

                    float bx = DequantizeAffineToF32(*ptr, zp, scale) * 2.0f - 0.5f;
                    float by = DequantizeAffineToF32(ptr[grid_len], zp, scale) * 2.0f - 0.5f;
                    float bw = DequantizeAffineToF32(ptr[2 * grid_len], zp, scale) * 2.0f;
                    float bh = DequantizeAffineToF32(ptr[3 * grid_len], zp, scale) * 2.0f;

                    bx = (bx + j) * stride;
                    by = (by + i) * stride;
                    bw = bw * bw * anchor[a * 2];
                    bh = bh * bh * anchor[a * 2 + 1];
                    bx -= bw / 2.0f;
                    by -= bh / 2.0f;

                    int8_t max_cls_prob = ptr[5 * grid_len];
                    int max_cls_id = 0;
                    for (int k = 1; k < kObjClassNum; ++k) {
                        int8_t prob = ptr[(5 + k) * grid_len];
                        if (prob > max_cls_prob) {
                            max_cls_id = k;
                            max_cls_prob = prob;
                        }
                    }

                    if (max_cls_prob > thresh_i8) {
                        float score = DequantizeAffineToF32(max_cls_prob, zp, scale) *
                                      DequantizeAffineToF32(box_conf, zp, scale);
                        obj_probs.push_back(score);
                        class_ids.push_back(max_cls_id);
                        boxes.push_back(bx);
                        boxes.push_back(by);
                        boxes.push_back(bw);
                        boxes.push_back(bh);
                        ++valid_count;
                    }
                }
            }
        }
    }
    return valid_count;
}
#endif

}  // namespace

class AiYoloNode : public INode {
public:
    std::string Id() const override { return id_; }
    std::string Type() const override { return "ai_yolo"; }

    bool Init(const SimpleJson& config, const NodeContext& ctx) override {
        id_ = config.ValueOr<std::string>("id", "ai_yolo");
        model_path_ = config.ValueOr<std::string>("model_path", "");
        conf_thresh_ = config.ValueOr<float>("conf", 0.25f);
        nms_thresh_ = config.ValueOr<float>("nms", 0.45f);
        model_input_w_ = config.ValueOr<int>("model_w", 640);
        model_input_h_ = config.ValueOr<int>("model_h", 640);

        if (const SimpleJson* filter = config.Find("class_filter")) {
            for (const auto& item : filter->AsArray()) {
                class_filter_.insert(item.AsInt(-1));
            }
        }

        input_queue_ = ctx.input_queue;
        if (!input_queue_) {
            std::cerr << "[ai_yolo] no input queue for node " << id_ << "\n";
            return false;
        }
        if (ctx.output_queues.empty()) {
            std::cerr << "[ai_yolo] no output queue for node " << id_ << "\n";
            return false;
        }
        output_queues_ = ctx.output_queues;

#if defined(RK3588_ENABLE_RKNN)
        if (model_path_.empty()) {
            std::cerr << "[ai_yolo] model_path is required\n";
            return false;
        }
        if (!LoadModel()) {
            std::cerr << "[ai_yolo] failed to load model: " << model_path_ << "\n";
            return false;
        }
        std::cout << "[ai_yolo] model loaded: " << model_path_ << "\n";
#else
        std::cout << "[ai_yolo] RKNN disabled, will passthrough frames\n";
#endif
        return true;
    }

    bool Start() override {
        if (!input_queue_) return false;
        running_.store(true);
        worker_ = std::thread(&AiYoloNode::WorkerLoop, this);
        std::cout << "[ai_yolo] started, conf=" << conf_thresh_ << " nms=" << nms_thresh_ << "\n";
        return true;
    }

    void Stop() override {
        running_.store(false);
        if (input_queue_) input_queue_->Stop();
        for (auto& q : output_queues_) q->Stop();
        if (worker_.joinable()) worker_.join();

#if defined(RK3588_ENABLE_RKNN)
        if (rknn_ctx_) {
            rknn_destroy(rknn_ctx_);
            rknn_ctx_ = 0;
        }
#endif
        std::cout << "[ai_yolo] stopped\n";
    }

private:
#if defined(RK3588_ENABLE_RKNN)
    bool LoadModel() {
        std::ifstream file(model_path_, std::ios::binary | std::ios::ate);
        if (!file.is_open()) return false;

        size_t model_size = file.tellg();
        file.seekg(0, std::ios::beg);
        model_data_.resize(model_size);
        if (!file.read(reinterpret_cast<char*>(model_data_.data()), model_size)) {
            return false;
        }

        int ret = rknn_init(&rknn_ctx_, model_data_.data(), model_size, 0, nullptr);
        if (ret < 0) {
            std::cerr << "[ai_yolo] rknn_init failed: " << ret << "\n";
            return false;
        }

        rknn_input_output_num io_num;
        ret = rknn_query(rknn_ctx_, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
        if (ret < 0) {
            std::cerr << "[ai_yolo] rknn_query IO num failed\n";
            return false;
        }
        n_input_ = io_num.n_input;
        n_output_ = io_num.n_output;

        input_attrs_.resize(n_input_);
        for (uint32_t i = 0; i < n_input_; ++i) {
            input_attrs_[i].index = i;
            rknn_query(rknn_ctx_, RKNN_QUERY_INPUT_ATTR, &input_attrs_[i], sizeof(rknn_tensor_attr));
        }

        output_attrs_.resize(n_output_);
        for (uint32_t i = 0; i < n_output_; ++i) {
            output_attrs_[i].index = i;
            rknn_query(rknn_ctx_, RKNN_QUERY_OUTPUT_ATTR, &output_attrs_[i], sizeof(rknn_tensor_attr));
        }

        if (input_attrs_[0].fmt == RKNN_TENSOR_NCHW) {
            model_input_h_ = input_attrs_[0].dims[2];
            model_input_w_ = input_attrs_[0].dims[3];
        } else {
            model_input_h_ = input_attrs_[0].dims[1];
            model_input_w_ = input_attrs_[0].dims[2];
        }

        std::cout << "[ai_yolo] model input: " << model_input_w_ << "x" << model_input_h_
                  << ", outputs: " << n_output_ << "\n";
        return true;
    }
#endif

    void PushToDownstream(FramePtr frame) {
        for (auto& q : output_queues_) {
            q->Push(frame);
        }
    }

    void WorkerLoop() {
        using namespace std::chrono;
        FramePtr frame;

        while (running_.load()) {
            if (!input_queue_->Pop(frame, milliseconds(200))) continue;
            if (!frame) continue;

#if defined(RK3588_ENABLE_RKNN)
            RunInference(frame);
#endif
            PushToDownstream(frame);
            ++processed_;

            if (processed_ % 100 == 0) {
                std::cout << "[ai_yolo] processed " << processed_ << " frames\n";
            }
        }
    }

#if defined(RK3588_ENABLE_RKNN)
    void RunInference(FramePtr frame) {
        if (!frame->data || frame->data_size == 0) return;

        bool is_rgb = (frame->format == PixelFormat::RGB || frame->format == PixelFormat::BGR);
        if (!is_rgb) {
            std::cerr << "[ai_yolo] input must be RGB/BGR, got other format\n";
            return;
        }

        rknn_input inputs[1];
        memset(inputs, 0, sizeof(inputs));
        inputs[0].index = 0;
        inputs[0].type = RKNN_TENSOR_UINT8;
        inputs[0].size = frame->width * frame->height * 3;
        inputs[0].fmt = RKNN_TENSOR_NHWC;
        inputs[0].buf = frame->data;
        inputs[0].pass_through = 0;

        int ret = rknn_inputs_set(rknn_ctx_, n_input_, inputs);
        if (ret < 0) {
            std::cerr << "[ai_yolo] rknn_inputs_set failed: " << ret << "\n";
            return;
        }

        ret = rknn_run(rknn_ctx_, nullptr);
        if (ret < 0) {
            std::cerr << "[ai_yolo] rknn_run failed: " << ret << "\n";
            return;
        }

        std::vector<rknn_output> outputs(n_output_);
        memset(outputs.data(), 0, sizeof(rknn_output) * n_output_);
        for (uint32_t i = 0; i < n_output_; ++i) {
            outputs[i].want_float = 0;
        }

        ret = rknn_outputs_get(rknn_ctx_, n_output_, outputs.data(), nullptr);
        if (ret < 0) {
            std::cerr << "[ai_yolo] rknn_outputs_get failed: " << ret << "\n";
            return;
        }

        PostProcess(outputs, frame);
        rknn_outputs_release(rknn_ctx_, n_output_, outputs.data());
    }

    void PostProcess(std::vector<rknn_output>& outputs, FramePtr frame) {
        if (n_output_ < 3) return;

        std::vector<float> boxes;
        std::vector<float> obj_probs;
        std::vector<int> class_ids;

        std::vector<int32_t> zps;
        std::vector<float> scales;
        for (uint32_t i = 0; i < n_output_; ++i) {
            zps.push_back(output_attrs_[i].zp);
            scales.push_back(output_attrs_[i].scale);
        }

        int stride0 = 8, stride1 = 16, stride2 = 32;
        int grid_h0 = model_input_h_ / stride0, grid_w0 = model_input_w_ / stride0;
        int grid_h1 = model_input_h_ / stride1, grid_w1 = model_input_w_ / stride1;
        int grid_h2 = model_input_h_ / stride2, grid_w2 = model_input_w_ / stride2;

        int cnt0 = ProcessFeatureMap(reinterpret_cast<int8_t*>(outputs[0].buf), kAnchor0,
                                     grid_h0, grid_w0, model_input_h_, model_input_w_, stride0,
                                     boxes, obj_probs, class_ids, conf_thresh_, zps[0], scales[0]);
        int cnt1 = ProcessFeatureMap(reinterpret_cast<int8_t*>(outputs[1].buf), kAnchor1,
                                     grid_h1, grid_w1, model_input_h_, model_input_w_, stride1,
                                     boxes, obj_probs, class_ids, conf_thresh_, zps[1], scales[1]);
        int cnt2 = ProcessFeatureMap(reinterpret_cast<int8_t*>(outputs[2].buf), kAnchor2,
                                     grid_h2, grid_w2, model_input_h_, model_input_w_, stride2,
                                     boxes, obj_probs, class_ids, conf_thresh_, zps[2], scales[2]);

        int valid_count = cnt0 + cnt1 + cnt2;
        if (valid_count <= 0) return;

        std::vector<int> indices(valid_count);
        for (int i = 0; i < valid_count; ++i) indices[i] = i;

        QuickSortDescending(obj_probs, 0, valid_count - 1, indices);

        std::set<int> class_set(class_ids.begin(), class_ids.end());
        for (int c : class_set) {
            NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
        }

        float scale_w = static_cast<float>(model_input_w_) / frame->width;
        float scale_h = static_cast<float>(model_input_h_) / frame->height;

        auto det_result = std::make_shared<DetectionResult>();
        det_result->img_w = frame->width;
        det_result->img_h = frame->height;
        det_result->model_name = "yolov5";

        for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
            if (indices[i] == -1) continue;
            int n = indices[i];
            int cls_id = class_ids[n];

            if (!class_filter_.empty() && class_filter_.find(cls_id) == class_filter_.end()) {
                continue;
            }

            float x1 = boxes[n * 4 + 0];
            float y1 = boxes[n * 4 + 1];
            float w = boxes[n * 4 + 2];
            float h = boxes[n * 4 + 3];

            Detection det;
            det.cls_id = cls_id;
            det.score = obj_probs[i];
            det.bbox.x = Clamp(x1 / scale_w, 0, frame->width);
            det.bbox.y = Clamp(y1 / scale_h, 0, frame->height);
            det.bbox.w = Clamp(w / scale_w, 0, frame->width - det.bbox.x);
            det.bbox.h = Clamp(h / scale_h, 0, frame->height - det.bbox.y);
            det.track_id = -1;

            det_result->items.push_back(det);
        }

        frame->det = det_result;
    }
#endif

    std::string id_;
    std::string model_path_;
    float conf_thresh_ = 0.25f;
    float nms_thresh_ = 0.45f;
    int model_input_w_ = 640;
    int model_input_h_ = 640;
    std::set<int> class_filter_;

    std::atomic<bool> running_{false};
    std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
    std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
    std::thread worker_;
    uint64_t processed_ = 0;

#if defined(RK3588_ENABLE_RKNN)
    rknn_context rknn_ctx_ = 0;
    std::vector<uint8_t> model_data_;
    uint32_t n_input_ = 0;
    uint32_t n_output_ = 0;
    std::vector<rknn_tensor_attr> input_attrs_;
    std::vector<rknn_tensor_attr> output_attrs_;
#endif
};

REGISTER_NODE(AiYoloNode, "ai_yolo");

}  // namespace rk3588