OrangePi3588Media/plugins/ai_yolo/ai_yolo_node.cpp

#include <algorithm>
#include <atomic>
#include <chrono>
#include <cmath>
#include <cstring>
#include <iostream>
#include <memory>
#include <set>
#include <thread>
#include <vector>

#include "ai_scheduler.h"
#include "node.h"

#if defined(RK3588_ENABLE_RKNN)
#include "rknn_api.h"
#endif

namespace rk3588 {

namespace {

constexpr int kObjClassNum = 80;
constexpr int kPropBoxSizeV5 = 5 + kObjClassNum;  // YOLOv5: x,y,w,h,conf + 80 classes
constexpr int kPropBoxSizeV8 = 4 + kObjClassNum;  // YOLOv8: x,y,w,h + 80 classes (no conf)
constexpr int kMaxDetections = 64;

// YOLOv5 anchors
const int kAnchor0[6] = {10, 13, 16, 30, 33, 23};
const int kAnchor1[6] = {30, 61, 62, 45, 59, 119};
const int kAnchor2[6] = {116, 90, 156, 198, 373, 326};

enum class YoloVersion { V5, V8 };

const char* kCocoLabels[kObjClassNum] = {
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
    "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
    "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
    "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
    "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator",
    "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
};

inline int Clamp(float val, int min_val, int max_val) {
    return val > min_val ? (val < max_val ? static_cast<int>(val) : max_val) : min_val;
}

inline int32_t ClipFloat(float val, float min_val, float max_val) {
    return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
}

inline int8_t QuantizeF32ToAffine(float f32, int32_t zp, float scale) {
    float dst_val = (f32 / scale) + zp;
    return static_cast<int8_t>(ClipFloat(dst_val, -128, 127));
}

inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
    return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
}

float CalculateIoU(float x1_min, float y1_min, float x1_max, float y1_max,
                   float x2_min, float y2_min, float x2_max, float y2_max) {
    float w = std::fmax(0.f, std::fmin(x1_max, x2_max) - std::fmax(x1_min, x2_min) + 1.0f);
    float h = std::fmax(0.f, std::fmin(y1_max, y2_max) - std::fmax(y1_min, y2_min) + 1.0f);
    float inter = w * h;
    float area1 = (x1_max - x1_min + 1.0f) * (y1_max - y1_min + 1.0f);
    float area2 = (x2_max - x2_min + 1.0f) * (y2_max - y2_min + 1.0f);
    float uni = area1 + area2 - inter;
    return uni <= 0.f ? 0.f : (inter / uni);
}

void QuickSortDescending(std::vector<float>& values, int left, int right, std::vector<int>& indices) {
    if (left >= right) return;
    float pivot = values[left];
    int pivot_idx = indices[left];
    int low = left, high = right;
    while (low < high) {
        while (low < high && values[high] <= pivot) high--;
        values[low] = values[high];
        indices[low] = indices[high];
        while (low < high && values[low] >= pivot) low++;
        values[high] = values[low];
        indices[high] = indices[low];
    }
    values[low] = pivot;
    indices[low] = pivot_idx;
    QuickSortDescending(values, left, low - 1, indices);
    QuickSortDescending(values, low + 1, right, indices);
}

void NMS(int valid_count, std::vector<float>& boxes, std::vector<int>& class_ids,
         std::vector<int>& order, int filter_id, float threshold) {
    for (int i = 0; i < valid_count; ++i) {
        int n = order[i];
        if (n < 0 || n >= valid_count) continue;
        if (class_ids[n] != filter_id) continue;
        for (int j = i + 1; j < valid_count; ++j) {
            int m = order[j];
            if (m < 0 || m >= valid_count) continue;
            if (class_ids[m] != filter_id) continue;
            float x1_min = boxes[n * 4 + 0];
            float y1_min = boxes[n * 4 + 1];
            float x1_max = x1_min + boxes[n * 4 + 2];
            float y1_max = y1_min + boxes[n * 4 + 3];
            float x2_min = boxes[m * 4 + 0];
            float y2_min = boxes[m * 4 + 1];
            float x2_max = x2_min + boxes[m * 4 + 2];
            float y2_max = y2_min + boxes[m * 4 + 3];
            if (CalculateIoU(x1_min, y1_min, x1_max, y1_max, x2_min, y2_min, x2_max, y2_max) > threshold) {
                order[j] = -1;
            }
        }
    }
}

#if defined(RK3588_ENABLE_RKNN)
// YOLOv5 feature map processing (anchor-based)
int ProcessFeatureMapV5(int8_t* input, const int* anchor, int grid_h, int grid_w,
                        int model_h, int model_w, int stride,
                        std::vector<float>& boxes, std::vector<float>& obj_probs,
                        std::vector<int>& class_ids, float conf_thresh, int32_t zp, float scale) {
    int valid_count = 0;
    int grid_len = grid_h * grid_w;
    int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);

    for (int a = 0; a < 3; ++a) {
        for (int i = 0; i < grid_h; ++i) {
            for (int j = 0; j < grid_w; ++j) {
                int8_t box_conf = input[(kPropBoxSizeV5 * a + 4) * grid_len + i * grid_w + j];
                if (box_conf >= thresh_i8) {
                    int offset = (kPropBoxSizeV5 * a) * grid_len + i * grid_w + j;
                    int8_t* ptr = input + offset;

                    float bx = DequantizeAffineToF32(*ptr, zp, scale) * 2.0f - 0.5f;
                    float by = DequantizeAffineToF32(ptr[grid_len], zp, scale) * 2.0f - 0.5f;
                    float bw = DequantizeAffineToF32(ptr[2 * grid_len], zp, scale) * 2.0f;
                    float bh = DequantizeAffineToF32(ptr[3 * grid_len], zp, scale) * 2.0f;

                    bx = (bx + j) * stride;
                    by = (by + i) * stride;
                    bw = bw * bw * anchor[a * 2];
                    bh = bh * bh * anchor[a * 2 + 1];
                    bx -= bw / 2.0f;
                    by -= bh / 2.0f;

                    int8_t max_cls_prob = ptr[5 * grid_len];
                    int max_cls_id = 0;
                    for (int k = 1; k < kObjClassNum; ++k) {
                        int8_t prob = ptr[(5 + k) * grid_len];
                        if (prob > max_cls_prob) {
                            max_cls_id = k;
                            max_cls_prob = prob;
                        }
                    }

                    if (max_cls_prob > thresh_i8) {
                        float score = DequantizeAffineToF32(max_cls_prob, zp, scale) *
                                      DequantizeAffineToF32(box_conf, zp, scale);
                        obj_probs.push_back(score);
                        class_ids.push_back(max_cls_id);
                        boxes.push_back(bx);
                        boxes.push_back(by);
                        boxes.push_back(bw);
                        boxes.push_back(bh);
                        ++valid_count;
                    }
                }
            }
        }
    }
    return valid_count;
}

// YOLOv8 output processing (anchor-free, single output tensor)
int ProcessOutputV8(float* output, int num_boxes, int num_classes,
                    int model_h, int model_w,
                    std::vector<float>& boxes, std::vector<float>& obj_probs,
                    std::vector<int>& class_ids, float conf_thresh) {
    int valid_count = 0;

    for (int i = 0; i < num_boxes; ++i) {
        float max_score = 0.0f;
        int max_cls_id = 0;
        for (int c = 0; c < num_classes; ++c) {
            float score = output[(4 + c) * num_boxes + i];
            if (score > max_score) {
                max_score = score;
                max_cls_id = c;
            }
        }

        if (max_score >= conf_thresh) {
            float cx = output[0 * num_boxes + i];
            float cy = output[1 * num_boxes + i];
            float w = output[2 * num_boxes + i];
            float h = output[3 * num_boxes + i];

            float x1 = cx - w / 2.0f;
            float y1 = cy - h / 2.0f;

            boxes.push_back(x1);
            boxes.push_back(y1);
            boxes.push_back(w);
            boxes.push_back(h);
            obj_probs.push_back(max_score);
            class_ids.push_back(max_cls_id);
            ++valid_count;
        }
    }
    return valid_count;
}

// YOLOv8 INT8 output processing
int ProcessOutputV8Int8(int8_t* output, int num_boxes, int num_classes,
                        int model_h, int model_w,
                        std::vector<float>& boxes, std::vector<float>& obj_probs,
                        std::vector<int>& class_ids, float conf_thresh,
                        int32_t zp, float scale) {
    int valid_count = 0;
    int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);

    for (int i = 0; i < num_boxes; ++i) {
        int8_t max_score_i8 = -128;
        int max_cls_id = 0;
        for (int c = 0; c < num_classes; ++c) {
            int8_t score = output[(4 + c) * num_boxes + i];
            if (score > max_score_i8) {
                max_score_i8 = score;
                max_cls_id = c;
            }
        }

        if (max_score_i8 >= thresh_i8) {
            float cx = DequantizeAffineToF32(output[0 * num_boxes + i], zp, scale);
            float cy = DequantizeAffineToF32(output[1 * num_boxes + i], zp, scale);
            float w = DequantizeAffineToF32(output[2 * num_boxes + i], zp, scale);
            float h = DequantizeAffineToF32(output[3 * num_boxes + i], zp, scale);
            float max_score = DequantizeAffineToF32(max_score_i8, zp, scale);

            float x1 = cx - w / 2.0f;
            float y1 = cy - h / 2.0f;

            boxes.push_back(x1);
            boxes.push_back(y1);
            boxes.push_back(w);
            boxes.push_back(h);
            obj_probs.push_back(max_score);
            class_ids.push_back(max_cls_id);
            ++valid_count;
        }
    }
    return valid_count;
}
#endif

}  // namespace

class AiYoloNode : public INode {
public:
    std::string Id() const override { return id_; }
    std::string Type() const override { return "ai_yolo"; }

    bool Init(const SimpleJson& config, const NodeContext& ctx) override {
        id_ = config.ValueOr<std::string>("id", "ai_yolo");
        model_path_ = config.ValueOr<std::string>("model_path", "");
        conf_thresh_ = config.ValueOr<float>("conf", 0.25f);
        nms_thresh_ = config.ValueOr<float>("nms", 0.45f);
        model_input_w_ = config.ValueOr<int>("model_w", 640);
        model_input_h_ = config.ValueOr<int>("model_h", 640);
        num_classes_ = config.ValueOr<int>("num_classes", 80);

        // Optional inference throttle. 0 = run every frame.
        infer_interval_ms_ = std::max<int64_t>(0, static_cast<int64_t>(config.ValueOr<int>("infer_interval_ms", 0)));
        if (infer_interval_ms_ <= 0) {
            const double infer_fps = config.ValueOr<double>("infer_fps", 0.0);
            if (infer_fps > 0.0) {
                infer_interval_ms_ = static_cast<int64_t>(1000.0 / infer_fps);
                if (infer_interval_ms_ < 1) infer_interval_ms_ = 1;
            }
        }

        std::string ver = config.ValueOr<std::string>("model_version", "auto");
        if (ver == "v5") {
            yolo_version_ = YoloVersion::V5;
        } else if (ver == "v8") {
            yolo_version_ = YoloVersion::V8;
        } else {
            yolo_version_ = YoloVersion::V8;
            auto_detect_version_ = true;
        }

        if (const SimpleJson* filter = config.Find("class_filter")) {
            for (const auto& item : filter->AsArray()) {
                class_filter_.insert(item.AsInt(-1));
            }
        }

        input_queue_ = ctx.input_queue;
        if (!input_queue_) {
            std::cerr << "[ai_yolo] no input queue for node " << id_ << "\n";
            return false;
        }
        if (ctx.output_queues.empty()) {
            std::cerr << "[ai_yolo] no output queue for node " << id_ << "\n";
            return false;
        }
        output_queues_ = ctx.output_queues;

#if defined(RK3588_ENABLE_RKNN)
        if (model_path_.empty()) {
            std::cerr << "[ai_yolo] model_path is required\n";
            return false;
        }

        std::string err;
        model_handle_ = AiScheduler::Instance().LoadModel(model_path_, err);
        if (model_handle_ == kInvalidModelHandle) {
            std::cerr << "[ai_yolo] failed to load model: " << err << "\n";
            return false;
        }

        ModelInfo info;
        if (AiScheduler::Instance().GetModelInfo(model_handle_, info)) {
            model_input_w_ = info.input_width;
            model_input_h_ = info.input_height;
            n_output_ = info.n_output;

            if (auto_detect_version_) {
                if (n_output_ == 1) {
                    yolo_version_ = YoloVersion::V8;
                } else if (n_output_ >= 3) {
                    yolo_version_ = YoloVersion::V5;
                }
            }
        }

        std::cout << "[ai_yolo] model loaded via AiScheduler: " << model_path_
                  << " (handle=" << model_handle_ << ", version="
                  << (yolo_version_ == YoloVersion::V5 ? "v5" : "v8") << ")\n";
#else
        std::cout << "[ai_yolo] RKNN disabled, will passthrough frames\n";
#endif
        return true;
    }

    bool Start() override {
        std::cout << "[ai_yolo] started, conf=" << conf_thresh_ << " nms=" << nms_thresh_ << "\n";
        return true;
    }

    void Stop() override {
#if defined(RK3588_ENABLE_RKNN)
        if (model_handle_ != kInvalidModelHandle) {
            AiScheduler::Instance().UnloadModel(model_handle_);
            model_handle_ = kInvalidModelHandle;
        }
#endif
        std::cout << "[ai_yolo] stopped\n";
    }

    NodeStatus Process(FramePtr frame) override {
        if (!frame) return NodeStatus::DROP;

        if (infer_interval_ms_ > 0 && frame->pts > 0) {
            const int64_t pts_ms = static_cast<int64_t>(frame->pts / 1000ULL);
            if (last_infer_pts_ms_ > 0 && (pts_ms - last_infer_pts_ms_) < infer_interval_ms_) {
                PushToDownstream(frame);
                ++processed_;
                return NodeStatus::OK;
            }
            last_infer_pts_ms_ = pts_ms;
        }

#if defined(RK3588_ENABLE_RKNN)
        RunInference(frame);
#endif
        PushToDownstream(frame);
        ++processed_;

        if (processed_ % 100 == 0) {
            std::cout << "[ai_yolo] processed " << processed_ << " frames\n";
        }
        return NodeStatus::OK;
    }

private:
    void PushToDownstream(FramePtr frame) {
        for (auto& q : output_queues_) {
            q->Push(frame);
        }
    }

#if defined(RK3588_ENABLE_RKNN)
    void RunInference(FramePtr frame) {
        if (!frame->data || frame->data_size == 0) return;

        bool is_rgb = (frame->format == PixelFormat::RGB || frame->format == PixelFormat::BGR);
        if (!is_rgb) {
            std::cerr << "[ai_yolo] input must be RGB/BGR, got other format\n";
            return;
        }

        const int w = frame->width;
        const int h = frame->height;
        const size_t packed_row = static_cast<size_t>(w) * 3;
        const size_t packed_size = packed_row * static_cast<size_t>(h);

        const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
        int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
                         : (frame->stride > 0 ? frame->stride : static_cast<int>(packed_row));
        if (!src || src_stride <= 0) return;

        InferInput input;
        if (static_cast<size_t>(src_stride) == packed_row && frame->data_size >= packed_size) {
            input.data = src;
            input.size = packed_size;
        } else {
            if (frame->data_size < static_cast<size_t>(src_stride) * static_cast<size_t>(h)) {
                std::cerr << "[ai_yolo] invalid RGB buffer size/stride (data_size=" << frame->data_size
                          << ", stride=" << src_stride << ", h=" << h << ")\n";
                return;
            }
            rgb_tmp_.resize(packed_size);
            for (int y = 0; y < h; ++y) {
                memcpy(rgb_tmp_.data() + static_cast<size_t>(y) * packed_row,
                       src + static_cast<size_t>(y) * static_cast<size_t>(src_stride),
                       packed_row);
            }
            input.data = rgb_tmp_.data();
            input.size = packed_size;
        }
        input.width = w;
        input.height = h;
        input.is_nhwc = true;

        auto result = AiScheduler::Instance().InferBorrowed(model_handle_, input);
        if (!result.success) {
            std::cerr << "[ai_yolo] inference failed: " << result.error << "\n";
            return;
        }

        PostProcessBorrowed(result.outputs, frame);
    }

    void PostProcessBorrowed(const std::vector<AiScheduler::BorrowedOutput>& outputs, FramePtr frame) {
        std::vector<float> boxes;
        std::vector<float> obj_probs;
        std::vector<int> class_ids;
        int valid_count = 0;

        if (yolo_version_ == YoloVersion::V5) {
            if (outputs.size() < 3) return;
            if (!outputs[0].data || !outputs[1].data || !outputs[2].data) return;

            int stride0 = 8, stride1 = 16, stride2 = 32;
            int grid_h0 = model_input_h_ / stride0, grid_w0 = model_input_w_ / stride0;
            int grid_h1 = model_input_h_ / stride1, grid_w1 = model_input_w_ / stride1;
            int grid_h2 = model_input_h_ / stride2, grid_w2 = model_input_w_ / stride2;

            int cnt0 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[0].data)), kAnchor0,
                                           grid_h0, grid_w0, model_input_h_, model_input_w_, stride0,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[0].zp, outputs[0].scale);
            int cnt1 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[1].data)), kAnchor1,
                                           grid_h1, grid_w1, model_input_h_, model_input_w_, stride1,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[1].zp, outputs[1].scale);
            int cnt2 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[2].data)), kAnchor2,
                                           grid_h2, grid_w2, model_input_h_, model_input_w_, stride2,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[2].zp, outputs[2].scale);
            valid_count = cnt0 + cnt1 + cnt2;
        } else {
            if (outputs.empty()) return;
            if (!outputs[0].data || outputs[0].size == 0) return;

            int num_boxes = 0;
            int num_channels = 4 + num_classes_;

            if (outputs[0].dims.size() >= 3) {
                if (outputs[0].dims[1] == static_cast<uint32_t>(num_channels)) {
                    num_boxes = static_cast<int>(outputs[0].dims[2]);
                } else if (outputs[0].dims[2] == static_cast<uint32_t>(num_channels)) {
                    num_boxes = static_cast<int>(outputs[0].dims[1]);
                } else {
                    num_boxes = 8400;
                }
            } else {
                num_boxes = static_cast<int>(outputs[0].size) / num_channels;
            }

            if (outputs[0].type == RKNN_TENSOR_FLOAT32 ||
                outputs[0].type == RKNN_TENSOR_FLOAT16) {
                valid_count = ProcessOutputV8(reinterpret_cast<float*>(const_cast<uint8_t*>(outputs[0].data)),
                                              num_boxes, num_classes_,
                                              model_input_h_, model_input_w_,
                                              boxes, obj_probs, class_ids, conf_thresh_);
            } else {
                valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[0].data)),
                                                  num_boxes, num_classes_,
                                                  model_input_h_, model_input_w_,
                                                  boxes, obj_probs, class_ids, conf_thresh_,
                                                  outputs[0].zp, outputs[0].scale);
            }
        }

        if (valid_count <= 0) return;

        std::vector<int> indices(valid_count);
        for (int i = 0; i < valid_count; ++i) indices[i] = i;

        QuickSortDescending(obj_probs, 0, valid_count - 1, indices);

        std::set<int> class_set(class_ids.begin(), class_ids.end());
        for (int c : class_set) {
            NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
        }

        float scale_w = static_cast<float>(model_input_w_) / frame->width;
        float scale_h = static_cast<float>(model_input_h_) / frame->height;

        auto det_result = std::make_shared<DetectionResult>();
        det_result->img_w = frame->width;
        det_result->img_h = frame->height;
        det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";

        for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
            if (indices[i] == -1) continue;
            int n = indices[i];
            int cls_id = class_ids[n];

            if (!class_filter_.empty() && class_filter_.find(cls_id) == class_filter_.end()) {
                continue;
            }

            float x1 = boxes[n * 4 + 0];
            float y1 = boxes[n * 4 + 1];
            float w = boxes[n * 4 + 2];
            float h = boxes[n * 4 + 3];

            Detection det;
            det.cls_id = cls_id;
            det.score = obj_probs[i];
            det.bbox.x = static_cast<float>(Clamp(static_cast<int>(x1 / scale_w), 0, frame->width));
            det.bbox.y = static_cast<float>(Clamp(static_cast<int>(y1 / scale_h), 0, frame->height));
            det.bbox.w = static_cast<float>(Clamp(static_cast<int>(w / scale_w), 0, frame->width - static_cast<int>(det.bbox.x)));
            det.bbox.h = static_cast<float>(Clamp(static_cast<int>(h / scale_h), 0, frame->height - static_cast<int>(det.bbox.y)));
            det.track_id = -1;

            if (det_result->items.size() < 3 && processed_ < 10) {
                std::cout << "[ai_yolo] det: raw(" << x1 << "," << y1 << "," << w << "," << h
                          << ") -> bbox(" << det.bbox.x << "," << det.bbox.y << ","
                          << det.bbox.w << "," << det.bbox.h << ") cls=" << cls_id
                          << " score=" << det.score << "\n";
            }

            det_result->items.push_back(det);
        }

        frame->det = det_result;
    }

    void PostProcess(std::vector<InferOutput>& outputs, FramePtr frame) {
        std::vector<float> boxes;
        std::vector<float> obj_probs;
        std::vector<int> class_ids;
        int valid_count = 0;

        if (yolo_version_ == YoloVersion::V5) {
            if (outputs.size() < 3) return;

            int stride0 = 8, stride1 = 16, stride2 = 32;
            int grid_h0 = model_input_h_ / stride0, grid_w0 = model_input_w_ / stride0;
            int grid_h1 = model_input_h_ / stride1, grid_w1 = model_input_w_ / stride1;
            int grid_h2 = model_input_h_ / stride2, grid_w2 = model_input_w_ / stride2;

            int cnt0 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[0].data.data()), kAnchor0,
                                           grid_h0, grid_w0, model_input_h_, model_input_w_, stride0,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[0].zp, outputs[0].scale);
            int cnt1 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[1].data.data()), kAnchor1,
                                           grid_h1, grid_w1, model_input_h_, model_input_w_, stride1,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[1].zp, outputs[1].scale);
            int cnt2 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[2].data.data()), kAnchor2,
                                           grid_h2, grid_w2, model_input_h_, model_input_w_, stride2,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[2].zp, outputs[2].scale);
            valid_count = cnt0 + cnt1 + cnt2;
        } else {
            if (outputs.empty()) return;

            int num_boxes = 0;
            int num_channels = 4 + num_classes_;

            if (outputs[0].dims.size() >= 3) {
                if (outputs[0].dims[1] == static_cast<uint32_t>(num_channels)) {
                    num_boxes = outputs[0].dims[2];
                } else if (outputs[0].dims[2] == static_cast<uint32_t>(num_channels)) {
                    num_boxes = outputs[0].dims[1];
                } else {
                    num_boxes = 8400;
                }
            } else {
                num_boxes = outputs[0].data.size() / num_channels;
            }

            if (outputs[0].type == RKNN_TENSOR_FLOAT32 ||
                outputs[0].type == RKNN_TENSOR_FLOAT16) {
                valid_count = ProcessOutputV8(reinterpret_cast<float*>(outputs[0].data.data()),
                                              num_boxes, num_classes_,
                                              model_input_h_, model_input_w_,
                                              boxes, obj_probs, class_ids, conf_thresh_);
            } else {
                valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(outputs[0].data.data()),
                                                  num_boxes, num_classes_,
                                                  model_input_h_, model_input_w_,
                                                  boxes, obj_probs, class_ids, conf_thresh_,
                                                  outputs[0].zp, outputs[0].scale);
            }
        }

        if (valid_count <= 0) return;

        std::vector<int> indices(valid_count);
        for (int i = 0; i < valid_count; ++i) indices[i] = i;

        QuickSortDescending(obj_probs, 0, valid_count - 1, indices);

        std::set<int> class_set(class_ids.begin(), class_ids.end());
        for (int c : class_set) {
            NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
        }

        float scale_w = static_cast<float>(model_input_w_) / frame->width;
        float scale_h = static_cast<float>(model_input_h_) / frame->height;

        auto det_result = std::make_shared<DetectionResult>();
        det_result->img_w = frame->width;
        det_result->img_h = frame->height;
        det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";

        for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
            if (indices[i] == -1) continue;
            int n = indices[i];
            int cls_id = class_ids[n];

            if (!class_filter_.empty() && class_filter_.find(cls_id) == class_filter_.end()) {
                continue;
            }

            float x1 = boxes[n * 4 + 0];
            float y1 = boxes[n * 4 + 1];
            float w = boxes[n * 4 + 2];
            float h = boxes[n * 4 + 3];

            Detection det;
            det.cls_id = cls_id;
            det.score = obj_probs[i];
            det.bbox.x = static_cast<float>(Clamp(static_cast<int>(x1 / scale_w), 0, frame->width));
            det.bbox.y = static_cast<float>(Clamp(static_cast<int>(y1 / scale_h), 0, frame->height));
            det.bbox.w = static_cast<float>(Clamp(static_cast<int>(w / scale_w), 0, frame->width - static_cast<int>(det.bbox.x)));
            det.bbox.h = static_cast<float>(Clamp(static_cast<int>(h / scale_h), 0, frame->height - static_cast<int>(det.bbox.y)));
            det.track_id = -1;

            // Debug output for first few detections
            if (det_result->items.size() < 3 && processed_ < 10) {
                std::cout << "[ai_yolo] det: raw(" << x1 << "," << y1 << "," << w << "," << h
                          << ") -> bbox(" << det.bbox.x << "," << det.bbox.y << ","
                          << det.bbox.w << "," << det.bbox.h << ") cls=" << cls_id
                          << " score=" << det.score << "\n";
            }

            det_result->items.push_back(det);
        }

        frame->det = det_result;
    }
#endif

    std::string id_;
    std::string model_path_;
    float conf_thresh_ = 0.25f;
    float nms_thresh_ = 0.45f;
    int model_input_w_ = 640;
    int model_input_h_ = 640;
    int num_classes_ = 80;
    YoloVersion yolo_version_ = YoloVersion::V8;
    bool auto_detect_version_ = false;
    std::set<int> class_filter_;

    std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
    std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
    uint64_t processed_ = 0;

    int64_t infer_interval_ms_ = 0;
    int64_t last_infer_pts_ms_ = 0;

#if defined(RK3588_ENABLE_RKNN)
    ModelHandle model_handle_ = kInvalidModelHandle;
    uint32_t n_output_ = 0;
    std::vector<uint8_t> rgb_tmp_;
#endif
};

REGISTER_NODE(AiYoloNode, "ai_yolo");

}  // namespace rk3588