OrangePi3588Media/plugins/ai_yolo/ai_yolo_node.cpp

#include <algorithm>
#include <atomic>
#include <chrono>
#include <cmath>
#include <cstddef>
#include <cstring>
#include <limits>
#include <memory>
#include <set>
#include <thread>
#include <vector>

#include "hw/i_infer_backend.h"
#include "node.h"
#include "utils/dma_alloc.h"
#include "utils/logger.h"

#if defined(RK3588_ENABLE_RKNN)
#include "rknn_api.h"
#endif

namespace rk3588 {

namespace {

constexpr int kObjClassNum = 80;
constexpr int kPropBoxSizeV5 = 5 + kObjClassNum;  // YOLOv5: x,y,w,h,conf + 80 classes
constexpr int kPropBoxSizeV8 = 4 + kObjClassNum;  // YOLOv8: x,y,w,h + 80 classes (no conf)
constexpr int kMaxDetections = 64;

// YOLOv5 anchors
const int kAnchor0[6] = {10, 13, 16, 30, 33, 23};
const int kAnchor1[6] = {30, 61, 62, 45, 59, 119};
const int kAnchor2[6] = {116, 90, 156, 198, 373, 326};

enum class YoloVersion { V5, V8 };
enum class V8BoxFormat { Auto, CxCyWh, XyXy, XyWh };
enum class V8ClsActivation { Auto, None, Sigmoid };

const char* kCocoLabels[kObjClassNum] = {
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
    "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
    "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
    "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
    "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator",
    "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
};

inline int Clamp(float val, int min_val, int max_val) {
    return val > min_val ? (val < max_val ? static_cast<int>(val) : max_val) : min_val;
}

struct DetCoordContext {
    bool has_transform = false;
    int out_w = 0;
    int out_h = 0;
    float scale_x = 1.0f;
    float scale_y = 1.0f;
    float pad_x = 0.0f;
    float pad_y = 0.0f;
    float fallback_scale_w = 1.0f;
    float fallback_scale_h = 1.0f;
};

DetCoordContext BuildDetCoordContext(const Frame& frame, int model_input_w, int model_input_h) {
    DetCoordContext ctx{};
    ctx.fallback_scale_w = frame.width > 0 ? static_cast<float>(model_input_w) / frame.width : 1.0f;
    ctx.fallback_scale_h = frame.height > 0 ? static_cast<float>(model_input_h) / frame.height : 1.0f;
    ctx.out_w = frame.width;
    ctx.out_h = frame.height;

    const bool infer_input_matches_frame = (frame.width == model_input_w && frame.height == model_input_h);

    if (infer_input_matches_frame &&
        frame.transform_meta && frame.transform_meta->valid &&
        frame.transform_meta->src_w > 0 && frame.transform_meta->src_h > 0 &&
        frame.transform_meta->scale_x > 1e-6f && frame.transform_meta->scale_y > 1e-6f) {
        ctx.has_transform = true;
        ctx.out_w = frame.transform_meta->src_w;
        ctx.out_h = frame.transform_meta->src_h;
        ctx.scale_x = frame.transform_meta->scale_x;
        ctx.scale_y = frame.transform_meta->scale_y;
        ctx.pad_x = frame.transform_meta->pad_x;
        ctx.pad_y = frame.transform_meta->pad_y;
    }
    return ctx;
}

struct BboxExpandConfig {
    bool enable = false;
    int class_id = 0;
    float left = 0.05f;
    float right = 0.05f;
    float top = 0.05f;
    float bottom = 0.12f;
};

Rect DecodeToOutputRect(float x, float y, float w, float h, const DetCoordContext& ctx) {
    float ox = x;
    float oy = y;
    float ow = w;
    float oh = h;

    if (ctx.has_transform) {
        ox = (x - ctx.pad_x) / ctx.scale_x;
        oy = (y - ctx.pad_y) / ctx.scale_y;
        ow = w / ctx.scale_x;
        oh = h / ctx.scale_y;
    } else {
        ox = x / ctx.fallback_scale_w;
        oy = y / ctx.fallback_scale_h;
        ow = w / ctx.fallback_scale_w;
        oh = h / ctx.fallback_scale_h;
    }

    Rect r{};
    const int out_w = std::max(1, ctx.out_w);
    const int out_h = std::max(1, ctx.out_h);
    r.x = static_cast<float>(Clamp(static_cast<int>(ox), 0, out_w));
    r.y = static_cast<float>(Clamp(static_cast<int>(oy), 0, out_h));
    r.w = static_cast<float>(Clamp(static_cast<int>(ow), 0, out_w - static_cast<int>(r.x)));
    r.h = static_cast<float>(Clamp(static_cast<int>(oh), 0, out_h - static_cast<int>(r.y)));
    return r;
}

Rect ExpandRect(const Rect& in, int img_w, int img_h, const BboxExpandConfig& cfg) {
    if (!cfg.enable || img_w <= 0 || img_h <= 0) return in;

    const float ex = in.w * cfg.left;
    const float ey = in.h * cfg.top;
    const float ew = in.w * (cfg.left + cfg.right);
    const float eh = in.h * (cfg.top + cfg.bottom);

    Rect out{};
    out.x = std::max(0.0f, in.x - ex);
    out.y = std::max(0.0f, in.y - ey);
    out.w = std::min(static_cast<float>(img_w) - out.x, in.w + ew);
    out.h = std::min(static_cast<float>(img_h) - out.y, in.h + eh);
    out.w = std::max(0.0f, out.w);
    out.h = std::max(0.0f, out.h);
    return out;
}

inline int32_t ClipFloat(float val, float min_val, float max_val) {
    return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
}

inline int8_t QuantizeF32ToAffine(float f32, int32_t zp, float scale) {
    float dst_val = (f32 / scale) + zp;
    return static_cast<int8_t>(ClipFloat(dst_val, -128, 127));
}

inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
    return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
}

inline float Sigmoid(float x) {
    return 1.0f / (1.0f + std::exp(-x));
}

// FP16 (half) to FP32 conversion.
// Uses arithmetic reconstruction to avoid undefined behavior on subnormals.
inline float Fp16ToFp32(uint16_t h) {
    const int sign = (h & 0x8000) ? -1 : 1;
    const int exp = (h >> 10) & 0x1F;
    const int mant = h & 0x03FF;

    if (exp == 0) {
        if (mant == 0) return sign < 0 ? -0.0f : 0.0f;
        // subnormal: mant * 2^-24
        return static_cast<float>(sign) * std::ldexp(static_cast<float>(mant), -24);
    }
    if (exp == 0x1F) {
        if (mant == 0) return sign < 0 ? -INFINITY : INFINITY;
        return std::numeric_limits<float>::quiet_NaN();
    }
    // normal: (mant + 1024) * 2^(exp-25)
    return static_cast<float>(sign) *
           std::ldexp(static_cast<float>(mant + 1024), exp - 25);
}

float CalculateIoU(float x1_min, float y1_min, float x1_max, float y1_max,
                   float x2_min, float y2_min, float x2_max, float y2_max) {
    float w = std::fmax(0.f, std::fmin(x1_max, x2_max) - std::fmax(x1_min, x2_min) + 1.0f);
    float h = std::fmax(0.f, std::fmin(y1_max, y2_max) - std::fmax(y1_min, y2_min) + 1.0f);
    float inter = w * h;
    float area1 = (x1_max - x1_min + 1.0f) * (y1_max - y1_min + 1.0f);
    float area2 = (x2_max - x2_min + 1.0f) * (y2_max - y2_min + 1.0f);
    float uni = area1 + area2 - inter;
    return uni <= 0.f ? 0.f : (inter / uni);
}

void QuickSortDescending(std::vector<float>& values, int left, int right, std::vector<int>& indices) {
    if (left >= right) return;
    float pivot = values[left];
    int pivot_idx = indices[left];
    int low = left, high = right;
    while (low < high) {
        while (low < high && values[high] <= pivot) high--;
        values[low] = values[high];
        indices[low] = indices[high];
        while (low < high && values[low] >= pivot) low++;
        values[high] = values[low];
        indices[high] = indices[low];
    }
    values[low] = pivot;
    indices[low] = pivot_idx;
    QuickSortDescending(values, left, low - 1, indices);
    QuickSortDescending(values, low + 1, right, indices);
}

void NMS(int valid_count, std::vector<float>& boxes, std::vector<int>& class_ids,
         std::vector<int>& order, int filter_id, float threshold) {
    for (int i = 0; i < valid_count; ++i) {
        int n = order[i];
        if (n < 0 || n >= valid_count) continue;
        if (class_ids[n] != filter_id) continue;
        for (int j = i + 1; j < valid_count; ++j) {
            int m = order[j];
            if (m < 0 || m >= valid_count) continue;
            if (class_ids[m] != filter_id) continue;
            float x1_min = boxes[n * 4 + 0];
            float y1_min = boxes[n * 4 + 1];
            float x1_max = x1_min + boxes[n * 4 + 2];
            float y1_max = y1_min + boxes[n * 4 + 3];
            float x2_min = boxes[m * 4 + 0];
            float y2_min = boxes[m * 4 + 1];
            float x2_max = x2_min + boxes[m * 4 + 2];
            float y2_max = y2_min + boxes[m * 4 + 3];
            if (CalculateIoU(x1_min, y1_min, x1_max, y1_max, x2_min, y2_min, x2_max, y2_max) > threshold) {
                order[j] = -1;
            }
        }
    }
}

#if defined(RK3588_ENABLE_RKNN)
// YOLOv5 feature map processing (anchor-based)
int ProcessFeatureMapV5(int8_t* input, const int* anchor, int grid_h, int grid_w,
                        int model_h, int model_w, int stride,
                        std::vector<float>& boxes, std::vector<float>& obj_probs,
                        std::vector<int>& class_ids, float conf_thresh, int32_t zp, float scale) {
    int valid_count = 0;
    int grid_len = grid_h * grid_w;
    int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);

    for (int a = 0; a < 3; ++a) {
        for (int i = 0; i < grid_h; ++i) {
            for (int j = 0; j < grid_w; ++j) {
                int8_t box_conf = input[(kPropBoxSizeV5 * a + 4) * grid_len + i * grid_w + j];
                if (box_conf >= thresh_i8) {
                    int offset = (kPropBoxSizeV5 * a) * grid_len + i * grid_w + j;
                    int8_t* ptr = input + offset;

                    float bx = DequantizeAffineToF32(*ptr, zp, scale) * 2.0f - 0.5f;
                    float by = DequantizeAffineToF32(ptr[grid_len], zp, scale) * 2.0f - 0.5f;
                    float bw = DequantizeAffineToF32(ptr[2 * grid_len], zp, scale) * 2.0f;
                    float bh = DequantizeAffineToF32(ptr[3 * grid_len], zp, scale) * 2.0f;

                    bx = (bx + j) * stride;
                    by = (by + i) * stride;
                    bw = bw * bw * anchor[a * 2];
                    bh = bh * bh * anchor[a * 2 + 1];
                    bx -= bw / 2.0f;
                    by -= bh / 2.0f;

                    int8_t max_cls_prob = ptr[5 * grid_len];
                    int max_cls_id = 0;
                    for (int k = 1; k < kObjClassNum; ++k) {
                        int8_t prob = ptr[(5 + k) * grid_len];
                        if (prob > max_cls_prob) {
                            max_cls_id = k;
                            max_cls_prob = prob;
                        }
                    }

                    if (max_cls_prob > thresh_i8) {
                        float score = DequantizeAffineToF32(max_cls_prob, zp, scale) *
                                      DequantizeAffineToF32(box_conf, zp, scale);
                        obj_probs.push_back(score);
                        class_ids.push_back(max_cls_id);
                        boxes.push_back(bx);
                        boxes.push_back(by);
                        boxes.push_back(bw);
                        boxes.push_back(bh);
                        ++valid_count;
                    }
                }
            }
        }
    }
    return valid_count;
}

uint32_t TensorTypeSizeBytes(rknn_tensor_type t) {
    switch (t) {
        case RKNN_TENSOR_INT8:
        case RKNN_TENSOR_UINT8:
            return 1;
        case RKNN_TENSOR_FLOAT16:
            return 2;
        case RKNN_TENSOR_FLOAT32:
            return 4;
        default:
            return 1;
    }
}

int DefaultV8NumBoxes(int model_h, int model_w) {
    if (model_h <= 0 || model_w <= 0) return 0;
    return (model_h / 8) * (model_w / 8) +
           (model_h / 16) * (model_w / 16) +
           (model_h / 32) * (model_w / 32);
}

struct V8LayoutInfo {
    int num_boxes = 0;
    bool channels_first = true;  // true: CxN, false: NxC
};

float ScoreBoxCandidate(float x, float y, float w, float h, int model_w, int model_h) {
    float s = 0.0f;
    if (w > 0.0f && h > 0.0f) s += 3.0f;
    if (w <= model_w * 1.2f) s += 1.0f;
    if (h <= model_h * 1.2f) s += 1.0f;
    if (x >= -model_w * 0.1f) s += 1.0f;
    if (y >= -model_h * 0.1f) s += 1.0f;
    if ((x + w) <= model_w * 1.2f) s += 1.0f;
    if ((y + h) <= model_h * 1.2f) s += 1.0f;
    return s;
}

bool SeemsNormalized(float a, float b, float c, float d) {
    auto in_range = [](float v) { return v >= -0.05f && v <= 2.5f; };
    return in_range(a) && in_range(b) && in_range(c) && in_range(d);
}

const char* V8BoxFormatName(V8BoxFormat fmt) {
    switch (fmt) {
        case V8BoxFormat::CxCyWh: return "cxcywh";
        case V8BoxFormat::XyXy: return "xyxy";
        case V8BoxFormat::XyWh: return "xywh";
        default: return "auto";
    }
}

void DecodeV8Box(float a, float b, float c, float d, int model_w, int model_h, V8BoxFormat fmt,
                 float& out_x, float& out_y, float& out_w, float& out_h, V8BoxFormat* used_fmt = nullptr) {
    if (SeemsNormalized(a, b, c, d)) {
        a *= static_cast<float>(model_w);
        b *= static_cast<float>(model_h);
        c *= static_cast<float>(model_w);
        d *= static_cast<float>(model_h);
    }

    auto decode_cxcywh = [&](float& x, float& y, float& w, float& h) {
        x = a - c / 2.0f;
        y = b - d / 2.0f;
        w = c;
        h = d;
    };
    auto decode_xyxy = [&](float& x, float& y, float& w, float& h) {
        x = a;
        y = b;
        w = c - a;
        h = d - b;
    };
    auto decode_xywh = [&](float& x, float& y, float& w, float& h) {
        x = a;
        y = b;
        w = c;
        h = d;
    };

    if (fmt == V8BoxFormat::CxCyWh) {
        decode_cxcywh(out_x, out_y, out_w, out_h);
        if (used_fmt) *used_fmt = V8BoxFormat::CxCyWh;
        return;
    }
    if (fmt == V8BoxFormat::XyXy) {
        decode_xyxy(out_x, out_y, out_w, out_h);
        if (used_fmt) *used_fmt = V8BoxFormat::XyXy;
        return;
    }
    if (fmt == V8BoxFormat::XyWh) {
        decode_xywh(out_x, out_y, out_w, out_h);
        if (used_fmt) *used_fmt = V8BoxFormat::XyWh;
        return;
    }

    float x1 = 0.0f, y1 = 0.0f, w1 = 0.0f, h1 = 0.0f;
    float x2 = 0.0f, y2 = 0.0f, w2 = 0.0f, h2 = 0.0f;
    float x3 = 0.0f, y3 = 0.0f, w3 = 0.0f, h3 = 0.0f;
    decode_cxcywh(x1, y1, w1, h1);
    decode_xyxy(x2, y2, w2, h2);
    decode_xywh(x3, y3, w3, h3);

    const float s1 = ScoreBoxCandidate(x1, y1, w1, h1, model_w, model_h);
    const float s2 = ScoreBoxCandidate(x2, y2, w2, h2, model_w, model_h);
    const float s3 = ScoreBoxCandidate(x3, y3, w3, h3, model_w, model_h);
    if (s2 >= s1 && s2 >= s3) {
        out_x = x2; out_y = y2; out_w = w2; out_h = h2;
        if (used_fmt) *used_fmt = V8BoxFormat::XyXy;
    } else if (s3 >= s1 && s3 >= s2) {
        out_x = x3; out_y = y3; out_w = w3; out_h = h3;
        if (used_fmt) *used_fmt = V8BoxFormat::XyWh;
    } else {
        out_x = x1; out_y = y1; out_w = w1; out_h = h1;
        if (used_fmt) *used_fmt = V8BoxFormat::CxCyWh;
    }
}

bool ResolveV8ApplySigmoid(const float* output, int num_boxes, int num_classes, bool channels_first,
                           V8ClsActivation act_mode) {
    if (act_mode == V8ClsActivation::None) return false;
    if (act_mode == V8ClsActivation::Sigmoid) return true;
    if (!output || num_boxes <= 0 || num_classes <= 0) return false;

    const int num_channels = 4 + num_classes;
    const int sample_boxes = std::min(num_boxes, 64);
    float min_v = 1e9f;
    float max_v = -1e9f;
    for (int i = 0; i < sample_boxes; ++i) {
        for (int c = 0; c < num_classes; ++c) {
            const float v = channels_first ? output[(4 + c) * num_boxes + i]
                                           : output[i * num_channels + (4 + c)];
            if (v < min_v) min_v = v;
            if (v > max_v) max_v = v;
        }
    }
    // If class outputs clearly look like logits, enable sigmoid.
    return (min_v < -0.1f || max_v > 1.5f);
}

V8LayoutInfo ResolveV8Layout(const std::vector<uint32_t>& dims, size_t byte_size,
                             rknn_tensor_type type, int num_classes,
                             int model_h, int model_w) {
    V8LayoutInfo info;
    const int num_channels = 4 + num_classes;
    if (num_channels <= 0) return info;

    const uint32_t elem_bytes = TensorTypeSizeBytes(type);
    const size_t total_elems = elem_bytes > 0 ? (byte_size / elem_bytes) : 0;
    const size_t max_boxes_from_data = static_cast<size_t>(num_channels) > 0
                                           ? (total_elems / static_cast<size_t>(num_channels))
                                           : 0;

    int ch_idx = -1;
    for (size_t i = 0; i < dims.size(); ++i) {
        if (dims[i] == static_cast<uint32_t>(num_channels)) {
            ch_idx = static_cast<int>(i);
            break;
        }
    }

    if (ch_idx >= 0 && total_elems >= static_cast<size_t>(num_channels)) {
        info.num_boxes = static_cast<int>(max_boxes_from_data);

        int prev_non1 = 1;
        for (int i = ch_idx - 1; i >= 0; --i) {
            if (dims[static_cast<size_t>(i)] > 1U) {
                prev_non1 = static_cast<int>(dims[static_cast<size_t>(i)]);
                break;
            }
        }
        int next_non1 = 1;
        for (size_t i = static_cast<size_t>(ch_idx + 1); i < dims.size(); ++i) {
            if (dims[i] > 1U) {
                next_non1 = static_cast<int>(dims[i]);
                break;
            }
        }

        if (next_non1 > 1 && prev_non1 <= 1) {
            info.channels_first = true;
        } else if (prev_non1 > 1 && next_non1 <= 1) {
            info.channels_first = false;
        } else if (next_non1 > 1 && prev_non1 > 1) {
            info.channels_first = next_non1 >= prev_non1;
        } else {
            info.channels_first = true;
        }
    } else if (dims.size() >= 3) {
        // Compatibility with old rank-3 assumptions.
        if (dims[1] == static_cast<uint32_t>(num_channels)) {
            info.num_boxes = static_cast<int>(dims[2]);
            info.channels_first = true;
        } else if (dims[2] == static_cast<uint32_t>(num_channels)) {
            info.num_boxes = static_cast<int>(dims[1]);
            info.channels_first = false;
        }
    }

    if (info.num_boxes <= 0 && max_boxes_from_data > 0) {
        info.num_boxes = static_cast<int>(max_boxes_from_data);
    }
    if (info.num_boxes <= 0) {
        info.num_boxes = DefaultV8NumBoxes(model_h, model_w);
    }
    if (info.num_boxes <= 0) {
        info.num_boxes = 8400;
    }

    if (max_boxes_from_data > 0 && static_cast<size_t>(info.num_boxes) > max_boxes_from_data) {
        info.num_boxes = static_cast<int>(max_boxes_from_data);
    }
    if (info.num_boxes < 0) info.num_boxes = 0;
    return info;
}

// YOLOv8 output processing (anchor-free, single output tensor)
int ProcessOutputV8(float* output, int num_boxes, int num_classes,
                    int model_h, int model_w,
                    std::vector<float>& boxes, std::vector<float>& obj_probs,
                    std::vector<int>& class_ids, float conf_thresh,
                    bool channels_first, V8BoxFormat box_format, bool apply_sigmoid,
                    bool debug_decode, int* debug_left) {
    int valid_count = 0;
    const int num_channels = 4 + num_classes;

    for (int i = 0; i < num_boxes; ++i) {
        float max_score = 0.0f;
        int max_cls_id = 0;
        for (int c = 0; c < num_classes; ++c) {
            float score = channels_first ? output[(4 + c) * num_boxes + i]
                                         : output[i * num_channels + (4 + c)];
            if (apply_sigmoid) score = Sigmoid(score);
            if (score > max_score) {
                max_score = score;
                max_cls_id = c;
            }
        }

        if (max_score >= conf_thresh) {
            const float a = channels_first ? output[0 * num_boxes + i] : output[i * num_channels + 0];
            const float b = channels_first ? output[1 * num_boxes + i] : output[i * num_channels + 1];
            const float c = channels_first ? output[2 * num_boxes + i] : output[i * num_channels + 2];
            const float d = channels_first ? output[3 * num_boxes + i] : output[i * num_channels + 3];
            if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) {
                continue;
            }
            float x1 = 0.0f, y1 = 0.0f, w = 0.0f, h = 0.0f;
            V8BoxFormat used_fmt = box_format;
            DecodeV8Box(a, b, c, d, model_w, model_h, box_format, x1, y1, w, h, &used_fmt);
            if (!std::isfinite(x1) || !std::isfinite(y1) || !std::isfinite(w) || !std::isfinite(h)) {
                continue;
            }
            if (w <= 1e-3f || h <= 1e-3f) continue;
            if (debug_decode && debug_left && *debug_left > 0) {
                --(*debug_left);
                LogInfo("[ai_yolo] v8 decode f32: raw4(" + std::to_string(a) + "," + std::to_string(b) + "," +
                        std::to_string(c) + "," + std::to_string(d) + ") fmt=" + V8BoxFormatName(used_fmt) +
                        " -> xywh(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
                        std::to_string(w) + "," + std::to_string(h) + ") cls=" +
                        std::to_string(max_cls_id) + " score=" + std::to_string(max_score));
            }

            boxes.push_back(x1);
            boxes.push_back(y1);
            boxes.push_back(w);
            boxes.push_back(h);
            obj_probs.push_back(max_score);
            class_ids.push_back(max_cls_id);
            ++valid_count;
        }
    }
    return valid_count;
}

// YOLOv8 INT8 output processing
int ProcessOutputV8Int8(int8_t* output, int num_boxes, int num_classes,
                        int model_h, int model_w,
                        std::vector<float>& boxes, std::vector<float>& obj_probs,
                        std::vector<int>& class_ids, float conf_thresh,
                        int32_t zp, float scale, bool channels_first, V8BoxFormat box_format,
                        bool debug_decode, int* debug_left) {
    int valid_count = 0;
    int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);
    const int num_channels = 4 + num_classes;

    for (int i = 0; i < num_boxes; ++i) {
        int8_t max_score_i8 = -128;
        int max_cls_id = 0;
        for (int c = 0; c < num_classes; ++c) {
            int8_t score = channels_first ? output[(4 + c) * num_boxes + i]
                                          : output[i * num_channels + (4 + c)];
            if (score > max_score_i8) {
                max_score_i8 = score;
                max_cls_id = c;
            }
        }

        if (max_score_i8 >= thresh_i8) {
            float a = DequantizeAffineToF32(
                channels_first ? output[0 * num_boxes + i] : output[i * num_channels + 0], zp, scale);
            float b = DequantizeAffineToF32(
                channels_first ? output[1 * num_boxes + i] : output[i * num_channels + 1], zp, scale);
            float c = DequantizeAffineToF32(
                channels_first ? output[2 * num_boxes + i] : output[i * num_channels + 2], zp, scale);
            float d = DequantizeAffineToF32(
                channels_first ? output[3 * num_boxes + i] : output[i * num_channels + 3], zp, scale);
            float max_score = DequantizeAffineToF32(max_score_i8, zp, scale);
            if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) {
                continue;
            }
            float x1 = 0.0f, y1 = 0.0f, w = 0.0f, h = 0.0f;
            V8BoxFormat used_fmt = box_format;
            DecodeV8Box(a, b, c, d, model_w, model_h, box_format, x1, y1, w, h, &used_fmt);
            if (!std::isfinite(x1) || !std::isfinite(y1) || !std::isfinite(w) || !std::isfinite(h)) {
                continue;
            }
            if (w <= 1e-3f || h <= 1e-3f) continue;
            if (debug_decode && debug_left && *debug_left > 0) {
                --(*debug_left);
                LogInfo("[ai_yolo] v8 decode int8: raw4(" + std::to_string(a) + "," + std::to_string(b) + "," +
                        std::to_string(c) + "," + std::to_string(d) + ") fmt=" + V8BoxFormatName(used_fmt) +
                        " -> xywh(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
                        std::to_string(w) + "," + std::to_string(h) + ") cls=" +
                        std::to_string(max_cls_id) + " score=" + std::to_string(max_score));
            }

            boxes.push_back(x1);
            boxes.push_back(y1);
            boxes.push_back(w);
            boxes.push_back(h);
            obj_probs.push_back(max_score);
            class_ids.push_back(max_cls_id);
            ++valid_count;
        }
    }
    return valid_count;
}
#endif

}  // namespace

class AiYoloNode : public INode {
public:
    std::string Id() const override { return id_; }
    std::string Type() const override { return "ai_yolo"; }

    bool Init(const SimpleJson& config, const NodeContext& ctx) override {
        id_ = config.ValueOr<std::string>("id", "ai_yolo");
        model_path_ = config.ValueOr<std::string>("model_path", "");
        conf_thresh_ = config.ValueOr<float>("conf", 0.25f);
        nms_thresh_ = config.ValueOr<float>("nms", 0.45f);
        model_input_w_ = config.ValueOr<int>("model_w", 640);
        model_input_h_ = config.ValueOr<int>("model_h", 640);
        num_classes_ = config.ValueOr<int>("num_classes", 80);
        {
            const std::string bf = config.ValueOr<std::string>("v8_box_format", "cxcywh");
            if (bf == "xyxy") {
                v8_box_format_ = V8BoxFormat::XyXy;
            } else if (bf == "xywh") {
                v8_box_format_ = V8BoxFormat::XyWh;
            } else if (bf == "cxcywh") {
                v8_box_format_ = V8BoxFormat::CxCyWh;
            } else {
                v8_box_format_ = V8BoxFormat::Auto;
            }
        }
        {
            const std::string act = config.ValueOr<std::string>("v8_cls_activation", "auto");
            if (act == "sigmoid") {
                v8_cls_activation_ = V8ClsActivation::Sigmoid;
            } else if (act == "none") {
                v8_cls_activation_ = V8ClsActivation::None;
            } else {
                v8_cls_activation_ = V8ClsActivation::Auto;
            }
        }

        if (const SimpleJson* dbg = config.Find("debug"); dbg && dbg->IsObject()) {
            stats_log_ = dbg->ValueOr<bool>("stats", stats_log_);
            stats_interval_ = std::max<uint64_t>(
                1, static_cast<uint64_t>(dbg->ValueOr<int>("stats_interval", static_cast<int>(stats_interval_))));
            debug_det_ = dbg->ValueOr<bool>("detections", debug_det_);
        }

        // Optional inference throttle. 0 = run every frame.
        infer_interval_ms_ = std::max<int64_t>(0, static_cast<int64_t>(config.ValueOr<int>("infer_interval_ms", 0)));
        if (infer_interval_ms_ <= 0) {
            const double infer_fps = config.ValueOr<double>("infer_fps", 0.0);
            if (infer_fps > 0.0) {
                infer_interval_ms_ = static_cast<int64_t>(1000.0 / infer_fps);
                if (infer_interval_ms_ < 1) infer_interval_ms_ = 1;
            }
        }

        std::string ver = config.ValueOr<std::string>("model_version", "auto");
        if (ver == "v5") {
            yolo_version_ = YoloVersion::V5;
        } else if (ver == "v8") {
            yolo_version_ = YoloVersion::V8;
        } else {
            yolo_version_ = YoloVersion::V8;
            auto_detect_version_ = true;
        }

        if (const SimpleJson* filter = config.Find("class_filter")) {
            for (const auto& item : filter->AsArray()) {
                class_filter_.insert(item.AsInt(-1));
            }
        }

        if (const SimpleJson* expand = config.Find("bbox_expand"); expand && expand->IsObject()) {
            bbox_expand_.enable = expand->ValueOr<bool>("enable", false);
            bbox_expand_.class_id = expand->ValueOr<int>("class_id", bbox_expand_.class_id);
            bbox_expand_.left = expand->ValueOr<float>("left", bbox_expand_.left);
            bbox_expand_.right = expand->ValueOr<float>("right", bbox_expand_.right);
            bbox_expand_.top = expand->ValueOr<float>("top", bbox_expand_.top);
            bbox_expand_.bottom = expand->ValueOr<float>("bottom", bbox_expand_.bottom);
        }

        input_queue_ = ctx.input_queue;
        if (!input_queue_) {
            LogError("[ai_yolo] no input queue for node " + id_);
            return false;
        }
        if (ctx.output_queues.empty()) {
            LogError("[ai_yolo] no output queue for node " + id_);
            return false;
        }
        output_queues_ = ctx.output_queues;

        infer_backend_ = ctx.infer_backend;
        if (!infer_backend_) {
            LogError("[ai_yolo] no infer backend for node " + id_);
            return false;
        }

#if defined(RK3588_ENABLE_RKNN)
        if (model_path_.empty()) {
            LogError("[ai_yolo] model_path is required");
            return false;
        }

        std::string err;
        model_handle_ = infer_backend_->LoadModel(model_path_, err);
        if (model_handle_ == kInvalidModelHandle) {
            LogError("[ai_yolo] failed to load model: " + err);
            return false;
        }

        ModelInfo info;
        if (infer_backend_->GetModelInfo(model_handle_, info)) {
            model_input_w_ = info.input_width;
            model_input_h_ = info.input_height;
            n_output_ = info.n_output;

            if (auto_detect_version_) {
                if (n_output_ == 1) {
                    yolo_version_ = YoloVersion::V8;
                } else if (n_output_ >= 3) {
                    yolo_version_ = YoloVersion::V5;
                }
            }
        }

        LogInfo("[ai_yolo] model loaded via InferBackend: " + model_path_ +
                " (handle=" + std::to_string(model_handle_) + ", version=" +
                (yolo_version_ == YoloVersion::V5 ? "v5" : "v8") + ")");
#else
        LogWarn("[ai_yolo] RKNN disabled, will passthrough frames");
#endif
        return true;
    }

    bool Start() override {
        LogInfo("[ai_yolo] start id=" + id_ + " conf=" + std::to_string(conf_thresh_) +
                " nms=" + std::to_string(nms_thresh_));
        return true;
    }

    void Stop() override {
#if defined(RK3588_ENABLE_RKNN)
        if (model_handle_ != kInvalidModelHandle) {
            infer_backend_->UnloadModel(model_handle_);
            model_handle_ = kInvalidModelHandle;
        }
#endif
        LogInfo("[ai_yolo] stop id=" + id_);
    }

    NodeStatus Process(FramePtr frame) override {
        if (!frame) return NodeStatus::DROP;

        if (infer_interval_ms_ > 0 && frame->pts > 0) {
            const int64_t pts_ms = static_cast<int64_t>(frame->pts / 1000ULL);
            const int64_t delta_ms = pts_ms - last_infer_pts_ms_;
            if (last_infer_pts_ms_ > 0 && delta_ms > 0 && delta_ms < infer_interval_ms_) {
                PushToDownstream(frame);
                ++processed_;
                return NodeStatus::OK;
            }
            last_infer_pts_ms_ = pts_ms;
        }

#if defined(RK3588_ENABLE_RKNN)
        RunInference(frame);
#endif
        PushToDownstream(frame);
        ++processed_;

        // Stats logging disabled
        (void)stats_log_;
        (void)stats_interval_;
        return NodeStatus::OK;
    }

private:
    void PushToDownstream(FramePtr frame) {
        for (auto& q : output_queues_) {
            q->Push(frame);
        }
    }

#if defined(RK3588_ENABLE_RKNN)
    void ResizeRgbBilinear(const uint8_t* src, int src_w, int src_h, int src_stride,
                           uint8_t* dst, int dst_w, int dst_h, int dst_stride) {
        const float scale_x = static_cast<float>(src_w) / static_cast<float>(dst_w);
        const float scale_y = static_cast<float>(src_h) / static_cast<float>(dst_h);

        for (int y = 0; y < dst_h; ++y) {
            const float fy = static_cast<float>(y) * scale_y;
            const int y0 = static_cast<int>(fy);
            const int y1 = std::min(y0 + 1, src_h - 1);
            const float dy = fy - static_cast<float>(y0);

            for (int x = 0; x < dst_w; ++x) {
                const float fx = static_cast<float>(x) * scale_x;
                const int x0 = static_cast<int>(fx);
                const int x1 = std::min(x0 + 1, src_w - 1);
                const float dx = fx - static_cast<float>(x0);

                for (int c = 0; c < 3; ++c) {
                    const float v00 = src[y0 * src_stride + x0 * 3 + c];
                    const float v01 = src[y0 * src_stride + x1 * 3 + c];
                    const float v10 = src[y1 * src_stride + x0 * 3 + c];
                    const float v11 = src[y1 * src_stride + x1 * 3 + c];
                    const float v = v00 * (1.0f - dx) * (1.0f - dy) +
                                    v01 * dx * (1.0f - dy) +
                                    v10 * (1.0f - dx) * dy +
                                    v11 * dx * dy;
                    dst[y * dst_stride + x * 3 + c] = static_cast<uint8_t>(v);
                }
            }
        }
    }

    void RunInference(FramePtr frame) {
        if (!frame->data || frame->data_size == 0) return;

        bool is_rgb = (frame->format == PixelFormat::RGB || frame->format == PixelFormat::BGR);
        if (!is_rgb) {
            LogWarn("[ai_yolo] input must be RGB/BGR, got other format");
            return;
        }

        const int w = frame->width;
        const int h = frame->height;
        const size_t packed_row = static_cast<size_t>(w) * 3;
        const size_t packed_size = packed_row * static_cast<size_t>(h);

        const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
        int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
                         : (frame->stride > 0 ? frame->stride : static_cast<int>(packed_row));
        if (!src || src_stride <= 0) return;

        InferInput input;
        const bool exact_model_input = (w == model_input_w_ && h == model_input_h_);
        if (exact_model_input && static_cast<size_t>(src_stride) == packed_row && frame->data_size >= packed_size) {
            input.data = src;
            input.size = packed_size;
            input.width = w;
            input.height = h;

            // Best-effort RKNN DMA-BUF zero-copy path.
            if (frame->DmaFd() >= 0 && frame->data) {
                const ptrdiff_t off = src - frame->data;
                if (off >= 0 && static_cast<size_t>(off) + packed_size <= frame->data_size) {
                    input.dma_fd = frame->DmaFd();
                    input.dma_offset = static_cast<int>(off);
                }
            }
        } else {
            if (frame->data_size < static_cast<size_t>(src_stride) * static_cast<size_t>(h)) {
                LogWarn("[ai_yolo] invalid RGB buffer size/stride (data_size=" + std::to_string(frame->data_size) +
                        ", stride=" + std::to_string(src_stride) +
                        ", h=" + std::to_string(h) + ")");
                return;
            }

            if (frame->DmaFd() >= 0) frame->SyncStart();
            const size_t input_row = static_cast<size_t>(model_input_w_) * 3;
            const size_t input_size = input_row * static_cast<size_t>(model_input_h_);
            resized_input_.resize(input_size);

            if (exact_model_input) {
                for (int y = 0; y < h; ++y) {
                    memcpy(resized_input_.data() + static_cast<size_t>(y) * input_row,
                           src + static_cast<size_t>(y) * static_cast<size_t>(src_stride),
                           input_row);
                }
            } else {
                ResizeRgbBilinear(src, w, h, src_stride,
                                  resized_input_.data(), model_input_w_, model_input_h_,
                                  static_cast<int>(input_row));
            }
            if (frame->DmaFd() >= 0) frame->SyncEnd();
            input.data = resized_input_.data();
            input.size = input_size;
            input.width = model_input_w_;
            input.height = model_input_h_;
        }
        input.is_nhwc = true;

        auto result = infer_backend_->InferBorrowed(model_handle_, input);
        if (!result.success) {
            LogWarn("[ai_yolo] inference failed: " + result.error);
            return;
        }

        PostProcessBorrowed(result.outputs, frame);
    }

    void PostProcessBorrowed(const std::vector<AiScheduler::BorrowedOutput>& outputs, FramePtr frame) {
        std::vector<float> boxes;
        std::vector<float> obj_probs;
        std::vector<int> class_ids;
        int valid_count = 0;

        if (yolo_version_ == YoloVersion::V5) {
            if (outputs.size() < 3) return;
            if (!outputs[0].data || !outputs[1].data || !outputs[2].data) return;

            int stride0 = 8, stride1 = 16, stride2 = 32;
            int grid_h0 = model_input_h_ / stride0, grid_w0 = model_input_w_ / stride0;
            int grid_h1 = model_input_h_ / stride1, grid_w1 = model_input_w_ / stride1;
            int grid_h2 = model_input_h_ / stride2, grid_w2 = model_input_w_ / stride2;

            int cnt0 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[0].data)), kAnchor0,
                                           grid_h0, grid_w0, model_input_h_, model_input_w_, stride0,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[0].zp, outputs[0].scale);
            int cnt1 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[1].data)), kAnchor1,
                                           grid_h1, grid_w1, model_input_h_, model_input_w_, stride1,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[1].zp, outputs[1].scale);
            int cnt2 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[2].data)), kAnchor2,
                                           grid_h2, grid_w2, model_input_h_, model_input_w_, stride2,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[2].zp, outputs[2].scale);
            valid_count = cnt0 + cnt1 + cnt2;
        } else {
            if (outputs.empty()) return;
            if (!outputs[0].data || outputs[0].size == 0) return;

            const V8LayoutInfo layout = ResolveV8Layout(outputs[0].dims, outputs[0].size,
                                                        outputs[0].type, num_classes_,
                                                        model_input_h_, model_input_w_);
            const int num_boxes = layout.num_boxes;
            int debug_decode_left = (debug_det_ && processed_ < 20) ? 5 : 0;
            if (num_boxes <= 0) return;
            if (debug_det_ && processed_ < 5) {
                std::string dims_s;
                for (size_t di = 0; di < outputs[0].dims.size(); ++di) {
                    dims_s += (di == 0 ? "[" : ",");
                    dims_s += std::to_string(outputs[0].dims[di]);
                }
                dims_s += "]";
                LogInfo("[ai_yolo] v8 out type=" + std::to_string(static_cast<int>(outputs[0].type)) +
                        " size=" + std::to_string(outputs[0].size) +
                        " dims=" + dims_s +
                        " num_boxes=" + std::to_string(num_boxes) +
                        " layout=" + std::string(layout.channels_first ? "CxN" : "NxC"));
            }

            if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
                const bool apply_sigmoid = ResolveV8ApplySigmoid(
                    reinterpret_cast<float*>(const_cast<uint8_t*>(outputs[0].data)),
                    num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
                if (debug_det_ && processed_ < 5) {
                    LogInfo("[ai_yolo] v8 cls activation=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
                }
                valid_count = ProcessOutputV8(reinterpret_cast<float*>(const_cast<uint8_t*>(outputs[0].data)),
                                              num_boxes, num_classes_,
                                              model_input_h_, model_input_w_,
                                              boxes, obj_probs, class_ids, conf_thresh_,
                                              layout.channels_first, v8_box_format_, apply_sigmoid,
                                              debug_det_, &debug_decode_left);
            } else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
                // Convert FP16 to FP32
                size_t num_elements = outputs[0].size / sizeof(uint16_t);
                fp32_buffer_.resize(num_elements);
                const uint16_t* fp16_data = reinterpret_cast<const uint16_t*>(outputs[0].data);
                for (size_t i = 0; i < num_elements; ++i) {
                    fp32_buffer_[i] = Fp16ToFp32(fp16_data[i]);
                }
                const bool apply_sigmoid = ResolveV8ApplySigmoid(
                    fp32_buffer_.data(), num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
                if (debug_det_ && processed_ < 5) {
                    LogInfo("[ai_yolo] v8 cls activation=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
                }
                valid_count = ProcessOutputV8(fp32_buffer_.data(),
                                              num_boxes, num_classes_,
                                              model_input_h_, model_input_w_,
                                              boxes, obj_probs, class_ids, conf_thresh_,
                                              layout.channels_first, v8_box_format_, apply_sigmoid,
                                              debug_det_, &debug_decode_left);
            } else {
                valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[0].data)),
                                                  num_boxes, num_classes_,
                                                  model_input_h_, model_input_w_,
                                                  boxes, obj_probs, class_ids, conf_thresh_,
                                                  outputs[0].zp, outputs[0].scale,
                                                  layout.channels_first, v8_box_format_,
                                                  debug_det_, &debug_decode_left);
            }
        }

        if (valid_count <= 0) return;

        std::vector<int> indices(valid_count);
        for (int i = 0; i < valid_count; ++i) indices[i] = i;

        QuickSortDescending(obj_probs, 0, valid_count - 1, indices);

        std::set<int> class_set(class_ids.begin(), class_ids.end());
        for (int c : class_set) {
            NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
        }

        const DetCoordContext coord_ctx = BuildDetCoordContext(*frame, model_input_w_, model_input_h_);

        auto det_result = std::make_shared<DetectionResult>();
        det_result->img_w = coord_ctx.out_w;
        det_result->img_h = coord_ctx.out_h;
        det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";

        for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
            if (indices[i] == -1) continue;
            int n = indices[i];
            int cls_id = class_ids[n];

            if (!class_filter_.empty() && class_filter_.find(cls_id) == class_filter_.end()) {
                continue;
            }

            float x1 = boxes[n * 4 + 0];
            float y1 = boxes[n * 4 + 1];
            float w = boxes[n * 4 + 2];
            float h = boxes[n * 4 + 3];

            Detection det;
            det.cls_id = cls_id;
            det.score = obj_probs[i];
            det.bbox = DecodeToOutputRect(x1, y1, w, h, coord_ctx);
            if (bbox_expand_.enable && det.cls_id == bbox_expand_.class_id) {
                det.bbox = ExpandRect(det.bbox, coord_ctx.out_w, coord_ctx.out_h, bbox_expand_);
            }
            det.track_id = -1;

            if (debug_det_ && det_result->items.size() < 5 && processed_ < 20) {
                LogInfo("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
                        std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
                        std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
                        std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
                        std::to_string(cls_id) + " score=" + std::to_string(det.score));
            }

            det_result->items.push_back(det);
        }
        if (debug_det_ && processed_ < 20) {
            LogInfo("[ai_yolo] det summary: valid_count=" + std::to_string(valid_count) +
                    " final=" + std::to_string(det_result->items.size()));
        }

        frame->det = det_result;
    }

    void PostProcess(std::vector<InferOutput>& outputs, FramePtr frame) {
        std::vector<float> boxes;
        std::vector<float> obj_probs;
        std::vector<int> class_ids;
        int valid_count = 0;

        if (yolo_version_ == YoloVersion::V5) {
            if (outputs.size() < 3) return;

            int stride0 = 8, stride1 = 16, stride2 = 32;
            int grid_h0 = model_input_h_ / stride0, grid_w0 = model_input_w_ / stride0;
            int grid_h1 = model_input_h_ / stride1, grid_w1 = model_input_w_ / stride1;
            int grid_h2 = model_input_h_ / stride2, grid_w2 = model_input_w_ / stride2;

            int cnt0 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[0].data.data()), kAnchor0,
                                           grid_h0, grid_w0, model_input_h_, model_input_w_, stride0,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[0].zp, outputs[0].scale);
            int cnt1 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[1].data.data()), kAnchor1,
                                           grid_h1, grid_w1, model_input_h_, model_input_w_, stride1,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[1].zp, outputs[1].scale);
            int cnt2 = ProcessFeatureMapV5(reinterpret_cast<int8_t*>(outputs[2].data.data()), kAnchor2,
                                           grid_h2, grid_w2, model_input_h_, model_input_w_, stride2,
                                           boxes, obj_probs, class_ids, conf_thresh_,
                                           outputs[2].zp, outputs[2].scale);
            valid_count = cnt0 + cnt1 + cnt2;
        } else {
            if (outputs.empty()) return;

            const V8LayoutInfo layout = ResolveV8Layout(outputs[0].dims, outputs[0].data.size(),
                                                        outputs[0].type, num_classes_,
                                                        model_input_h_, model_input_w_);
            const int num_boxes = layout.num_boxes;
            int debug_decode_left = (debug_det_ && processed_ < 20) ? 5 : 0;
            if (num_boxes <= 0) return;
            if (debug_det_ && processed_ < 5) {
                std::string dims_s;
                for (size_t di = 0; di < outputs[0].dims.size(); ++di) {
                    dims_s += (di == 0 ? "[" : ",");
                    dims_s += std::to_string(outputs[0].dims[di]);
                }
                dims_s += "]";
                LogInfo("[ai_yolo] v8 out(type copy) type=" + std::to_string(static_cast<int>(outputs[0].type)) +
                        " size=" + std::to_string(outputs[0].data.size()) +
                        " dims=" + dims_s +
                        " num_boxes=" + std::to_string(num_boxes) +
                        " layout=" + std::string(layout.channels_first ? "CxN" : "NxC"));
            }

            if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
                const bool apply_sigmoid = ResolveV8ApplySigmoid(
                    reinterpret_cast<float*>(outputs[0].data.data()),
                    num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
                if (debug_det_ && processed_ < 5) {
                    LogInfo("[ai_yolo] v8 cls activation(copy)=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
                }
                valid_count = ProcessOutputV8(reinterpret_cast<float*>(outputs[0].data.data()),
                                              num_boxes, num_classes_,
                                              model_input_h_, model_input_w_,
                                              boxes, obj_probs, class_ids, conf_thresh_,
                                              layout.channels_first, v8_box_format_, apply_sigmoid,
                                              debug_det_, &debug_decode_left);
            } else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
                // Convert FP16 to FP32
                size_t num_elements = outputs[0].data.size() / sizeof(uint16_t);
                fp32_buffer_.resize(num_elements);
                const uint16_t* fp16_data = reinterpret_cast<const uint16_t*>(outputs[0].data.data());
                for (size_t i = 0; i < num_elements; ++i) {
                    fp32_buffer_[i] = Fp16ToFp32(fp16_data[i]);
                }
                const bool apply_sigmoid = ResolveV8ApplySigmoid(
                    fp32_buffer_.data(), num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
                if (debug_det_ && processed_ < 5) {
                    LogInfo("[ai_yolo] v8 cls activation(copy)=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
                }
                valid_count = ProcessOutputV8(fp32_buffer_.data(),
                                              num_boxes, num_classes_,
                                              model_input_h_, model_input_w_,
                                              boxes, obj_probs, class_ids, conf_thresh_,
                                              layout.channels_first, v8_box_format_, apply_sigmoid,
                                              debug_det_, &debug_decode_left);
            } else {
                valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(outputs[0].data.data()),
                                                  num_boxes, num_classes_,
                                                  model_input_h_, model_input_w_,
                                                  boxes, obj_probs, class_ids, conf_thresh_,
                                                  outputs[0].zp, outputs[0].scale,
                                                  layout.channels_first, v8_box_format_,
                                                  debug_det_, &debug_decode_left);
            }
        }

        if (valid_count <= 0) return;

        std::vector<int> indices(valid_count);
        for (int i = 0; i < valid_count; ++i) indices[i] = i;

        QuickSortDescending(obj_probs, 0, valid_count - 1, indices);

        std::set<int> class_set(class_ids.begin(), class_ids.end());
        for (int c : class_set) {
            NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
        }

        const DetCoordContext coord_ctx = BuildDetCoordContext(*frame, model_input_w_, model_input_h_);

        auto det_result = std::make_shared<DetectionResult>();
        det_result->img_w = coord_ctx.out_w;
        det_result->img_h = coord_ctx.out_h;
        det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";

        for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
            if (indices[i] == -1) continue;
            int n = indices[i];
            int cls_id = class_ids[n];

            if (!class_filter_.empty() && class_filter_.find(cls_id) == class_filter_.end()) {
                continue;
            }

            float x1 = boxes[n * 4 + 0];
            float y1 = boxes[n * 4 + 1];
            float w = boxes[n * 4 + 2];
            float h = boxes[n * 4 + 3];

            Detection det;
            det.cls_id = cls_id;
            det.score = obj_probs[i];
            det.bbox = DecodeToOutputRect(x1, y1, w, h, coord_ctx);
            if (bbox_expand_.enable && det.cls_id == bbox_expand_.class_id) {
                det.bbox = ExpandRect(det.bbox, coord_ctx.out_w, coord_ctx.out_h, bbox_expand_);
            }
            det.track_id = -1;

            if (debug_det_ && det_result->items.size() < 5 && processed_ < 20) {
                LogInfo("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
                        std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
                        std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
                        std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
                        std::to_string(cls_id) + " score=" + std::to_string(det.score));
            }

            det_result->items.push_back(det);
        }
        if (debug_det_ && processed_ < 20) {
            LogInfo("[ai_yolo] det summary(copy): valid_count=" + std::to_string(valid_count) +
                    " final=" + std::to_string(det_result->items.size()));
        }

        frame->det = det_result;
    }
#endif

    std::string id_;
    std::string model_path_;
    float conf_thresh_ = 0.25f;
    float nms_thresh_ = 0.45f;
    int model_input_w_ = 640;
    int model_input_h_ = 640;
    int num_classes_ = 80;
    V8BoxFormat v8_box_format_ = V8BoxFormat::CxCyWh;
    V8ClsActivation v8_cls_activation_ = V8ClsActivation::Auto;
    YoloVersion yolo_version_ = YoloVersion::V8;
    bool auto_detect_version_ = false;
    std::set<int> class_filter_;

    std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
    std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
    std::shared_ptr<IInferBackend> infer_backend_;
    uint64_t processed_ = 0;

    bool stats_log_ = false;
    uint64_t stats_interval_ = 100;
    bool debug_det_ = false;
    BboxExpandConfig bbox_expand_{};

    int64_t infer_interval_ms_ = 0;
    int64_t last_infer_pts_ms_ = 0;

#if defined(RK3588_ENABLE_RKNN)
    ModelHandle model_handle_ = kInvalidModelHandle;
    uint32_t n_output_ = 0;
    std::vector<uint8_t> resized_input_;
    std::vector<float> fp32_buffer_;  // For FP16 to FP32 conversion
#endif
};

REGISTER_NODE(AiYoloNode, "ai_yolo");

}  // namespace rk3588