OrangePi3588Media/plugins/ai_shoe_det/ai_shoe_det_node.cpp

/**
 * ai_shoe_det - 鞋子检测节点（支持滑动窗口）
 *
 * 基于 ai_yolo 实现，添加多窗口检测支持
 */

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <string>
#include <vector>

#include "ai_scheduler.h"
#include "frame/frame.h"
#include "hw/i_image_processor.h"
#include "hw/i_infer_backend.h"
#include "node.h"
#include "utils/logger.h"
#include "utils/person_shoe_roi.h"

#if defined(RK3588_ENABLE_RKNN)
#include "rknn_api.h"
#endif

namespace rk3588 {

struct DetWindow {
    int x, y, w, h;
};

struct DetBox {
    float x, y, w, h;
    float conf;
    int class_id;
};

struct DynamicRoiConfig {
    bool enable = false;
    int person_class_id = 0;
    int shoe_class_id = 0;
    int debug_roi_class_id = -1;
    int max_rois = 8;
    int min_person_height = 0;
    float max_box_area_ratio = 0.0f;
    float y_offset = 0.72f;
    float width_scale = 1.30f;
    float height_scale = 0.38f;
};

enum class V8BoxFormat { Auto, CxCyWh, XyXy, XyWh };
enum class V8ClsActivation { Auto, None, Sigmoid };

uint32_t TensorTypeSizeBytes(rknn_tensor_type t) {
    switch (t) {
        case RKNN_TENSOR_INT8:
        case RKNN_TENSOR_UINT8:
            return 1;
        case RKNN_TENSOR_FLOAT16:
            return 2;
        case RKNN_TENSOR_FLOAT32:
            return 4;
        default:
            return 1;
    }
}

inline float Sigmoid(float x) {
    return 1.0f / (1.0f + std::exp(-x));
}

inline float Fp16ToFp32(uint16_t h) {
    const int sign = (h & 0x8000) ? -1 : 1;
    const int exp = (h >> 10) & 0x1F;
    const int mant = h & 0x03FF;

    if (exp == 0) {
        if (mant == 0) return sign < 0 ? -0.0f : 0.0f;
        return static_cast<float>(sign) * std::ldexp(static_cast<float>(mant), -24);
    }
    if (exp == 0x1F) {
        if (mant == 0) return sign < 0 ? -INFINITY : INFINITY;
        return std::numeric_limits<float>::quiet_NaN();
    }
    return static_cast<float>(sign) *
           std::ldexp(static_cast<float>(mant + 1024), exp - 25);
}

inline int32_t ClipFloat(float val, float min_val, float max_val) {
    return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
}

inline int8_t QuantizeF32ToAffine(float f32, int32_t zp, float scale) {
    float dst_val = (f32 / scale) + zp;
    return static_cast<int8_t>(ClipFloat(dst_val, -128, 127));
}

inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
    return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
}

int DefaultV8NumBoxes(int model_h, int model_w) {
    if (model_h <= 0 || model_w <= 0) return 0;
    return (model_h / 8) * (model_w / 8) +
           (model_h / 16) * (model_w / 16) +
           (model_h / 32) * (model_w / 32);
}

struct V8LayoutInfo {
    int num_boxes = 0;
    bool channels_first = true;
};

float ScoreBoxCandidate(float x, float y, float w, float h, int model_w, int model_h) {
    float s = 0.0f;
    if (w > 0.0f && h > 0.0f) s += 3.0f;
    if (w <= model_w * 1.2f) s += 1.0f;
    if (h <= model_h * 1.2f) s += 1.0f;
    if (x >= -model_w * 0.1f) s += 1.0f;
    if (y >= -model_h * 0.1f) s += 1.0f;
    if ((x + w) <= model_w * 1.2f) s += 1.0f;
    if ((y + h) <= model_h * 1.2f) s += 1.0f;
    return s;
}

bool SeemsNormalized(float a, float b, float c, float d) {
    auto in_range = [](float v) { return v >= -0.05f && v <= 2.5f; };
    return in_range(a) && in_range(b) && in_range(c) && in_range(d);
}

void DecodeV8Box(float a, float b, float c, float d, int model_w, int model_h, V8BoxFormat fmt,
                 float& out_x, float& out_y, float& out_w, float& out_h) {
    if (SeemsNormalized(a, b, c, d)) {
        a *= static_cast<float>(model_w);
        b *= static_cast<float>(model_h);
        c *= static_cast<float>(model_w);
        d *= static_cast<float>(model_h);
    }

    auto decode_cxcywh = [&](float& x, float& y, float& w, float& h) {
        x = a - c / 2.0f;
        y = b - d / 2.0f;
        w = c;
        h = d;
    };
    auto decode_xyxy = [&](float& x, float& y, float& w, float& h) {
        x = a;
        y = b;
        w = c - a;
        h = d - b;
    };
    auto decode_xywh = [&](float& x, float& y, float& w, float& h) {
        x = a;
        y = b;
        w = c;
        h = d;
    };

    if (fmt == V8BoxFormat::CxCyWh) {
        decode_cxcywh(out_x, out_y, out_w, out_h);
        return;
    }
    if (fmt == V8BoxFormat::XyXy) {
        decode_xyxy(out_x, out_y, out_w, out_h);
        return;
    }
    if (fmt == V8BoxFormat::XyWh) {
        decode_xywh(out_x, out_y, out_w, out_h);
        return;
    }

    float x1 = 0.0f, y1 = 0.0f, w1 = 0.0f, h1 = 0.0f;
    float x2 = 0.0f, y2 = 0.0f, w2 = 0.0f, h2 = 0.0f;
    float x3 = 0.0f, y3 = 0.0f, w3 = 0.0f, h3 = 0.0f;
    decode_cxcywh(x1, y1, w1, h1);
    decode_xyxy(x2, y2, w2, h2);
    decode_xywh(x3, y3, w3, h3);

    const float s1 = ScoreBoxCandidate(x1, y1, w1, h1, model_w, model_h);
    const float s2 = ScoreBoxCandidate(x2, y2, w2, h2, model_w, model_h);
    const float s3 = ScoreBoxCandidate(x3, y3, w3, h3, model_w, model_h);
    if (s2 >= s1 && s2 >= s3) {
        out_x = x2; out_y = y2; out_w = w2; out_h = h2;
    } else if (s3 >= s1 && s3 >= s2) {
        out_x = x3; out_y = y3; out_w = w3; out_h = h3;
    } else {
        out_x = x1; out_y = y1; out_w = w1; out_h = h1;
    }
}

bool ResolveV8ApplySigmoid(const float* output, int num_boxes, int num_classes, bool channels_first,
                           V8ClsActivation act_mode) {
    if (act_mode == V8ClsActivation::None) return false;
    if (act_mode == V8ClsActivation::Sigmoid) return true;
    if (!output || num_boxes <= 0 || num_classes <= 0) return false;

    const int num_channels = 4 + num_classes;
    const int sample_boxes = std::min(num_boxes, 64);
    float min_v = 1e9f;
    float max_v = -1e9f;
    for (int i = 0; i < sample_boxes; ++i) {
        for (int c = 0; c < num_classes; ++c) {
            const float v = channels_first ? output[(4 + c) * num_boxes + i]
                                           : output[i * num_channels + (4 + c)];
            if (v < min_v) min_v = v;
            if (v > max_v) max_v = v;
        }
    }
    return (min_v < -0.1f || max_v > 1.5f);
}

V8LayoutInfo ResolveV8Layout(const std::vector<uint32_t>& dims, size_t byte_size,
                             rknn_tensor_type type, int num_classes,
                             int model_h, int model_w) {
    V8LayoutInfo info;
    const int num_channels = 4 + num_classes;
    const uint32_t elem_bytes = TensorTypeSizeBytes(type);
    const size_t total_elems = elem_bytes > 0 ? (byte_size / elem_bytes) : 0;
    const size_t max_boxes_from_data = static_cast<size_t>(num_channels) > 0
                                           ? (total_elems / static_cast<size_t>(num_channels))
                                           : 0;

    int ch_idx = -1;
    for (size_t i = 0; i < dims.size(); ++i) {
        if (dims[i] == static_cast<uint32_t>(num_channels)) {
            ch_idx = static_cast<int>(i);
            break;
        }
    }

    if (ch_idx >= 0 && total_elems >= static_cast<size_t>(num_channels)) {
        info.num_boxes = static_cast<int>(max_boxes_from_data);
        int prev_non1 = 1;
        for (int i = ch_idx - 1; i >= 0; --i) {
            if (dims[static_cast<size_t>(i)] > 1U) {
                prev_non1 = static_cast<int>(dims[static_cast<size_t>(i)]);
                break;
            }
        }
        int next_non1 = 1;
        for (size_t i = static_cast<size_t>(ch_idx + 1); i < dims.size(); ++i) {
            if (dims[i] > 1U) {
                next_non1 = static_cast<int>(dims[i]);
                break;
            }
        }
        if (next_non1 > 1 && prev_non1 <= 1) {
            info.channels_first = true;
        } else if (prev_non1 > 1 && next_non1 <= 1) {
            info.channels_first = false;
        } else if (next_non1 > 1 && prev_non1 > 1) {
            info.channels_first = next_non1 >= prev_non1;
        }
    }

    if (info.num_boxes <= 0 && max_boxes_from_data > 0) {
        info.num_boxes = static_cast<int>(max_boxes_from_data);
    }
    if (info.num_boxes <= 0) {
        info.num_boxes = DefaultV8NumBoxes(model_h, model_w);
    }
    if (info.num_boxes <= 0) {
        info.num_boxes = 8400;
    }
    if (max_boxes_from_data > 0 && static_cast<size_t>(info.num_boxes) > max_boxes_from_data) {
        info.num_boxes = static_cast<int>(max_boxes_from_data);
    }
    return info;
}

class AiShoeDetNode : public INode {
public:
    std::string Id() const override { return id_; }
    std::string Type() const override { return "ai_shoe_det"; }

    bool Init(const SimpleJson& config, const NodeContext& ctx) override {
        id_ = config.ValueOr<std::string>("id", "shoe_det");
        model_path_ = config.ValueOr<std::string>("model_path",
            "./models/shoe_detector.rknn");

        model_w_ = config.ValueOr<int>("model_w", 640);
        model_h_ = config.ValueOr<int>("model_h", 640);
        conf_thresh_ = config.ValueOr<float>("conf", 0.25f);
        nms_thresh_ = config.ValueOr<float>("nms", 0.45f);
        append_detections_ = config.ValueOr<bool>("append_detections", false);
        {
            const std::string bf = config.ValueOr<std::string>("v8_box_format", "auto");
            if (bf == "xyxy") {
                v8_box_format_ = V8BoxFormat::XyXy;
            } else if (bf == "xywh") {
                v8_box_format_ = V8BoxFormat::XyWh;
            } else if (bf == "cxcywh") {
                v8_box_format_ = V8BoxFormat::CxCyWh;
            } else {
                v8_box_format_ = V8BoxFormat::Auto;
            }
        }
        {
            const std::string act = config.ValueOr<std::string>("v8_cls_activation", "auto");
            if (act == "sigmoid") {
                v8_cls_activation_ = V8ClsActivation::Sigmoid;
            } else if (act == "none") {
                v8_cls_activation_ = V8ClsActivation::None;
            } else {
                v8_cls_activation_ = V8ClsActivation::Auto;
            }
        }

        infer_interval_ms_ = std::max<int64_t>(
            0, static_cast<int64_t>(config.ValueOr<int>("infer_interval_ms", 0)));
        if (infer_interval_ms_ <= 0) {
            const double infer_fps = config.ValueOr<double>("infer_fps", 0.0);
            if (infer_fps > 0.0) {
                infer_interval_ms_ = static_cast<int64_t>(1000.0 / infer_fps);
                if (infer_interval_ms_ < 1) infer_interval_ms_ = 1;
            }
        }
        infer_phase_ms_ = std::max<int64_t>(
            0, static_cast<int64_t>(config.ValueOr<int>("infer_phase_ms", 0)));
        if (infer_interval_ms_ > 0 && infer_phase_ms_ >= infer_interval_ms_) {
            infer_phase_ms_ %= infer_interval_ms_;
        }
        use_dma_input_ = config.ValueOr<bool>("use_dma_input", use_dma_input_);

        if (const SimpleJson* dyn = config.Find("dynamic_roi"); dyn && dyn->IsObject()) {
            dynamic_roi_.enable = dyn->ValueOr<bool>("enable", false);
            dynamic_roi_.person_class_id =
                dyn->ValueOr<int>("person_class_id", dynamic_roi_.person_class_id);
            dynamic_roi_.shoe_class_id =
                dyn->ValueOr<int>("shoe_class_id", dynamic_roi_.shoe_class_id);
            dynamic_roi_.debug_roi_class_id =
                dyn->ValueOr<int>("debug_roi_class_id", dynamic_roi_.debug_roi_class_id);
            dynamic_roi_.max_rois = std::max(1, dyn->ValueOr<int>("max_rois", dynamic_roi_.max_rois));
            dynamic_roi_.min_person_height =
                std::max(0, dyn->ValueOr<int>("min_person_height", dynamic_roi_.min_person_height));
            dynamic_roi_.max_box_area_ratio =
                std::max(0.0f, dyn->ValueOr<float>("max_box_area_ratio", dynamic_roi_.max_box_area_ratio));
            dynamic_roi_.y_offset = dyn->ValueOr<float>("y_offset", dynamic_roi_.y_offset);
            dynamic_roi_.width_scale = dyn->ValueOr<float>("width_scale", dynamic_roi_.width_scale);
            dynamic_roi_.height_scale = dyn->ValueOr<float>("height_scale", dynamic_roi_.height_scale);
        }

        // 解析窗口配置
        windows_.clear();
        if (const SimpleJson* win_arr = config.Find("windows"); win_arr && win_arr->IsArray()) {
            for (const auto& w : win_arr->AsArray()) {
                if (w.IsObject()) {
                    DetWindow win;
                    win.x = w.ValueOr<int>("x", 0);
                    win.y = w.ValueOr<int>("y", 0);
                    win.w = w.ValueOr<int>("w", 640);
                    win.h = w.ValueOr<int>("h", 640);
                    windows_.push_back(win);
                }
            }
        }

        // 默认单窗口（全图）
        if (!dynamic_roi_.enable && windows_.empty()) {
            windows_.push_back({0, 0, 0, 0});  // 0表示全图
        }

        input_queue_ = ctx.input_queue;
        output_queues_ = ctx.output_queues;
        if (!input_queue_) {
            LogError("[ai_shoe_det] no input queue");
            return false;
        }

        infer_backend_ = ctx.infer_backend;
        if (!infer_backend_) {
            LogError("[ai_shoe_det] no infer backend");
            return false;
        }
        image_processor_ = ctx.image_processor;

#if defined(RK3588_ENABLE_RKNN)
        std::string err;
        model_handle_ = infer_backend_->LoadModel(model_path_, err);
        if (model_handle_ == kInvalidModelHandle) {
            LogError("[ai_shoe_det] failed to load model: " + err);
            return false;
        }
        input_buf_.resize(model_w_ * model_h_ * 3);
        LogInfo("[ai_shoe_det] model loaded: " + model_path_);
#else
        LogWarn("[ai_shoe_det] RKNN disabled");
#endif

        return true;
    }

    bool Start() override {
        LogInfo("[ai_shoe_det] start mode=" + std::string(dynamic_roi_.enable ? "dynamic_roi" : "windows") +
                " windows=" + std::to_string(windows_.size()) +
                " append=" + std::string(append_detections_ ? "true" : "false") +
                " infer_interval_ms=" + std::to_string(infer_interval_ms_));
        return true;
    }

    void Stop() override {
#if defined(RK3588_ENABLE_RKNN)
        if (model_handle_ != kInvalidModelHandle) {
            infer_backend_->UnloadModel(model_handle_);
            model_handle_ = kInvalidModelHandle;
        }
#endif
        LogInfo("[ai_shoe_det] stop");
    }

    NodeStatus Process(FramePtr frame) override {
        if (!frame) return NodeStatus::DROP;
#if defined(RK3588_ENABLE_RKNN)
        if (infer_interval_ms_ > 0 && frame->pts > 0) {
            const int64_t pts_ms = static_cast<int64_t>(frame->pts / 1000ULL);
            const int64_t effective_pts_ms = pts_ms + infer_phase_ms_;
            const int64_t delta_ms = effective_pts_ms - last_infer_pts_ms_;
            if (last_infer_pts_ms_ > 0 && delta_ms > 0 && delta_ms < infer_interval_ms_) {
                Push(frame);
                return NodeStatus::OK;
            }
            last_infer_pts_ms_ = effective_pts_ms;
        }
        RunDetection(frame);
#endif
        Push(frame);
        return NodeStatus::OK;
    }

private:
    void Push(FramePtr frame) {
        for (auto& q : output_queues_) q->Push(frame);
    }

#if defined(RK3588_ENABLE_RKNN)

    void RunDetection(FramePtr frame) {
        if (!frame->data || frame->data_size == 0) return;

        const int src_w = frame->width;
        const int src_h = frame->height;

        std::vector<DetBox> all_dets;

        std::vector<DetWindow> active_windows = windows_;
        if (dynamic_roi_.enable) {
            active_windows = BuildDynamicWindows(frame, src_w, src_h);
            if (active_windows.empty()) {
                return;
            }
        }

        // 对每个窗口进行检测
        for (const auto& win : active_windows) {
            auto dets = DetectWindow(frame, src_w, src_h, win);
            all_dets.insert(all_dets.end(), dets.begin(), dets.end());
        }

        // NMS
        all_dets = ApplyNMS(all_dets, nms_thresh_);

        // 填充结果
        if (!frame->det) {
            frame->det = std::make_shared<DetectionResult>();
        }

        if (!append_detections_) {
            frame->det->items.clear();
        }
        frame->det->img_w = src_w;
        frame->det->img_h = src_h;

        if (dynamic_roi_.enable && dynamic_roi_.debug_roi_class_id >= 0) {
            AddDebugRoiDetections(frame, active_windows);
        }

        for (const auto& d : all_dets) {
            Detection item;
            item.bbox = {d.x, d.y, d.w, d.h};
            item.score = d.conf;
            item.cls_id = d.class_id;
            frame->det->items.push_back(item);
        }
    }

    std::vector<DetWindow> BuildDynamicWindows(FramePtr frame, int src_w, int src_h) {
        std::vector<DetWindow> rois;
        if (!frame->det) return rois;

        std::vector<Detection> persons;
        persons.reserve(frame->det->items.size());
        for (const auto& det : frame->det->items) {
            if (det.cls_id != dynamic_roi_.person_class_id) continue;
            if (det.bbox.h < dynamic_roi_.min_person_height) continue;
            persons.push_back(det);
        }

        if (persons.empty()) return rois;

        std::sort(persons.begin(), persons.end(),
                  [](const Detection& a, const Detection& b) { return a.score > b.score; });

        const size_t max_rois = std::min(
            persons.size(), static_cast<size_t>(std::max(1, dynamic_roi_.max_rois)));
        rois.reserve(max_rois);
        const size_t start_index = persons.empty() ? 0 : (roi_round_robin_offset_ % persons.size());

        for (size_t i = 0; i < max_rois; ++i) {
            const Detection& person = persons[(start_index + i) % persons.size()];
            const Rect& bbox = person.bbox;
            const Rect roi = BuildPersonFootRegion(bbox,
                                                   src_w,
                                                   src_h,
                                                   dynamic_roi_.y_offset,
                                                   dynamic_roi_.width_scale,
                                                   dynamic_roi_.height_scale);

            const int x0 = std::max(0, static_cast<int>(roi.x));
            const int y0 = std::max(0, static_cast<int>(roi.y));
            const int x1 = std::min(src_w, static_cast<int>(roi.x + roi.w));
            const int y1 = std::min(src_h, static_cast<int>(roi.y + roi.h));

            if (x1 <= x0 || y1 <= y0) continue;
            rois.push_back({x0, y0, x1 - x0, y1 - y0});
        }
        if (!persons.empty()) {
            roi_round_robin_offset_ = (start_index + max_rois) % persons.size();
        }

        return rois;
    }

    void AddDebugRoiDetections(FramePtr frame, const std::vector<DetWindow>& rois) const {
        if (!frame || !frame->det) return;
        for (const auto& roi : rois) {
            Detection item;
            item.cls_id = dynamic_roi_.debug_roi_class_id;
            item.score = 1.0f;
            item.track_id = -1;
            item.bbox = {
                static_cast<float>(roi.x),
                static_cast<float>(roi.y),
                static_cast<float>(roi.w),
                static_cast<float>(roi.h)
            };
            frame->det->items.push_back(item);
        }
    }

    std::vector<DetBox> DetectWindow(FramePtr frame, int src_w, int src_h, const DetWindow& win) {
        std::vector<DetBox> dets;

        // 确定裁剪区域
        int win_x, win_y, win_w, win_h;
        if (win.w == 0 || win.h == 0) {
            win_x = 0; win_y = 0; win_w = src_w; win_h = src_h;
        } else {
            win_x = std::max(0, std::min(win.x, src_w - 1));
            win_y = std::max(0, std::min(win.y, src_h - 1));
            win_w = std::min(win.w, src_w - win_x);
            win_h = std::min(win.h, src_h - win_y);
        }

        if (win_w <= 0 || win_h <= 0) return dets;

        // 获取源数据
        const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
        int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
                        : (frame->stride > 0 ? frame->stride : src_w * 3);

        if (!src || src_stride <= 0) return dets;

        const uint8_t* roi_src = src + static_cast<size_t>(win_y) * static_cast<size_t>(src_stride) +
                                 static_cast<size_t>(win_x) * 3;

        const uint8_t* model_input_data = nullptr;
        size_t model_input_size = 0;
        int model_input_dma_fd = -1;
        Frame resized_frame;

        if (image_processor_ && win_w > 0 && win_h > 0) {
            Frame crop_frame;
            crop_frame.width = win_w;
            crop_frame.height = win_h;
            crop_frame.format = PixelFormat::RGB;
            crop_frame.stride = src_stride;
            crop_frame.data = const_cast<uint8_t*>(roi_src);
            crop_frame.data_size = frame->data_size;
            crop_frame.plane_count = 1;
            crop_frame.planes[0] = {crop_frame.data, crop_frame.stride, static_cast<int>(frame->data_size), 0};

            resized_frame.width = model_w_;
            resized_frame.height = model_h_;
            resized_frame.format = PixelFormat::RGB;

            Status st = image_processor_->Resize(crop_frame, resized_frame);
            if (st.Ok()) {
                const uint8_t* resized_data = resized_frame.planes[0].data ? resized_frame.planes[0].data : resized_frame.data;
                const int resized_stride = resized_frame.planes[0].stride > 0
                                               ? resized_frame.planes[0].stride
                                               : (resized_frame.stride > 0 ? resized_frame.stride : model_w_ * 3);
                const size_t packed_size = static_cast<size_t>(model_w_) * static_cast<size_t>(model_h_) * 3;
                if (use_dma_input_ && resized_frame.DmaFd() >= 0 && resized_data) {
                    model_input_data = resized_data;
                    model_input_size = resized_frame.data_size > 0 ? resized_frame.data_size : packed_size;
                    model_input_dma_fd = resized_frame.DmaFd();
                } else if (resized_data && resized_stride > 0) {
                    if (packed_size > input_buf_.size()) input_buf_.resize(packed_size);
                    for (int row = 0; row < model_h_; ++row) {
                        memcpy(input_buf_.data() + static_cast<size_t>(row) * static_cast<size_t>(model_w_) * 3,
                               resized_data + static_cast<size_t>(row) * static_cast<size_t>(resized_stride),
                               static_cast<size_t>(model_w_) * 3);
                    }
                    model_input_data = input_buf_.data();
                    model_input_size = packed_size;
                }
            }
        }

        if (!model_input_data) {
            ResizeRgbBilinear(roi_src, win_w, win_h, src_stride,
                              input_buf_.data(), model_w_, model_h_, model_w_ * 3);
            model_input_data = input_buf_.data();
            model_input_size = input_buf_.size();
        }

        // 推理
        InferInput input;
        input.width = model_w_;
        input.height = model_h_;
        input.is_nhwc = true;
        input.data = model_input_data;
        input.size = model_input_size;
        input.dma_fd = model_input_dma_fd;
        input.type = RKNN_TENSOR_UINT8;

        auto r = infer_backend_->InferBorrowed(model_handle_, input);
        if (!r.success || r.outputs.empty() || !r.outputs[0].data) {
            LogWarn("[ai_shoe_det] inference failed");
            return dets;
        }

        // 解析输出
        dets = ParseOutput(r.outputs, win_x, win_y, win_w, win_h);
        return dets;
    }

    std::vector<DetBox> ParseOutput(const std::vector<AiScheduler::BorrowedOutput>& outputs,
                                    int win_x, int win_y, int win_w, int win_h) {
        std::vector<DetBox> dets;
        if (outputs.empty() || !outputs[0].data) return dets;
        constexpr int kNumClasses = 1;
        const V8LayoutInfo layout = ResolveV8Layout(
            outputs[0].dims, outputs[0].size, outputs[0].type, kNumClasses, model_h_, model_w_);
        const int num_boxes = layout.num_boxes;
        if (num_boxes <= 0) return dets;

        float scale_x = static_cast<float>(win_w) / model_w_;
        float scale_y = static_cast<float>(win_h) / model_h_;
        const int num_channels = 4 + kNumClasses;

        auto append_box = [&](float a, float b, float c, float d, float conf) {
            if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) return;
            float x = 0.0f, y = 0.0f, w = 0.0f, h = 0.0f;
            DecodeV8Box(a, b, c, d, model_w_, model_h_, v8_box_format_, x, y, w, h);
            if (!std::isfinite(x) || !std::isfinite(y) || !std::isfinite(w) || !std::isfinite(h)) return;
            if (w <= 1e-3f || h <= 1e-3f) return;
            if (w > model_w_ * 1.2f || h > model_h_ * 1.2f) return;
            if (dynamic_roi_.enable && dynamic_roi_.max_box_area_ratio > 0.0f && win_w > 0 && win_h > 0) {
                const float mapped_area_ratio =
                    (w * h) / (static_cast<float>(model_w_) * static_cast<float>(model_h_));
                if (mapped_area_ratio > dynamic_roi_.max_box_area_ratio) return;
            }

            DetBox box;
            box.x = win_x + x * scale_x;
            box.y = win_y + y * scale_y;
            box.w = w * scale_x;
            box.h = h * scale_y;
            box.conf = conf;
            box.class_id = dynamic_roi_.shoe_class_id;
            dets.push_back(box);
        };

        if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
            const float* data = reinterpret_cast<const float*>(outputs[0].data);
            const bool apply_sigmoid = ResolveV8ApplySigmoid(
                data, num_boxes, kNumClasses, layout.channels_first, v8_cls_activation_);
            for (int i = 0; i < num_boxes; ++i) {
                const float score = layout.channels_first ? data[4 * num_boxes + i]
                                                          : data[i * num_channels + 4];
                const float conf = apply_sigmoid ? Sigmoid(score) : score;
                if (conf < conf_thresh_) continue;
                const float a = layout.channels_first ? data[0 * num_boxes + i] : data[i * num_channels + 0];
                const float b = layout.channels_first ? data[1 * num_boxes + i] : data[i * num_channels + 1];
                const float c = layout.channels_first ? data[2 * num_boxes + i] : data[i * num_channels + 2];
                const float d = layout.channels_first ? data[3 * num_boxes + i] : data[i * num_channels + 3];
                append_box(a, b, c, d, conf);
            }
        } else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
            const uint16_t* data = reinterpret_cast<const uint16_t*>(outputs[0].data);
            fp32_decode_buf_.resize(outputs[0].size / sizeof(uint16_t));
            for (size_t i = 0; i < fp32_decode_buf_.size(); ++i) {
                fp32_decode_buf_[i] = Fp16ToFp32(data[i]);
            }
            const bool apply_sigmoid = ResolveV8ApplySigmoid(
                fp32_decode_buf_.data(), num_boxes, kNumClasses, layout.channels_first, v8_cls_activation_);
            for (int i = 0; i < num_boxes; ++i) {
                const float score = layout.channels_first ? fp32_decode_buf_[4 * num_boxes + i]
                                                          : fp32_decode_buf_[i * num_channels + 4];
                const float conf = apply_sigmoid ? Sigmoid(score) : score;
                if (conf < conf_thresh_) continue;
                const float a = layout.channels_first ? fp32_decode_buf_[0 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 0];
                const float b = layout.channels_first ? fp32_decode_buf_[1 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 1];
                const float c = layout.channels_first ? fp32_decode_buf_[2 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 2];
                const float d = layout.channels_first ? fp32_decode_buf_[3 * num_boxes + i] : fp32_decode_buf_[i * num_channels + 3];
                append_box(a, b, c, d, conf);
            }
        } else if (outputs[0].type == RKNN_TENSOR_INT8) {
            const int8_t* data = reinterpret_cast<const int8_t*>(outputs[0].data);
            const int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh_, outputs[0].zp, outputs[0].scale);
            for (int i = 0; i < num_boxes; ++i) {
                const int8_t score_i8 = layout.channels_first ? data[4 * num_boxes + i]
                                                              : data[i * num_channels + 4];
                if (score_i8 < thresh_i8) continue;
                const float conf = DequantizeAffineToF32(score_i8, outputs[0].zp, outputs[0].scale);
                const float a = DequantizeAffineToF32(layout.channels_first ? data[0 * num_boxes + i] : data[i * num_channels + 0], outputs[0].zp, outputs[0].scale);
                const float b = DequantizeAffineToF32(layout.channels_first ? data[1 * num_boxes + i] : data[i * num_channels + 1], outputs[0].zp, outputs[0].scale);
                const float c = DequantizeAffineToF32(layout.channels_first ? data[2 * num_boxes + i] : data[i * num_channels + 2], outputs[0].zp, outputs[0].scale);
                const float d = DequantizeAffineToF32(layout.channels_first ? data[3 * num_boxes + i] : data[i * num_channels + 3], outputs[0].zp, outputs[0].scale);
                append_box(a, b, c, d, conf);
            }
        } else {
            LogWarn("[ai_shoe_det] unsupported output tensor type");
        }

        return dets;
    }

    std::vector<DetBox> ApplyNMS(std::vector<DetBox>& dets, float thresh) {
        if (dets.empty()) return dets;

        std::sort(dets.begin(), dets.end(),
                  [](const DetBox& a, const DetBox& b) { return a.conf > b.conf; });

        std::vector<DetBox> keep;
        std::vector<bool> suppressed(dets.size(), false);

        for (size_t i = 0; i < dets.size(); ++i) {
            if (suppressed[i]) continue;
            keep.push_back(dets[i]);

            for (size_t j = i + 1; j < dets.size(); ++j) {
                if (suppressed[j]) continue;
                float iou = ComputeIoU(dets[i], dets[j]);
                if (iou > thresh) {
                    suppressed[j] = true;
                }
            }
        }

        return keep;
    }

    float ComputeIoU(const DetBox& a, const DetBox& b) {
        float x1 = std::max(a.x, b.x);
        float y1 = std::max(a.y, b.y);
        float x2 = std::min(a.x + a.w, b.x + b.w);
        float y2 = std::min(a.y + a.h, b.y + b.h);

        float inter = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
        float area_a = a.w * a.h;
        float area_b = b.w * b.h;
        float uni = area_a + area_b - inter;

        return uni > 0 ? inter / uni : 0;
    }

    void ResizeRgbBilinear(const uint8_t* src, int src_w, int src_h, int src_stride,
                           uint8_t* dst, int dst_w, int dst_h, int dst_stride) {
        float scale_x = static_cast<float>(src_w) / dst_w;
        float scale_y = static_cast<float>(src_h) / dst_h;

        for (int y = 0; y < dst_h; ++y) {
            float fy = y * scale_y;
            int y0 = static_cast<int>(fy);
            int y1 = std::min(y0 + 1, src_h - 1);
            float dy = fy - y0;

            for (int x = 0; x < dst_w; ++x) {
                float fx = x * scale_x;
                int x0 = static_cast<int>(fx);
                int x1 = std::min(x0 + 1, src_w - 1);
                float dx = fx - x0;

                for (int c = 0; c < 3; ++c) {
                    float v00 = src[y0 * src_stride + x0 * 3 + c];
                    float v01 = src[y0 * src_stride + x1 * 3 + c];
                    float v10 = src[y1 * src_stride + x0 * 3 + c];
                    float v11 = src[y1 * src_stride + x1 * 3 + c];

                    float v = v00 * (1-dx) * (1-dy) +
                              v01 * dx * (1-dy) +
                              v10 * (1-dx) * dy +
                              v11 * dx * dy;

                    dst[y * dst_stride + x * 3 + c] = static_cast<uint8_t>(v);
                }
            }
        }
    }

#endif

    std::string id_;
    std::string model_path_;
    int model_w_ = 640;
    int model_h_ = 640;
    float conf_thresh_ = 0.25f;
    float nms_thresh_ = 0.45f;
    bool append_detections_ = false;
    V8BoxFormat v8_box_format_ = V8BoxFormat::Auto;
    V8ClsActivation v8_cls_activation_ = V8ClsActivation::Auto;
    int64_t infer_interval_ms_ = 0;
    int64_t infer_phase_ms_ = 0;
    int64_t last_infer_pts_ms_ = 0;
    bool use_dma_input_ = true;
    DynamicRoiConfig dynamic_roi_;
    std::vector<DetWindow> windows_;
    std::vector<uint8_t> input_buf_;
    std::vector<float> fp32_decode_buf_;
    size_t roi_round_robin_offset_ = 0;

    std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
    std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
    std::shared_ptr<IImageProcessor> image_processor_;

#if defined(RK3588_ENABLE_RKNN)
    std::shared_ptr<IInferBackend> infer_backend_;
    ModelHandle model_handle_ = kInvalidModelHandle;
#endif
};

REGISTER_NODE(AiShoeDetNode, "ai_shoe_det");

}  // namespace rk3588