OrangePi3588Media/plugins/ai_scrfd/ai_scrfd_node.cpp

/**
 * ai_scrfd - SCRFD 640x640 face detection node for RK3588
 *
 * Reference: https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/ort/cv/scrfd.cpp
 * BBox format: [left, top, right, bottom] offsets from grid center
 */

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <memory>
#include <string>
#include <vector>

#include "face/face_result.h"
#include "hw/i_infer_backend.h"
#include "node.h"
#include "utils/dma_alloc.h"
#include "utils/logger.h"

namespace rk3588 {

struct ScrfdConfig {
    float conf_thresh = 0.5f;
    float nms_thresh = 0.4f;
    int max_faces = 50;
    bool output_landmarks = true;
    std::string input_format = "rgb";
};

// Grid center point for each anchor
struct CenterPoint {
    float cx, cy;  // grid coordinates (0,0), (1,0), ...
    float stride;
};

class AiScrfdNode : public INode {
public:
    std::string Id() const override { return id_; }
    std::string Type() const override { return "ai_scrfd"; }

    bool Init(const SimpleJson& config, const NodeContext& ctx) override {
        id_ = config.ValueOr<std::string>("id", "scrfd");
        model_path_ = config.ValueOr<std::string>("model_path", "");
        if (model_path_.empty()) {
            LogError("[ai_scrfd] model_path is required");
            return false;
        }

        cfg_.conf_thresh = config.ValueOr<float>("conf_thresh", 0.5f);
        cfg_.nms_thresh = config.ValueOr<float>("nms_thresh", 0.4f);
        cfg_.max_faces = config.ValueOr<int>("max_faces", 50);
        cfg_.output_landmarks = config.ValueOr<bool>("output_landmarks", true);
        cfg_.input_format = config.ValueOr<std::string>("input_format", "rgb");

        model_w_ = 640;
        model_h_ = 640;

        // Generate center points for all anchors
        GenerateCenterPoints();

        input_queue_ = ctx.input_queue;
        output_queues_ = ctx.output_queues;
        if (!input_queue_) {
            LogError("[ai_scrfd] no input queue");
            return false;
        }

        infer_backend_ = ctx.infer_backend;
        if (!infer_backend_) {
            LogError("[ai_scrfd] no infer backend");
            return false;
        }

#if defined(RK3588_ENABLE_RKNN)
        std::string err;
        model_handle_ = infer_backend_->LoadModel(model_path_, err);
        if (model_handle_ == kInvalidModelHandle) {
            LogError("[ai_scrfd] failed to load model: " + err);
            return false;
        }

        input_buf_.resize(model_w_ * model_h_ * 3);

        LogInfo("[ai_scrfd] model loaded: " + model_path_);
#else
        LogWarn("[ai_scrfd] RKNN disabled");
#endif

        return true;
    }

    bool Start() override {
        LogInfo("[ai_scrfd] start");
        return true;
    }

    void Stop() override {
#if defined(RK3588_ENABLE_RKNN)
        if (model_handle_ != kInvalidModelHandle) {
            infer_backend_->UnloadModel(model_handle_);
            model_handle_ = kInvalidModelHandle;
        }
#endif
        LogInfo("[ai_scrfd] stop");
    }

    NodeStatus Process(FramePtr frame) override {
        if (!frame) return NodeStatus::DROP;

#if defined(RK3588_ENABLE_RKNN)
        RunDetection(frame);
#endif

        Push(frame);
        return NodeStatus::OK;
    }

private:
    void Push(FramePtr frame) {
        for (auto& q : output_queues_) q->Push(frame);
    }

#if defined(RK3588_ENABLE_RKNN)

    void GenerateCenterPoints() {
        // strides: 8, 16, 32
        const int strides[] = {8, 16, 32};

        for (int stride : strides) {
            int num_grid = model_w_ / stride;
            for (int y = 0; y < num_grid; ++y) {
                for (int x = 0; x < num_grid; ++x) {
                    // 2 anchors per location
                    for (int a = 0; a < 2; ++a) {
                        CenterPoint pt;
                        pt.cx = static_cast<float>(x);  // grid x
                        pt.cy = static_cast<float>(y);  // grid y
                        pt.stride = static_cast<float>(stride);
                        center_points_.push_back(pt);
                    }
                }
            }
        }

        LogInfo("[ai_scrfd] Generated " + std::to_string(center_points_.size()) + " center points");
    }

    void RunDetection(FramePtr frame) {
        if (!frame->data || frame->data_size == 0) return;

        const int src_w = frame->width;
        const int src_h = frame->height;

        if (frame->DmaFd() >= 0) frame->SyncStart();

        PrepareInput(frame, model_w_, model_h_);

        InferInput input;
        input.width = model_w_;
        input.height = model_h_;
        input.is_nhwc = true;
        input.data = input_buf_.data();
        input.size = input_buf_.size();
        input.type = RKNN_TENSOR_UINT8;

        auto r = infer_backend_->InferBorrowed(model_handle_, input);
        if (!r.success) {
            LogWarn("[ai_scrfd] inference failed: " + r.error);
            return;
        }

        std::vector<FaceDetItem> detections = ParseOutputs(r.outputs, src_w, src_h);
        detections = ApplyNMS(detections, cfg_.nms_thresh);

        if (detections.size() > static_cast<size_t>(cfg_.max_faces)) {
            detections.resize(cfg_.max_faces);
        }

        FaceDetResult result;
        result.img_w = src_w;
        result.img_h = src_h;
        result.model_name = "scrfd_640";
        result.faces = std::move(detections);

        frame->face_det = std::make_shared<FaceDetResult>(std::move(result));
    }

    void PrepareInput(FramePtr frame, int dst_w, int dst_h) {
        const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
        const int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
                            : (frame->stride > 0 ? frame->stride : frame->width * 3);

        // Simple bilinear resize
        ResizeBilinear(src, frame->width, frame->height, src_stride,
                      input_buf_.data(), dst_w, dst_h, dst_w * 3);

        // RGB/BGR conversion if needed
        bool need_swap = (frame->format == PixelFormat::BGR && cfg_.input_format == "rgb") ||
                        (frame->format == PixelFormat::RGB && cfg_.input_format == "bgr");

        if (need_swap) {
            for (int i = 0; i < dst_w * dst_h * 3; i += 3) {
                std::swap(input_buf_[i], input_buf_[i + 2]);
            }
        }
    }

    void ResizeBilinear(const uint8_t* src, int src_w, int src_h, int src_stride,
                       uint8_t* dst, int dst_w, int dst_h, int dst_stride) {
        float x_ratio = static_cast<float>(src_w) / dst_w;
        float y_ratio = static_cast<float>(src_h) / dst_h;

        for (int y = 0; y < dst_h; ++y) {
            for (int x = 0; x < dst_w; ++x) {
                float sx = (x + 0.5f) * x_ratio - 0.5f;
                float sy = (y + 0.5f) * y_ratio - 0.5f;

                int x0 = static_cast<int>(std::floor(sx));
                int y0 = static_cast<int>(std::floor(sy));
                int x1 = std::min(x0 + 1, src_w - 1);
                int y1 = std::min(y0 + 1, src_h - 1);

                x0 = std::max(0, x0);
                y0 = std::max(0, y0);

                float fx = sx - x0;
                float fy = sy - y0;

                for (int c = 0; c < 3; ++c) {
                    float v00 = src[y0 * src_stride + x0 * 3 + c];
                    float v01 = src[y0 * src_stride + x1 * 3 + c];
                    float v10 = src[y1 * src_stride + x0 * 3 + c];
                    float v11 = src[y1 * src_stride + x1 * 3 + c];

                    float v = v00 * (1-fx) * (1-fy) + v01 * fx * (1-fy) +
                             v10 * (1-fx) * fy + v11 * fx * fy;

                    dst[y * dst_stride + x * 3 + c] = static_cast<uint8_t>(v);
                }
            }
        }
    }

    /**
     * Parse SCRFD outputs - reference implementation from lite.ai.toolkit
     *
     * BBox format: [left, top, right, bottom] - distances from grid center
     * NOT [dx, dy, dw, dh]!
     */
    std::vector<FaceDetItem> ParseOutputs(
            const std::vector<AiScheduler::BorrowedOutput>& outputs,
            int src_w, int src_h) {
        std::vector<FaceDetItem> detections;

        if (outputs.size() != 9) return detections;

        // Output order: score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32
        const int anchor_counts[] = {12800, 3200, 800};
        const int strides[] = {8, 16, 32};

        size_t anchor_idx = 0;

        for (int s = 0; s < 3; ++s) {
            int stride = strides[s];
            int count = anchor_counts[s];

            const auto& score_out = outputs[s];
            const auto& bbox_out = outputs[s + 3];
            const auto& kps_out = outputs[s + 6];

            if (score_out.dims.size() < 3) continue;

            const float* scores = reinterpret_cast<const float*>(score_out.data);
            const float* bboxes = reinterpret_cast<const float*>(bbox_out.data);
            const float* kps = reinterpret_cast<const float*>(kps_out.data);

            if (!scores || !bboxes || !kps) continue;

            for (int i = 0; i < count; ++i) {
                if (anchor_idx >= center_points_.size()) break;

                float score = scores[i];
                if (score < cfg_.conf_thresh) {
                    anchor_idx++;
                    continue;
                }

                const CenterPoint& pt = center_points_[anchor_idx];

                // BBox: [left, top, right, bottom] - distances from center
                float left = bboxes[i * 4 + 0];
                float top = bboxes[i * 4 + 1];
                float right = bboxes[i * 4 + 2];
                float bottom = bboxes[i * 4 + 3];

                // Decode to image coordinates (640x640)
                float x1_640 = (pt.cx - left) * stride;
                float y1_640 = (pt.cy - top) * stride;
                float x2_640 = (pt.cx + right) * stride;
                float y2_640 = (pt.cy + bottom) * stride;

                // Scale to original image size
                float scale_x = static_cast<float>(src_w) / model_w_;
                float scale_y = static_cast<float>(src_h) / model_h_;

                FaceDetItem det;
                det.bbox.x = x1_640 * scale_x;
                det.bbox.y = y1_640 * scale_y;
                det.bbox.w = (x2_640 - x1_640) * scale_x;
                det.bbox.h = (y2_640 - y1_640) * scale_y;
                det.score = score;
                det.has_landmarks = cfg_.output_landmarks;

                // Keypoints
                if (cfg_.output_landmarks) {
                    for (int p = 0; p < 5; ++p) {
                        float kps_x = kps[i * 10 + p * 2 + 0];
                        float kps_y = kps[i * 10 + p * 2 + 1];
                        float kx_640 = (pt.cx + kps_x) * stride;
                        float ky_640 = (pt.cy + kps_y) * stride;
                        det.landmarks[p].x = kx_640 * scale_x;
                        det.landmarks[p].y = ky_640 * scale_y;
                    }
                }

                detections.push_back(det);
                anchor_idx++;
            }
        }

        return detections;
    }

    float IoU(const Rect& a, const Rect& b) {
        float x1 = std::max(a.x, b.x);
        float y1 = std::max(a.y, b.y);
        float x2 = std::min(a.x + a.w, b.x + b.w);
        float y2 = std::min(a.y + a.h, b.y + b.h);

        float inter = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
        float area_a = a.w * a.h;
        float area_b = b.w * b.h;
        float union_area = area_a + area_b - inter;

        return union_area > 0 ? inter / union_area : 0;
    }

    std::vector<FaceDetItem> ApplyNMS(std::vector<FaceDetItem>& dets, float thresh) {
        if (dets.empty()) return dets;

        std::sort(dets.begin(), dets.end(),
                  [](const FaceDetItem& a, const FaceDetItem& b) {
                      return a.score > b.score;
                  });

        std::vector<FaceDetItem> keep;
        std::vector<bool> suppressed(dets.size(), false);

        for (size_t i = 0; i < dets.size(); ++i) {
            if (suppressed[i]) continue;
            keep.push_back(dets[i]);

            for (size_t j = i + 1; j < dets.size(); ++j) {
                if (suppressed[j]) continue;
                if (IoU(dets[i].bbox, dets[j].bbox) > thresh) {
                    suppressed[j] = true;
                }
            }
        }

        return keep;
    }

#endif

    std::string id_;
    std::string model_path_;
    ScrfdConfig cfg_;
    int model_w_ = 640;
    int model_h_ = 640;

    std::vector<CenterPoint> center_points_;

    std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
    std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
    std::shared_ptr<IInferBackend> infer_backend_;

#if defined(RK3588_ENABLE_RKNN)
    ModelHandle model_handle_ = kInvalidModelHandle;
    std::vector<uint8_t> input_buf_;
#endif
};

REGISTER_NODE(AiScrfdNode, "ai_scrfd");

}  // namespace rk3588