OrangePi3588Media/include/face/face_detection_utils.h

#pragma once

/**
 * 人脸检测公共工具函数
 * 供 ai_face_det 和 ai_face_det_zoned 节点复用
 */

#include <algorithm>
#include <array>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <memory>
#include <numeric>
#include <vector>

#include "face/face_result.h"

// RKNN类型前向声明（避免直接依赖rknn_api.h）
#if defined(RK3588_ENABLE_RKNN)
#include "rknn_api.h"
#else
typedef enum _rknn_tensor_type {
    RKNN_TENSOR_UINT8 = 0,
    RKNN_TENSOR_INT8,
    RKNN_TENSOR_FLOAT16,
    RKNN_TENSOR_FLOAT32,
} rknn_tensor_type;
#endif

namespace rk3588 {
namespace face_detection {

// ============================================================================
// 基础工具函数
// ============================================================================

inline int ClampInt(int v, int lo, int hi) {
    return v < lo ? lo : (v > hi ? hi : v);
}

inline float Sigmoid(float x) {
    return 1.0f / (1.0f + std::exp(-x));
}

inline float Softmax2(float a, float b) {
    const float m = std::max(a, b);
    const float ea = std::exp(a - m);
    const float eb = std::exp(b - m);
    return eb / (ea + eb);
}

// ============================================================================
// 几何计算
// ============================================================================

inline float IoU(const Rect& a, const Rect& b) {
    const float ax1 = a.x;
    const float ay1 = a.y;
    const float ax2 = a.x + a.w;
    const float ay2 = a.y + a.h;
    const float bx1 = b.x;
    const float by1 = b.y;
    const float bx2 = b.x + b.w;
    const float by2 = b.y + b.h;

    const float ix1 = std::max(ax1, bx1);
    const float iy1 = std::max(ay1, by1);
    const float ix2 = std::min(ax2, bx2);
    const float iy2 = std::min(ay2, by2);

    const float iw = std::max(0.0f, ix2 - ix1);
    const float ih = std::max(0.0f, iy2 - iy1);
    const float inter = iw * ih;
    const float ua = a.w * a.h + b.w * b.h - inter;
    return ua <= 0.0f ? 0.0f : (inter / ua);
}

inline void NmsSorted(const std::vector<Rect>& boxes,
                      const std::vector<float>& scores,
                      float nms_thresh,
                      std::vector<int>& keep) {
    keep.clear();
    std::vector<int> order(scores.size());
    std::iota(order.begin(), order.end(), 0);
    std::sort(order.begin(), order.end(),
              [&](int a, int b) { return scores[a] > scores[b]; });

    for (int idx : order) {
        bool suppressed = false;
        for (int kept : keep) {
            if (IoU(boxes[idx], boxes[kept]) > nms_thresh) {
                suppressed = true;
                break;
            }
        }
        if (!suppressed) keep.push_back(idx);
    }
}

// ============================================================================
// RetinaFace 数据结构
// ============================================================================

struct Prior {
    float cx = 0.0f;
    float cy = 0.0f;
    float w = 0.0f;
    float h = 0.0f;
};

struct DetectionConfig {
    float conf_thresh = 0.6f;
    float nms_thresh = 0.4f;
    int max_faces = 10;
    bool output_landmarks = true;

    // RetinaFace默认参数
    std::vector<int> steps{8, 16, 32};
    std::vector<std::vector<int>> min_sizes{{16, 32}, {64, 128}, {256, 512}};
    float variance0 = 0.1f;
    float variance1 = 0.2f;
};

// 张量结构（与RKNN解耦）
struct TensorView {
    const uint8_t* data = nullptr;
    size_t size = 0;
    int32_t zp = 0;
    float scale = 1.0f;
    std::vector<uint32_t> dims;
    rknn_tensor_type type = RKNN_TENSOR_UINT8;
};

struct NcTensor {
    int n = 0;
    int c = 0;
    std::vector<float> data;  // N*C row-major
};

// ============================================================================
// 图像预处理
// ============================================================================

inline void ResizeRgbBilinear(const uint8_t* src, int src_w, int src_h, int src_stride,
                               uint8_t* dst, int dst_w, int dst_h, bool swap_rb) {
    const float scale_x = static_cast<float>(src_w) / static_cast<float>(dst_w);
    const float scale_y = static_cast<float>(src_h) / static_cast<float>(dst_h);

    for (int y = 0; y < dst_h; ++y) {
        const float fy = (static_cast<float>(y) + 0.5f) * scale_y - 0.5f;
        int y0 = static_cast<int>(std::floor(fy));
        int y1 = y0 + 1;
        const float wy1 = fy - static_cast<float>(y0);
        const float wy0 = 1.0f - wy1;
        y0 = ClampInt(y0, 0, src_h - 1);
        y1 = ClampInt(y1, 0, src_h - 1);

        const uint8_t* row0 = src + static_cast<size_t>(y0) * static_cast<size_t>(src_stride);
        const uint8_t* row1 = src + static_cast<size_t>(y1) * static_cast<size_t>(src_stride);
        uint8_t* out = dst + static_cast<size_t>(y) * static_cast<size_t>(dst_w) * 3;

        for (int x = 0; x < dst_w; ++x) {
            const float fx = (static_cast<float>(x) + 0.5f) * scale_x - 0.5f;
            int x0 = static_cast<int>(std::floor(fx));
            int x1 = x0 + 1;
            const float wx1 = fx - static_cast<float>(x0);
            const float wx0 = 1.0f - wx1;
            x0 = ClampInt(x0, 0, src_w - 1);
            x1 = ClampInt(x1, 0, src_w - 1);

            const uint8_t* p00 = row0 + x0 * 3;
            const uint8_t* p01 = row0 + x1 * 3;
            const uint8_t* p10 = row1 + x0 * 3;
            const uint8_t* p11 = row1 + x1 * 3;

            for (int c = 0; c < 3; ++c) {
                const float v =
                    (static_cast<float>(p00[c]) * wx0 + static_cast<float>(p01[c]) * wx1) * wy0 +
                    (static_cast<float>(p10[c]) * wx0 + static_cast<float>(p11[c]) * wx1) * wy1;
                out[c] = static_cast<uint8_t>(ClampInt(static_cast<int>(v + 0.5f), 0, 255));
            }

            if (swap_rb) {
                std::swap(out[0], out[2]);
            }
            out += 3;
        }
    }
}

// ============================================================================
// 张量解析（从RKNN输出提取）
// ============================================================================

inline float HalfToFloat(uint16_t h) {
    const uint32_t sign = (static_cast<uint32_t>(h & 0x8000u)) << 16;
    uint32_t exp = (h & 0x7C00u) >> 10;
    uint32_t mant = (h & 0x03FFu);

    uint32_t f = 0;
    if (exp == 0) {
        if (mant == 0) {
            f = sign;
        } else {
            exp = 1;
            while ((mant & 0x0400u) == 0) {
                mant <<= 1;
                --exp;
            }
            mant &= 0x03FFu;
            exp = exp + (127 - 15);
            f = sign | (exp << 23) | (mant << 13);
        }
    } else if (exp == 31) {
        f = sign | 0x7F800000u | (mant << 13);
    } else {
        exp = exp + (127 - 15);
        f = sign | (exp << 23) | (mant << 13);
    }

    float out;
    memcpy(&out, &f, sizeof(out));
    return out;
}

template <typename T>
inline float Dequant(T q, int32_t zp, float scale) {
    return (static_cast<float>(q) - static_cast<float>(zp)) * scale;
}

// 从TensorView提取NcTensor
inline bool ExtractNcTensor(const TensorView& t, int c, NcTensor& out) {
    out = {};
    out.c = c;
    if (!t.data || t.size == 0) return false;

    size_t elem_size = 1;
    bool is_float32 = false;
    bool is_float16 = false;

    if (t.type == RKNN_TENSOR_FLOAT16) {
        elem_size = 2;
        is_float16 = true;
    } else if (t.type == RKNN_TENSOR_FLOAT32) {
        elem_size = 4;
        is_float32 = true;
    }

    const size_t elem_cnt = elem_size > 0 ? (t.size / elem_size) : 0;
    if (elem_cnt == 0) return false;

    int n = 0;
    bool transposed = false;
    if (t.dims.size() == 3) {
        const uint32_t d1 = t.dims[1];
        const uint32_t d2 = t.dims[2];
        if (static_cast<int>(d2) == c) {
            n = static_cast<int>(d1);
            transposed = false;
        } else if (static_cast<int>(d1) == c) {
            n = static_cast<int>(d2);
            transposed = true;
        } else {
            return false;
        }
    } else if (t.dims.size() == 2) {
        const uint32_t d0 = t.dims[0];
        const uint32_t d1 = t.dims[1];
        if (static_cast<int>(d1) == c) {
            n = static_cast<int>(d0);
            transposed = false;
        } else if (static_cast<int>(d0) == c) {
            n = static_cast<int>(d1);
            transposed = true;
        }
    }

    if (n <= 0) {
        if (elem_cnt % static_cast<size_t>(c) != 0) return false;
        n = static_cast<int>(elem_cnt / static_cast<size_t>(c));
        transposed = false;
    }

    if (static_cast<size_t>(n) * static_cast<size_t>(c) != elem_cnt) {
        return false;
    }

    out.n = n;
    out.data.resize(static_cast<size_t>(n) * static_cast<size_t>(c));

    auto ReadElem = [&](size_t idx) -> float {
        if (is_float32) {
            const float* fp = reinterpret_cast<const float*>(t.data);
            return fp[idx];
        }
        if (is_float16) {
            const uint16_t* hp = reinterpret_cast<const uint16_t*>(t.data);
            return HalfToFloat(hp[idx]);
        }
        if (t.type == RKNN_TENSOR_INT8) {
            const int8_t* p = reinterpret_cast<const int8_t*>(t.data);
            return Dequant(p[idx], t.zp, t.scale);
        }
        const uint8_t* p = reinterpret_cast<const uint8_t*>(t.data);
        return Dequant(p[idx], t.zp, t.scale);
    };

    if (!transposed) {
        for (size_t i = 0; i < out.data.size(); ++i) {
            out.data[i] = ReadElem(i);
        }
    } else {
        for (int ci = 0; ci < c; ++ci) {
            for (int ni = 0; ni < n; ++ni) {
                const size_t src_idx = static_cast<size_t>(ci) * static_cast<size_t>(n) + static_cast<size_t>(ni);
                const size_t dst_idx = static_cast<size_t>(ni) * static_cast<size_t>(c) + static_cast<size_t>(ci);
                out.data[dst_idx] = ReadElem(src_idx);
            }
        }
    }

    return true;
}

// ============================================================================
// RetinaFace 核心：先验框生成
// ============================================================================

inline std::vector<Prior> GeneratePriors(int in_w, int in_h,
                                          const std::vector<int>& steps,
                                          const std::vector<std::vector<int>>& min_sizes) {
    std::vector<Prior> priors;
    if (steps.empty() || steps.size() != min_sizes.size()) return priors;
    priors.reserve(5000);

    for (size_t s = 0; s < steps.size(); ++s) {
        const int step = steps[s];
        const int fm_w = in_w / step;
        const int fm_h = in_h / step;
        for (int i = 0; i < fm_h; ++i) {
            for (int j = 0; j < fm_w; ++j) {
                for (int ms : min_sizes[s]) {
                    const float s_kx = static_cast<float>(ms) / static_cast<float>(in_w);
                    const float s_ky = static_cast<float>(ms) / static_cast<float>(in_h);
                    const float cx = (static_cast<float>(j) + 0.5f) * static_cast<float>(step) /
                                    static_cast<float>(in_w);
                    const float cy = (static_cast<float>(i) + 0.5f) * static_cast<float>(step) /
                                    static_cast<float>(in_h);
                    priors.push_back(Prior{cx, cy, s_kx, s_ky});
                }
            }
        }
    }
    return priors;
}

// ============================================================================
// RetinaFace 核心：检测结果解码
// ============================================================================

/**
 * 解码RetinaFace检测结果
 *
 * @param loc_tensor    位置回归张量 [N, 4]
 * @param conf_tensor   置信度张量 [N, 2]
 * @param landm_tensor  关键点张量 [N, 10] (可选，可以为空)
 * @param priors        先验框
 * @param src_w         原始图像宽度
 * @param src_h         原始图像高度
 * @param model_w       模型输入宽度
 * @param model_h       模型输入高度
 * @param cfg           检测配置
 * @param out           输出结果
 */
inline void DecodeRetinaFace(const NcTensor& loc_tensor,
                              const NcTensor& conf_tensor,
                              const NcTensor& landm_tensor,
                              const std::vector<Prior>& priors,
                              int src_w, int src_h,
                              int model_w, int model_h,
                              const DetectionConfig& cfg,
                              FaceDetResult& out) {
    if (loc_tensor.n <= 0 || conf_tensor.n != loc_tensor.n) return;

    const int n = loc_tensor.n;
    const bool has_landmarks = cfg.output_landmarks && !landm_tensor.data.empty() && landm_tensor.n == n;

    if (!priors.empty() && static_cast<int>(priors.size()) != n) {
        return;  // prior mismatch
    }

    const float sx = static_cast<float>(src_w) / static_cast<float>(model_w);
    const float sy = static_cast<float>(src_h) / static_cast<float>(model_h);

    std::vector<Rect> boxes;
    std::vector<float> scores;
    std::vector<std::array<Point2f, 5>> lmks;
    boxes.reserve(static_cast<size_t>(n));
    scores.reserve(static_cast<size_t>(n));
    if (has_landmarks) lmks.reserve(static_cast<size_t>(n));

    const float var0 = cfg.variance0;
    const float var1 = cfg.variance1;

    for (int i = 0; i < n; ++i) {
        // 解析置信度
        const float s0 = conf_tensor.data[static_cast<size_t>(i) * 2 + 0];
        const float s1 = conf_tensor.data[static_cast<size_t>(i) * 2 + 1];
        float score;
        if (s0 >= 0.0f && s0 <= 1.0f && s1 >= 0.0f && s1 <= 1.0f && std::fabs((s0 + s1) - 1.0f) < 0.1f) {
            score = s1;
        } else {
            score = Softmax2(s0, s1);
        }
        if (score < cfg.conf_thresh) continue;

        // 解析位置
        const Prior p = priors.empty() ? Prior{0, 0, 0, 0} : priors[static_cast<size_t>(i)];
        const float dx = loc_tensor.data[static_cast<size_t>(i) * 4 + 0];
        const float dy = loc_tensor.data[static_cast<size_t>(i) * 4 + 1];
        const float dw = loc_tensor.data[static_cast<size_t>(i) * 4 + 2];
        const float dh = loc_tensor.data[static_cast<size_t>(i) * 4 + 3];

        const float cx = p.cx + dx * var0 * p.w;
        const float cy = p.cy + dy * var0 * p.h;
        const float ww = p.w * std::exp(dw * var1);
        const float hh = p.h * std::exp(dh * var1);

        float x1 = (cx - ww * 0.5f) * static_cast<float>(model_w);
        float y1 = (cy - hh * 0.5f) * static_cast<float>(model_h);
        float x2 = (cx + ww * 0.5f) * static_cast<float>(model_w);
        float y2 = (cy + hh * 0.5f) * static_cast<float>(model_h);

        // 映射到原始图像
        x1 *= sx;
        x2 *= sx;
        y1 *= sy;
        y2 *= sy;

        Rect bb;
        bb.x = static_cast<float>(ClampInt(static_cast<int>(x1), 0, src_w - 1));
        bb.y = static_cast<float>(ClampInt(static_cast<int>(y1), 0, src_h - 1));
        const float rx2 = static_cast<float>(ClampInt(static_cast<int>(x2), 0, src_w - 1));
        const float ry2 = static_cast<float>(ClampInt(static_cast<int>(y2), 0, src_h - 1));
        bb.w = std::max(0.0f, rx2 - bb.x);
        bb.h = std::max(0.0f, ry2 - bb.y);
        if (bb.w <= 1.0f || bb.h <= 1.0f) continue;

        boxes.push_back(bb);
        scores.push_back(score);

        // 解析关键点
        if (has_landmarks) {
            std::array<Point2f, 5> pts{};
            for (int k = 0; k < 5; ++k) {
                const float lx = landm_tensor.data[static_cast<size_t>(i) * 10 + k * 2 + 0];
                const float ly = landm_tensor.data[static_cast<size_t>(i) * 10 + k * 2 + 1];
                const float px = (p.cx + lx * var0 * p.w) * static_cast<float>(model_w) * sx;
                const float py = (p.cy + ly * var0 * p.h) * static_cast<float>(model_h) * sy;
                pts[k].x = static_cast<float>(ClampInt(static_cast<int>(px), 0, src_w - 1));
                pts[k].y = static_cast<float>(ClampInt(static_cast<int>(py), 0, src_h - 1));
            }
            lmks.push_back(pts);
        }
    }

    if (boxes.empty()) return;

    // NMS
    std::vector<int> keep;
    NmsSorted(boxes, scores, cfg.nms_thresh, keep);
    if (keep.empty()) return;

    // 构建输出
    const int out_n = std::min<int>(cfg.max_faces, static_cast<int>(keep.size()));
    out.faces.reserve(static_cast<size_t>(out_n));
    for (int i = 0; i < out_n; ++i) {
        const int k = keep[static_cast<size_t>(i)];
        FaceDetItem item;
        item.bbox = boxes[static_cast<size_t>(k)];
        item.score = scores[static_cast<size_t>(k)];
        item.track_id = -1;
        if (has_landmarks && k < static_cast<int>(lmks.size())) {
            item.has_landmarks = true;
            item.landmarks = lmks[static_cast<size_t>(k)];
        }
        out.faces.push_back(std::move(item));
    }
}

// ============================================================================
// SCRFD 核心：检测结果解码
// SCRFD输出格式：9个张量 (3个尺度 × 3种类型：score, bbox, kps)
// ============================================================================

struct ScrfdAnchor {
    float cx = 0.0f;
    float cy = 0.0f;
    int stride = 0;
};

inline std::vector<ScrfdAnchor> GenerateScrfdAnchors(int in_w, int in_h,
                                                       const std::vector<int>& strides) {
    std::vector<ScrfdAnchor> anchors;
    anchors.reserve(20000);

    for (int stride : strides) {
        int fm_w = in_w / stride;
        int fm_h = in_h / stride;
        for (int y = 0; y < fm_h; ++y) {
            for (int x = 0; x < fm_w; ++x) {
                // SCRFD使用2个anchor per location
                for (int a = 0; a < 2; ++a) {
                    anchors.push_back(ScrfdAnchor{
                        (x + 0.5f) * stride,
                        (y + 0.5f) * stride,
                        stride
                    });
                }
            }
        }
    }
    return anchors;
}

// 从NCHW格式提取值
inline float ExtractNCHW(const TensorView& t, int c, int h, int w, int C, int H, int W) {
    if (c < 0 || c >= C || h < 0 || h >= H || w < 0 || w >= W) return 0.0f;
    size_t idx = (static_cast<size_t>(c) * H + h) * W + w;

    if (t.type == RKNN_TENSOR_FLOAT32) {
        const float* p = reinterpret_cast<const float*>(t.data);
        return p[idx];
    } else if (t.type == RKNN_TENSOR_INT8) {
        const int8_t* p = reinterpret_cast<const int8_t*>(t.data);
        return Dequant(p[idx], t.zp, t.scale);
    } else {
        const uint8_t* p = reinterpret_cast<const uint8_t*>(t.data);
        return Dequant(p[idx], t.zp, t.scale);
    }
}

/**
 * 解码SCRFD检测结果 - 与 ai_scrfd 节点使用相同的逻辑
 *
 * @param outputs       9个输出张量 [score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32]
 * @param anchors       预生成的anchor (center_x, center_y, stride)
 * @param src_w         原始图像宽度
 * @param src_h         原始图像高度
 * @param model_w       模型输入宽度
 * @param model_h       模型输入高度
 * @param conf_thresh   置信度阈值
 * @param output_lm     是否输出关键点
 * @param out           输出结果
 */
inline void DecodeScrfd(const std::vector<TensorView>& outputs,
                        const std::vector<ScrfdAnchor>& anchors,
                        int src_w, int src_h,
                        int model_w, int model_h,
                        float conf_thresh,
                        bool output_lm,
                        FaceDetResult& out) {
    if (outputs.size() != 9) return;

    // Output order: score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32
    const int anchor_counts[] = {12800, 3200, 800};
    const int strides[] = {8, 16, 32};

    size_t anchor_idx = 0;
    float scale_x = static_cast<float>(src_w) / model_w;
    float scale_y = static_cast<float>(src_h) / model_h;

    for (int s = 0; s < 3; ++s) {
        int stride = strides[s];
        int count = anchor_counts[s];

        // 检查输出数据是否有效
        if (outputs[s].type != RKNN_TENSOR_FLOAT32 ||
            outputs[s + 3].type != RKNN_TENSOR_FLOAT32 ||
            outputs[s + 6].type != RKNN_TENSOR_FLOAT32) {
            continue;
        }

        const float* scores = reinterpret_cast<const float*>(outputs[s].data);
        const float* bboxes = reinterpret_cast<const float*>(outputs[s + 3].data);
        const float* kps = reinterpret_cast<const float*>(outputs[s + 6].data);

        if (!scores || !bboxes || !kps) continue;

        for (int i = 0; i < count; ++i) {
            if (anchor_idx >= anchors.size()) break;

            float score = scores[i];
            if (score < conf_thresh) {
                anchor_idx++;
                continue;
            }

            const ScrfdAnchor& pt = anchors[anchor_idx];

            // BBox: [left, top, right, bottom] - distances from center
            float left = bboxes[i * 4 + 0];
            float top = bboxes[i * 4 + 1];
            float right = bboxes[i * 4 + 2];
            float bottom = bboxes[i * 4 + 3];

            // Decode to image coordinates (640x640)
            float x1_640 = (pt.cx - left) * stride;
            float y1_640 = (pt.cy - top) * stride;
            float x2_640 = (pt.cx + right) * stride;
            float y2_640 = (pt.cy + bottom) * stride;

            FaceDetItem det;
            det.bbox.x = x1_640 * scale_x;
            det.bbox.y = y1_640 * scale_y;
            det.bbox.w = (x2_640 - x1_640) * scale_x;
            det.bbox.h = (y2_640 - y1_640) * scale_y;
            det.score = score;
            det.has_landmarks = output_lm;

            // Keypoints
            if (output_lm) {
                for (int p = 0; p < 5; ++p) {
                    float kps_x = kps[i * 10 + p * 2 + 0];
                    float kps_y = kps[i * 10 + p * 2 + 1];
                    float kx_640 = (pt.cx + kps_x) * stride;
                    float ky_640 = (pt.cy + kps_y) * stride;
                    det.landmarks[p].x = kx_640 * scale_x;
                    det.landmarks[p].y = ky_640 * scale_y;
                }
            }

            out.faces.push_back(det);
            anchor_idx++;
        }
    }
}

} // namespace face_detection
} // namespace rk3588