OrangePi3588Media/include/face/face_detection_utils.h

642 lines
22 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
/**
* 人脸检测公共工具函数
* 供 ai_face_det 和 ai_face_det_zoned 节点复用
*/
#include <algorithm>
#include <array>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <memory>
#include <numeric>
#include <vector>
#include "face/face_result.h"
// RKNN类型前向声明避免直接依赖rknn_api.h
#if defined(RK3588_ENABLE_RKNN)
#include "rknn_api.h"
#else
typedef enum _rknn_tensor_type {
RKNN_TENSOR_UINT8 = 0,
RKNN_TENSOR_INT8,
RKNN_TENSOR_FLOAT16,
RKNN_TENSOR_FLOAT32,
} rknn_tensor_type;
#endif
namespace rk3588 {
namespace face_detection {
// ============================================================================
// 基础工具函数
// ============================================================================
inline int ClampInt(int v, int lo, int hi) {
return v < lo ? lo : (v > hi ? hi : v);
}
inline float Sigmoid(float x) {
return 1.0f / (1.0f + std::exp(-x));
}
inline float Softmax2(float a, float b) {
const float m = std::max(a, b);
const float ea = std::exp(a - m);
const float eb = std::exp(b - m);
return eb / (ea + eb);
}
// ============================================================================
// 几何计算
// ============================================================================
inline float IoU(const Rect& a, const Rect& b) {
const float ax1 = a.x;
const float ay1 = a.y;
const float ax2 = a.x + a.w;
const float ay2 = a.y + a.h;
const float bx1 = b.x;
const float by1 = b.y;
const float bx2 = b.x + b.w;
const float by2 = b.y + b.h;
const float ix1 = std::max(ax1, bx1);
const float iy1 = std::max(ay1, by1);
const float ix2 = std::min(ax2, bx2);
const float iy2 = std::min(ay2, by2);
const float iw = std::max(0.0f, ix2 - ix1);
const float ih = std::max(0.0f, iy2 - iy1);
const float inter = iw * ih;
const float ua = a.w * a.h + b.w * b.h - inter;
return ua <= 0.0f ? 0.0f : (inter / ua);
}
inline void NmsSorted(const std::vector<Rect>& boxes,
const std::vector<float>& scores,
float nms_thresh,
std::vector<int>& keep) {
keep.clear();
std::vector<int> order(scores.size());
std::iota(order.begin(), order.end(), 0);
std::sort(order.begin(), order.end(),
[&](int a, int b) { return scores[a] > scores[b]; });
for (int idx : order) {
bool suppressed = false;
for (int kept : keep) {
if (IoU(boxes[idx], boxes[kept]) > nms_thresh) {
suppressed = true;
break;
}
}
if (!suppressed) keep.push_back(idx);
}
}
// ============================================================================
// RetinaFace 数据结构
// ============================================================================
struct Prior {
float cx = 0.0f;
float cy = 0.0f;
float w = 0.0f;
float h = 0.0f;
};
struct DetectionConfig {
float conf_thresh = 0.6f;
float nms_thresh = 0.4f;
int max_faces = 10;
bool output_landmarks = true;
// RetinaFace默认参数
std::vector<int> steps{8, 16, 32};
std::vector<std::vector<int>> min_sizes{{16, 32}, {64, 128}, {256, 512}};
float variance0 = 0.1f;
float variance1 = 0.2f;
};
// 张量结构与RKNN解耦
struct TensorView {
const uint8_t* data = nullptr;
size_t size = 0;
int32_t zp = 0;
float scale = 1.0f;
std::vector<uint32_t> dims;
rknn_tensor_type type = RKNN_TENSOR_UINT8;
};
struct NcTensor {
int n = 0;
int c = 0;
std::vector<float> data; // N*C row-major
};
// ============================================================================
// 图像预处理
// ============================================================================
inline void ResizeRgbBilinear(const uint8_t* src, int src_w, int src_h, int src_stride,
uint8_t* dst, int dst_w, int dst_h, bool swap_rb) {
const float scale_x = static_cast<float>(src_w) / static_cast<float>(dst_w);
const float scale_y = static_cast<float>(src_h) / static_cast<float>(dst_h);
for (int y = 0; y < dst_h; ++y) {
const float fy = (static_cast<float>(y) + 0.5f) * scale_y - 0.5f;
int y0 = static_cast<int>(std::floor(fy));
int y1 = y0 + 1;
const float wy1 = fy - static_cast<float>(y0);
const float wy0 = 1.0f - wy1;
y0 = ClampInt(y0, 0, src_h - 1);
y1 = ClampInt(y1, 0, src_h - 1);
const uint8_t* row0 = src + static_cast<size_t>(y0) * static_cast<size_t>(src_stride);
const uint8_t* row1 = src + static_cast<size_t>(y1) * static_cast<size_t>(src_stride);
uint8_t* out = dst + static_cast<size_t>(y) * static_cast<size_t>(dst_w) * 3;
for (int x = 0; x < dst_w; ++x) {
const float fx = (static_cast<float>(x) + 0.5f) * scale_x - 0.5f;
int x0 = static_cast<int>(std::floor(fx));
int x1 = x0 + 1;
const float wx1 = fx - static_cast<float>(x0);
const float wx0 = 1.0f - wx1;
x0 = ClampInt(x0, 0, src_w - 1);
x1 = ClampInt(x1, 0, src_w - 1);
const uint8_t* p00 = row0 + x0 * 3;
const uint8_t* p01 = row0 + x1 * 3;
const uint8_t* p10 = row1 + x0 * 3;
const uint8_t* p11 = row1 + x1 * 3;
for (int c = 0; c < 3; ++c) {
const float v =
(static_cast<float>(p00[c]) * wx0 + static_cast<float>(p01[c]) * wx1) * wy0 +
(static_cast<float>(p10[c]) * wx0 + static_cast<float>(p11[c]) * wx1) * wy1;
out[c] = static_cast<uint8_t>(ClampInt(static_cast<int>(v + 0.5f), 0, 255));
}
if (swap_rb) {
std::swap(out[0], out[2]);
}
out += 3;
}
}
}
// ============================================================================
// 张量解析从RKNN输出提取
// ============================================================================
inline float HalfToFloat(uint16_t h) {
const uint32_t sign = (static_cast<uint32_t>(h & 0x8000u)) << 16;
uint32_t exp = (h & 0x7C00u) >> 10;
uint32_t mant = (h & 0x03FFu);
uint32_t f = 0;
if (exp == 0) {
if (mant == 0) {
f = sign;
} else {
exp = 1;
while ((mant & 0x0400u) == 0) {
mant <<= 1;
--exp;
}
mant &= 0x03FFu;
exp = exp + (127 - 15);
f = sign | (exp << 23) | (mant << 13);
}
} else if (exp == 31) {
f = sign | 0x7F800000u | (mant << 13);
} else {
exp = exp + (127 - 15);
f = sign | (exp << 23) | (mant << 13);
}
float out;
memcpy(&out, &f, sizeof(out));
return out;
}
template <typename T>
inline float Dequant(T q, int32_t zp, float scale) {
return (static_cast<float>(q) - static_cast<float>(zp)) * scale;
}
// 从TensorView提取NcTensor
inline bool ExtractNcTensor(const TensorView& t, int c, NcTensor& out) {
out = {};
out.c = c;
if (!t.data || t.size == 0) return false;
size_t elem_size = 1;
bool is_float32 = false;
bool is_float16 = false;
if (t.type == RKNN_TENSOR_FLOAT16) {
elem_size = 2;
is_float16 = true;
} else if (t.type == RKNN_TENSOR_FLOAT32) {
elem_size = 4;
is_float32 = true;
}
const size_t elem_cnt = elem_size > 0 ? (t.size / elem_size) : 0;
if (elem_cnt == 0) return false;
int n = 0;
bool transposed = false;
if (t.dims.size() == 3) {
const uint32_t d1 = t.dims[1];
const uint32_t d2 = t.dims[2];
if (static_cast<int>(d2) == c) {
n = static_cast<int>(d1);
transposed = false;
} else if (static_cast<int>(d1) == c) {
n = static_cast<int>(d2);
transposed = true;
} else {
return false;
}
} else if (t.dims.size() == 2) {
const uint32_t d0 = t.dims[0];
const uint32_t d1 = t.dims[1];
if (static_cast<int>(d1) == c) {
n = static_cast<int>(d0);
transposed = false;
} else if (static_cast<int>(d0) == c) {
n = static_cast<int>(d1);
transposed = true;
}
}
if (n <= 0) {
if (elem_cnt % static_cast<size_t>(c) != 0) return false;
n = static_cast<int>(elem_cnt / static_cast<size_t>(c));
transposed = false;
}
if (static_cast<size_t>(n) * static_cast<size_t>(c) != elem_cnt) {
return false;
}
out.n = n;
out.data.resize(static_cast<size_t>(n) * static_cast<size_t>(c));
auto ReadElem = [&](size_t idx) -> float {
if (is_float32) {
const float* fp = reinterpret_cast<const float*>(t.data);
return fp[idx];
}
if (is_float16) {
const uint16_t* hp = reinterpret_cast<const uint16_t*>(t.data);
return HalfToFloat(hp[idx]);
}
if (t.type == RKNN_TENSOR_INT8) {
const int8_t* p = reinterpret_cast<const int8_t*>(t.data);
return Dequant(p[idx], t.zp, t.scale);
}
const uint8_t* p = reinterpret_cast<const uint8_t*>(t.data);
return Dequant(p[idx], t.zp, t.scale);
};
if (!transposed) {
for (size_t i = 0; i < out.data.size(); ++i) {
out.data[i] = ReadElem(i);
}
} else {
for (int ci = 0; ci < c; ++ci) {
for (int ni = 0; ni < n; ++ni) {
const size_t src_idx = static_cast<size_t>(ci) * static_cast<size_t>(n) + static_cast<size_t>(ni);
const size_t dst_idx = static_cast<size_t>(ni) * static_cast<size_t>(c) + static_cast<size_t>(ci);
out.data[dst_idx] = ReadElem(src_idx);
}
}
}
return true;
}
// ============================================================================
// RetinaFace 核心:先验框生成
// ============================================================================
inline std::vector<Prior> GeneratePriors(int in_w, int in_h,
const std::vector<int>& steps,
const std::vector<std::vector<int>>& min_sizes) {
std::vector<Prior> priors;
if (steps.empty() || steps.size() != min_sizes.size()) return priors;
priors.reserve(5000);
for (size_t s = 0; s < steps.size(); ++s) {
const int step = steps[s];
const int fm_w = in_w / step;
const int fm_h = in_h / step;
for (int i = 0; i < fm_h; ++i) {
for (int j = 0; j < fm_w; ++j) {
for (int ms : min_sizes[s]) {
const float s_kx = static_cast<float>(ms) / static_cast<float>(in_w);
const float s_ky = static_cast<float>(ms) / static_cast<float>(in_h);
const float cx = (static_cast<float>(j) + 0.5f) * static_cast<float>(step) /
static_cast<float>(in_w);
const float cy = (static_cast<float>(i) + 0.5f) * static_cast<float>(step) /
static_cast<float>(in_h);
priors.push_back(Prior{cx, cy, s_kx, s_ky});
}
}
}
}
return priors;
}
// ============================================================================
// RetinaFace 核心:检测结果解码
// ============================================================================
/**
* 解码RetinaFace检测结果
*
* @param loc_tensor 位置回归张量 [N, 4]
* @param conf_tensor 置信度张量 [N, 2]
* @param landm_tensor 关键点张量 [N, 10] (可选,可以为空)
* @param priors 先验框
* @param src_w 原始图像宽度
* @param src_h 原始图像高度
* @param model_w 模型输入宽度
* @param model_h 模型输入高度
* @param cfg 检测配置
* @param out 输出结果
*/
inline void DecodeRetinaFace(const NcTensor& loc_tensor,
const NcTensor& conf_tensor,
const NcTensor& landm_tensor,
const std::vector<Prior>& priors,
int src_w, int src_h,
int model_w, int model_h,
const DetectionConfig& cfg,
FaceDetResult& out) {
if (loc_tensor.n <= 0 || conf_tensor.n != loc_tensor.n) return;
const int n = loc_tensor.n;
const bool has_landmarks = cfg.output_landmarks && !landm_tensor.data.empty() && landm_tensor.n == n;
if (!priors.empty() && static_cast<int>(priors.size()) != n) {
return; // prior mismatch
}
const float sx = static_cast<float>(src_w) / static_cast<float>(model_w);
const float sy = static_cast<float>(src_h) / static_cast<float>(model_h);
std::vector<Rect> boxes;
std::vector<float> scores;
std::vector<std::array<Point2f, 5>> lmks;
boxes.reserve(static_cast<size_t>(n));
scores.reserve(static_cast<size_t>(n));
if (has_landmarks) lmks.reserve(static_cast<size_t>(n));
const float var0 = cfg.variance0;
const float var1 = cfg.variance1;
for (int i = 0; i < n; ++i) {
// 解析置信度
const float s0 = conf_tensor.data[static_cast<size_t>(i) * 2 + 0];
const float s1 = conf_tensor.data[static_cast<size_t>(i) * 2 + 1];
float score;
if (s0 >= 0.0f && s0 <= 1.0f && s1 >= 0.0f && s1 <= 1.0f && std::fabs((s0 + s1) - 1.0f) < 0.1f) {
score = s1;
} else {
score = Softmax2(s0, s1);
}
if (score < cfg.conf_thresh) continue;
// 解析位置
const Prior p = priors.empty() ? Prior{0, 0, 0, 0} : priors[static_cast<size_t>(i)];
const float dx = loc_tensor.data[static_cast<size_t>(i) * 4 + 0];
const float dy = loc_tensor.data[static_cast<size_t>(i) * 4 + 1];
const float dw = loc_tensor.data[static_cast<size_t>(i) * 4 + 2];
const float dh = loc_tensor.data[static_cast<size_t>(i) * 4 + 3];
const float cx = p.cx + dx * var0 * p.w;
const float cy = p.cy + dy * var0 * p.h;
const float ww = p.w * std::exp(dw * var1);
const float hh = p.h * std::exp(dh * var1);
float x1 = (cx - ww * 0.5f) * static_cast<float>(model_w);
float y1 = (cy - hh * 0.5f) * static_cast<float>(model_h);
float x2 = (cx + ww * 0.5f) * static_cast<float>(model_w);
float y2 = (cy + hh * 0.5f) * static_cast<float>(model_h);
// 映射到原始图像
x1 *= sx;
x2 *= sx;
y1 *= sy;
y2 *= sy;
Rect bb;
bb.x = static_cast<float>(ClampInt(static_cast<int>(x1), 0, src_w - 1));
bb.y = static_cast<float>(ClampInt(static_cast<int>(y1), 0, src_h - 1));
const float rx2 = static_cast<float>(ClampInt(static_cast<int>(x2), 0, src_w - 1));
const float ry2 = static_cast<float>(ClampInt(static_cast<int>(y2), 0, src_h - 1));
bb.w = std::max(0.0f, rx2 - bb.x);
bb.h = std::max(0.0f, ry2 - bb.y);
if (bb.w <= 1.0f || bb.h <= 1.0f) continue;
boxes.push_back(bb);
scores.push_back(score);
// 解析关键点
if (has_landmarks) {
std::array<Point2f, 5> pts{};
for (int k = 0; k < 5; ++k) {
const float lx = landm_tensor.data[static_cast<size_t>(i) * 10 + k * 2 + 0];
const float ly = landm_tensor.data[static_cast<size_t>(i) * 10 + k * 2 + 1];
const float px = (p.cx + lx * var0 * p.w) * static_cast<float>(model_w) * sx;
const float py = (p.cy + ly * var0 * p.h) * static_cast<float>(model_h) * sy;
pts[k].x = static_cast<float>(ClampInt(static_cast<int>(px), 0, src_w - 1));
pts[k].y = static_cast<float>(ClampInt(static_cast<int>(py), 0, src_h - 1));
}
lmks.push_back(pts);
}
}
if (boxes.empty()) return;
// NMS
std::vector<int> keep;
NmsSorted(boxes, scores, cfg.nms_thresh, keep);
if (keep.empty()) return;
// 构建输出
const int out_n = std::min<int>(cfg.max_faces, static_cast<int>(keep.size()));
out.faces.reserve(static_cast<size_t>(out_n));
for (int i = 0; i < out_n; ++i) {
const int k = keep[static_cast<size_t>(i)];
FaceDetItem item;
item.bbox = boxes[static_cast<size_t>(k)];
item.score = scores[static_cast<size_t>(k)];
item.track_id = -1;
if (has_landmarks && k < static_cast<int>(lmks.size())) {
item.has_landmarks = true;
item.landmarks = lmks[static_cast<size_t>(k)];
}
out.faces.push_back(std::move(item));
}
}
// ============================================================================
// SCRFD 核心:检测结果解码
// SCRFD输出格式9个张量 (3个尺度 × 3种类型score, bbox, kps)
// ============================================================================
struct ScrfdAnchor {
float cx = 0.0f;
float cy = 0.0f;
int stride = 0;
};
inline std::vector<ScrfdAnchor> GenerateScrfdAnchors(int in_w, int in_h,
const std::vector<int>& strides) {
std::vector<ScrfdAnchor> anchors;
anchors.reserve(20000);
for (int stride : strides) {
int fm_w = in_w / stride;
int fm_h = in_h / stride;
for (int y = 0; y < fm_h; ++y) {
for (int x = 0; x < fm_w; ++x) {
// SCRFD使用2个anchor per location
for (int a = 0; a < 2; ++a) {
anchors.push_back(ScrfdAnchor{
(x + 0.5f) * stride,
(y + 0.5f) * stride,
stride
});
}
}
}
}
return anchors;
}
// 从NCHW格式提取值
inline float ExtractNCHW(const TensorView& t, int c, int h, int w, int C, int H, int W) {
if (c < 0 || c >= C || h < 0 || h >= H || w < 0 || w >= W) return 0.0f;
size_t idx = (static_cast<size_t>(c) * H + h) * W + w;
if (t.type == RKNN_TENSOR_FLOAT32) {
const float* p = reinterpret_cast<const float*>(t.data);
return p[idx];
} else if (t.type == RKNN_TENSOR_INT8) {
const int8_t* p = reinterpret_cast<const int8_t*>(t.data);
return Dequant(p[idx], t.zp, t.scale);
} else {
const uint8_t* p = reinterpret_cast<const uint8_t*>(t.data);
return Dequant(p[idx], t.zp, t.scale);
}
}
/**
* 解码SCRFD检测结果 - 与 ai_scrfd 节点使用相同的逻辑
*
* @param outputs 9个输出张量 [score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32]
* @param anchors 预生成的anchor (center_x, center_y, stride)
* @param src_w 原始图像宽度
* @param src_h 原始图像高度
* @param model_w 模型输入宽度
* @param model_h 模型输入高度
* @param conf_thresh 置信度阈值
* @param output_lm 是否输出关键点
* @param out 输出结果
*/
inline void DecodeScrfd(const std::vector<TensorView>& outputs,
const std::vector<ScrfdAnchor>& anchors,
int src_w, int src_h,
int model_w, int model_h,
float conf_thresh,
bool output_lm,
FaceDetResult& out) {
if (outputs.size() != 9) return;
// Output order: score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32
const int anchor_counts[] = {12800, 3200, 800};
const int strides[] = {8, 16, 32};
size_t anchor_idx = 0;
float scale_x = static_cast<float>(src_w) / model_w;
float scale_y = static_cast<float>(src_h) / model_h;
for (int s = 0; s < 3; ++s) {
int stride = strides[s];
int count = anchor_counts[s];
// 检查输出数据是否有效
if (outputs[s].type != RKNN_TENSOR_FLOAT32 ||
outputs[s + 3].type != RKNN_TENSOR_FLOAT32 ||
outputs[s + 6].type != RKNN_TENSOR_FLOAT32) {
continue;
}
const float* scores = reinterpret_cast<const float*>(outputs[s].data);
const float* bboxes = reinterpret_cast<const float*>(outputs[s + 3].data);
const float* kps = reinterpret_cast<const float*>(outputs[s + 6].data);
if (!scores || !bboxes || !kps) continue;
for (int i = 0; i < count; ++i) {
if (anchor_idx >= anchors.size()) break;
float score = scores[i];
if (score < conf_thresh) {
anchor_idx++;
continue;
}
const ScrfdAnchor& pt = anchors[anchor_idx];
// BBox: [left, top, right, bottom] - distances from center
float left = bboxes[i * 4 + 0];
float top = bboxes[i * 4 + 1];
float right = bboxes[i * 4 + 2];
float bottom = bboxes[i * 4 + 3];
// Decode to image coordinates (640x640)
float x1_640 = (pt.cx - left) * stride;
float y1_640 = (pt.cy - top) * stride;
float x2_640 = (pt.cx + right) * stride;
float y2_640 = (pt.cy + bottom) * stride;
FaceDetItem det;
det.bbox.x = x1_640 * scale_x;
det.bbox.y = y1_640 * scale_y;
det.bbox.w = (x2_640 - x1_640) * scale_x;
det.bbox.h = (y2_640 - y1_640) * scale_y;
det.score = score;
det.has_landmarks = output_lm;
// Keypoints
if (output_lm) {
for (int p = 0; p < 5; ++p) {
float kps_x = kps[i * 10 + p * 2 + 0];
float kps_y = kps[i * 10 + p * 2 + 1];
float kx_640 = (pt.cx + kps_x) * stride;
float ky_640 = (pt.cy + kps_y) * stride;
det.landmarks[p].x = kx_640 * scale_x;
det.landmarks[p].y = ky_640 * scale_y;
}
}
out.faces.push_back(det);
anchor_idx++;
}
}
}
} // namespace face_detection
} // namespace rk3588