#pragma once /** * 人脸检测公共工具函数 * 供 ai_face_det 和 ai_face_det_zoned 节点复用 */ #include #include #include #include #include #include #include #include #include "face/face_result.h" // RKNN类型前向声明（避免直接依赖rknn_api.h） #if defined(RK3588_ENABLE_RKNN) #include "rknn_api.h" #else typedef enum _rknn_tensor_type { RKNN_TENSOR_UINT8 = 0, RKNN_TENSOR_INT8, RKNN_TENSOR_FLOAT16, RKNN_TENSOR_FLOAT32, } rknn_tensor_type; #endif namespace rk3588 { namespace face_detection { // ============================================================================ // 基础工具函数 // ============================================================================ inline int ClampInt(int v, int lo, int hi) { return v < lo ? lo : (v > hi ? hi : v); } inline float Sigmoid(float x) { return 1.0f / (1.0f + std::exp(-x)); } inline float Softmax2(float a, float b) { const float m = std::max(a, b); const float ea = std::exp(a - m); const float eb = std::exp(b - m); return eb / (ea + eb); } // ============================================================================ // 几何计算 // ============================================================================ inline float IoU(const Rect& a, const Rect& b) { const float ax1 = a.x; const float ay1 = a.y; const float ax2 = a.x + a.w; const float ay2 = a.y + a.h; const float bx1 = b.x; const float by1 = b.y; const float bx2 = b.x + b.w; const float by2 = b.y + b.h; const float ix1 = std::max(ax1, bx1); const float iy1 = std::max(ay1, by1); const float ix2 = std::min(ax2, bx2); const float iy2 = std::min(ay2, by2); const float iw = std::max(0.0f, ix2 - ix1); const float ih = std::max(0.0f, iy2 - iy1); const float inter = iw * ih; const float ua = a.w * a.h + b.w * b.h - inter; return ua <= 0.0f ? 0.0f : (inter / ua); } inline void NmsSorted(const std::vector& boxes, const std::vector& scores, float nms_thresh, std::vector& keep) { keep.clear(); std::vector order(scores.size()); std::iota(order.begin(), order.end(), 0); std::sort(order.begin(), order.end(), [&](int a, int b) { return scores[a] > scores[b]; }); for (int idx : order) { bool suppressed = false; for (int kept : keep) { if (IoU(boxes[idx], boxes[kept]) > nms_thresh) { suppressed = true; break; } } if (!suppressed) keep.push_back(idx); } } // ============================================================================ // RetinaFace 数据结构 // ============================================================================ struct Prior { float cx = 0.0f; float cy = 0.0f; float w = 0.0f; float h = 0.0f; }; struct DetectionConfig { float conf_thresh = 0.6f; float nms_thresh = 0.4f; int max_faces = 10; bool output_landmarks = true; // RetinaFace默认参数 std::vector steps{8, 16, 32}; std::vector> min_sizes{{16, 32}, {64, 128}, {256, 512}}; float variance0 = 0.1f; float variance1 = 0.2f; }; // 张量结构（与RKNN解耦） struct TensorView { const uint8_t* data = nullptr; size_t size = 0; int32_t zp = 0; float scale = 1.0f; std::vector dims; rknn_tensor_type type = RKNN_TENSOR_UINT8; }; struct NcTensor { int n = 0; int c = 0; std::vector data; // N*C row-major }; // ============================================================================ // 图像预处理 // ============================================================================ inline void ResizeRgbBilinear(const uint8_t* src, int src_w, int src_h, int src_stride, uint8_t* dst, int dst_w, int dst_h, bool swap_rb) { const float scale_x = static_cast(src_w) / static_cast(dst_w); const float scale_y = static_cast(src_h) / static_cast(dst_h); for (int y = 0; y < dst_h; ++y) { const float fy = (static_cast(y) + 0.5f) * scale_y - 0.5f; int y0 = static_cast(std::floor(fy)); int y1 = y0 + 1; const float wy1 = fy - static_cast(y0); const float wy0 = 1.0f - wy1; y0 = ClampInt(y0, 0, src_h - 1); y1 = ClampInt(y1, 0, src_h - 1); const uint8_t* row0 = src + static_cast(y0) * static_cast(src_stride); const uint8_t* row1 = src + static_cast(y1) * static_cast(src_stride); uint8_t* out = dst + static_cast(y) * static_cast(dst_w) * 3; for (int x = 0; x < dst_w; ++x) { const float fx = (static_cast(x) + 0.5f) * scale_x - 0.5f; int x0 = static_cast(std::floor(fx)); int x1 = x0 + 1; const float wx1 = fx - static_cast(x0); const float wx0 = 1.0f - wx1; x0 = ClampInt(x0, 0, src_w - 1); x1 = ClampInt(x1, 0, src_w - 1); const uint8_t* p00 = row0 + x0 * 3; const uint8_t* p01 = row0 + x1 * 3; const uint8_t* p10 = row1 + x0 * 3; const uint8_t* p11 = row1 + x1 * 3; for (int c = 0; c < 3; ++c) { const float v = (static_cast(p00[c]) * wx0 + static_cast(p01[c]) * wx1) * wy0 + (static_cast(p10[c]) * wx0 + static_cast(p11[c]) * wx1) * wy1; out[c] = static_cast(ClampInt(static_cast(v + 0.5f), 0, 255)); } if (swap_rb) { std::swap(out[0], out[2]); } out += 3; } } } // ============================================================================ // 张量解析（从RKNN输出提取） // ============================================================================ inline float HalfToFloat(uint16_t h) { const uint32_t sign = (static_cast(h & 0x8000u)) << 16; uint32_t exp = (h & 0x7C00u) >> 10; uint32_t mant = (h & 0x03FFu); uint32_t f = 0; if (exp == 0) { if (mant == 0) { f = sign; } else { exp = 1; while ((mant & 0x0400u) == 0) { mant <<= 1; --exp; } mant &= 0x03FFu; exp = exp + (127 - 15); f = sign | (exp << 23) | (mant << 13); } } else if (exp == 31) { f = sign | 0x7F800000u | (mant << 13); } else { exp = exp + (127 - 15); f = sign | (exp << 23) | (mant << 13); } float out; memcpy(&out, &f, sizeof(out)); return out; } template inline float Dequant(T q, int32_t zp, float scale) { return (static_cast(q) - static_cast(zp)) * scale; } // 从TensorView提取NcTensor inline bool ExtractNcTensor(const TensorView& t, int c, NcTensor& out) { out = {}; out.c = c; if (!t.data || t.size == 0) return false; size_t elem_size = 1; bool is_float32 = false; bool is_float16 = false; if (t.type == RKNN_TENSOR_FLOAT16) { elem_size = 2; is_float16 = true; } else if (t.type == RKNN_TENSOR_FLOAT32) { elem_size = 4; is_float32 = true; } const size_t elem_cnt = elem_size > 0 ? (t.size / elem_size) : 0; if (elem_cnt == 0) return false; int n = 0; bool transposed = false; if (t.dims.size() == 3) { const uint32_t d1 = t.dims[1]; const uint32_t d2 = t.dims[2]; if (static_cast(d2) == c) { n = static_cast(d1); transposed = false; } else if (static_cast(d1) == c) { n = static_cast(d2); transposed = true; } else { return false; } } else if (t.dims.size() == 2) { const uint32_t d0 = t.dims[0]; const uint32_t d1 = t.dims[1]; if (static_cast(d1) == c) { n = static_cast(d0); transposed = false; } else if (static_cast(d0) == c) { n = static_cast(d1); transposed = true; } } if (n <= 0) { if (elem_cnt % static_cast(c) != 0) return false; n = static_cast(elem_cnt / static_cast(c)); transposed = false; } if (static_cast(n) * static_cast(c) != elem_cnt) { return false; } out.n = n; out.data.resize(static_cast(n) * static_cast(c)); auto ReadElem = [&](size_t idx) -> float { if (is_float32) { const float* fp = reinterpret_cast(t.data); return fp[idx]; } if (is_float16) { const uint16_t* hp = reinterpret_cast(t.data); return HalfToFloat(hp[idx]); } if (t.type == RKNN_TENSOR_INT8) { const int8_t* p = reinterpret_cast(t.data); return Dequant(p[idx], t.zp, t.scale); } const uint8_t* p = reinterpret_cast(t.data); return Dequant(p[idx], t.zp, t.scale); }; if (!transposed) { for (size_t i = 0; i < out.data.size(); ++i) { out.data[i] = ReadElem(i); } } else { for (int ci = 0; ci < c; ++ci) { for (int ni = 0; ni < n; ++ni) { const size_t src_idx = static_cast(ci) * static_cast(n) + static_cast(ni); const size_t dst_idx = static_cast(ni) * static_cast(c) + static_cast(ci); out.data[dst_idx] = ReadElem(src_idx); } } } return true; } // ============================================================================ // RetinaFace 核心：先验框生成 // ============================================================================ inline std::vector GeneratePriors(int in_w, int in_h, const std::vector& steps, const std::vector>& min_sizes) { std::vector priors; if (steps.empty() || steps.size() != min_sizes.size()) return priors; priors.reserve(5000); for (size_t s = 0; s < steps.size(); ++s) { const int step = steps[s]; const int fm_w = in_w / step; const int fm_h = in_h / step; for (int i = 0; i < fm_h; ++i) { for (int j = 0; j < fm_w; ++j) { for (int ms : min_sizes[s]) { const float s_kx = static_cast(ms) / static_cast(in_w); const float s_ky = static_cast(ms) / static_cast(in_h); const float cx = (static_cast(j) + 0.5f) * static_cast(step) / static_cast(in_w); const float cy = (static_cast(i) + 0.5f) * static_cast(step) / static_cast(in_h); priors.push_back(Prior{cx, cy, s_kx, s_ky}); } } } } return priors; } // ============================================================================ // RetinaFace 核心：检测结果解码 // ============================================================================ /** * 解码RetinaFace检测结果 * * @param loc_tensor 位置回归张量 [N, 4] * @param conf_tensor 置信度张量 [N, 2] * @param landm_tensor 关键点张量 [N, 10] (可选，可以为空) * @param priors 先验框 * @param src_w 原始图像宽度 * @param src_h 原始图像高度 * @param model_w 模型输入宽度 * @param model_h 模型输入高度 * @param cfg 检测配置 * @param out 输出结果 */ inline void DecodeRetinaFace(const NcTensor& loc_tensor, const NcTensor& conf_tensor, const NcTensor& landm_tensor, const std::vector& priors, int src_w, int src_h, int model_w, int model_h, const DetectionConfig& cfg, FaceDetResult& out) { if (loc_tensor.n <= 0 || conf_tensor.n != loc_tensor.n) return; const int n = loc_tensor.n; const bool has_landmarks = cfg.output_landmarks && !landm_tensor.data.empty() && landm_tensor.n == n; if (!priors.empty() && static_cast(priors.size()) != n) { return; // prior mismatch } const float sx = static_cast(src_w) / static_cast(model_w); const float sy = static_cast(src_h) / static_cast(model_h); std::vector boxes; std::vector scores; std::vector> lmks; boxes.reserve(static_cast(n)); scores.reserve(static_cast(n)); if (has_landmarks) lmks.reserve(static_cast(n)); const float var0 = cfg.variance0; const float var1 = cfg.variance1; for (int i = 0; i < n; ++i) { // 解析置信度 const float s0 = conf_tensor.data[static_cast(i) * 2 + 0]; const float s1 = conf_tensor.data[static_cast(i) * 2 + 1]; float score; if (s0 >= 0.0f && s0 <= 1.0f && s1 >= 0.0f && s1 <= 1.0f && std::fabs((s0 + s1) - 1.0f) < 0.1f) { score = s1; } else { score = Softmax2(s0, s1); } if (score < cfg.conf_thresh) continue; // 解析位置 const Prior p = priors.empty() ? Prior{0, 0, 0, 0} : priors[static_cast(i)]; const float dx = loc_tensor.data[static_cast(i) * 4 + 0]; const float dy = loc_tensor.data[static_cast(i) * 4 + 1]; const float dw = loc_tensor.data[static_cast(i) * 4 + 2]; const float dh = loc_tensor.data[static_cast(i) * 4 + 3]; const float cx = p.cx + dx * var0 * p.w; const float cy = p.cy + dy * var0 * p.h; const float ww = p.w * std::exp(dw * var1); const float hh = p.h * std::exp(dh * var1); float x1 = (cx - ww * 0.5f) * static_cast(model_w); float y1 = (cy - hh * 0.5f) * static_cast(model_h); float x2 = (cx + ww * 0.5f) * static_cast(model_w); float y2 = (cy + hh * 0.5f) * static_cast(model_h); // 映射到原始图像 x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy; Rect bb; bb.x = static_cast(ClampInt(static_cast(x1), 0, src_w - 1)); bb.y = static_cast(ClampInt(static_cast(y1), 0, src_h - 1)); const float rx2 = static_cast(ClampInt(static_cast(x2), 0, src_w - 1)); const float ry2 = static_cast(ClampInt(static_cast(y2), 0, src_h - 1)); bb.w = std::max(0.0f, rx2 - bb.x); bb.h = std::max(0.0f, ry2 - bb.y); if (bb.w <= 1.0f || bb.h <= 1.0f) continue; boxes.push_back(bb); scores.push_back(score); // 解析关键点 if (has_landmarks) { std::array pts{}; for (int k = 0; k < 5; ++k) { const float lx = landm_tensor.data[static_cast(i) * 10 + k * 2 + 0]; const float ly = landm_tensor.data[static_cast(i) * 10 + k * 2 + 1]; const float px = (p.cx + lx * var0 * p.w) * static_cast(model_w) * sx; const float py = (p.cy + ly * var0 * p.h) * static_cast(model_h) * sy; pts[k].x = static_cast(ClampInt(static_cast(px), 0, src_w - 1)); pts[k].y = static_cast(ClampInt(static_cast(py), 0, src_h - 1)); } lmks.push_back(pts); } } if (boxes.empty()) return; // NMS std::vector keep; NmsSorted(boxes, scores, cfg.nms_thresh, keep); if (keep.empty()) return; // 构建输出 const int out_n = std::min(cfg.max_faces, static_cast(keep.size())); out.faces.reserve(static_cast(out_n)); for (int i = 0; i < out_n; ++i) { const int k = keep[static_cast(i)]; FaceDetItem item; item.bbox = boxes[static_cast(k)]; item.score = scores[static_cast(k)]; item.track_id = -1; if (has_landmarks && k < static_cast(lmks.size())) { item.has_landmarks = true; item.landmarks = lmks[static_cast(k)]; } out.faces.push_back(std::move(item)); } } // ============================================================================ // SCRFD 核心：检测结果解码 // SCRFD输出格式：9个张量 (3个尺度 × 3种类型：score, bbox, kps) // ============================================================================ struct ScrfdAnchor { float cx = 0.0f; float cy = 0.0f; int stride = 0; }; inline std::vector GenerateScrfdAnchors(int in_w, int in_h, const std::vector& strides) { std::vector anchors; anchors.reserve(20000); for (int stride : strides) { int fm_w = in_w / stride; int fm_h = in_h / stride; for (int y = 0; y < fm_h; ++y) { for (int x = 0; x < fm_w; ++x) { // SCRFD使用2个anchor per location for (int a = 0; a < 2; ++a) { anchors.push_back(ScrfdAnchor{ (x + 0.5f) * stride, (y + 0.5f) * stride, stride }); } } } } return anchors; } // 从NCHW格式提取值 inline float ExtractNCHW(const TensorView& t, int c, int h, int w, int C, int H, int W) { if (c < 0 || c >= C || h < 0 || h >= H || w < 0 || w >= W) return 0.0f; size_t idx = (static_cast(c) * H + h) * W + w; if (t.type == RKNN_TENSOR_FLOAT32) { const float* p = reinterpret_cast(t.data); return p[idx]; } else if (t.type == RKNN_TENSOR_INT8) { const int8_t* p = reinterpret_cast(t.data); return Dequant(p[idx], t.zp, t.scale); } else { const uint8_t* p = reinterpret_cast(t.data); return Dequant(p[idx], t.zp, t.scale); } } /** * 解码SCRFD检测结果 - 与 ai_scrfd 节点使用相同的逻辑 * * @param outputs 9个输出张量 [score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32] * @param anchors 预生成的anchor (center_x, center_y, stride) * @param src_w 原始图像宽度 * @param src_h 原始图像高度 * @param model_w 模型输入宽度 * @param model_h 模型输入高度 * @param conf_thresh 置信度阈值 * @param output_lm 是否输出关键点 * @param out 输出结果 */ inline void DecodeScrfd(const std::vector& outputs, const std::vector& anchors, int src_w, int src_h, int model_w, int model_h, float conf_thresh, bool output_lm, FaceDetResult& out) { if (outputs.size() != 9) return; // Output order: score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32 const int anchor_counts[] = {12800, 3200, 800}; const int strides[] = {8, 16, 32}; size_t anchor_idx = 0; float scale_x = static_cast(src_w) / model_w; float scale_y = static_cast(src_h) / model_h; for (int s = 0; s < 3; ++s) { int stride = strides[s]; int count = anchor_counts[s]; // 检查输出数据是否有效 if (outputs[s].type != RKNN_TENSOR_FLOAT32 || outputs[s + 3].type != RKNN_TENSOR_FLOAT32 || outputs[s + 6].type != RKNN_TENSOR_FLOAT32) { continue; } const float* scores = reinterpret_cast(outputs[s].data); const float* bboxes = reinterpret_cast(outputs[s + 3].data); const float* kps = reinterpret_cast(outputs[s + 6].data); if (!scores || !bboxes || !kps) continue; for (int i = 0; i < count; ++i) { if (anchor_idx >= anchors.size()) break; float score = scores[i]; if (score < conf_thresh) { anchor_idx++; continue; } const ScrfdAnchor& pt = anchors[anchor_idx]; // BBox: [left, top, right, bottom] - distances from center float left = bboxes[i * 4 + 0]; float top = bboxes[i * 4 + 1]; float right = bboxes[i * 4 + 2]; float bottom = bboxes[i * 4 + 3]; // Decode to image coordinates (640x640) float x1_640 = (pt.cx - left) * stride; float y1_640 = (pt.cy - top) * stride; float x2_640 = (pt.cx + right) * stride; float y2_640 = (pt.cy + bottom) * stride; FaceDetItem det; det.bbox.x = x1_640 * scale_x; det.bbox.y = y1_640 * scale_y; det.bbox.w = (x2_640 - x1_640) * scale_x; det.bbox.h = (y2_640 - y1_640) * scale_y; det.score = score; det.has_landmarks = output_lm; // Keypoints if (output_lm) { for (int p = 0; p < 5; ++p) { float kps_x = kps[i * 10 + p * 2 + 0]; float kps_y = kps[i * 10 + p * 2 + 1]; float kx_640 = (pt.cx + kps_x) * stride; float ky_640 = (pt.cy + kps_y) * stride; det.landmarks[p].x = kx_640 * scale_x; det.landmarks[p].y = ky_640 * scale_y; } } out.faces.push_back(det); anchor_idx++; } } } } // namespace face_detection } // namespace rk3588