新增滑动窗口节点,提高了人脸检测和识别能力

This commit is contained in:
haotian 2026-03-12 20:12:00 +08:00
parent d8c492f9ab
commit 8ae1893f5f
14 changed files with 2008 additions and 647 deletions

View File

@ -0,0 +1,311 @@
{
"queue": {
"size": 8,
"strategy": "drop_oldest"
},
"graphs": [
{
"name": "cam1_full_pipeline",
"nodes": [
{
"id": "in_cam1",
"type": "input_rtsp",
"role": "source",
"enable": true,
"url": "rtsp://10.0.0.49:8554/cam",
"fps": 30,
"width": 1920,
"height": 1080,
"use_mpp": true,
"use_ffmpeg": false,
"force_tcp": true,
"reconnect_sec": 5,
"reconnect_backoff_max_sec": 30
},
{
"id": "pre_face",
"type": "preprocess",
"role": "filter",
"enable": true,
"dst_w": 1920,
"dst_h": 1080,
"dst_format": "rgb",
"dst_packed": true,
"resize_mode": "stretch",
"keep_ratio": false,
"rga_gate": "ppe_detection",
"use_rga": true
},
{
"id": "face_det",
"type": "ai_scrfd_sliding",
"role": "filter",
"enable": true,
"infer_fps": 5,
"model_path": "./models/scrfd_500m_640.rknn",
"model_w": 640,
"model_h": 640,
"windows": [
{"x": 0, "y": 0, "w": 960, "h": 1080},
{"x": 960, "y": 0, "w": 960, "h": 1080}
],
"conf_thresh": 0.5,
"nms_thresh": 0.4,
"max_faces": 50,
"debug": {
"stats": true,
"stats_interval": 30
}
},
{
"id": "face_recog",
"type": "ai_face_recog",
"role": "filter",
"enable": true,
"model_path": "./models/mobilefacenet_arcface.rknn",
"align": true,
"emit_embedding": false,
"max_faces": 50,
"input_format": "rgb",
"input_dtype": "uint8",
"threshold": {
"accept": 0.45,
"margin": 0.05
},
"gallery": {
"backend": "sqlite",
"path": "./models/face_gallery.db",
"load_on_start": true,
"expected_dim": 128,
"dtype": "auto"
}
},
{
"id": "pre_yolo",
"type": "preprocess",
"role": "filter",
"enable": true,
"dst_w": 768,
"dst_h": 768,
"dst_format": "rgb",
"dst_packed": true,
"resize_mode": "stretch",
"keep_ratio": false,
"rga_gate": "ppe_detection",
"use_rga": true
},
{
"id": "yolo_ppe",
"type": "ai_yolo",
"role": "filter",
"enable": true,
"infer_fps": 5,
"model_path": "./models/best-768.rknn",
"model_version": "v8",
"model_w": 768,
"model_h": 768,
"num_classes": 11,
"conf": 0.25,
"nms": 0.45,
"debug": {
"stats": true,
"stats_interval": 30,
"detections": false
},
"class_filter": [3, 6, 10]
},
{
"id": "tracker",
"type": "tracker",
"role": "filter",
"enable": true,
"mode": "bytetrack_lite",
"per_class": true,
"state_key": "ppe_detection",
"track_classes": [3, 6, 10],
"ignore_classes": [],
"allowed_models": ["yolov8"],
"high_th": 0.5,
"low_th": 0.1,
"iou_th": 0.3,
"max_age_ms": 1500,
"min_hits": 2,
"max_tracks": 128
},
{
"id": "logic_boots",
"type": "logic_gate",
"role": "filter",
"enable": true,
"mode": "ppe_boots_check",
"anchor_class": 6,
"boots_class": 3,
"color_check": {
"enable": true,
"method": "hsv",
"dark_threshold": 80,
"roi_expand": 1.0
},
"debug": false
},
{
"id": "pre_osd",
"type": "preprocess",
"role": "filter",
"enable": true,
"dst_w": 1920,
"dst_h": 1080,
"dst_format": "nv12",
"resize_mode": "stretch",
"rga_gate": "ppe_detection",
"use_rga": true
},
{
"id": "osd",
"type": "osd",
"role": "filter",
"enable": true,
"draw_bbox": true,
"draw_text": true,
"draw_face_det": true,
"draw_face_bbox": true,
"line_width": 2,
"font_scale": 1,
"use_rga_bbox": false,
"labels": ["helmet", "gloves", "vest", "boots", "goggles", "none", "Person", "no_helmet", "no_goggle", "no_gloves", "no_boots", "violation"]
},
{
"id": "publish",
"type": "publish",
"role": "filter",
"enable": true,
"queue": {"size": 2, "policy": "drop_oldest"},
"codec": "h264",
"fps": 30,
"gop": 60,
"bitrate_kbps": 4000,
"use_mpp": true,
"use_ffmpeg_mux": true,
"outputs": [
{
"proto": "hls",
"path": "./web/hls/cam1/index.m3u8",
"segment_sec": 2
},
{
"proto": "rtsp_server",
"port": 8555,
"path": "/live/cam1"
}
]
},
{
"id": "alarm",
"type": "alarm",
"role": "sink",
"enable": true,
"eval_fps": 10,
"labels": ["helmet", "gloves", "vest", "boots", "goggles", "none", "Person", "no_helmet", "no_goggle", "no_gloves", "no_boots", "violation"],
"rules": [
{
"name": "non_compliant_boots",
"class_ids": [10],
"roi": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"min_score": 0.3,
"min_box_area_ratio": 0.01,
"require_track_id": true,
"min_duration_ms": 800,
"min_hits": 2,
"hit_window_ms": 1000,
"cooldown_ms": 5000,
"per_track_cooldown_ms": 5000
}
],
"face_rules": [
{
"name": "unknown_face",
"type": "unknown",
"cooldown_ms": 7000,
"min_sim": 0.35,
"min_hits": 2,
"hit_window_ms": 1500,
"min_face_area_ratio": 0.01,
"min_face_aspect": 0.6,
"max_face_aspect": 1.6
},
{
"name": "known_person",
"type": "person",
"cooldown_ms": 7000,
"min_sim": 0.6,
"min_hits": 2,
"hit_window_ms": 1500,
"min_face_area_ratio": 0.01,
"min_face_aspect": 0.6,
"max_face_aspect": 1.6
}
],
"actions": {
"log": {
"enable": true,
"level": "info"
},
"snapshot": {
"enable": true,
"format": "jpg",
"quality": 85,
"upload": {
"type": "minio",
"endpoint": "http://10.0.0.49:9000",
"bucket": "myminio",
"region": "us-east-1",
"access_key": "minioadmin",
"secret_key": "minioadmin"
}
},
"clip": {
"enable": true,
"pre_sec": 5,
"post_sec": 10,
"format": "mp4",
"fps": 30,
"upload": {
"type": "minio",
"endpoint": "http://10.0.0.49:9000",
"bucket": "myminio",
"region": "us-east-1",
"access_key": "minioadmin",
"secret_key": "minioadmin"
}
},
"external_api": {
"enable": true,
"getTokenUrl": "http://10.0.0.49:8080/api/getToken",
"putMessageUrl": "http://10.0.0.49:8080/api/putMessage",
"tenantCode": "32",
"channelNo": "cam1",
"timeout_ms": 3000,
"include_media_url": true,
"token_header": "X-Access-Token",
"token_json_path": "responseBody.token",
"token_cache_sec": 1200
}
}
}
],
"edges": [
["in_cam1", "pre_face"],
["pre_face", "face_det"],
["face_det", "face_recog"],
["face_recog", "pre_yolo"],
["pre_yolo", "yolo_ppe"],
["yolo_ppe", "tracker"],
["tracker", "logic_boots"],
["logic_boots", "osd"],
["osd", "pre_osd"],
["pre_osd", "publish"],
["publish", "alarm"]
]
}
]
}

View File

@ -14,8 +14,8 @@
"enable": true,
"url": "rtsp://10.0.0.49:8554/cam",
"fps": 30,
"width": 1280,
"height": 720,
"width": 1920,
"height": 1080,
"use_mpp": true,
"use_ffmpeg": false,
"force_tcp": true,
@ -33,7 +33,7 @@
"dst_packed": true,
"resize_mode": "stretch",
"keep_ratio": false,
"rga_gate": "scrfd_640_test",
"rga_gate": "scrfd_1080p",
"use_rga": true
},
{
@ -42,9 +42,9 @@
"role": "filter",
"enable": true,
"model_path": "./models/scrfd_500m_640.rknn",
"conf_thresh": 0.5,
"conf_thresh": 0.3,
"nms_thresh": 0.4,
"max_faces": 10,
"max_faces": 50,
"output_landmarks": true,
"input_format": "rgb"
},
@ -67,11 +67,11 @@
"type": "preprocess",
"role": "filter",
"enable": true,
"dst_w": 1280,
"dst_h": 720,
"dst_w": 1920,
"dst_h": 1080,
"dst_format": "nv12",
"resize_mode": "stretch",
"rga_gate": "scrfd_640_test",
"rga_gate": "scrfd_1080p",
"use_rga": true
},
{
@ -83,13 +83,13 @@
"codec": "h264",
"fps": 30,
"gop": 60,
"bitrate_kbps": 2000,
"bitrate_kbps": 4000,
"use_mpp": true,
"use_ffmpeg_mux": true,
"outputs": [
{
"proto": "hls",
"path": "./web/hls/scrfd/index.m3u8",
"path": "./web/hls/cam1/index.m3u8",
"segment_sec": 2
},
{

View File

@ -0,0 +1,136 @@
{
"queue": {
"size": 8,
"strategy": "drop_oldest"
},
"graphs": [
{
"name": "scrfd_640_recog_test",
"nodes": [
{
"id": "in_cam1",
"type": "input_rtsp",
"role": "source",
"enable": true,
"url": "rtsp://10.0.0.49:8554/cam",
"fps": 30,
"width": 1920,
"height": 1080,
"use_mpp": true,
"use_ffmpeg": false,
"force_tcp": true,
"reconnect_sec": 5,
"reconnect_backoff_max_sec": 30
},
{
"id": "pre_cam1",
"type": "preprocess",
"role": "filter",
"enable": true,
"dst_w": 640,
"dst_h": 640,
"dst_format": "rgb",
"dst_packed": true,
"resize_mode": "stretch",
"keep_ratio": false,
"rga_gate": "scrfd_1080p",
"use_rga": true
},
{
"id": "scrfd",
"type": "ai_scrfd",
"role": "filter",
"enable": true,
"model_path": "./models/scrfd_500m_640.rknn",
"conf_thresh": 0.3,
"nms_thresh": 0.4,
"max_faces": 50,
"output_landmarks": true,
"input_format": "rgb"
},
{
"id": "face_recog",
"type": "ai_face_recog",
"role": "filter",
"enable": true,
"model_path": "./models/mobilefacenet_arcface.rknn",
"align": true,
"emit_embedding": false,
"max_faces": 50,
"input_format": "rgb",
"input_dtype": "uint8",
"threshold": {
"accept": 0.45,
"margin": 0.05
},
"gallery": {
"backend": "sqlite",
"path": "./models/face_gallery.db",
"load_on_start": true,
"expected_dim": 512,
"dtype": "auto"
}
},
{
"id": "osd_cam1",
"type": "osd",
"role": "filter",
"enable": true,
"draw_bbox": true,
"draw_text": true,
"draw_face_det": true,
"draw_face_bbox": true,
"line_width": 2,
"font_scale": 1,
"use_rga_bbox": false,
"labels": ["face"]
},
{
"id": "post_cam1",
"type": "preprocess",
"role": "filter",
"enable": true,
"dst_w": 1920,
"dst_h": 1080,
"dst_format": "nv12",
"resize_mode": "stretch",
"rga_gate": "scrfd_1080p",
"use_rga": true
},
{
"id": "pub_cam1",
"type": "publish",
"role": "filter",
"enable": true,
"queue": {"size": 2, "policy": "drop_oldest"},
"codec": "h264",
"fps": 30,
"gop": 60,
"bitrate_kbps": 4000,
"use_mpp": true,
"use_ffmpeg_mux": true,
"outputs": [
{
"proto": "hls",
"path": "./web/hls/cam1/index.m3u8",
"segment_sec": 2
},
{
"proto": "rtsp_server",
"port": 8555,
"path": "/live/cam1"
}
]
}
],
"edges": [
["in_cam1", "pre_cam1"],
["pre_cam1", "scrfd"],
["scrfd", "face_recog"],
["face_recog", "osd_cam1"],
["osd_cam1", "post_cam1"],
["post_cam1", "pub_cam1"]
]
}
]
}

View File

@ -0,0 +1,139 @@
{
"queue": {
"size": 8,
"strategy": "drop_oldest"
},
"graphs": [
{
"name": "scrfd_sliding_1080p_recog",
"nodes": [
{
"id": "in_cam1",
"type": "input_rtsp",
"role": "source",
"enable": true,
"url": "rtsp://10.0.0.49:8554/cam",
"fps": 30,
"width": 1920,
"height": 1080,
"use_mpp": true,
"use_ffmpeg": false,
"force_tcp": true,
"reconnect_sec": 5,
"reconnect_backoff_max_sec": 30
},
{
"id": "pre_cam1",
"type": "preprocess",
"role": "filter",
"enable": true,
"dst_w": 1920,
"dst_h": 1080,
"dst_format": "rgb",
"dst_packed": true,
"resize_mode": "stretch",
"keep_ratio": false,
"rga_gate": "scrfd_sliding_1080p_recog",
"use_rga": true
},
{
"id": "scrfd_sliding",
"type": "ai_scrfd_sliding",
"role": "filter",
"enable": true,
"model_path": "./models/scrfd_500m_640.rknn",
"conf_thresh": 0.5,
"nms_thresh": 0.4,
"max_faces": 50,
"output_landmarks": true,
"windows": [
{"x": 0, "y": 0, "w": 960, "h": 1080},
{"x": 960, "y": 0, "w": 960, "h": 1080}
]
},
{
"id": "face_recog",
"type": "ai_face_recog",
"role": "filter",
"enable": true,
"model_path": "./models/mobilefacenet_arcface.rknn",
"align": true,
"emit_embedding": false,
"max_faces": 50,
"input_format": "rgb",
"input_dtype": "uint8",
"threshold": {
"accept": 0.45,
"margin": 0.05
},
"gallery": {
"backend": "sqlite",
"path": "./models/face_gallery.db",
"load_on_start": true,
"expected_dim": 512,
"dtype": "auto"
}
},
{
"id": "osd_cam1",
"type": "osd",
"role": "filter",
"enable": true,
"draw_bbox": true,
"draw_text": true,
"draw_face_det": true,
"draw_face_bbox": true,
"line_width": 2,
"font_scale": 1,
"use_rga_bbox": false,
"labels": ["face"]
},
{
"id": "post_cam1",
"type": "preprocess",
"role": "filter",
"enable": true,
"dst_w": 1920,
"dst_h": 1080,
"dst_format": "nv12",
"resize_mode": "stretch",
"rga_gate": "scrfd_sliding_1080p_recog",
"use_rga": true
},
{
"id": "pub_cam1",
"type": "publish",
"role": "filter",
"enable": true,
"queue": {"size": 2, "policy": "drop_oldest"},
"codec": "h264",
"fps": 30,
"gop": 60,
"bitrate_kbps": 4000,
"use_mpp": true,
"use_ffmpeg_mux": true,
"outputs": [
{
"proto": "hls",
"path": "./web/hls/cam1/index.m3u8",
"segment_sec": 2
},
{
"proto": "rtsp_server",
"port": 8555,
"path": "/live/cam1"
}
]
}
],
"edges": [
["in_cam1", "pre_cam1"],
["pre_cam1", "scrfd_sliding"],
["scrfd_sliding", "face_recog"],
["face_recog", "osd_cam1"],
["osd_cam1", "post_cam1"],
["post_cam1", "pub_cam1"]
]
}
]
}

View File

@ -0,0 +1,727 @@
# 人脸检测参数配置指南
本文档详细说明人脸检测节点(`ai_face_det`, `ai_scrfd`, `ai_scrfd_sliding`)中的核心后处理参数及其对检测结果的影响。
---
## 参数概览
| 参数名 | 类型 | 默认值 | 范围 | 说明 |
|--------|------|--------|------|------|
| `conf_thresh` | float | 0.5/0.6/0.7 | 0.0 ~ 1.0 | 置信度阈值,过滤低置信度候选框 |
| `nms_thresh` | float | 0.4 | 0.0 ~ 1.0 | NMS IoU 阈值,控制重复框去重力度 |
| `max_faces` | int | 10/50 | ≥ 1 | 单帧最大返回人脸数 |
---
## 1. conf_thresh (置信度阈值)
### 含义
模型对检测到的人脸的置信度confidence score阈值。神经网络在推理时会对每个候选框输出一个置信度分数表示该位置存在人脸的确定性程度。
### 对检测结果的影响
| 设置 | 效果 | 适用场景 |
|------|------|----------|
| **调高** (如 0.7) | 减少误检,只保留高置信度人脸 | 高精度需求场景(门禁、考勤) |
| **调低** (如 0.2) | 增加检出率,更多弱特征人脸被检测 | 弱光、远距离、小目标场景 |
### 注意事项
- 值过高可能导致**漏检**:模糊人脸、侧脸、小人脸可能被过滤
- 值过低可能导致**误检**:背景中的类似人脸的纹理可能被误判
### 代码实现
在 SCRFD 后处理中 (`ai_scrfd_node.cpp:282`)
```cpp
if (score < cfg_.conf_thresh) continue;
```
在 RetinaFace 后处理中 (`ai_face_det_node.cpp:784`)
```cpp
if (score < cfg.conf_thresh) continue;
```
---
## 2. nms_thresh (NMS IoU 阈值)
### 含义
非极大值抑制Non-Maximum Suppression的 IoUIntersection over Union交并比阈值。
**NMS 的作用**:同一个真实人脸可能被多个 anchor/候选框检测到NMS 用于去除重叠的重复检测框,只保留最优的一个。
**IoU 计算**
```
IoU = 两个框的交集面积 / 两个框的并集面积
```
### 对检测结果的影响
| 设置 | 效果 | 适用场景 |
|------|------|----------|
| **调高** (如 0.6) | 保留更多重叠框,对密集人脸友好 | 多人密集场景(会议室、教室) |
| **调低** (如 0.3) | 严格去重,只保留最优框 | 单人场景、需要精确框选 |
### 注意事项
- 值过高:同一人脸可能返回多个重叠框
- 值过低:密集人脸场景可能误删相邻的不同人脸
### 代码实现
在 SCRFD 后处理中 (`ai_scrfd_node.cpp:172`)
```cpp
detections = ApplyNMS(detections, cfg_.nms_thresh);
```
NMS 算法逻辑 (`ai_face_det_node.cpp:156-167`)
```cpp
void NmsSorted(const std::vector<Rect>& boxes, const std::vector<float>& scores,
float nms_thresh, std::vector<int>& keep) {
for (...) {
bool suppressed = false;
for (int kept : keep) {
if (IoU(boxes[idx], boxes[kept]) > nms_thresh) {
suppressed = true; // 被已保留的框抑制
break;
}
}
if (!suppressed) keep.push_back(idx);
}
}
```
---
## 3. max_faces (最大人脸数)
### 含义
单帧图像中最多返回的人脸检测数量限制。
### 对检测结果的影响
| 设置 | 效果 | 性能影响 |
|------|------|----------|
| **调高** (如 50) | 可检测更多人脸,不遗漏密集场景目标 | 增加后处理开销RGA/OSD 绘制负载增大 |
| **调低** (如 5) | 仅保留置信度最高的前几个人脸 | 减少计算量,提升实时性 |
### 注意事项
- 当画面中出现超过 `max_faces` 数量的人脸时,系统会按置信度排序,只保留前 N 个
- 设置过大可能导致 RGA 任务堆积,引起 OSD 绘制卡顿
### 代码实现
在 SCRFD 后处理中 (`ai_scrfd_node.cpp:174-176`)
```cpp
if (detections.size() > static_cast<size_t>(cfg_.max_faces)) {
detections.resize(cfg_.max_faces);
}
```
在 RetinaFace 后处理中 (`ai_face_det_node.cpp:840`)
```cpp
const int out_n = std::min<int>(cfg.max_faces, static_cast<int>(keep.size()));
```
---
## 推荐配置
### 按应用场景
| 场景 | conf_thresh | nms_thresh | max_faces | 说明 |
|------|-------------|------------|-----------|------|
| **高精度门禁/考勤** | 0.6 ~ 0.7 | 0.4 | 5 ~ 10 | 减少误识别,确保准确率 |
| **多人大场景** (会议室/教室) | 0.4 ~ 0.5 | 0.4 ~ 0.5 | 20 ~ 50 | 平衡检出率和去重效果 |
| **实时性优先** | 0.5 | 0.4 | 10 | 减少后处理开销 |
| **弱光/远距离/小目标** | 0.3 ~ 0.4 | 0.3 | 10 ~ 20 | 提高检出率,但需容忍一定误检 |
| **单人视频通话** | 0.6 | 0.4 | 1 ~ 3 | 最小化处理开销 |
### 按硬件性能
| 设备性能 | max_faces 建议 | 优化策略 |
|----------|----------------|----------|
| **RK3588 高性能模式** | 20 ~ 50 | 可同时处理多路高清视频 |
| **RK3588 平衡模式** | 10 ~ 20 | 适当降低分辨率和检测频率 |
| **RK3566/RK3568** | 5 ~ 10 | 降低输入分辨率,提高 conf_thresh 减少候选框 |
---
## 参数联动关系
这三个参数需要协同调整:
1. **提高 `conf_thresh`** → 候选框数量减少 → 可降低 `max_faces` → NMS 压力减小
2. **降低 `conf_thresh`** → 候选框数量增加 → 可能需要提高 `max_faces` → NMS 压力增大
3. **密集场景**:适当提高 `nms_thresh` 避免误删相邻人脸,同时确保 `max_faces` 足够大
---
## 配置示例
### SCRFD 配置 (`ai_scrfd` 节点)
```json
{
"type": "ai_scrfd",
"model_path": "./models/scrfd_500m_640.rknn",
"conf_thresh": 0.5,
"nms_thresh": 0.4,
"max_faces": 50,
"output_landmarks": true,
"input_format": "rgb"
}
```
### RetinaFace 配置 (`ai_face_det` 节点)
```json
{
"type": "ai_face_det",
"model_path": "./models/RetinaFace_mobile320.rknn",
"conf": 0.7,
"nms": 0.4,
"max_faces": 10,
"output_landmarks": true,
"input_format": "rgb"
}
```
### 分区域检测配置 (`ai_face_det_zoned` 节点)
```json
{
"type": "ai_face_det_zoned",
"model_path": "./models/RetinaFace_mobile320.rknn",
"conf": 0.6,
"nms": 0.4,
"max_faces": 10,
"output_landmarks": true
}
```
---
## 调试建议
1. **先调 conf_thresh**:从默认值开始,观察是否漏检或误检
2. **再调 nms_thresh**:在密集人脸场景测试,确保既不重复框选也不漏检
3. **最后调 max_faces**:根据实际场景人数和硬件性能调整
### 日志查看
启动时节点会打印当前参数:
```
[ai_face_det] start id=face_det conf=0.7 nms=0.4 max_faces=10
[ai_scrfd] start id=scrfd conf=0.5 nms=0.4 max_faces=50
```
---
## 常见问题
### Q1: 为什么检测到的人脸框会抖动/闪烁?
**可能原因**
- `conf_thresh` 设置过低,边缘候选框置信度波动导致时有时无
- `nms_thresh` 过低,相邻帧选择不同的 anchor
**解决方法**:适当提高 `conf_thresh` 或调整 `nms_thresh`
### Q2: 密集场景漏检严重怎么办?
**解决方法**
- 降低 `conf_thresh` 到 0.4 左右
- 提高 `max_faces` 到 30 以上
- 适当提高 `nms_thresh` 到 0.5,避免相邻人脸被抑制
### Q3: OSD 绘制卡顿RGA 任务堆积?
**解决方法**
- 降低 `max_faces` 减少绘制负载
- 提高 `conf_thresh` 减少检测数量
---
## 四、人脸识别参数 (`ai_face_recog`)
`ai_face_recog` 节点接收人脸检测结果,提取人脸特征向量并与特征库进行比对,完成人脸识别。
### 4.1 参数概览
| 参数名 | 类型 | 默认值 | 说明 |
|--------|------|--------|------|
| `align` | bool | true | 是否使用5点关键点进行人脸对齐 |
| `emit_embedding` | bool | false | 是否输出特征向量(用于调试) |
| `max_faces` | int | 10 | 单帧最大处理人脸数 |
| `input_format` | string | "rgb" | 输入图像格式rgb/bgr |
| `input_dtype` | string | "uint8" | 输入数据类型uint8/float |
| `threshold.accept` | float | 0.45 | 识别通过阈值,相似度超过此值才接受 |
| `threshold.margin` | float | 0.05 | 边距阈值,最佳与次佳匹配的差距要求 |
| `gallery.backend` | string | "sqlite" | 人脸库后端类型 |
| `gallery.path` | string | "./models/face_gallery.db" | 人脸库文件路径 |
---
### 4.2 align (人脸对齐)
#### 含义
是否使用检测到的5个面部关键点眼睛、鼻子、嘴角进行人脸对齐变换。
#### 对识别效果的影响
| 设置 | 效果 | 适用场景 |
|------|------|----------|
| **true** | 对齐后人脸姿态归一化,提高识别准确率 | 高位摄像头、角度倾斜、侧脸场景 |
| **false** | 直接裁剪人脸区域,计算量略小 | 正面、固定位置场景 |
#### 对齐原理
使用5点关键点与标准模板进行相似变换Similarity Transform
- 标准模板坐标112x112输入左眼(38.29,51.70)、右眼(73.53,51.50)、鼻尖(56.02,71.74)、左嘴角(41.55,92.37)、右嘴角(70.73,92.20)
- 代码实现:`ai_face_recog_node.cpp:851-865`
```cpp
if (cfg->align && face.has_landmarks && model_w_ == 112 && model_h_ == 112) {
const std::array<Point2f, 5> dst = { ... }; // 标准模板
SimilarityTransform t;
InvTransform inv;
if (ComputeSimilarity(face.landmarks, dst, t) && InvertSimilarity(t, inv)) {
WarpFace(src, w, h, stride, inv, face_buf_.data(), model_w_, model_h_, need_swap);
}
}
```
---
### 4.3 threshold.accept (接受阈值)
#### 含义
特征向量相似度阈值,范围 `0.0 ~ 1.0`。只有当待识别人脸与库中某人的相似度超过此值时,才认为是匹配成功。
#### 对识别结果的影响
| 设置 | 效果 | 误识率 | 拒识率 |
|------|------|--------|--------|
| **调高** (如 0.55) | 更严格,只接受高度相似 | 低 | 高 |
| **调低** (如 0.35) | 更宽松,容易匹配 | 高 | 低 |
#### 推荐值
| 场景 | 推荐值 | 说明 |
|------|--------|------|
| **高安全性场景** | 0.50 ~ 0.55 | 门禁、支付,严格控制误识 |
| **一般场景** | 0.45 ~ 0.50 | 考勤、签到,平衡准确率和体验 |
| **快速通行场景** | 0.40 ~ 0.45 | 闸机、通道,减少拒识 |
#### 代码实现
```cpp
const bool accept = (sr.best_person_id >= 0) &&
(sr.best_sim >= cfg->thr_accept) &&
((cfg->thr_margin <= 0.0f) || ((sr.best_sim - sr.second_sim) >= cfg->thr_margin));
```
---
### 4.4 threshold.margin (边距阈值)
#### 含义
要求最佳匹配与次佳匹配的相似度差距至少达到此值,用于排除模糊匹配(如两个人都很像的情况)。设为 `0` 或负数可禁用此检查。
#### 作用示例
假设待识别人脸与库中人员相似度如下:
- 张三(最佳): 0.62
- 李四(次佳): 0.58
- 差距: 0.04
如果 `margin = 0.05`,则 0.04 < 0.05匹配失败标记为 unknown
如果 `margin = 0.03`,则 0.04 > 0.03,匹配成功(识别为张三)
#### 推荐值
- **0.05**(默认):适合大多数人脸库
- **0.00** 或负数:禁用边距检查,只依赖 accept 阈值
---
### 4.5 max_faces (最大处理人脸数)
#### 含义
单帧最多处理的人脸数量。由于特征提取需要 NPU 推理,此参数直接影响处理延迟。
#### 与检测节点 max_faces 的关系
```
实际处理数 = min(face_det.max_faces, face_recog.max_faces)
```
建议两个节点的 `max_faces` 保持一致或识别节点略小。
---
### 4.6 gallery (人脸库配置)
#### 参数说明
| 参数 | 默认值 | 说明 |
|------|--------|------|
| `backend` | "sqlite" | 后端类型,目前仅支持 sqlite |
| `path` | "./models/face_gallery.db" | 人脸库数据库文件路径 |
| `load_on_start` | true | 启动时加载到内存 |
| `expected_dim` | 512 | 特征向量维度MobileFaceNet 为 512 |
| `dtype` | "auto" | 数据类型auto/float32 |
#### 人脸库管理
人脸库使用 SQLite 存储,包含以下信息:
- `person_id`人员唯一ID
- `name`:人员名称
- `embedding`特征向量512维浮点数
- 可通过 Web 管理接口或脚本添加/删除/更新人脸
---
### 4.7 normalize (输入归一化)
#### 两种归一化方式
**方式一:缩放+偏移(简单)**
```json
{
"normalize": {
"scale": 0.0078125,
"bias": 0.0
}
}
```
公式:`output = input * scale + bias`
**方式二:均值+标准差(标准)**
```json
{
"normalize": {
"mean": [127.5, 127.5, 127.5],
"std": [128.0, 128.0, 128.0]
}
}
```
公式:`output = (input - mean) / std`
#### 默认值
MobileFaceNet 模型通常使用:
- `scale`: 1.0(不对 uint8 输入做缩放,由模型内部处理)
- 或 `mean: [127.5,127.5,127.5], std: [127.5,127.5,127.5]` 归一化到 [-1, 1]
---
### 4.8 人脸识别配置示例
```json
{
"id": "face_recog",
"type": "ai_face_recog",
"role": "filter",
"enable": true,
"model_path": "./models/mobilefacenet_arcface.rknn",
"align": true,
"emit_embedding": false,
"max_faces": 50,
"input_format": "rgb",
"input_dtype": "uint8",
"threshold": {
"accept": 0.45,
"margin": 0.05
},
"gallery": {
"backend": "sqlite",
"path": "./models/face_gallery.db",
"load_on_start": true,
"expected_dim": 512,
"dtype": "auto"
}
}
```
---
### 4.9 检测+识别完整流程配置
```json
{
"graphs": [{
"nodes": [
{
"id": "scrfd",
"type": "ai_scrfd",
"conf_thresh": 0.3,
"nms_thresh": 0.4,
"max_faces": 50,
"output_landmarks": true
},
{
"id": "face_recog",
"type": "ai_face_recog",
"align": true,
"max_faces": 50,
"threshold": { "accept": 0.45, "margin": 0.05 },
"gallery": { "path": "./models/face_gallery.db" }
},
{
"id": "osd",
"type": "osd",
"draw_face_det": true,
"draw_face_bbox": true
}
],
"edges": [
["scrfd", "face_recog"],
["face_recog", "osd"]
]
}]
}
```
---
---
## 五、滑动窗口检测参数 (`ai_scrfd_sliding`)
`ai_scrfd_sliding` 是专为**高分辨率视频**设计的滑动窗口检测节点,通过将画面分割成多个窗口分别检测,有效提升远处小目标的检出率。
### 5.1 节点特性
| 特性 | 说明 |
|------|------|
| **原始分辨率输入** | 直接接收原始图像,保留更多细节 |
| **滑动窗口检测** | 将画面分割成多个窗口,分别检测后合并结果 |
| **保持宽高比** | 每个窗口 resize 到 640x640轻微变形但可接受 |
| **窗口可配置** | 支持自定义窗口数量和位置 |
### 5.2 参数说明
| 参数 | 类型 | 默认值 | 说明 |
|------|------|--------|------|
| `model_path` | string | - | SCRFD 模型路径 |
| `conf_thresh` | float | 0.3 | 置信度阈值 |
| `nms_thresh` | float | 0.4 | NMS IoU 阈值 |
| `max_faces` | int | 50 | 最大检测人脸数 |
| `output_landmarks` | bool | true | 是否输出5点关键点 |
| `windows` | array | 自动计算 | 窗口配置数组 |
### 5.3 窗口配置 (`windows`)
如果不配置 `windows`,节点会根据输入分辨率自动计算窗口。
**窗口格式**
```json
{
"x": 0, // 窗口左上角 X 坐标
"y": 0, // 窗口左上角 Y 坐标
"w": 960, // 窗口宽度
"h": 1080 // 窗口高度
}
```
**窗口设计原则**
- 窗口之间应有适当重叠,避免漏检
- 窗口尺寸建议接近 640x640 的倍数resize 后变形较小)
- 对于 16:9 视频,水平分割效果较好
### 5.4 不同分辨率配置参考
#### 1080p (1920×1080) - 推荐2窗口
```json
{
"windows": [
{"x": 0, "y": 0, "w": 960, "h": 1080},
{"x": 960, "y": 0, "w": 960, "h": 1080}
]
}
```
**说明**
- 窗口 0左半边 960x1080
- 窗口 1右半边 960x1080
- 正好覆盖 1920 宽度,无重叠
- 每个窗口 resize 到 640x640比例 0.89:1
#### 1440p (2560×1440) - 推荐2窗口
```json
{
"windows": [
{"x": 0, "y": 0, "w": 1280, "h": 1440},
{"x": 1280, "y": 0, "w": 1280, "h": 1440}
]
}
```
**说明**
- 窗口 0左半边 1280x1440
- 窗口 1右半边 1280x1440
- 比例 0.89:1与 1080p 一致
#### 更高分辨率 - 增加窗口数
对于 4K (3840×2160) 等更高分辨率,可以增加窗口数量:
```json
{
"windows": [
{"x": 0, "y": 0, "w": 1280, "h": 1080},
{"x": 1280, "y": 0, "w": 1280, "h": 1080},
{"x": 2560, "y": 0, "w": 1280, "h": 1080}
]
}
```
### 5.5 配置示例
```json
{
"id": "scrfd_sliding",
"type": "ai_scrfd_sliding",
"role": "filter",
"enable": true,
"model_path": "./models/scrfd_500m_640.rknn",
"conf_thresh": 0.3,
"nms_thresh": 0.4,
"max_faces": 50,
"output_landmarks": true,
"windows": [
{"x": 0, "y": 0, "w": 960, "h": 1080},
{"x": 960, "y": 0, "w": 960, "h": 1080}
]
}
```
### 5.6 性能考量
- **窗口数 = 推理次数**2 个窗口 = 2 次模型推理
- **分辨率越高,窗口数越多**:需要在检测效果和性能之间平衡
- **建议窗口数**
- 1080p2 个窗口
- 1440p2 个窗口(或 4 个窗口用于更精细检测)
- 4K3-4 个窗口
### 5.7 滑动窗口检测常见问题
#### Q7: 窗口边缘的人脸被分割成两半?
**解决方法**
- 增加窗口重叠区域(如窗口 0 结束于 1000窗口 1 开始于 900
- NMS 会自动合并重复检测
#### Q8: 远处人脸还是检测不到?
**解决方法**
- 增加窗口数量,让每个窗口覆盖更小区域
- 降低 `conf_thresh` 让更多候选框通过
- 考虑使用更高分辨率摄像头
#### Q9: 检测延迟增加?
**解决方法**
- 减少窗口数量
- 降低 `max_faces` 减少后处理负担
- 使用更高性能硬件
---
## 六、综合配置建议
### 6.1 场景配置速查表
| 场景 | 检测节点 | 关键参数 | 说明 |
|------|----------|----------|------|
| **门禁/考勤** | `ai_face_det` | conf=0.7, max_faces=5 | 近距离,高精度 |
| **车间/厂房** | `ai_scrfd_sliding` | 2窗口 | 高位摄像头,大透视 |
| **会议室** | `ai_scrfd` | conf=0.4, max_faces=50 | 多人场景 |
| **户外/街道** | `ai_scrfd_sliding` | 2-4窗口 | 远距离检测 |
### 6.2 分辨率配置对照表
| 分辨率 | 检测节点 | 输入处理 | 建议 |
|--------|----------|----------|------|
| 720p | `ai_scrfd` | 前置缩放至640 | 通用配置 |
| 1080p | `ai_scrfd_sliding` | 2窗口(960x1080) | 滑动窗口检测 |
| 1440p | `ai_scrfd_sliding` | 2窗口(1280x1440) | 滑动窗口检测 |
| 4K | `ai_scrfd_sliding` | 3-4窗口 | 更多窗口提升精度 |
---
## 七、常见问题汇总
### Q1: 检测框抖动/闪烁
**可能原因**
- `conf_thresh` 设置过低,边缘候选框置信度波动
- `nms_thresh` 过低,相邻帧选择不同 anchor
**解决方法**:适当提高 `conf_thresh` 或调整 `nms_thresh`
### Q2: 密集场景漏检严重?
**解决方法**
- 降低 `conf_thresh` 到 0.4 左右
- 提高 `max_faces` 到 30 以上
- 适当提高 `nms_thresh` 到 0.5
### Q3: OSD 绘制卡顿?
**解决方法**
- 降低 `max_faces` 减少绘制负载
- 提高 `conf_thresh` 减少检测数量
### Q4: 识别准确率不高?
**可能原因及解决方法**
1. **对齐问题**:确保 `align: true`,且检测节点 `output_landmarks: true`
2. **阈值不合适**:调整 `threshold.accept`,根据实际测试确定最佳值
3. **人脸库质量**:确保库中人脸照片清晰、正面、光线均匀
4. **检测框质量**:适当提高检测 `conf_thresh`,过滤低质量检测框
#### Q5: 远距离/小目标识别效果差?
**解决方法**
- 提高检测 `conf_thresh`,让只有清晰的人脸进入识别
- 检查摄像头分辨率,确保人脸区域至少 60x60 像素
- 考虑使用更高清的摄像头或调整安装角度
#### Q6: 识别延迟高?
**优化方法**
- 降低 `max_faces`,减少单帧处理数量
- 提高检测 `conf_thresh`,减少候选框
- 确保 `gallery.load_on_start: true`,避免运行时查询数据库
---
## 相关文档
- [SCRFD 模型规格说明](../scrfd_500m_640_spec.md)
- [YOLO 检测参数配置](../config_guide.md)
- [DAG 节点与边说明](./dag_graph_node_edge.md)
- [MobileFaceNet 模型说明](../models.md)

View File

@ -542,141 +542,98 @@ inline float ExtractNCHW(const TensorView& t, int c, int h, int w, int C, int H,
}
/**
* SCRFD检测结果
* SCRFD检测结果 - ai_scrfd 使
*
* @param outputs 9 [score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32]
* @param anchors anchor
* @param anchors anchor (center_x, center_y, stride)
* @param src_w
* @param src_h
* @param model_w
* @param model_h
* @param cfg
* @param conf_thresh
* @param output_lm
* @param out
*/
inline void DecodeScrfd(const std::vector<TensorView>& outputs,
const std::vector<ScrfdAnchor>& anchors,
int src_w, int src_h,
int model_w, int model_h,
const DetectionConfig& cfg,
float conf_thresh,
bool output_lm,
FaceDetResult& out) {
if (outputs.size() != 9) {
return; // SCRFD需要9个输出
}
if (outputs.size() != 9) return;
const float sx = static_cast<float>(src_w) / static_cast<float>(model_w);
const float sy = static_cast<float>(src_h) / static_cast<float>(model_h);
std::vector<Rect> boxes;
std::vector<float> scores;
std::vector<std::array<Point2f, 5>> lmks;
// Output order: score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32
const int anchor_counts[] = {12800, 3200, 800};
const int strides[] = {8, 16, 32};
size_t anchor_idx = 0;
const int strides[3] = {8, 16, 32};
float scale_x = static_cast<float>(src_w) / model_w;
float scale_y = static_cast<float>(src_h) / model_h;
for (int s = 0; s < 3; ++s) {
int score_idx = s;
int bbox_idx = s + 3;
int kps_idx = s + 6;
int stride = strides[s];
int count = anchor_counts[s];
const TensorView& score_t = outputs[score_idx];
const TensorView& bbox_t = outputs[bbox_idx];
const TensorView& kps_t = outputs[kps_idx];
// 检查输出数据是否有效
if (outputs[s].type != RKNN_TENSOR_FLOAT32 ||
outputs[s + 3].type != RKNN_TENSOR_FLOAT32 ||
outputs[s + 6].type != RKNN_TENSOR_FLOAT32) {
continue;
}
// 检查维度
if (score_t.dims.size() < 4 || bbox_t.dims.size() < 4) continue;
const float* scores = reinterpret_cast<const float*>(outputs[s].data);
const float* bboxes = reinterpret_cast<const float*>(outputs[s + 3].data);
const float* kps = reinterpret_cast<const float*>(outputs[s + 6].data);
int C = static_cast<int>(score_t.dims[1]);
int H = static_cast<int>(score_t.dims[2]);
int W = static_cast<int>(score_t.dims[3]);
int anchors_per_loc = C / 2; // fg/bg
if (!scores || !bboxes || !kps) continue;
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
for (int a = 0; a < anchors_per_loc; ++a) {
for (int i = 0; i < count; ++i) {
if (anchor_idx >= anchors.size()) break;
// 提取前景分数 (channel a*2+1)
float score = ExtractNCHW(score_t, a * 2 + 1, h, w, C, H, W);
if (score >= cfg.conf_thresh) {
const ScrfdAnchor& anchor = anchors[anchor_idx];
// 提取bbox [dx, dy, dw, dh]
float dx = ExtractNCHW(bbox_t, a * 4 + 0, h, w,
static_cast<int>(bbox_t.dims[1]), H, W) * stride;
float dy = ExtractNCHW(bbox_t, a * 4 + 1, h, w,
static_cast<int>(bbox_t.dims[1]), H, W) * stride;
float dw = ExtractNCHW(bbox_t, a * 4 + 2, h, w,
static_cast<int>(bbox_t.dims[1]), H, W) * stride;
float dh = ExtractNCHW(bbox_t, a * 4 + 3, h, w,
static_cast<int>(bbox_t.dims[1]), H, W) * stride;
float cx = anchor.cx + dx;
float cy = anchor.cy + dy;
float x1 = (cx - dw * 0.5f) * sx;
float y1 = (cy - dh * 0.5f) * sy;
float x2 = (cx + dw * 0.5f) * sx;
float y2 = (cy + dh * 0.5f) * sy;
x1 = static_cast<float>(ClampInt(static_cast<int>(x1), 0, src_w - 1));
y1 = static_cast<float>(ClampInt(static_cast<int>(y1), 0, src_h - 1));
x2 = static_cast<float>(ClampInt(static_cast<int>(x2), 0, src_w - 1));
y2 = static_cast<float>(ClampInt(static_cast<int>(y2), 0, src_h - 1));
Rect bb;
bb.x = x1;
bb.y = y1;
bb.w = std::max(0.0f, x2 - x1);
bb.h = std::max(0.0f, y2 - y1);
if (bb.w > 1.0f && bb.h > 1.0f) {
boxes.push_back(bb);
scores.push_back(score);
// 提取关键点
if (cfg.output_landmarks) {
std::array<Point2f, 5> pts{};
for (int k = 0; k < 5; ++k) {
float lx = ExtractNCHW(kps_t, a * 10 + k * 2 + 0, h, w,
static_cast<int>(kps_t.dims[1]), H, W) * stride;
float ly = ExtractNCHW(kps_t, a * 10 + k * 2 + 1, h, w,
static_cast<int>(kps_t.dims[1]), H, W) * stride;
pts[k].x = (anchor.cx + lx) * sx;
pts[k].y = (anchor.cy + ly) * sy;
}
lmks.push_back(pts);
}
}
float score = scores[i];
if (score < conf_thresh) {
anchor_idx++;
continue;
}
++anchor_idx;
}
}
const ScrfdAnchor& pt = anchors[anchor_idx];
// BBox: [left, top, right, bottom] - distances from center
float left = bboxes[i * 4 + 0];
float top = bboxes[i * 4 + 1];
float right = bboxes[i * 4 + 2];
float bottom = bboxes[i * 4 + 3];
// Decode to image coordinates (640x640)
float x1_640 = (pt.cx - left) * stride;
float y1_640 = (pt.cy - top) * stride;
float x2_640 = (pt.cx + right) * stride;
float y2_640 = (pt.cy + bottom) * stride;
FaceDetItem det;
det.bbox.x = x1_640 * scale_x;
det.bbox.y = y1_640 * scale_y;
det.bbox.w = (x2_640 - x1_640) * scale_x;
det.bbox.h = (y2_640 - y1_640) * scale_y;
det.score = score;
det.has_landmarks = output_lm;
// Keypoints
if (output_lm) {
for (int p = 0; p < 5; ++p) {
float kps_x = kps[i * 10 + p * 2 + 0];
float kps_y = kps[i * 10 + p * 2 + 1];
float kx_640 = (pt.cx + kps_x) * stride;
float ky_640 = (pt.cy + kps_y) * stride;
det.landmarks[p].x = kx_640 * scale_x;
det.landmarks[p].y = ky_640 * scale_y;
}
}
if (boxes.empty()) return;
// NMS
std::vector<int> keep;
NmsSorted(boxes, scores, cfg.nms_thresh, keep);
if (keep.empty()) return;
// 构建输出
const int out_n = std::min<int>(cfg.max_faces, static_cast<int>(keep.size()));
out.faces.reserve(static_cast<size_t>(out_n));
for (int i = 0; i < out_n; ++i) {
const int k = keep[static_cast<size_t>(i)];
FaceDetItem item;
item.bbox = boxes[static_cast<size_t>(k)];
item.score = scores[static_cast<size_t>(k)];
item.track_id = -1;
if (cfg.output_landmarks && k < static_cast<int>(lmks.size())) {
item.has_landmarks = true;
item.landmarks = lmks[static_cast<size_t>(k)];
out.faces.push_back(det);
anchor_idx++;
}
out.faces.push_back(std::move(item));
}
}

View File

@ -0,0 +1,85 @@
#pragma once
/**
* SCRFD Detector - SCRFD
* ai_scrfd ai_scrfd_zoned 使
*/
#include <vector>
#include <cstdint>
#include "face/face_result.h"
// 包含 AiScheduler 以使用 BorrowedOutput
#include "ai_scheduler.h"
namespace rk3588 {
/**
* SCRFD
*/
struct ScrfdDetection {
FaceDetItem item;
};
/**
* SCRFD
*/
struct ScrfdConfig {
float conf_thresh = 0.5f;
float nms_thresh = 0.4f;
int max_faces = 50;
bool output_landmarks = true;
};
/**
* SCRFD
*
* 使
* ScrfdDetector det;
* det.Init(640, 640);
* auto dets = det.Decode(outputs, src_w, src_h, config);
*/
class ScrfdDetector {
public:
ScrfdDetector();
~ScrfdDetector();
/**
*
* @param model_w (640)
* @param model_h (640)
*/
void Init(int model_w, int model_h);
/**
* SCRFD
* @param outputs 9 (BorrowedOutput)
* @param src_w
* @param src_h
* @param cfg
* @return
*/
std::vector<FaceDetItem> Decode(
const std::vector<AiScheduler::BorrowedOutput>& outputs,
int src_w, int src_h,
const ScrfdConfig& cfg);
/**
* NMS
*/
std::vector<FaceDetItem> ApplyNMS(
std::vector<FaceDetItem>& dets,
float nms_thresh);
private:
struct CenterPoint {
float cx, cy;
float stride;
};
std::vector<CenterPoint> center_points_;
int model_w_ = 640;
int model_h_ = 640;
};
} // namespace rk3588

View File

@ -269,24 +269,6 @@ set_target_properties(ai_face_det PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR}
)
# ai_face_det_zoned plugin (RKNN-based RetinaFace with distance zone detection)
add_library(ai_face_det_zoned SHARED
ai_face_det_zoned/ai_face_det_zoned_node.cpp
${CMAKE_SOURCE_DIR}/src/utils/dma_alloc.cpp
)
target_include_directories(ai_face_det_zoned PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/third_party)
target_link_libraries(ai_face_det_zoned PRIVATE project_options Threads::Threads ai_scheduler)
if(RK3588_ENABLE_RKNN AND RK_RKNN_LIB)
target_compile_definitions(ai_face_det_zoned PRIVATE RK3588_ENABLE_RKNN)
target_include_directories(ai_face_det_zoned PRIVATE ${RKNN_RUNTIME_INCLUDE_DIR})
target_link_libraries(ai_face_det_zoned PRIVATE ${RK_RKNN_LIB})
endif()
set_target_properties(ai_face_det_zoned PROPERTIES
OUTPUT_NAME "ai_face_det_zoned"
LIBRARY_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR}
RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR}
)
# ai_scrfd plugin (SCRFD 640x640 face detection)
add_library(ai_scrfd SHARED
ai_scrfd/ai_scrfd_node.cpp
@ -305,6 +287,25 @@ set_target_properties(ai_scrfd PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR}
)
# ai_scrfd_sliding plugin (SCRFD with sliding window detection)
add_library(ai_scrfd_sliding SHARED
ai_scrfd_sliding/ai_scrfd_sliding_node.cpp
${CMAKE_SOURCE_DIR}/src/face/scrfd_detector.cpp
${CMAKE_SOURCE_DIR}/src/utils/dma_alloc.cpp
)
target_include_directories(ai_scrfd_sliding PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/third_party)
target_link_libraries(ai_scrfd_sliding PRIVATE project_options Threads::Threads ai_scheduler)
if(RK3588_ENABLE_RKNN)
target_compile_definitions(ai_scrfd_sliding PRIVATE RK3588_ENABLE_RKNN)
target_include_directories(ai_scrfd_sliding PRIVATE ${RKNN_RUNTIME_INCLUDE_DIR})
target_link_libraries(ai_scrfd_sliding PRIVATE ${RK_RKNN_LIB})
endif()
set_target_properties(ai_scrfd_sliding PROPERTIES
OUTPUT_NAME "ai_scrfd_sliding"
LIBRARY_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR}
RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR}
)
# ai_face_recog plugin (RKNN-based ArcFace/MobileFaceNet inference)
add_library(ai_face_recog SHARED
ai_face_recog/ai_face_recog_node.cpp
@ -511,7 +512,7 @@ if(RK3588_ENABLE_ZLMEDIAKIT AND RK_ZLMK_API_LIB)
)
endif()
install(TARGETS input_rtsp input_file publish preprocess ai_yolo ai_face_det ai_face_det_zoned ai_face_recog tracker gate osd alarm logic_gate storage ai_scheduler
install(TARGETS input_rtsp input_file publish preprocess ai_yolo ai_face_det ai_scrfd ai_scrfd_sliding ai_face_recog tracker gate osd alarm logic_gate storage ai_scheduler
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/rk3588-media-server/plugins
RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}/rk3588-media-server/plugins
)

View File

@ -1,502 +0,0 @@
/**
* ai_face_det_zoned -
*
*
* 1.
* 2. ROI裁剪和三分区检测
* 3. (3-5m) 1.0x / (5-7m) 1.3x / (7-9m) 1.8x
* 4. face_detection_utils.h
*/
#include <algorithm>
#include <array>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <memory>
#include <mutex>
#include <string>
#include <vector>
#include "face/face_detection_utils.h"
#include "hw/i_infer_backend.h"
#include "face/face_result.h"
#include "node.h"
#include "utils/dma_alloc.h"
#include "utils/logger.h"
namespace rk3588 {
using namespace face_detection;
class AiFaceDetZonedNode : public INode {
public:
std::string Id() const override { return id_; }
std::string Type() const override { return "ai_face_det_zoned"; }
bool Init(const SimpleJson& config, const NodeContext& ctx) override {
id_ = config.ValueOr<std::string>("id", "face_det_zoned");
model_path_ = config.ValueOr<std::string>("model_path",
"./models/RetinaFace_mobile320.rknn");
// 基础检测参数
det_cfg_.conf_thresh = config.ValueOr<float>("conf", 0.6f);
det_cfg_.nms_thresh = config.ValueOr<float>("nms", 0.4f);
det_cfg_.max_faces = config.ValueOr<int>("max_faces", 10);
det_cfg_.output_landmarks = config.ValueOr<bool>("output_landmarks", true);
// 模型输入尺寸默认320
model_w_ = config.ValueOr<int>("model_w", 320);
model_h_ = config.ValueOr<int>("model_h", 320);
// 先验框步长和最小尺寸RetinaFace默认
det_cfg_.steps = {8, 16, 32};
det_cfg_.min_sizes = {{16, 32}, {64, 128}, {256, 512}};
// ROI配置 - 支持格式: "roi": {"x": 0, "y": 0, "w": 1920, "h": 1080}
roi_enabled_ = false;
roi_x_ = roi_y_ = roi_w_ = roi_h_ = 0;
if (const SimpleJson* roi = config.Find("roi"); roi && roi->IsObject()) {
// 直接读取平级格式
roi_x_ = roi->ValueOr<int>("x", 0);
roi_y_ = roi->ValueOr<int>("y", 0);
roi_w_ = roi->ValueOr<int>("w", 0);
roi_h_ = roi->ValueOr<int>("h", 0);
// 如果w/h有效则启用ROI
if (roi_w_ > 0 && roi_h_ > 0) {
roi_enabled_ = true;
}
// 兼容旧格式: "roi": {"crop": {...}}
else if (const SimpleJson* crop = roi->Find("crop"); crop && crop->IsObject()) {
roi_x_ = crop->ValueOr<int>("x", 0);
roi_y_ = crop->ValueOr<int>("y", 0);
roi_w_ = crop->ValueOr<int>("w", 0);
roi_h_ = crop->ValueOr<int>("h", 0);
if (roi_w_ > 0 && roi_h_ > 0) {
roi_enabled_ = true;
}
}
}
// 三分区配置 - 支持两种格式:
// 1. 旧格式: "distance_zones": {"enabled": true, "boundaries": [y1, y2], "scales": [s1, s2, s3]}
// 2. 新格式: "zones": {"near_zone": {"y_start": 0, "y_end": 405, "scale": 0.5}, ...}
zones_enabled_ = false;
boundary_y_5m_ = boundary_y_7m_ = 0;
scale_near_ = 1.0f;
scale_mid_ = 1.3f;
scale_far_ = 1.8f;
// 优先尝试新格式 "zones"
if (const SimpleJson* zones = config.Find("zones");
zones && zones->IsObject()) {
bool has_near = false, has_mid = false, has_far = false;
int near_y_end = 0, mid_y_end = 0;
if (const SimpleJson* near = zones->Find("near_zone"); near && near->IsObject()) {
near_y_end = near->ValueOr<int>("y_end", 0);
scale_near_ = near->ValueOr<float>("scale", 1.0f);
has_near = true;
}
if (const SimpleJson* mid = zones->Find("mid_zone"); mid && mid->IsObject()) {
mid_y_end = mid->ValueOr<int>("y_end", 0);
scale_mid_ = mid->ValueOr<float>("scale", 1.0f);
has_mid = true;
}
if (const SimpleJson* far = zones->Find("far_zone"); far && far->IsObject()) {
scale_far_ = far->ValueOr<float>("scale", 1.0f);
has_far = true;
}
if (has_near && has_mid && has_far) {
zones_enabled_ = true;
boundary_y_5m_ = near_y_end; // near和mid的分界
boundary_y_7m_ = mid_y_end; // mid和far的分界
}
}
// 兼容旧格式
else if (const SimpleJson* zones = config.Find("distance_zones");
zones && zones->IsObject()) {
zones_enabled_ = zones->ValueOr<bool>("enabled", false);
if (const SimpleJson* boundaries = zones->Find("boundaries");
boundaries && boundaries->IsArray() && boundaries->AsArray().size() >= 2) {
boundary_y_5m_ = boundaries->AsArray()[0].AsInt(0);
boundary_y_7m_ = boundaries->AsArray()[1].AsInt(0);
}
if (const SimpleJson* scales = zones->Find("scales");
scales && scales->IsArray() && scales->AsArray().size() >= 3) {
scale_near_ = scales->AsArray()[0].AsNumber(1.0f);
scale_mid_ = scales->AsArray()[1].AsNumber(1.3f);
scale_far_ = scales->AsArray()[2].AsNumber(1.8f);
}
}
input_queue_ = ctx.input_queue;
output_queues_ = ctx.output_queues;
if (!input_queue_) {
LogError("[ai_face_det_zoned] no input queue for node " + id_);
return false;
}
if (output_queues_.empty()) {
LogError("[ai_face_det_zoned] no output queue for node " + id_);
return false;
}
infer_backend_ = ctx.infer_backend;
if (!infer_backend_) {
LogError("[ai_face_det_zoned] no infer backend for node " + id_);
return false;
}
#if defined(RK3588_ENABLE_RKNN)
if (model_path_.empty()) {
LogError("[ai_face_det_zoned] model_path is required");
return false;
}
std::string err;
model_handle_ = infer_backend_->LoadModel(model_path_, err);
if (model_handle_ == kInvalidModelHandle) {
LogError("[ai_face_det_zoned] failed to load model: " + err);
return false;
}
// 预计算先验框
priors_ = GeneratePriors(model_w_, model_h_, det_cfg_.steps, det_cfg_.min_sizes);
LogInfo("[ai_face_det_zoned] model loaded: " + model_path_ +
" (" + std::to_string(model_w_) + "x" + std::to_string(model_h_) +
"), priors=" + std::to_string(priors_.size()));
#else
LogWarn("[ai_face_det_zoned] RKNN disabled, will passthrough frames");
#endif
return true;
}
bool Start() override {
LogInfo("[ai_face_det_zoned] start id=" + id_ +
" zones=" + std::string(zones_enabled_ ? "enabled" : "disabled") +
" roi=" + std::string(roi_enabled_ ? "enabled" : "disabled") +
" roi_xywh=" + std::to_string(roi_x_) + "," + std::to_string(roi_y_) + "," +
std::to_string(roi_w_) + "," + std::to_string(roi_h_) +
" boundaries=" + std::to_string(boundary_y_5m_) + "," + std::to_string(boundary_y_7m_) +
" scales=" + std::to_string(scale_near_) + "," + std::to_string(scale_mid_) + "," + std::to_string(scale_far_));
return true;
}
void Stop() override {
#if defined(RK3588_ENABLE_RKNN)
if (model_handle_ != kInvalidModelHandle) {
infer_backend_->UnloadModel(model_handle_);
model_handle_ = kInvalidModelHandle;
}
#endif
LogInfo("[ai_face_det_zoned] stop id=" + id_);
}
NodeStatus Process(FramePtr frame) override {
if (!frame) return NodeStatus::DROP;
#if defined(RK3588_ENABLE_RKNN)
RunZonedDetection(frame);
#endif
Push(frame);
return NodeStatus::OK;
}
private:
void Push(FramePtr frame) {
for (auto& q : output_queues_) q->Push(frame);
}
#if defined(RK3588_ENABLE_RKNN)
// 将RKNN输出转换为TensorView
TensorView ConvertToTensorView(const AiScheduler::BorrowedOutput& o) {
TensorView tv;
tv.data = o.data;
tv.size = o.size;
tv.zp = o.zp;
tv.scale = o.scale;
tv.dims = o.dims;
tv.type = o.type;
return tv;
}
void RunZonedDetection(FramePtr frame) {
if (!frame->data || frame->data_size == 0) return;
if (frame->format != PixelFormat::RGB && frame->format != PixelFormat::BGR) {
LogWarn("[ai_face_det_zoned] input must be RGB/BGR");
return;
}
const int src_w = frame->width;
const int src_h = frame->height;
// 应用ROI裁剪
int roi_x = 0, roi_y = 0, roi_w = src_w, roi_h = src_h;
if (roi_enabled_) {
roi_x = ClampInt(roi_x_, 0, src_w - 1);
roi_y = ClampInt(roi_y_, 0, src_h - 1);
roi_w = ClampInt(roi_w_, 1, src_w - roi_x);
roi_h = ClampInt(roi_h_, 1, src_h - roi_y);
}
std::vector<FaceDetItem> all_detections;
if (zones_enabled_) {
// 三分区检测
all_detections = DetectWithZones(frame, roi_x, roi_y, roi_w, roi_h);
} else {
// 单区检测全ROI区域
auto dets = DetectSingleZone(frame, roi_x, roi_y, roi_w, roi_h, 1.0f);
// 坐标映射回原始图像
for (auto& det : dets) {
det.bbox.x += roi_x;
det.bbox.y += roi_y;
if (det.has_landmarks) {
for (auto& lm : det.landmarks) {
lm.x += roi_x;
lm.y += roi_y;
}
}
all_detections.push_back(det);
}
}
// NMS去重
all_detections = ApplyNMS(all_detections, det_cfg_.nms_thresh);
// 限制最大人脸数
if (all_detections.size() > static_cast<size_t>(det_cfg_.max_faces)) {
all_detections.resize(det_cfg_.max_faces);
}
// 构建结果
FaceDetResult det_result;
det_result.img_w = src_w;
det_result.img_h = src_h;
det_result.model_name = "retinaface_zoned";
det_result.faces = std::move(all_detections);
frame->face_det = std::make_shared<FaceDetResult>(std::move(det_result));
}
std::vector<FaceDetItem> DetectWithZones(FramePtr frame,
int roi_x, int roi_y,
int roi_w, int roi_h) {
std::vector<FaceDetItem> all_dets;
// 将分界线坐标转换到ROI坐标系
int by5 = ClampInt(boundary_y_5m_ - roi_y, 0, roi_h);
int by7 = ClampInt(boundary_y_7m_ - roi_y, 0, roi_h);
// 确保顺序正确y大=下方=近距离)
if (by5 < by7) std::swap(by5, by7);
// 近区检测 (画面下方y大近距离3-5m)
if (by5 < roi_h) {
auto dets = DetectSingleZone(frame, roi_x, roi_y + by5, roi_w, roi_h - by5, scale_near_);
for (auto& det : dets) {
det.bbox.x += roi_x;
det.bbox.y += roi_y + by5;
if (det.has_landmarks) {
for (auto& lm : det.landmarks) {
lm.x += roi_x;
lm.y += roi_y + by5;
}
}
all_dets.push_back(det);
}
}
// 中区检测 (画面中部中距离5-7m)
if (by7 < by5) {
auto dets = DetectSingleZone(frame, roi_x, roi_y + by7, roi_w, by5 - by7, scale_mid_);
for (auto& det : dets) {
det.bbox.x += roi_x;
det.bbox.y += roi_y + by7;
if (det.has_landmarks) {
for (auto& lm : det.landmarks) {
lm.x += roi_x;
lm.y += roi_y + by7;
}
}
all_dets.push_back(det);
}
}
// 远区检测 (画面上方y小远距离7-9m)
if (by7 > 0) {
auto dets = DetectSingleZone(frame, roi_x, roi_y, roi_w, by7, scale_far_);
for (auto& det : dets) {
det.bbox.x += roi_x;
det.bbox.y += roi_y;
if (det.has_landmarks) {
for (auto& lm : det.landmarks) {
lm.x += roi_x;
lm.y += roi_y;
}
}
all_dets.push_back(det);
}
}
return all_dets;
}
std::vector<FaceDetItem> DetectSingleZone(FramePtr frame,
int x, int y, int w, int h,
float scale) {
std::vector<FaceDetItem> dets;
if (w <= 0 || h <= 0) return dets;
const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
const int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
: (frame->stride > 0 ? frame->stride : frame->width * 3);
// 裁剪区域缩放后的尺寸
int crop_w = static_cast<int>(w * scale);
int crop_h = static_cast<int>(h * scale);
if (crop_w <= 0 || crop_h <= 0) return dets;
// 分配缓冲区
input_buf_.resize(static_cast<size_t>(model_w_) * model_h_ * 3);
// 双线性缩放到模型输入尺寸
// 注意:这里从原图裁剪(x,y,w,h),缩放到(model_w_, model_h_)
// 优化可以直接从src裁剪并缩放避免中间buffer
// 简化的处理先裁剪到临时buffer再缩放
std::vector<uint8_t> crop_buf(static_cast<size_t>(w) * h * 3);
for (int row = 0; row < h; ++row) {
const uint8_t* src_row = src + (y + row) * src_stride + x * 3;
uint8_t* dst_row = crop_buf.data() + row * w * 3;
memcpy(dst_row, src_row, static_cast<size_t>(w) * 3);
}
// 缩放到模型输入尺寸
ResizeRgbBilinear(crop_buf.data(), w, h, w * 3,
input_buf_.data(), model_w_, model_h_,
false); // 假设输入已经是RGB
// NPU推理
InferInput input;
input.width = model_w_;
input.height = model_h_;
input.is_nhwc = true;
input.data = input_buf_.data();
input.size = input_buf_.size();
input.type = RKNN_TENSOR_UINT8;
auto r = infer_backend_->InferBorrowed(model_handle_, input);
if (!r.success || r.outputs.empty()) {
LogWarn("[ai_face_det_zoned] inference failed");
return dets;
}
// 解析输出
NcTensor loc_tensor, conf_tensor, landm_tensor;
bool has_loc = false, has_conf = false, has_landm = false;
for (const auto& o : r.outputs) {
TensorView tv = ConvertToTensorView(o);
NcTensor tmp;
if (!has_loc && ExtractNcTensor(tv, 4, tmp)) {
loc_tensor = std::move(tmp);
has_loc = true;
} else if (!has_conf && ExtractNcTensor(tv, 2, tmp)) {
conf_tensor = std::move(tmp);
has_conf = true;
} else if (!has_landm && ExtractNcTensor(tv, 10, tmp)) {
landm_tensor = std::move(tmp);
has_landm = true;
}
}
if (!has_loc || !has_conf) return dets;
// 解码检测结果
FaceDetResult result;
DecodeRetinaFace(loc_tensor, conf_tensor, landm_tensor,
priors_, w, h, model_w_, model_h_,
det_cfg_, result);
if (!result.faces.empty()) {
LogInfo("[ai_face_det_zoned] DetectSingleZone: detected " +
std::to_string(result.faces.size()) + " faces, max_score=" +
std::to_string(result.faces.empty() ? 0 : result.faces[0].score));
}
return result.faces;
}
std::vector<FaceDetItem> ApplyNMS(std::vector<FaceDetItem>& dets, float threshold) {
if (dets.empty()) return dets;
// 按置信度排序
std::sort(dets.begin(), dets.end(),
[](const FaceDetItem& a, const FaceDetItem& b) {
return a.score > b.score;
});
std::vector<FaceDetItem> keep;
std::vector<bool> suppressed(dets.size(), false);
for (size_t i = 0; i < dets.size(); ++i) {
if (suppressed[i]) continue;
keep.push_back(dets[i]);
for (size_t j = i + 1; j < dets.size(); ++j) {
if (suppressed[j]) continue;
if (IoU(dets[i].bbox, dets[j].bbox) > threshold) {
suppressed[j] = true;
}
}
}
return keep;
}
#endif
std::string id_;
std::string model_path_;
DetectionConfig det_cfg_;
int model_w_ = 320;
int model_h_ = 320;
// ROI
bool roi_enabled_ = false;
int roi_x_ = 0, roi_y_ = 0, roi_w_ = 0, roi_h_ = 0;
// 三分区
bool zones_enabled_ = false;
int boundary_y_5m_ = 0;
int boundary_y_7m_ = 0;
float scale_near_ = 1.0f;
float scale_mid_ = 1.3f;
float scale_far_ = 1.8f;
std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
std::shared_ptr<IInferBackend> infer_backend_;
#if defined(RK3588_ENABLE_RKNN)
ModelHandle model_handle_ = kInvalidModelHandle;
std::vector<Prior> priors_;
std::vector<uint8_t> input_buf_;
#endif
};
REGISTER_NODE(AiFaceDetZonedNode, "ai_face_det_zoned");
} // namespace rk3588

View File

@ -0,0 +1,311 @@
/**
* ai_scrfd_sliding - SCRFD with sliding window detection
*
* Features:
* 1. Resize input to target height (640) keeping approximate ratio
* 2. Split into multiple 640x640 windows
* 3. Detect on each window and merge results
*
* For 1080p: resize to 1280x640, 2 windows
* For 1440p: resize to 2560x640, 4 windows
*/
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <memory>
#include <string>
#include <vector>
#include "face/face_detection_utils.h"
#include "face/face_result.h"
#include "face/scrfd_detector.h"
#include "hw/i_infer_backend.h"
#include "node.h"
#include "utils/dma_alloc.h"
#include "utils/logger.h"
namespace rk3588 {
using namespace face_detection;
class AiScrfdSlidingNode : public INode {
public:
std::string Id() const override { return id_; }
std::string Type() const override { return "ai_scrfd_sliding"; }
bool Init(const SimpleJson& config, const NodeContext& ctx) override {
id_ = config.ValueOr<std::string>("id", "scrfd_sliding");
model_path_ = config.ValueOr<std::string>("model_path",
"./models/scrfd_500m_640.rknn");
// Detection parameters
det_cfg_.conf_thresh = config.ValueOr<float>("conf_thresh", 0.3f);
det_cfg_.nms_thresh = config.ValueOr<float>("nms_thresh", 0.4f);
det_cfg_.max_faces = config.ValueOr<int>("max_faces", 50);
det_cfg_.output_landmarks = config.ValueOr<bool>("output_landmarks", true);
model_w_ = 640;
model_h_ = 640;
// Initialize detector
detector_.Init(model_w_, model_h_);
// Parse sliding windows config
// If not configured, auto-calculate based on input resolution
windows_.clear();
if (const SimpleJson* win_arr = config.Find("windows"); win_arr && win_arr->IsArray()) {
for (const auto& w : win_arr->AsArray()) {
if (w.IsObject()) {
Window win;
win.x = w.ValueOr<int>("x", 0);
win.y = w.ValueOr<int>("y", 0);
win.w = w.ValueOr<int>("w", 640);
win.h = w.ValueOr<int>("h", 640);
windows_.push_back(win);
}
}
}
// Target resize height (default 640)
target_height_ = config.ValueOr<int>("target_height", 640);
input_queue_ = ctx.input_queue;
output_queues_ = ctx.output_queues;
if (!input_queue_) {
LogError("[ai_scrfd_sliding] no input queue");
return false;
}
infer_backend_ = ctx.infer_backend;
if (!infer_backend_) {
LogError("[ai_scrfd_sliding] no infer backend");
return false;
}
#if defined(RK3588_ENABLE_RKNN)
std::string err;
model_handle_ = infer_backend_->LoadModel(model_path_, err);
if (model_handle_ == kInvalidModelHandle) {
LogError("[ai_scrfd_sliding] failed to load model: " + err);
return false;
}
input_buf_.resize(model_w_ * model_h_ * 3);
LogInfo("[ai_scrfd_sliding] model loaded: " + model_path_);
#else
LogWarn("[ai_scrfd_sliding] RKNN disabled");
#endif
return true;
}
bool Start() override {
LogInfo("[ai_scrfd_sliding] start, windows=" + std::to_string(windows_.size()));
return true;
}
void Stop() override {
#if defined(RK3588_ENABLE_RKNN)
if (model_handle_ != kInvalidModelHandle) {
infer_backend_->UnloadModel(model_handle_);
model_handle_ = kInvalidModelHandle;
}
#endif
LogInfo("[ai_scrfd_sliding] stop");
}
NodeStatus Process(FramePtr frame) override {
if (!frame) return NodeStatus::DROP;
#if defined(RK3588_ENABLE_RKNN)
RunDetection(frame);
#endif
Push(frame);
return NodeStatus::OK;
}
private:
struct Window {
int x, y, w, h;
};
void Push(FramePtr frame) {
for (auto& q : output_queues_) q->Push(frame);
}
#if defined(RK3588_ENABLE_RKNN)
void RunDetection(FramePtr frame) {
if (!frame->data || frame->data_size == 0) return;
const int src_w = frame->width;
const int src_h = frame->height;
if (frame->DmaFd() >= 0) frame->SyncStart();
// Calculate windows if not pre-configured
std::vector<Window> windows = windows_;
if (windows.empty()) {
windows = CalculateWindows(src_w, src_h);
}
std::vector<FaceDetItem> all_detections;
const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
const int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
: (frame->stride > 0 ? frame->stride : frame->width * 3);
// Process each window - crop from original, then resize to 640x640
for (size_t i = 0; i < windows.size(); ++i) {
const auto& win = windows[i];
auto dets = DetectWindowFromSource(src, src_w, src_h, src_stride, win);
// Detections are already in original coordinates
all_detections.insert(all_detections.end(), dets.begin(), dets.end());
}
// Apply NMS
all_detections = detector_.ApplyNMS(all_detections, det_cfg_.nms_thresh);
if (all_detections.size() > static_cast<size_t>(det_cfg_.max_faces)) {
all_detections.resize(det_cfg_.max_faces);
}
FaceDetResult result;
result.img_w = src_w;
result.img_h = src_h;
result.model_name = "scrfd_sliding";
result.faces = std::move(all_detections);
frame->face_det = std::make_shared<FaceDetResult>(std::move(result));
}
std::vector<Window> CalculateWindows(int src_w, int src_h) {
std::vector<Window> windows;
// Strategy: Split source image into overlapping 640x640 regions
// For 1080p: 1920x1080 -> 3x2 grid (6 windows)
// For 1440p: 2560x1440 -> 4x2 grid (8 windows)
// Calculate step size (with overlap)
int step_x = (src_w <= 640) ? src_w : (src_w - 640) / ((src_w + 639) / 640 - 1);
int step_y = (src_h <= 640) ? src_h : (src_h - 640) / ((src_h + 639) / 640 - 1);
if (step_x < 640) step_x = 640;
if (step_y < 640) step_y = 640;
for (int y = 0; y < src_h; y += step_y) {
for (int x = 0; x < src_w; x += step_x) {
Window win;
win.x = x;
win.y = y;
win.w = 640;
win.h = 640;
windows.push_back(win);
// Stop if we've covered the width
if (x + 640 >= src_w) break;
}
// Stop if we've covered the height
if (y + 640 >= src_h) break;
}
LogInfo("[ai_scrfd_sliding] Auto-calculated: " + std::to_string(windows.size()) + " windows for " + std::to_string(src_w) + "x" + std::to_string(src_h));
return windows;
}
std::vector<FaceDetItem> DetectWindowFromSource(const uint8_t* src, int src_w, int src_h, int src_stride, const Window& win) {
std::vector<FaceDetItem> dets;
// Clamp window to source bounds
int win_x = std::max(0, std::min(win.x, src_w - 1));
int win_y = std::max(0, std::min(win.y, src_h - 1));
int win_w = std::min(win.w, src_w - win_x);
int win_h = std::min(win.h, src_h - win_y);
if (win_w <= 0 || win_h <= 0) {
LogWarn("[ai_scrfd_sliding] Invalid window");
return dets;
}
// Crop from source
std::vector<uint8_t> crop_buf(static_cast<size_t>(win_w) * win_h * 3);
for (int row = 0; row < win_h; ++row) {
const uint8_t* src_row = src + (win_y + row) * src_stride + win_x * 3;
uint8_t* dst_row = crop_buf.data() + row * win_w * 3;
memcpy(dst_row, src_row, static_cast<size_t>(win_w) * 3);
}
// Resize to 640x640
std::vector<uint8_t> model_input(640 * 640 * 3);
ResizeRgbBilinear(crop_buf.data(), win_w, win_h, win_w * 3,
model_input.data(), 640, 640, false);
// NPU inference
InferInput input;
input.width = 640;
input.height = 640;
input.is_nhwc = true;
input.data = model_input.data();
input.size = model_input.size();
input.type = RKNN_TENSOR_UINT8;
auto r = infer_backend_->InferBorrowed(model_handle_, input);
if (!r.success || r.outputs.empty()) {
LogWarn("[ai_scrfd_sliding] inference failed");
return dets;
}
// Decode (get detections in 640x640 coordinates)
dets = detector_.Decode(r.outputs, 640, 640, det_cfg_);
// Map back to original coordinates
float scale_x = static_cast<float>(win_w) / 640.0f;
float scale_y = static_cast<float>(win_h) / 640.0f;
for (auto& det : dets) {
det.bbox.x = win_x + det.bbox.x * scale_x;
det.bbox.y = win_y + det.bbox.y * scale_y;
det.bbox.w *= scale_x;
det.bbox.h *= scale_y;
if (det.has_landmarks) {
for (auto& lm : det.landmarks) {
lm.x = win_x + lm.x * scale_x;
lm.y = win_y + lm.y * scale_y;
}
}
}
return dets;
}
#endif
std::string id_;
std::string model_path_;
ScrfdConfig det_cfg_;
ScrfdDetector detector_;
int model_w_ = 640;
int model_h_ = 640;
int target_height_ = 640;
std::vector<Window> windows_;
std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;
std::shared_ptr<IInferBackend> infer_backend_;
#if defined(RK3588_ENABLE_RKNN)
ModelHandle model_handle_ = kInvalidModelHandle;
std::vector<uint8_t> input_buf_;
#endif
};
REGISTER_NODE(AiScrfdSlidingNode, "ai_scrfd_sliding");
} // namespace rk3588

View File

@ -428,10 +428,7 @@ public:
for (const auto& d : frame->det->items) {
if (d.cls_id == 10) no_boots_count++;
}
if (no_boots_count > 0 || processed_frames_ % 30 == 0) {
LogInfo("[alarm] frame received, dets=" + std::to_string(frame->det->items.size()) +
" no_boots=" + std::to_string(no_boots_count));
}
// Log throttled
}
if (eval_interval_ms_ > 0 && frame->pts > 0) {

View File

@ -128,6 +128,30 @@ private:
}
}
// 将检测坐标(相对于原始图像)映射到当前帧坐标
Rect MapDetCoordToFrame(const Rect& det_bbox, FramePtr frame) {
if (!frame->transform_meta || !frame->transform_meta->valid) {
return det_bbox; // 无变换信息,直接使用
}
const auto& meta = *frame->transform_meta;
if (meta.src_w <= 0 || meta.src_h <= 0 || frame->width <= 0 || frame->height <= 0) {
return det_bbox;
}
// 计算缩放因子:检测坐标是基于 src_w x src_h 的
float scale_x = static_cast<float>(frame->width) / meta.src_w;
float scale_y = static_cast<float>(frame->height) / meta.src_h;
Rect mapped;
mapped.x = det_bbox.x * scale_x;
mapped.y = det_bbox.y * scale_y;
mapped.w = det_bbox.w * scale_x;
mapped.h = det_bbox.h * scale_y;
return mapped;
}
void ProcessPpeBootsCheck(FramePtr frame) {
const auto& detections = frame->det->items;
@ -145,7 +169,12 @@ private:
if (config_.debug) {
LogInfo("[LogicGateNode] Persons=" + std::to_string(persons.size()) +
" Boots=" + std::to_string(boots.size()));
" Boots=" + std::to_string(boots.size()) +
" Frame=" + std::to_string(frame->width) + "x" + std::to_string(frame->height));
if (frame->transform_meta && frame->transform_meta->valid) {
LogInfo("[LogicGateNode] TransformMeta: src=" + std::to_string(frame->transform_meta->src_w) +
"x" + std::to_string(frame->transform_meta->src_h));
}
}
// 简化逻辑:必须同时检测到人和鞋,才开始判断
@ -158,7 +187,21 @@ private:
// 对每只鞋进行颜色检查
for (const auto& boot : boots) {
if (config_.enable_color_check && color_analyzer_) {
auto color_result = color_analyzer_->Analyze(*frame, boot.bbox);
// 将检测坐标映射到当前帧坐标
Rect mapped_bbox = MapDetCoordToFrame(boot.bbox, frame);
if (config_.debug) {
LogInfo("[LogicGateNode] Boot bbox: [" + std::to_string(static_cast<int>(boot.bbox.x)) +
"," + std::to_string(static_cast<int>(boot.bbox.y)) +
" " + std::to_string(static_cast<int>(boot.bbox.w)) +
"x" + std::to_string(static_cast<int>(boot.bbox.h)) +
"] -> Mapped: [" + std::to_string(static_cast<int>(mapped_bbox.x)) +
"," + std::to_string(static_cast<int>(mapped_bbox.y)) +
" " + std::to_string(static_cast<int>(mapped_bbox.w)) +
"x" + std::to_string(static_cast<int>(mapped_bbox.h)) + "]");
}
auto color_result = color_analyzer_->Analyze(*frame, mapped_bbox);
if (config_.debug) {
LogInfo("[LogicGateNode] Boot brightness=" +

157
src/face/scrfd_detector.cpp Normal file
View File

@ -0,0 +1,157 @@
/**
* SCRFD Detector Implementation
*/
#include "face/scrfd_detector.h"
#include "ai_scheduler.h" // For BorrowedOutput
#include "face/face_detection_utils.h"
#include <algorithm>
#include <cstring>
namespace rk3588 {
ScrfdDetector::ScrfdDetector() = default;
ScrfdDetector::~ScrfdDetector() = default;
void ScrfdDetector::Init(int model_w, int model_h) {
model_w_ = model_w;
model_h_ = model_h;
// Generate center points
const int strides[] = {8, 16, 32};
for (int stride : strides) {
int num_grid = model_w_ / stride;
for (int y = 0; y < num_grid; ++y) {
for (int x = 0; x < num_grid; ++x) {
// 2 anchors per location
for (int a = 0; a < 2; ++a) {
CenterPoint pt;
pt.cx = static_cast<float>(x);
pt.cy = static_cast<float>(y);
pt.stride = static_cast<float>(stride);
center_points_.push_back(pt);
}
}
}
}
}
std::vector<FaceDetItem> ScrfdDetector::Decode(
const std::vector<AiScheduler::BorrowedOutput>& outputs,
int src_w, int src_h,
const ScrfdConfig& cfg) {
std::vector<FaceDetItem> detections;
if (outputs.size() != 9) return detections;
// Output order: score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32
const int anchor_counts[] = {12800, 3200, 800};
const int strides[] = {8, 16, 32};
size_t anchor_idx = 0;
for (int s = 0; s < 3; ++s) {
int stride = strides[s];
int count = anchor_counts[s];
const auto& score_out = outputs[s];
const auto& bbox_out = outputs[s + 3];
const auto& kps_out = outputs[s + 6];
if (score_out.dims.size() < 3) continue;
const float* scores = reinterpret_cast<const float*>(score_out.data);
const float* bboxes = reinterpret_cast<const float*>(bbox_out.data);
const float* kps = reinterpret_cast<const float*>(kps_out.data);
if (!scores || !bboxes || !kps) continue;
for (int i = 0; i < count; ++i) {
if (anchor_idx >= center_points_.size()) break;
float score = scores[i];
if (score < cfg.conf_thresh) {
anchor_idx++;
continue;
}
const CenterPoint& pt = center_points_[anchor_idx];
// BBox: [left, top, right, bottom] - distances from center
float left = bboxes[i * 4 + 0];
float top = bboxes[i * 4 + 1];
float right = bboxes[i * 4 + 2];
float bottom = bboxes[i * 4 + 3];
// Decode to image coordinates (640x640)
float x1_640 = (pt.cx - left) * stride;
float y1_640 = (pt.cy - top) * stride;
float x2_640 = (pt.cx + right) * stride;
float y2_640 = (pt.cy + bottom) * stride;
// Scale to original image size
float scale_x = static_cast<float>(src_w) / model_w_;
float scale_y = static_cast<float>(src_h) / model_h_;
FaceDetItem det;
det.bbox.x = x1_640 * scale_x;
det.bbox.y = y1_640 * scale_y;
det.bbox.w = (x2_640 - x1_640) * scale_x;
det.bbox.h = (y2_640 - y1_640) * scale_y;
det.score = score;
det.has_landmarks = cfg.output_landmarks;
// Keypoints
if (cfg.output_landmarks) {
for (int p = 0; p < 5; ++p) {
float kps_x = kps[i * 10 + p * 2 + 0];
float kps_y = kps[i * 10 + p * 2 + 1];
float kx_640 = (pt.cx + kps_x) * stride;
float ky_640 = (pt.cy + kps_y) * stride;
det.landmarks[p].x = kx_640 * scale_x;
det.landmarks[p].y = ky_640 * scale_y;
}
}
detections.push_back(det);
anchor_idx++;
}
}
return detections;
}
std::vector<FaceDetItem> ScrfdDetector::ApplyNMS(
std::vector<FaceDetItem>& dets,
float nms_thresh) {
if (dets.empty()) return dets;
// Sort by score
std::sort(dets.begin(), dets.end(),
[](const FaceDetItem& a, const FaceDetItem& b) {
return a.score > b.score;
});
std::vector<FaceDetItem> keep;
std::vector<bool> suppressed(dets.size(), false);
for (size_t i = 0; i < dets.size(); ++i) {
if (suppressed[i]) continue;
keep.push_back(dets[i]);
for (size_t j = i + 1; j < dets.size(); ++j) {
if (suppressed[j]) continue;
if (face_detection::IoU(dets[i].bbox, dets[j].bbox) > nms_thresh) {
suppressed[j] = true;
}
}
}
return keep;
}
} // namespace rk3588

View File

@ -58,7 +58,6 @@
<script>
const streams = [
{ name: 'SCRFD Face Detection', url: '/hls/scrfd/index.m3u8' },
{ name: 'Cam 1', url: '/hls/cam1/index.m3u8' },
{ name: 'Cam 2', url: '/hls/cam2/index.m3u8' },
{ name: 'Cam 3', url: '/hls/cam3/index.m3u8' },