diff --git a/configs/full_pipeline_1080p.json b/configs/full_pipeline_1080p.json new file mode 100644 index 0000000..b18b20b --- /dev/null +++ b/configs/full_pipeline_1080p.json @@ -0,0 +1,311 @@ +{ + "queue": { + "size": 8, + "strategy": "drop_oldest" + }, + "graphs": [ + { + "name": "cam1_full_pipeline", + "nodes": [ + { + "id": "in_cam1", + "type": "input_rtsp", + "role": "source", + "enable": true, + "url": "rtsp://10.0.0.49:8554/cam", + "fps": 30, + "width": 1920, + "height": 1080, + "use_mpp": true, + "use_ffmpeg": false, + "force_tcp": true, + "reconnect_sec": 5, + "reconnect_backoff_max_sec": 30 + }, + { + "id": "pre_face", + "type": "preprocess", + "role": "filter", + "enable": true, + "dst_w": 1920, + "dst_h": 1080, + "dst_format": "rgb", + "dst_packed": true, + "resize_mode": "stretch", + "keep_ratio": false, + "rga_gate": "ppe_detection", + "use_rga": true + }, + { + "id": "face_det", + "type": "ai_scrfd_sliding", + "role": "filter", + "enable": true, + "infer_fps": 5, + "model_path": "./models/scrfd_500m_640.rknn", + "model_w": 640, + "model_h": 640, + "windows": [ + {"x": 0, "y": 0, "w": 960, "h": 1080}, + {"x": 960, "y": 0, "w": 960, "h": 1080} + ], + "conf_thresh": 0.5, + "nms_thresh": 0.4, + "max_faces": 50, + "debug": { + "stats": true, + "stats_interval": 30 + } + }, + { + "id": "face_recog", + "type": "ai_face_recog", + "role": "filter", + "enable": true, + "model_path": "./models/mobilefacenet_arcface.rknn", + "align": true, + "emit_embedding": false, + "max_faces": 50, + "input_format": "rgb", + "input_dtype": "uint8", + "threshold": { + "accept": 0.45, + "margin": 0.05 + }, + "gallery": { + "backend": "sqlite", + "path": "./models/face_gallery.db", + "load_on_start": true, + "expected_dim": 128, + "dtype": "auto" + } + }, + { + "id": "pre_yolo", + "type": "preprocess", + "role": "filter", + "enable": true, + "dst_w": 768, + "dst_h": 768, + "dst_format": "rgb", + "dst_packed": true, + "resize_mode": "stretch", + "keep_ratio": false, + "rga_gate": "ppe_detection", + "use_rga": true + }, + { + "id": "yolo_ppe", + "type": "ai_yolo", + "role": "filter", + "enable": true, + "infer_fps": 5, + "model_path": "./models/best-768.rknn", + "model_version": "v8", + "model_w": 768, + "model_h": 768, + "num_classes": 11, + "conf": 0.25, + "nms": 0.45, + "debug": { + "stats": true, + "stats_interval": 30, + "detections": false + }, + "class_filter": [3, 6, 10] + }, + { + "id": "tracker", + "type": "tracker", + "role": "filter", + "enable": true, + "mode": "bytetrack_lite", + "per_class": true, + "state_key": "ppe_detection", + "track_classes": [3, 6, 10], + "ignore_classes": [], + "allowed_models": ["yolov8"], + "high_th": 0.5, + "low_th": 0.1, + "iou_th": 0.3, + "max_age_ms": 1500, + "min_hits": 2, + "max_tracks": 128 + }, + { + "id": "logic_boots", + "type": "logic_gate", + "role": "filter", + "enable": true, + "mode": "ppe_boots_check", + "anchor_class": 6, + "boots_class": 3, + "color_check": { + "enable": true, + "method": "hsv", + "dark_threshold": 80, + "roi_expand": 1.0 + }, + "debug": false + }, + { + "id": "pre_osd", + "type": "preprocess", + "role": "filter", + "enable": true, + "dst_w": 1920, + "dst_h": 1080, + "dst_format": "nv12", + "resize_mode": "stretch", + "rga_gate": "ppe_detection", + "use_rga": true + }, + { + "id": "osd", + "type": "osd", + "role": "filter", + "enable": true, + "draw_bbox": true, + "draw_text": true, + "draw_face_det": true, + "draw_face_bbox": true, + "line_width": 2, + "font_scale": 1, + "use_rga_bbox": false, + "labels": ["helmet", "gloves", "vest", "boots", "goggles", "none", "Person", "no_helmet", "no_goggle", "no_gloves", "no_boots", "violation"] + }, + { + "id": "publish", + "type": "publish", + "role": "filter", + "enable": true, + "queue": {"size": 2, "policy": "drop_oldest"}, + "codec": "h264", + "fps": 30, + "gop": 60, + "bitrate_kbps": 4000, + "use_mpp": true, + "use_ffmpeg_mux": true, + "outputs": [ + { + "proto": "hls", + "path": "./web/hls/cam1/index.m3u8", + "segment_sec": 2 + }, + { + "proto": "rtsp_server", + "port": 8555, + "path": "/live/cam1" + } + ] + }, + { + "id": "alarm", + "type": "alarm", + "role": "sink", + "enable": true, + "eval_fps": 10, + "labels": ["helmet", "gloves", "vest", "boots", "goggles", "none", "Person", "no_helmet", "no_goggle", "no_gloves", "no_boots", "violation"], + "rules": [ + { + "name": "non_compliant_boots", + "class_ids": [10], + "roi": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, + "min_score": 0.3, + "min_box_area_ratio": 0.01, + "require_track_id": true, + "min_duration_ms": 800, + "min_hits": 2, + "hit_window_ms": 1000, + "cooldown_ms": 5000, + "per_track_cooldown_ms": 5000 + } + ], + "face_rules": [ + { + "name": "unknown_face", + "type": "unknown", + "cooldown_ms": 7000, + "min_sim": 0.35, + "min_hits": 2, + "hit_window_ms": 1500, + "min_face_area_ratio": 0.01, + "min_face_aspect": 0.6, + "max_face_aspect": 1.6 + }, + { + "name": "known_person", + "type": "person", + "cooldown_ms": 7000, + "min_sim": 0.6, + "min_hits": 2, + "hit_window_ms": 1500, + "min_face_area_ratio": 0.01, + "min_face_aspect": 0.6, + "max_face_aspect": 1.6 + } + ], + "actions": { + "log": { + "enable": true, + "level": "info" + }, + "snapshot": { + "enable": true, + "format": "jpg", + "quality": 85, + "upload": { + "type": "minio", + "endpoint": "http://10.0.0.49:9000", + "bucket": "myminio", + "region": "us-east-1", + "access_key": "minioadmin", + "secret_key": "minioadmin" + } + }, + "clip": { + "enable": true, + "pre_sec": 5, + "post_sec": 10, + "format": "mp4", + "fps": 30, + "upload": { + "type": "minio", + "endpoint": "http://10.0.0.49:9000", + "bucket": "myminio", + "region": "us-east-1", + "access_key": "minioadmin", + "secret_key": "minioadmin" + } + }, + "external_api": { + "enable": true, + "getTokenUrl": "http://10.0.0.49:8080/api/getToken", + "putMessageUrl": "http://10.0.0.49:8080/api/putMessage", + "tenantCode": "32", + "channelNo": "cam1", + "timeout_ms": 3000, + "include_media_url": true, + "token_header": "X-Access-Token", + "token_json_path": "responseBody.token", + "token_cache_sec": 1200 + } + } + } + ], + "edges": [ + ["in_cam1", "pre_face"], + ["pre_face", "face_det"], + ["face_det", "face_recog"], + ["face_recog", "pre_yolo"], + ["pre_yolo", "yolo_ppe"], + ["yolo_ppe", "tracker"], + ["tracker", "logic_boots"], + ["logic_boots", "osd"], + ["osd", "pre_osd"], + ["pre_osd", "publish"], + ["publish", "alarm"] + ] + } + ] +} diff --git a/configs/test_scrfd_640.json b/configs/test_scrfd_640.json index 6a72022..79bde68 100644 --- a/configs/test_scrfd_640.json +++ b/configs/test_scrfd_640.json @@ -14,8 +14,8 @@ "enable": true, "url": "rtsp://10.0.0.49:8554/cam", "fps": 30, - "width": 1280, - "height": 720, + "width": 1920, + "height": 1080, "use_mpp": true, "use_ffmpeg": false, "force_tcp": true, @@ -33,7 +33,7 @@ "dst_packed": true, "resize_mode": "stretch", "keep_ratio": false, - "rga_gate": "scrfd_640_test", + "rga_gate": "scrfd_1080p", "use_rga": true }, { @@ -42,9 +42,9 @@ "role": "filter", "enable": true, "model_path": "./models/scrfd_500m_640.rknn", - "conf_thresh": 0.5, + "conf_thresh": 0.3, "nms_thresh": 0.4, - "max_faces": 10, + "max_faces": 50, "output_landmarks": true, "input_format": "rgb" }, @@ -67,11 +67,11 @@ "type": "preprocess", "role": "filter", "enable": true, - "dst_w": 1280, - "dst_h": 720, + "dst_w": 1920, + "dst_h": 1080, "dst_format": "nv12", "resize_mode": "stretch", - "rga_gate": "scrfd_640_test", + "rga_gate": "scrfd_1080p", "use_rga": true }, { @@ -83,13 +83,13 @@ "codec": "h264", "fps": 30, "gop": 60, - "bitrate_kbps": 2000, + "bitrate_kbps": 4000, "use_mpp": true, "use_ffmpeg_mux": true, "outputs": [ { "proto": "hls", - "path": "./web/hls/scrfd/index.m3u8", + "path": "./web/hls/cam1/index.m3u8", "segment_sec": 2 }, { diff --git a/configs/test_scrfd_640_recog.json b/configs/test_scrfd_640_recog.json new file mode 100644 index 0000000..6a6ab22 --- /dev/null +++ b/configs/test_scrfd_640_recog.json @@ -0,0 +1,136 @@ +{ + "queue": { + "size": 8, + "strategy": "drop_oldest" + }, + "graphs": [ + { + "name": "scrfd_640_recog_test", + "nodes": [ + { + "id": "in_cam1", + "type": "input_rtsp", + "role": "source", + "enable": true, + "url": "rtsp://10.0.0.49:8554/cam", + "fps": 30, + "width": 1920, + "height": 1080, + "use_mpp": true, + "use_ffmpeg": false, + "force_tcp": true, + "reconnect_sec": 5, + "reconnect_backoff_max_sec": 30 + }, + { + "id": "pre_cam1", + "type": "preprocess", + "role": "filter", + "enable": true, + "dst_w": 640, + "dst_h": 640, + "dst_format": "rgb", + "dst_packed": true, + "resize_mode": "stretch", + "keep_ratio": false, + "rga_gate": "scrfd_1080p", + "use_rga": true + }, + { + "id": "scrfd", + "type": "ai_scrfd", + "role": "filter", + "enable": true, + "model_path": "./models/scrfd_500m_640.rknn", + "conf_thresh": 0.3, + "nms_thresh": 0.4, + "max_faces": 50, + "output_landmarks": true, + "input_format": "rgb" + }, + { + "id": "face_recog", + "type": "ai_face_recog", + "role": "filter", + "enable": true, + "model_path": "./models/mobilefacenet_arcface.rknn", + "align": true, + "emit_embedding": false, + "max_faces": 50, + "input_format": "rgb", + "input_dtype": "uint8", + "threshold": { + "accept": 0.45, + "margin": 0.05 + }, + "gallery": { + "backend": "sqlite", + "path": "./models/face_gallery.db", + "load_on_start": true, + "expected_dim": 512, + "dtype": "auto" + } + }, + { + "id": "osd_cam1", + "type": "osd", + "role": "filter", + "enable": true, + "draw_bbox": true, + "draw_text": true, + "draw_face_det": true, + "draw_face_bbox": true, + "line_width": 2, + "font_scale": 1, + "use_rga_bbox": false, + "labels": ["face"] + }, + { + "id": "post_cam1", + "type": "preprocess", + "role": "filter", + "enable": true, + "dst_w": 1920, + "dst_h": 1080, + "dst_format": "nv12", + "resize_mode": "stretch", + "rga_gate": "scrfd_1080p", + "use_rga": true + }, + { + "id": "pub_cam1", + "type": "publish", + "role": "filter", + "enable": true, + "queue": {"size": 2, "policy": "drop_oldest"}, + "codec": "h264", + "fps": 30, + "gop": 60, + "bitrate_kbps": 4000, + "use_mpp": true, + "use_ffmpeg_mux": true, + "outputs": [ + { + "proto": "hls", + "path": "./web/hls/cam1/index.m3u8", + "segment_sec": 2 + }, + { + "proto": "rtsp_server", + "port": 8555, + "path": "/live/cam1" + } + ] + } + ], + "edges": [ + ["in_cam1", "pre_cam1"], + ["pre_cam1", "scrfd"], + ["scrfd", "face_recog"], + ["face_recog", "osd_cam1"], + ["osd_cam1", "post_cam1"], + ["post_cam1", "pub_cam1"] + ] + } + ] +} diff --git a/configs/test_scrfd_sliding_1080p_recog.json b/configs/test_scrfd_sliding_1080p_recog.json new file mode 100644 index 0000000..f1f6cc6 --- /dev/null +++ b/configs/test_scrfd_sliding_1080p_recog.json @@ -0,0 +1,139 @@ +{ + "queue": { + "size": 8, + "strategy": "drop_oldest" + }, + "graphs": [ + { + "name": "scrfd_sliding_1080p_recog", + "nodes": [ + { + "id": "in_cam1", + "type": "input_rtsp", + "role": "source", + "enable": true, + "url": "rtsp://10.0.0.49:8554/cam", + "fps": 30, + "width": 1920, + "height": 1080, + "use_mpp": true, + "use_ffmpeg": false, + "force_tcp": true, + "reconnect_sec": 5, + "reconnect_backoff_max_sec": 30 + }, + { + "id": "pre_cam1", + "type": "preprocess", + "role": "filter", + "enable": true, + "dst_w": 1920, + "dst_h": 1080, + "dst_format": "rgb", + "dst_packed": true, + "resize_mode": "stretch", + "keep_ratio": false, + "rga_gate": "scrfd_sliding_1080p_recog", + "use_rga": true + }, + { + "id": "scrfd_sliding", + "type": "ai_scrfd_sliding", + "role": "filter", + "enable": true, + "model_path": "./models/scrfd_500m_640.rknn", + "conf_thresh": 0.5, + "nms_thresh": 0.4, + "max_faces": 50, + "output_landmarks": true, + "windows": [ + {"x": 0, "y": 0, "w": 960, "h": 1080}, + {"x": 960, "y": 0, "w": 960, "h": 1080} + ] + }, + { + "id": "face_recog", + "type": "ai_face_recog", + "role": "filter", + "enable": true, + "model_path": "./models/mobilefacenet_arcface.rknn", + "align": true, + "emit_embedding": false, + "max_faces": 50, + "input_format": "rgb", + "input_dtype": "uint8", + "threshold": { + "accept": 0.45, + "margin": 0.05 + }, + "gallery": { + "backend": "sqlite", + "path": "./models/face_gallery.db", + "load_on_start": true, + "expected_dim": 512, + "dtype": "auto" + } + }, + { + "id": "osd_cam1", + "type": "osd", + "role": "filter", + "enable": true, + "draw_bbox": true, + "draw_text": true, + "draw_face_det": true, + "draw_face_bbox": true, + "line_width": 2, + "font_scale": 1, + "use_rga_bbox": false, + "labels": ["face"] + }, + { + "id": "post_cam1", + "type": "preprocess", + "role": "filter", + "enable": true, + "dst_w": 1920, + "dst_h": 1080, + "dst_format": "nv12", + "resize_mode": "stretch", + "rga_gate": "scrfd_sliding_1080p_recog", + "use_rga": true + }, + { + "id": "pub_cam1", + "type": "publish", + "role": "filter", + "enable": true, + "queue": {"size": 2, "policy": "drop_oldest"}, + "codec": "h264", + "fps": 30, + "gop": 60, + "bitrate_kbps": 4000, + "use_mpp": true, + "use_ffmpeg_mux": true, + "outputs": [ + { + "proto": "hls", + "path": "./web/hls/cam1/index.m3u8", + "segment_sec": 2 + }, + { + "proto": "rtsp_server", + "port": 8555, + "path": "/live/cam1" + } + ] + } + ], + "edges": [ + ["in_cam1", "pre_cam1"], + ["pre_cam1", "scrfd_sliding"], + ["scrfd_sliding", "face_recog"], + ["face_recog", "osd_cam1"], + ["osd_cam1", "post_cam1"], + ["post_cam1", "pub_cam1"] + ] + } + ] +} diff --git a/docs/design/detection_parameters_guide.md b/docs/design/detection_parameters_guide.md new file mode 100644 index 0000000..8332a78 --- /dev/null +++ b/docs/design/detection_parameters_guide.md @@ -0,0 +1,727 @@ +# 人脸检测参数配置指南 + +本文档详细说明人脸检测节点(`ai_face_det`, `ai_scrfd`, `ai_scrfd_sliding`)中的核心后处理参数及其对检测结果的影响。 + +--- + +## 参数概览 + +| 参数名 | 类型 | 默认值 | 范围 | 说明 | +|--------|------|--------|------|------| +| `conf_thresh` | float | 0.5/0.6/0.7 | 0.0 ~ 1.0 | 置信度阈值,过滤低置信度候选框 | +| `nms_thresh` | float | 0.4 | 0.0 ~ 1.0 | NMS IoU 阈值,控制重复框去重力度 | +| `max_faces` | int | 10/50 | ≥ 1 | 单帧最大返回人脸数 | + +--- + +## 1. conf_thresh (置信度阈值) + +### 含义 + +模型对检测到的人脸的置信度(confidence score)阈值。神经网络在推理时会对每个候选框输出一个置信度分数,表示该位置存在人脸的确定性程度。 + +### 对检测结果的影响 + +| 设置 | 效果 | 适用场景 | +|------|------|----------| +| **调高** (如 0.7) | 减少误检,只保留高置信度人脸 | 高精度需求场景(门禁、考勤) | +| **调低** (如 0.2) | 增加检出率,更多弱特征人脸被检测 | 弱光、远距离、小目标场景 | + +### 注意事项 + +- 值过高可能导致**漏检**:模糊人脸、侧脸、小人脸可能被过滤 +- 值过低可能导致**误检**:背景中的类似人脸的纹理可能被误判 + +### 代码实现 + +在 SCRFD 后处理中 (`ai_scrfd_node.cpp:282`): + +```cpp +if (score < cfg_.conf_thresh) continue; +``` + +在 RetinaFace 后处理中 (`ai_face_det_node.cpp:784`): + +```cpp +if (score < cfg.conf_thresh) continue; +``` + +--- + +## 2. nms_thresh (NMS IoU 阈值) + +### 含义 + +非极大值抑制(Non-Maximum Suppression)的 IoU(Intersection over Union,交并比)阈值。 + +**NMS 的作用**:同一个真实人脸可能被多个 anchor/候选框检测到,NMS 用于去除重叠的重复检测框,只保留最优的一个。 + +**IoU 计算**: +``` +IoU = 两个框的交集面积 / 两个框的并集面积 +``` + +### 对检测结果的影响 + +| 设置 | 效果 | 适用场景 | +|------|------|----------| +| **调高** (如 0.6) | 保留更多重叠框,对密集人脸友好 | 多人密集场景(会议室、教室) | +| **调低** (如 0.3) | 严格去重,只保留最优框 | 单人场景、需要精确框选 | + +### 注意事项 + +- 值过高:同一人脸可能返回多个重叠框 +- 值过低:密集人脸场景可能误删相邻的不同人脸 + +### 代码实现 + +在 SCRFD 后处理中 (`ai_scrfd_node.cpp:172`): + +```cpp +detections = ApplyNMS(detections, cfg_.nms_thresh); +``` + +NMS 算法逻辑 (`ai_face_det_node.cpp:156-167`): + +```cpp +void NmsSorted(const std::vector& boxes, const std::vector& scores, + float nms_thresh, std::vector& keep) { + for (...) { + bool suppressed = false; + for (int kept : keep) { + if (IoU(boxes[idx], boxes[kept]) > nms_thresh) { + suppressed = true; // 被已保留的框抑制 + break; + } + } + if (!suppressed) keep.push_back(idx); + } +} +``` + +--- + +## 3. max_faces (最大人脸数) + +### 含义 + +单帧图像中最多返回的人脸检测数量限制。 + +### 对检测结果的影响 + +| 设置 | 效果 | 性能影响 | +|------|------|----------| +| **调高** (如 50) | 可检测更多人脸,不遗漏密集场景目标 | 增加后处理开销,RGA/OSD 绘制负载增大 | +| **调低** (如 5) | 仅保留置信度最高的前几个人脸 | 减少计算量,提升实时性 | + +### 注意事项 + +- 当画面中出现超过 `max_faces` 数量的人脸时,系统会按置信度排序,只保留前 N 个 +- 设置过大可能导致 RGA 任务堆积,引起 OSD 绘制卡顿 + +### 代码实现 + +在 SCRFD 后处理中 (`ai_scrfd_node.cpp:174-176`): + +```cpp +if (detections.size() > static_cast(cfg_.max_faces)) { + detections.resize(cfg_.max_faces); +} +``` + +在 RetinaFace 后处理中 (`ai_face_det_node.cpp:840`): + +```cpp +const int out_n = std::min(cfg.max_faces, static_cast(keep.size())); +``` + +--- + +## 推荐配置 + +### 按应用场景 + +| 场景 | conf_thresh | nms_thresh | max_faces | 说明 | +|------|-------------|------------|-----------|------| +| **高精度门禁/考勤** | 0.6 ~ 0.7 | 0.4 | 5 ~ 10 | 减少误识别,确保准确率 | +| **多人大场景** (会议室/教室) | 0.4 ~ 0.5 | 0.4 ~ 0.5 | 20 ~ 50 | 平衡检出率和去重效果 | +| **实时性优先** | 0.5 | 0.4 | 10 | 减少后处理开销 | +| **弱光/远距离/小目标** | 0.3 ~ 0.4 | 0.3 | 10 ~ 20 | 提高检出率,但需容忍一定误检 | +| **单人视频通话** | 0.6 | 0.4 | 1 ~ 3 | 最小化处理开销 | + +### 按硬件性能 + +| 设备性能 | max_faces 建议 | 优化策略 | +|----------|----------------|----------| +| **RK3588 高性能模式** | 20 ~ 50 | 可同时处理多路高清视频 | +| **RK3588 平衡模式** | 10 ~ 20 | 适当降低分辨率和检测频率 | +| **RK3566/RK3568** | 5 ~ 10 | 降低输入分辨率,提高 conf_thresh 减少候选框 | + +--- + +## 参数联动关系 + +这三个参数需要协同调整: + +1. **提高 `conf_thresh`** → 候选框数量减少 → 可降低 `max_faces` → NMS 压力减小 +2. **降低 `conf_thresh`** → 候选框数量增加 → 可能需要提高 `max_faces` → NMS 压力增大 +3. **密集场景**:适当提高 `nms_thresh` 避免误删相邻人脸,同时确保 `max_faces` 足够大 + +--- + +## 配置示例 + +### SCRFD 配置 (`ai_scrfd` 节点) + +```json +{ + "type": "ai_scrfd", + "model_path": "./models/scrfd_500m_640.rknn", + "conf_thresh": 0.5, + "nms_thresh": 0.4, + "max_faces": 50, + "output_landmarks": true, + "input_format": "rgb" +} +``` + +### RetinaFace 配置 (`ai_face_det` 节点) + +```json +{ + "type": "ai_face_det", + "model_path": "./models/RetinaFace_mobile320.rknn", + "conf": 0.7, + "nms": 0.4, + "max_faces": 10, + "output_landmarks": true, + "input_format": "rgb" +} +``` + +### 分区域检测配置 (`ai_face_det_zoned` 节点) + +```json +{ + "type": "ai_face_det_zoned", + "model_path": "./models/RetinaFace_mobile320.rknn", + "conf": 0.6, + "nms": 0.4, + "max_faces": 10, + "output_landmarks": true +} +``` + +--- + +## 调试建议 + +1. **先调 conf_thresh**:从默认值开始,观察是否漏检或误检 +2. **再调 nms_thresh**:在密集人脸场景测试,确保既不重复框选也不漏检 +3. **最后调 max_faces**:根据实际场景人数和硬件性能调整 + +### 日志查看 + +启动时节点会打印当前参数: + +``` +[ai_face_det] start id=face_det conf=0.7 nms=0.4 max_faces=10 +[ai_scrfd] start id=scrfd conf=0.5 nms=0.4 max_faces=50 +``` + +--- + +## 常见问题 + +### Q1: 为什么检测到的人脸框会抖动/闪烁? + +**可能原因**: +- `conf_thresh` 设置过低,边缘候选框置信度波动导致时有时无 +- `nms_thresh` 过低,相邻帧选择不同的 anchor + +**解决方法**:适当提高 `conf_thresh` 或调整 `nms_thresh` + +### Q2: 密集场景漏检严重怎么办? + +**解决方法**: +- 降低 `conf_thresh` 到 0.4 左右 +- 提高 `max_faces` 到 30 以上 +- 适当提高 `nms_thresh` 到 0.5,避免相邻人脸被抑制 + +### Q3: OSD 绘制卡顿,RGA 任务堆积? + +**解决方法**: +- 降低 `max_faces` 减少绘制负载 +- 提高 `conf_thresh` 减少检测数量 + +--- + +## 四、人脸识别参数 (`ai_face_recog`) + +`ai_face_recog` 节点接收人脸检测结果,提取人脸特征向量并与特征库进行比对,完成人脸识别。 + +### 4.1 参数概览 + +| 参数名 | 类型 | 默认值 | 说明 | +|--------|------|--------|------| +| `align` | bool | true | 是否使用5点关键点进行人脸对齐 | +| `emit_embedding` | bool | false | 是否输出特征向量(用于调试) | +| `max_faces` | int | 10 | 单帧最大处理人脸数 | +| `input_format` | string | "rgb" | 输入图像格式:rgb/bgr | +| `input_dtype` | string | "uint8" | 输入数据类型:uint8/float | +| `threshold.accept` | float | 0.45 | 识别通过阈值,相似度超过此值才接受 | +| `threshold.margin` | float | 0.05 | 边距阈值,最佳与次佳匹配的差距要求 | +| `gallery.backend` | string | "sqlite" | 人脸库后端类型 | +| `gallery.path` | string | "./models/face_gallery.db" | 人脸库文件路径 | + +--- + +### 4.2 align (人脸对齐) + +#### 含义 + +是否使用检测到的5个面部关键点(眼睛、鼻子、嘴角)进行人脸对齐变换。 + +#### 对识别效果的影响 + +| 设置 | 效果 | 适用场景 | +|------|------|----------| +| **true** | 对齐后人脸姿态归一化,提高识别准确率 | 高位摄像头、角度倾斜、侧脸场景 | +| **false** | 直接裁剪人脸区域,计算量略小 | 正面、固定位置场景 | + +#### 对齐原理 + +使用5点关键点与标准模板进行相似变换(Similarity Transform): +- 标准模板坐标(112x112输入):左眼(38.29,51.70)、右眼(73.53,51.50)、鼻尖(56.02,71.74)、左嘴角(41.55,92.37)、右嘴角(70.73,92.20) +- 代码实现:`ai_face_recog_node.cpp:851-865` + +```cpp +if (cfg->align && face.has_landmarks && model_w_ == 112 && model_h_ == 112) { + const std::array dst = { ... }; // 标准模板 + SimilarityTransform t; + InvTransform inv; + if (ComputeSimilarity(face.landmarks, dst, t) && InvertSimilarity(t, inv)) { + WarpFace(src, w, h, stride, inv, face_buf_.data(), model_w_, model_h_, need_swap); + } +} +``` + +--- + +### 4.3 threshold.accept (接受阈值) + +#### 含义 + +特征向量相似度阈值,范围 `0.0 ~ 1.0`。只有当待识别人脸与库中某人的相似度超过此值时,才认为是匹配成功。 + +#### 对识别结果的影响 + +| 设置 | 效果 | 误识率 | 拒识率 | +|------|------|--------|--------| +| **调高** (如 0.55) | 更严格,只接受高度相似 | 低 | 高 | +| **调低** (如 0.35) | 更宽松,容易匹配 | 高 | 低 | + +#### 推荐值 + +| 场景 | 推荐值 | 说明 | +|------|--------|------| +| **高安全性场景** | 0.50 ~ 0.55 | 门禁、支付,严格控制误识 | +| **一般场景** | 0.45 ~ 0.50 | 考勤、签到,平衡准确率和体验 | +| **快速通行场景** | 0.40 ~ 0.45 | 闸机、通道,减少拒识 | + +#### 代码实现 + +```cpp +const bool accept = (sr.best_person_id >= 0) && + (sr.best_sim >= cfg->thr_accept) && + ((cfg->thr_margin <= 0.0f) || ((sr.best_sim - sr.second_sim) >= cfg->thr_margin)); +``` + +--- + +### 4.4 threshold.margin (边距阈值) + +#### 含义 + +要求最佳匹配与次佳匹配的相似度差距至少达到此值,用于排除模糊匹配(如两个人都很像的情况)。设为 `0` 或负数可禁用此检查。 + +#### 作用示例 + +假设待识别人脸与库中人员相似度如下: +- 张三(最佳): 0.62 +- 李四(次佳): 0.58 +- 差距: 0.04 + +如果 `margin = 0.05`,则 0.04 < 0.05,匹配失败(标记为 unknown) +如果 `margin = 0.03`,则 0.04 > 0.03,匹配成功(识别为张三) + +#### 推荐值 + +- **0.05**(默认):适合大多数人脸库 +- **0.00** 或负数:禁用边距检查,只依赖 accept 阈值 + +--- + +### 4.5 max_faces (最大处理人脸数) + +#### 含义 + +单帧最多处理的人脸数量。由于特征提取需要 NPU 推理,此参数直接影响处理延迟。 + +#### 与检测节点 max_faces 的关系 + +``` +实际处理数 = min(face_det.max_faces, face_recog.max_faces) +``` + +建议两个节点的 `max_faces` 保持一致或识别节点略小。 + +--- + +### 4.6 gallery (人脸库配置) + +#### 参数说明 + +| 参数 | 默认值 | 说明 | +|------|--------|------| +| `backend` | "sqlite" | 后端类型,目前仅支持 sqlite | +| `path` | "./models/face_gallery.db" | 人脸库数据库文件路径 | +| `load_on_start` | true | 启动时加载到内存 | +| `expected_dim` | 512 | 特征向量维度(MobileFaceNet 为 512) | +| `dtype` | "auto" | 数据类型,auto/float32 | + +#### 人脸库管理 + +人脸库使用 SQLite 存储,包含以下信息: +- `person_id`:人员唯一ID +- `name`:人员名称 +- `embedding`:特征向量(512维浮点数) +- 可通过 Web 管理接口或脚本添加/删除/更新人脸 + +--- + +### 4.7 normalize (输入归一化) + +#### 两种归一化方式 + +**方式一:缩放+偏移(简单)** +```json +{ + "normalize": { + "scale": 0.0078125, + "bias": 0.0 + } +} +``` +公式:`output = input * scale + bias` + +**方式二:均值+标准差(标准)** +```json +{ + "normalize": { + "mean": [127.5, 127.5, 127.5], + "std": [128.0, 128.0, 128.0] + } +} +``` +公式:`output = (input - mean) / std` + +#### 默认值 + +MobileFaceNet 模型通常使用: +- `scale`: 1.0(不对 uint8 输入做缩放,由模型内部处理) +- 或 `mean: [127.5,127.5,127.5], std: [127.5,127.5,127.5]` 归一化到 [-1, 1] + +--- + +### 4.8 人脸识别配置示例 + +```json +{ + "id": "face_recog", + "type": "ai_face_recog", + "role": "filter", + "enable": true, + "model_path": "./models/mobilefacenet_arcface.rknn", + "align": true, + "emit_embedding": false, + "max_faces": 50, + "input_format": "rgb", + "input_dtype": "uint8", + "threshold": { + "accept": 0.45, + "margin": 0.05 + }, + "gallery": { + "backend": "sqlite", + "path": "./models/face_gallery.db", + "load_on_start": true, + "expected_dim": 512, + "dtype": "auto" + } +} +``` + +--- + +### 4.9 检测+识别完整流程配置 + +```json +{ + "graphs": [{ + "nodes": [ + { + "id": "scrfd", + "type": "ai_scrfd", + "conf_thresh": 0.3, + "nms_thresh": 0.4, + "max_faces": 50, + "output_landmarks": true + }, + { + "id": "face_recog", + "type": "ai_face_recog", + "align": true, + "max_faces": 50, + "threshold": { "accept": 0.45, "margin": 0.05 }, + "gallery": { "path": "./models/face_gallery.db" } + }, + { + "id": "osd", + "type": "osd", + "draw_face_det": true, + "draw_face_bbox": true + } + ], + "edges": [ + ["scrfd", "face_recog"], + ["face_recog", "osd"] + ] + }] +} +``` + +--- + +--- + +## 五、滑动窗口检测参数 (`ai_scrfd_sliding`) + +`ai_scrfd_sliding` 是专为**高分辨率视频**设计的滑动窗口检测节点,通过将画面分割成多个窗口分别检测,有效提升远处小目标的检出率。 + +### 5.1 节点特性 + +| 特性 | 说明 | +|------|------| +| **原始分辨率输入** | 直接接收原始图像,保留更多细节 | +| **滑动窗口检测** | 将画面分割成多个窗口,分别检测后合并结果 | +| **保持宽高比** | 每个窗口 resize 到 640x640,轻微变形但可接受 | +| **窗口可配置** | 支持自定义窗口数量和位置 | + +### 5.2 参数说明 + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `model_path` | string | - | SCRFD 模型路径 | +| `conf_thresh` | float | 0.3 | 置信度阈值 | +| `nms_thresh` | float | 0.4 | NMS IoU 阈值 | +| `max_faces` | int | 50 | 最大检测人脸数 | +| `output_landmarks` | bool | true | 是否输出5点关键点 | +| `windows` | array | 自动计算 | 窗口配置数组 | + +### 5.3 窗口配置 (`windows`) + +如果不配置 `windows`,节点会根据输入分辨率自动计算窗口。 + +**窗口格式**: +```json +{ + "x": 0, // 窗口左上角 X 坐标 + "y": 0, // 窗口左上角 Y 坐标 + "w": 960, // 窗口宽度 + "h": 1080 // 窗口高度 +} +``` + +**窗口设计原则**: +- 窗口之间应有适当重叠,避免漏检 +- 窗口尺寸建议接近 640x640 的倍数(resize 后变形较小) +- 对于 16:9 视频,水平分割效果较好 + +### 5.4 不同分辨率配置参考 + +#### 1080p (1920×1080) - 推荐2窗口 + +```json +{ + "windows": [ + {"x": 0, "y": 0, "w": 960, "h": 1080}, + {"x": 960, "y": 0, "w": 960, "h": 1080} + ] +} +``` + +**说明**: +- 窗口 0:左半边 960x1080 +- 窗口 1:右半边 960x1080 +- 正好覆盖 1920 宽度,无重叠 +- 每个窗口 resize 到 640x640,比例 0.89:1 + +#### 1440p (2560×1440) - 推荐2窗口 + +```json +{ + "windows": [ + {"x": 0, "y": 0, "w": 1280, "h": 1440}, + {"x": 1280, "y": 0, "w": 1280, "h": 1440} + ] +} +``` + +**说明**: +- 窗口 0:左半边 1280x1440 +- 窗口 1:右半边 1280x1440 +- 比例 0.89:1,与 1080p 一致 + +#### 更高分辨率 - 增加窗口数 + +对于 4K (3840×2160) 等更高分辨率,可以增加窗口数量: + +```json +{ + "windows": [ + {"x": 0, "y": 0, "w": 1280, "h": 1080}, + {"x": 1280, "y": 0, "w": 1280, "h": 1080}, + {"x": 2560, "y": 0, "w": 1280, "h": 1080} + ] +} +``` + +### 5.5 配置示例 + +```json +{ + "id": "scrfd_sliding", + "type": "ai_scrfd_sliding", + "role": "filter", + "enable": true, + "model_path": "./models/scrfd_500m_640.rknn", + "conf_thresh": 0.3, + "nms_thresh": 0.4, + "max_faces": 50, + "output_landmarks": true, + "windows": [ + {"x": 0, "y": 0, "w": 960, "h": 1080}, + {"x": 960, "y": 0, "w": 960, "h": 1080} + ] +} +``` + +### 5.6 性能考量 + +- **窗口数 = 推理次数**:2 个窗口 = 2 次模型推理 +- **分辨率越高,窗口数越多**:需要在检测效果和性能之间平衡 +- **建议窗口数**: + - 1080p:2 个窗口 + - 1440p:2 个窗口(或 4 个窗口用于更精细检测) + - 4K:3-4 个窗口 + +### 5.7 滑动窗口检测常见问题 + +#### Q7: 窗口边缘的人脸被分割成两半? + +**解决方法**: +- 增加窗口重叠区域(如窗口 0 结束于 1000,窗口 1 开始于 900) +- NMS 会自动合并重复检测 + +#### Q8: 远处人脸还是检测不到? + +**解决方法**: +- 增加窗口数量,让每个窗口覆盖更小区域 +- 降低 `conf_thresh` 让更多候选框通过 +- 考虑使用更高分辨率摄像头 + +#### Q9: 检测延迟增加? + +**解决方法**: +- 减少窗口数量 +- 降低 `max_faces` 减少后处理负担 +- 使用更高性能硬件 + +--- + +## 六、综合配置建议 + +### 6.1 场景配置速查表 + +| 场景 | 检测节点 | 关键参数 | 说明 | +|------|----------|----------|------| +| **门禁/考勤** | `ai_face_det` | conf=0.7, max_faces=5 | 近距离,高精度 | +| **车间/厂房** | `ai_scrfd_sliding` | 2窗口 | 高位摄像头,大透视 | +| **会议室** | `ai_scrfd` | conf=0.4, max_faces=50 | 多人场景 | +| **户外/街道** | `ai_scrfd_sliding` | 2-4窗口 | 远距离检测 | + +### 6.2 分辨率配置对照表 + +| 分辨率 | 检测节点 | 输入处理 | 建议 | +|--------|----------|----------|------| +| 720p | `ai_scrfd` | 前置缩放至640 | 通用配置 | +| 1080p | `ai_scrfd_sliding` | 2窗口(960x1080) | 滑动窗口检测 | +| 1440p | `ai_scrfd_sliding` | 2窗口(1280x1440) | 滑动窗口检测 | +| 4K | `ai_scrfd_sliding` | 3-4窗口 | 更多窗口提升精度 | + +--- + +## 七、常见问题汇总 + +### Q1: 检测框抖动/闪烁 + +**可能原因**: +- `conf_thresh` 设置过低,边缘候选框置信度波动 +- `nms_thresh` 过低,相邻帧选择不同 anchor + +**解决方法**:适当提高 `conf_thresh` 或调整 `nms_thresh` + +### Q2: 密集场景漏检严重? + +**解决方法**: +- 降低 `conf_thresh` 到 0.4 左右 +- 提高 `max_faces` 到 30 以上 +- 适当提高 `nms_thresh` 到 0.5 + +### Q3: OSD 绘制卡顿? + +**解决方法**: +- 降低 `max_faces` 减少绘制负载 +- 提高 `conf_thresh` 减少检测数量 + +### Q4: 识别准确率不高? + +**可能原因及解决方法**: +1. **对齐问题**:确保 `align: true`,且检测节点 `output_landmarks: true` +2. **阈值不合适**:调整 `threshold.accept`,根据实际测试确定最佳值 +3. **人脸库质量**:确保库中人脸照片清晰、正面、光线均匀 +4. **检测框质量**:适当提高检测 `conf_thresh`,过滤低质量检测框 + +#### Q5: 远距离/小目标识别效果差? + +**解决方法**: +- 提高检测 `conf_thresh`,让只有清晰的人脸进入识别 +- 检查摄像头分辨率,确保人脸区域至少 60x60 像素 +- 考虑使用更高清的摄像头或调整安装角度 + +#### Q6: 识别延迟高? + +**优化方法**: +- 降低 `max_faces`,减少单帧处理数量 +- 提高检测 `conf_thresh`,减少候选框 +- 确保 `gallery.load_on_start: true`,避免运行时查询数据库 + +--- + +## 相关文档 + +- [SCRFD 模型规格说明](../scrfd_500m_640_spec.md) +- [YOLO 检测参数配置](../config_guide.md) +- [DAG 节点与边说明](./dag_graph_node_edge.md) +- [MobileFaceNet 模型说明](../models.md) diff --git a/include/face/face_detection_utils.h b/include/face/face_detection_utils.h index 7f59eec..0ccda4e 100644 --- a/include/face/face_detection_utils.h +++ b/include/face/face_detection_utils.h @@ -542,142 +542,99 @@ inline float ExtractNCHW(const TensorView& t, int c, int h, int w, int C, int H, } /** - * 解码SCRFD检测结果 + * 解码SCRFD检测结果 - 与 ai_scrfd 节点使用相同的逻辑 * * @param outputs 9个输出张量 [score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32] - * @param anchors 预生成的anchor + * @param anchors 预生成的anchor (center_x, center_y, stride) * @param src_w 原始图像宽度 * @param src_h 原始图像高度 * @param model_w 模型输入宽度 * @param model_h 模型输入高度 - * @param cfg 检测配置 + * @param conf_thresh 置信度阈值 + * @param output_lm 是否输出关键点 * @param out 输出结果 */ inline void DecodeScrfd(const std::vector& outputs, const std::vector& anchors, int src_w, int src_h, int model_w, int model_h, - const DetectionConfig& cfg, + float conf_thresh, + bool output_lm, FaceDetResult& out) { - if (outputs.size() != 9) { - return; // SCRFD需要9个输出 - } + if (outputs.size() != 9) return; - const float sx = static_cast(src_w) / static_cast(model_w); - const float sy = static_cast(src_h) / static_cast(model_h); - - std::vector boxes; - std::vector scores; - std::vector> lmks; + // Output order: score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32 + const int anchor_counts[] = {12800, 3200, 800}; + const int strides[] = {8, 16, 32}; size_t anchor_idx = 0; - const int strides[3] = {8, 16, 32}; + float scale_x = static_cast(src_w) / model_w; + float scale_y = static_cast(src_h) / model_h; for (int s = 0; s < 3; ++s) { - int score_idx = s; - int bbox_idx = s + 3; - int kps_idx = s + 6; int stride = strides[s]; + int count = anchor_counts[s]; - const TensorView& score_t = outputs[score_idx]; - const TensorView& bbox_t = outputs[bbox_idx]; - const TensorView& kps_t = outputs[kps_idx]; + // 检查输出数据是否有效 + if (outputs[s].type != RKNN_TENSOR_FLOAT32 || + outputs[s + 3].type != RKNN_TENSOR_FLOAT32 || + outputs[s + 6].type != RKNN_TENSOR_FLOAT32) { + continue; + } - // 检查维度 - if (score_t.dims.size() < 4 || bbox_t.dims.size() < 4) continue; + const float* scores = reinterpret_cast(outputs[s].data); + const float* bboxes = reinterpret_cast(outputs[s + 3].data); + const float* kps = reinterpret_cast(outputs[s + 6].data); - int C = static_cast(score_t.dims[1]); - int H = static_cast(score_t.dims[2]); - int W = static_cast(score_t.dims[3]); - int anchors_per_loc = C / 2; // fg/bg + if (!scores || !bboxes || !kps) continue; - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - for (int a = 0; a < anchors_per_loc; ++a) { - if (anchor_idx >= anchors.size()) break; - - // 提取前景分数 (channel a*2+1) - float score = ExtractNCHW(score_t, a * 2 + 1, h, w, C, H, W); - - if (score >= cfg.conf_thresh) { - const ScrfdAnchor& anchor = anchors[anchor_idx]; - - // 提取bbox [dx, dy, dw, dh] - float dx = ExtractNCHW(bbox_t, a * 4 + 0, h, w, - static_cast(bbox_t.dims[1]), H, W) * stride; - float dy = ExtractNCHW(bbox_t, a * 4 + 1, h, w, - static_cast(bbox_t.dims[1]), H, W) * stride; - float dw = ExtractNCHW(bbox_t, a * 4 + 2, h, w, - static_cast(bbox_t.dims[1]), H, W) * stride; - float dh = ExtractNCHW(bbox_t, a * 4 + 3, h, w, - static_cast(bbox_t.dims[1]), H, W) * stride; - - float cx = anchor.cx + dx; - float cy = anchor.cy + dy; - float x1 = (cx - dw * 0.5f) * sx; - float y1 = (cy - dh * 0.5f) * sy; - float x2 = (cx + dw * 0.5f) * sx; - float y2 = (cy + dh * 0.5f) * sy; - - x1 = static_cast(ClampInt(static_cast(x1), 0, src_w - 1)); - y1 = static_cast(ClampInt(static_cast(y1), 0, src_h - 1)); - x2 = static_cast(ClampInt(static_cast(x2), 0, src_w - 1)); - y2 = static_cast(ClampInt(static_cast(y2), 0, src_h - 1)); - - Rect bb; - bb.x = x1; - bb.y = y1; - bb.w = std::max(0.0f, x2 - x1); - bb.h = std::max(0.0f, y2 - y1); - - if (bb.w > 1.0f && bb.h > 1.0f) { - boxes.push_back(bb); - scores.push_back(score); - - // 提取关键点 - if (cfg.output_landmarks) { - std::array pts{}; - for (int k = 0; k < 5; ++k) { - float lx = ExtractNCHW(kps_t, a * 10 + k * 2 + 0, h, w, - static_cast(kps_t.dims[1]), H, W) * stride; - float ly = ExtractNCHW(kps_t, a * 10 + k * 2 + 1, h, w, - static_cast(kps_t.dims[1]), H, W) * stride; - pts[k].x = (anchor.cx + lx) * sx; - pts[k].y = (anchor.cy + ly) * sy; - } - lmks.push_back(pts); - } - } - } - - ++anchor_idx; + for (int i = 0; i < count; ++i) { + if (anchor_idx >= anchors.size()) break; + + float score = scores[i]; + if (score < conf_thresh) { + anchor_idx++; + continue; + } + + const ScrfdAnchor& pt = anchors[anchor_idx]; + + // BBox: [left, top, right, bottom] - distances from center + float left = bboxes[i * 4 + 0]; + float top = bboxes[i * 4 + 1]; + float right = bboxes[i * 4 + 2]; + float bottom = bboxes[i * 4 + 3]; + + // Decode to image coordinates (640x640) + float x1_640 = (pt.cx - left) * stride; + float y1_640 = (pt.cy - top) * stride; + float x2_640 = (pt.cx + right) * stride; + float y2_640 = (pt.cy + bottom) * stride; + + FaceDetItem det; + det.bbox.x = x1_640 * scale_x; + det.bbox.y = y1_640 * scale_y; + det.bbox.w = (x2_640 - x1_640) * scale_x; + det.bbox.h = (y2_640 - y1_640) * scale_y; + det.score = score; + det.has_landmarks = output_lm; + + // Keypoints + if (output_lm) { + for (int p = 0; p < 5; ++p) { + float kps_x = kps[i * 10 + p * 2 + 0]; + float kps_y = kps[i * 10 + p * 2 + 1]; + float kx_640 = (pt.cx + kps_x) * stride; + float ky_640 = (pt.cy + kps_y) * stride; + det.landmarks[p].x = kx_640 * scale_x; + det.landmarks[p].y = ky_640 * scale_y; } } + + out.faces.push_back(det); + anchor_idx++; } } - - if (boxes.empty()) return; - - // NMS - std::vector keep; - NmsSorted(boxes, scores, cfg.nms_thresh, keep); - if (keep.empty()) return; - - // 构建输出 - const int out_n = std::min(cfg.max_faces, static_cast(keep.size())); - out.faces.reserve(static_cast(out_n)); - for (int i = 0; i < out_n; ++i) { - const int k = keep[static_cast(i)]; - FaceDetItem item; - item.bbox = boxes[static_cast(k)]; - item.score = scores[static_cast(k)]; - item.track_id = -1; - if (cfg.output_landmarks && k < static_cast(lmks.size())) { - item.has_landmarks = true; - item.landmarks = lmks[static_cast(k)]; - } - out.faces.push_back(std::move(item)); - } } } // namespace face_detection diff --git a/include/face/scrfd_detector.h b/include/face/scrfd_detector.h new file mode 100644 index 0000000..a6883c5 --- /dev/null +++ b/include/face/scrfd_detector.h @@ -0,0 +1,85 @@ +#pragma once + +/** + * SCRFD Detector - 可复用的 SCRFD 检测器 + * 供 ai_scrfd 和 ai_scrfd_zoned 节点使用 + */ + +#include +#include +#include "face/face_result.h" + +// 包含 AiScheduler 以使用 BorrowedOutput +#include "ai_scheduler.h" + +namespace rk3588 { + +/** + * SCRFD 检测结果 + */ +struct ScrfdDetection { + FaceDetItem item; +}; + +/** + * SCRFD 检测器配置 + */ +struct ScrfdConfig { + float conf_thresh = 0.5f; + float nms_thresh = 0.4f; + int max_faces = 50; + bool output_landmarks = true; +}; + +/** + * SCRFD 检测器 + * + * 使用示例: + * ScrfdDetector det; + * det.Init(640, 640); + * auto dets = det.Decode(outputs, src_w, src_h, config); + */ +class ScrfdDetector { +public: + ScrfdDetector(); + ~ScrfdDetector(); + + /** + * 初始化检测器 + * @param model_w 模型输入宽度 (640) + * @param model_h 模型输入高度 (640) + */ + void Init(int model_w, int model_h); + + /** + * 解码 SCRFD 输出 + * @param outputs 9个输出张量 (BorrowedOutput) + * @param src_w 原始图像宽度 + * @param src_h 原始图像高度 + * @param cfg 检测配置 + * @return 检测结果列表 + */ + std::vector Decode( + const std::vector& outputs, + int src_w, int src_h, + const ScrfdConfig& cfg); + + /** + * 应用 NMS + */ + std::vector ApplyNMS( + std::vector& dets, + float nms_thresh); + +private: + struct CenterPoint { + float cx, cy; + float stride; + }; + + std::vector center_points_; + int model_w_ = 640; + int model_h_ = 640; +}; + +} // namespace rk3588 diff --git a/plugins/CMakeLists.txt b/plugins/CMakeLists.txt index b084b4d..1871599 100644 --- a/plugins/CMakeLists.txt +++ b/plugins/CMakeLists.txt @@ -269,24 +269,6 @@ set_target_properties(ai_face_det PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} ) -# ai_face_det_zoned plugin (RKNN-based RetinaFace with distance zone detection) -add_library(ai_face_det_zoned SHARED - ai_face_det_zoned/ai_face_det_zoned_node.cpp - ${CMAKE_SOURCE_DIR}/src/utils/dma_alloc.cpp -) -target_include_directories(ai_face_det_zoned PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/third_party) -target_link_libraries(ai_face_det_zoned PRIVATE project_options Threads::Threads ai_scheduler) -if(RK3588_ENABLE_RKNN AND RK_RKNN_LIB) - target_compile_definitions(ai_face_det_zoned PRIVATE RK3588_ENABLE_RKNN) - target_include_directories(ai_face_det_zoned PRIVATE ${RKNN_RUNTIME_INCLUDE_DIR}) - target_link_libraries(ai_face_det_zoned PRIVATE ${RK_RKNN_LIB}) -endif() -set_target_properties(ai_face_det_zoned PROPERTIES - OUTPUT_NAME "ai_face_det_zoned" - LIBRARY_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} - RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} -) - # ai_scrfd plugin (SCRFD 640x640 face detection) add_library(ai_scrfd SHARED ai_scrfd/ai_scrfd_node.cpp @@ -305,6 +287,25 @@ set_target_properties(ai_scrfd PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} ) +# ai_scrfd_sliding plugin (SCRFD with sliding window detection) +add_library(ai_scrfd_sliding SHARED + ai_scrfd_sliding/ai_scrfd_sliding_node.cpp + ${CMAKE_SOURCE_DIR}/src/face/scrfd_detector.cpp + ${CMAKE_SOURCE_DIR}/src/utils/dma_alloc.cpp +) +target_include_directories(ai_scrfd_sliding PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/third_party) +target_link_libraries(ai_scrfd_sliding PRIVATE project_options Threads::Threads ai_scheduler) +if(RK3588_ENABLE_RKNN) + target_compile_definitions(ai_scrfd_sliding PRIVATE RK3588_ENABLE_RKNN) + target_include_directories(ai_scrfd_sliding PRIVATE ${RKNN_RUNTIME_INCLUDE_DIR}) + target_link_libraries(ai_scrfd_sliding PRIVATE ${RK_RKNN_LIB}) +endif() +set_target_properties(ai_scrfd_sliding PROPERTIES + OUTPUT_NAME "ai_scrfd_sliding" + LIBRARY_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} + RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} +) + # ai_face_recog plugin (RKNN-based ArcFace/MobileFaceNet inference) add_library(ai_face_recog SHARED ai_face_recog/ai_face_recog_node.cpp @@ -511,7 +512,7 @@ if(RK3588_ENABLE_ZLMEDIAKIT AND RK_ZLMK_API_LIB) ) endif() -install(TARGETS input_rtsp input_file publish preprocess ai_yolo ai_face_det ai_face_det_zoned ai_face_recog tracker gate osd alarm logic_gate storage ai_scheduler +install(TARGETS input_rtsp input_file publish preprocess ai_yolo ai_face_det ai_scrfd ai_scrfd_sliding ai_face_recog tracker gate osd alarm logic_gate storage ai_scheduler LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/rk3588-media-server/plugins RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}/rk3588-media-server/plugins ) diff --git a/plugins/ai_face_det_zoned/ai_face_det_zoned_node.cpp b/plugins/ai_face_det_zoned/ai_face_det_zoned_node.cpp deleted file mode 100644 index 5323be8..0000000 --- a/plugins/ai_face_det_zoned/ai_face_det_zoned_node.cpp +++ /dev/null @@ -1,502 +0,0 @@ -/** - * ai_face_det_zoned - 三分区距离感知人脸检测节点 - * - * 特性: - * 1. 接收原始分辨率输入(不经过前置缩放) - * 2. 基于距离进行ROI裁剪和三分区检测 - * 3. 近区(3-5m) 1.0x / 中区(5-7m) 1.3x / 远区(7-9m) 1.8x - * 4. 复用 face_detection_utils.h 中的公共函数 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "face/face_detection_utils.h" -#include "hw/i_infer_backend.h" -#include "face/face_result.h" -#include "node.h" -#include "utils/dma_alloc.h" -#include "utils/logger.h" - -namespace rk3588 { - -using namespace face_detection; - -class AiFaceDetZonedNode : public INode { -public: - std::string Id() const override { return id_; } - std::string Type() const override { return "ai_face_det_zoned"; } - - bool Init(const SimpleJson& config, const NodeContext& ctx) override { - id_ = config.ValueOr("id", "face_det_zoned"); - model_path_ = config.ValueOr("model_path", - "./models/RetinaFace_mobile320.rknn"); - - // 基础检测参数 - det_cfg_.conf_thresh = config.ValueOr("conf", 0.6f); - det_cfg_.nms_thresh = config.ValueOr("nms", 0.4f); - det_cfg_.max_faces = config.ValueOr("max_faces", 10); - det_cfg_.output_landmarks = config.ValueOr("output_landmarks", true); - - // 模型输入尺寸(默认320) - model_w_ = config.ValueOr("model_w", 320); - model_h_ = config.ValueOr("model_h", 320); - - // 先验框步长和最小尺寸(RetinaFace默认) - det_cfg_.steps = {8, 16, 32}; - det_cfg_.min_sizes = {{16, 32}, {64, 128}, {256, 512}}; - - // ROI配置 - 支持格式: "roi": {"x": 0, "y": 0, "w": 1920, "h": 1080} - roi_enabled_ = false; - roi_x_ = roi_y_ = roi_w_ = roi_h_ = 0; - if (const SimpleJson* roi = config.Find("roi"); roi && roi->IsObject()) { - // 直接读取平级格式 - roi_x_ = roi->ValueOr("x", 0); - roi_y_ = roi->ValueOr("y", 0); - roi_w_ = roi->ValueOr("w", 0); - roi_h_ = roi->ValueOr("h", 0); - - // 如果w/h有效,则启用ROI - if (roi_w_ > 0 && roi_h_ > 0) { - roi_enabled_ = true; - } - // 兼容旧格式: "roi": {"crop": {...}} - else if (const SimpleJson* crop = roi->Find("crop"); crop && crop->IsObject()) { - roi_x_ = crop->ValueOr("x", 0); - roi_y_ = crop->ValueOr("y", 0); - roi_w_ = crop->ValueOr("w", 0); - roi_h_ = crop->ValueOr("h", 0); - if (roi_w_ > 0 && roi_h_ > 0) { - roi_enabled_ = true; - } - } - } - - // 三分区配置 - 支持两种格式: - // 1. 旧格式: "distance_zones": {"enabled": true, "boundaries": [y1, y2], "scales": [s1, s2, s3]} - // 2. 新格式: "zones": {"near_zone": {"y_start": 0, "y_end": 405, "scale": 0.5}, ...} - zones_enabled_ = false; - boundary_y_5m_ = boundary_y_7m_ = 0; - scale_near_ = 1.0f; - scale_mid_ = 1.3f; - scale_far_ = 1.8f; - - // 优先尝试新格式 "zones" - if (const SimpleJson* zones = config.Find("zones"); - zones && zones->IsObject()) { - bool has_near = false, has_mid = false, has_far = false; - int near_y_end = 0, mid_y_end = 0; - - if (const SimpleJson* near = zones->Find("near_zone"); near && near->IsObject()) { - near_y_end = near->ValueOr("y_end", 0); - scale_near_ = near->ValueOr("scale", 1.0f); - has_near = true; - } - - if (const SimpleJson* mid = zones->Find("mid_zone"); mid && mid->IsObject()) { - mid_y_end = mid->ValueOr("y_end", 0); - scale_mid_ = mid->ValueOr("scale", 1.0f); - has_mid = true; - } - - if (const SimpleJson* far = zones->Find("far_zone"); far && far->IsObject()) { - scale_far_ = far->ValueOr("scale", 1.0f); - has_far = true; - } - - if (has_near && has_mid && has_far) { - zones_enabled_ = true; - boundary_y_5m_ = near_y_end; // near和mid的分界 - boundary_y_7m_ = mid_y_end; // mid和far的分界 - } - } - // 兼容旧格式 - else if (const SimpleJson* zones = config.Find("distance_zones"); - zones && zones->IsObject()) { - zones_enabled_ = zones->ValueOr("enabled", false); - - if (const SimpleJson* boundaries = zones->Find("boundaries"); - boundaries && boundaries->IsArray() && boundaries->AsArray().size() >= 2) { - boundary_y_5m_ = boundaries->AsArray()[0].AsInt(0); - boundary_y_7m_ = boundaries->AsArray()[1].AsInt(0); - } - - if (const SimpleJson* scales = zones->Find("scales"); - scales && scales->IsArray() && scales->AsArray().size() >= 3) { - scale_near_ = scales->AsArray()[0].AsNumber(1.0f); - scale_mid_ = scales->AsArray()[1].AsNumber(1.3f); - scale_far_ = scales->AsArray()[2].AsNumber(1.8f); - } - } - - input_queue_ = ctx.input_queue; - output_queues_ = ctx.output_queues; - if (!input_queue_) { - LogError("[ai_face_det_zoned] no input queue for node " + id_); - return false; - } - if (output_queues_.empty()) { - LogError("[ai_face_det_zoned] no output queue for node " + id_); - return false; - } - - infer_backend_ = ctx.infer_backend; - if (!infer_backend_) { - LogError("[ai_face_det_zoned] no infer backend for node " + id_); - return false; - } - -#if defined(RK3588_ENABLE_RKNN) - if (model_path_.empty()) { - LogError("[ai_face_det_zoned] model_path is required"); - return false; - } - std::string err; - model_handle_ = infer_backend_->LoadModel(model_path_, err); - if (model_handle_ == kInvalidModelHandle) { - LogError("[ai_face_det_zoned] failed to load model: " + err); - return false; - } - - // 预计算先验框 - priors_ = GeneratePriors(model_w_, model_h_, det_cfg_.steps, det_cfg_.min_sizes); - - LogInfo("[ai_face_det_zoned] model loaded: " + model_path_ + - " (" + std::to_string(model_w_) + "x" + std::to_string(model_h_) + - "), priors=" + std::to_string(priors_.size())); -#else - LogWarn("[ai_face_det_zoned] RKNN disabled, will passthrough frames"); -#endif - - return true; - } - - bool Start() override { - LogInfo("[ai_face_det_zoned] start id=" + id_ + - " zones=" + std::string(zones_enabled_ ? "enabled" : "disabled") + - " roi=" + std::string(roi_enabled_ ? "enabled" : "disabled") + - " roi_xywh=" + std::to_string(roi_x_) + "," + std::to_string(roi_y_) + "," + - std::to_string(roi_w_) + "," + std::to_string(roi_h_) + - " boundaries=" + std::to_string(boundary_y_5m_) + "," + std::to_string(boundary_y_7m_) + - " scales=" + std::to_string(scale_near_) + "," + std::to_string(scale_mid_) + "," + std::to_string(scale_far_)); - return true; - } - - void Stop() override { -#if defined(RK3588_ENABLE_RKNN) - if (model_handle_ != kInvalidModelHandle) { - infer_backend_->UnloadModel(model_handle_); - model_handle_ = kInvalidModelHandle; - } -#endif - LogInfo("[ai_face_det_zoned] stop id=" + id_); - } - - NodeStatus Process(FramePtr frame) override { - if (!frame) return NodeStatus::DROP; - -#if defined(RK3588_ENABLE_RKNN) - RunZonedDetection(frame); -#endif - - Push(frame); - return NodeStatus::OK; - } - -private: - void Push(FramePtr frame) { - for (auto& q : output_queues_) q->Push(frame); - } - -#if defined(RK3588_ENABLE_RKNN) - - // 将RKNN输出转换为TensorView - TensorView ConvertToTensorView(const AiScheduler::BorrowedOutput& o) { - TensorView tv; - tv.data = o.data; - tv.size = o.size; - tv.zp = o.zp; - tv.scale = o.scale; - tv.dims = o.dims; - tv.type = o.type; - return tv; - } - - void RunZonedDetection(FramePtr frame) { - if (!frame->data || frame->data_size == 0) return; - if (frame->format != PixelFormat::RGB && frame->format != PixelFormat::BGR) { - LogWarn("[ai_face_det_zoned] input must be RGB/BGR"); - return; - } - - const int src_w = frame->width; - const int src_h = frame->height; - - // 应用ROI裁剪 - int roi_x = 0, roi_y = 0, roi_w = src_w, roi_h = src_h; - if (roi_enabled_) { - roi_x = ClampInt(roi_x_, 0, src_w - 1); - roi_y = ClampInt(roi_y_, 0, src_h - 1); - roi_w = ClampInt(roi_w_, 1, src_w - roi_x); - roi_h = ClampInt(roi_h_, 1, src_h - roi_y); - } - - std::vector all_detections; - - if (zones_enabled_) { - // 三分区检测 - all_detections = DetectWithZones(frame, roi_x, roi_y, roi_w, roi_h); - } else { - // 单区检测(全ROI区域) - auto dets = DetectSingleZone(frame, roi_x, roi_y, roi_w, roi_h, 1.0f); - // 坐标映射回原始图像 - for (auto& det : dets) { - det.bbox.x += roi_x; - det.bbox.y += roi_y; - if (det.has_landmarks) { - for (auto& lm : det.landmarks) { - lm.x += roi_x; - lm.y += roi_y; - } - } - all_detections.push_back(det); - } - } - - // NMS去重 - all_detections = ApplyNMS(all_detections, det_cfg_.nms_thresh); - - // 限制最大人脸数 - if (all_detections.size() > static_cast(det_cfg_.max_faces)) { - all_detections.resize(det_cfg_.max_faces); - } - - // 构建结果 - FaceDetResult det_result; - det_result.img_w = src_w; - det_result.img_h = src_h; - det_result.model_name = "retinaface_zoned"; - det_result.faces = std::move(all_detections); - - frame->face_det = std::make_shared(std::move(det_result)); - } - - std::vector DetectWithZones(FramePtr frame, - int roi_x, int roi_y, - int roi_w, int roi_h) { - std::vector all_dets; - - // 将分界线坐标转换到ROI坐标系 - int by5 = ClampInt(boundary_y_5m_ - roi_y, 0, roi_h); - int by7 = ClampInt(boundary_y_7m_ - roi_y, 0, roi_h); - - // 确保顺序正确(y大=下方=近距离) - if (by5 < by7) std::swap(by5, by7); - - // 近区检测 (画面下方,y大,近距离3-5m) - if (by5 < roi_h) { - auto dets = DetectSingleZone(frame, roi_x, roi_y + by5, roi_w, roi_h - by5, scale_near_); - for (auto& det : dets) { - det.bbox.x += roi_x; - det.bbox.y += roi_y + by5; - if (det.has_landmarks) { - for (auto& lm : det.landmarks) { - lm.x += roi_x; - lm.y += roi_y + by5; - } - } - all_dets.push_back(det); - } - } - - // 中区检测 (画面中部,中距离5-7m) - if (by7 < by5) { - auto dets = DetectSingleZone(frame, roi_x, roi_y + by7, roi_w, by5 - by7, scale_mid_); - for (auto& det : dets) { - det.bbox.x += roi_x; - det.bbox.y += roi_y + by7; - if (det.has_landmarks) { - for (auto& lm : det.landmarks) { - lm.x += roi_x; - lm.y += roi_y + by7; - } - } - all_dets.push_back(det); - } - } - - // 远区检测 (画面上方,y小,远距离7-9m) - if (by7 > 0) { - auto dets = DetectSingleZone(frame, roi_x, roi_y, roi_w, by7, scale_far_); - for (auto& det : dets) { - det.bbox.x += roi_x; - det.bbox.y += roi_y; - if (det.has_landmarks) { - for (auto& lm : det.landmarks) { - lm.x += roi_x; - lm.y += roi_y; - } - } - all_dets.push_back(det); - } - } - - return all_dets; - } - - std::vector DetectSingleZone(FramePtr frame, - int x, int y, int w, int h, - float scale) { - std::vector dets; - - if (w <= 0 || h <= 0) return dets; - - const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data; - const int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride - : (frame->stride > 0 ? frame->stride : frame->width * 3); - - // 裁剪区域缩放后的尺寸 - int crop_w = static_cast(w * scale); - int crop_h = static_cast(h * scale); - if (crop_w <= 0 || crop_h <= 0) return dets; - - // 分配缓冲区 - input_buf_.resize(static_cast(model_w_) * model_h_ * 3); - - // 双线性缩放到模型输入尺寸 - // 注意:这里从原图裁剪(x,y,w,h),缩放到(model_w_, model_h_) - // 优化:可以直接从src裁剪并缩放,避免中间buffer - - // 简化的处理:先裁剪到临时buffer,再缩放 - std::vector crop_buf(static_cast(w) * h * 3); - for (int row = 0; row < h; ++row) { - const uint8_t* src_row = src + (y + row) * src_stride + x * 3; - uint8_t* dst_row = crop_buf.data() + row * w * 3; - memcpy(dst_row, src_row, static_cast(w) * 3); - } - - // 缩放到模型输入尺寸 - ResizeRgbBilinear(crop_buf.data(), w, h, w * 3, - input_buf_.data(), model_w_, model_h_, - false); // 假设输入已经是RGB - - // NPU推理 - InferInput input; - input.width = model_w_; - input.height = model_h_; - input.is_nhwc = true; - input.data = input_buf_.data(); - input.size = input_buf_.size(); - input.type = RKNN_TENSOR_UINT8; - - auto r = infer_backend_->InferBorrowed(model_handle_, input); - if (!r.success || r.outputs.empty()) { - LogWarn("[ai_face_det_zoned] inference failed"); - return dets; - } - - // 解析输出 - NcTensor loc_tensor, conf_tensor, landm_tensor; - bool has_loc = false, has_conf = false, has_landm = false; - - for (const auto& o : r.outputs) { - TensorView tv = ConvertToTensorView(o); - NcTensor tmp; - if (!has_loc && ExtractNcTensor(tv, 4, tmp)) { - loc_tensor = std::move(tmp); - has_loc = true; - } else if (!has_conf && ExtractNcTensor(tv, 2, tmp)) { - conf_tensor = std::move(tmp); - has_conf = true; - } else if (!has_landm && ExtractNcTensor(tv, 10, tmp)) { - landm_tensor = std::move(tmp); - has_landm = true; - } - } - - if (!has_loc || !has_conf) return dets; - - // 解码检测结果 - FaceDetResult result; - DecodeRetinaFace(loc_tensor, conf_tensor, landm_tensor, - priors_, w, h, model_w_, model_h_, - det_cfg_, result); - - if (!result.faces.empty()) { - LogInfo("[ai_face_det_zoned] DetectSingleZone: detected " + - std::to_string(result.faces.size()) + " faces, max_score=" + - std::to_string(result.faces.empty() ? 0 : result.faces[0].score)); - } - - return result.faces; - } - - std::vector ApplyNMS(std::vector& dets, float threshold) { - if (dets.empty()) return dets; - - // 按置信度排序 - std::sort(dets.begin(), dets.end(), - [](const FaceDetItem& a, const FaceDetItem& b) { - return a.score > b.score; - }); - - std::vector keep; - std::vector suppressed(dets.size(), false); - - for (size_t i = 0; i < dets.size(); ++i) { - if (suppressed[i]) continue; - keep.push_back(dets[i]); - - for (size_t j = i + 1; j < dets.size(); ++j) { - if (suppressed[j]) continue; - if (IoU(dets[i].bbox, dets[j].bbox) > threshold) { - suppressed[j] = true; - } - } - } - - return keep; - } - -#endif - - std::string id_; - std::string model_path_; - - DetectionConfig det_cfg_; - int model_w_ = 320; - int model_h_ = 320; - - // ROI - bool roi_enabled_ = false; - int roi_x_ = 0, roi_y_ = 0, roi_w_ = 0, roi_h_ = 0; - - // 三分区 - bool zones_enabled_ = false; - int boundary_y_5m_ = 0; - int boundary_y_7m_ = 0; - float scale_near_ = 1.0f; - float scale_mid_ = 1.3f; - float scale_far_ = 1.8f; - - std::shared_ptr> input_queue_; - std::vector>> output_queues_; - std::shared_ptr infer_backend_; - -#if defined(RK3588_ENABLE_RKNN) - ModelHandle model_handle_ = kInvalidModelHandle; - std::vector priors_; - std::vector input_buf_; -#endif -}; - -REGISTER_NODE(AiFaceDetZonedNode, "ai_face_det_zoned"); - -} // namespace rk3588 diff --git a/plugins/ai_scrfd_sliding/ai_scrfd_sliding_node.cpp b/plugins/ai_scrfd_sliding/ai_scrfd_sliding_node.cpp new file mode 100644 index 0000000..4351033 --- /dev/null +++ b/plugins/ai_scrfd_sliding/ai_scrfd_sliding_node.cpp @@ -0,0 +1,311 @@ +/** + * ai_scrfd_sliding - SCRFD with sliding window detection + * + * Features: + * 1. Resize input to target height (640) keeping approximate ratio + * 2. Split into multiple 640x640 windows + * 3. Detect on each window and merge results + * + * For 1080p: resize to 1280x640, 2 windows + * For 1440p: resize to 2560x640, 4 windows + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "face/face_detection_utils.h" +#include "face/face_result.h" +#include "face/scrfd_detector.h" +#include "hw/i_infer_backend.h" +#include "node.h" +#include "utils/dma_alloc.h" +#include "utils/logger.h" + +namespace rk3588 { + +using namespace face_detection; + +class AiScrfdSlidingNode : public INode { +public: + std::string Id() const override { return id_; } + std::string Type() const override { return "ai_scrfd_sliding"; } + + bool Init(const SimpleJson& config, const NodeContext& ctx) override { + id_ = config.ValueOr("id", "scrfd_sliding"); + model_path_ = config.ValueOr("model_path", + "./models/scrfd_500m_640.rknn"); + + // Detection parameters + det_cfg_.conf_thresh = config.ValueOr("conf_thresh", 0.3f); + det_cfg_.nms_thresh = config.ValueOr("nms_thresh", 0.4f); + det_cfg_.max_faces = config.ValueOr("max_faces", 50); + det_cfg_.output_landmarks = config.ValueOr("output_landmarks", true); + + model_w_ = 640; + model_h_ = 640; + + // Initialize detector + detector_.Init(model_w_, model_h_); + + // Parse sliding windows config + // If not configured, auto-calculate based on input resolution + windows_.clear(); + if (const SimpleJson* win_arr = config.Find("windows"); win_arr && win_arr->IsArray()) { + for (const auto& w : win_arr->AsArray()) { + if (w.IsObject()) { + Window win; + win.x = w.ValueOr("x", 0); + win.y = w.ValueOr("y", 0); + win.w = w.ValueOr("w", 640); + win.h = w.ValueOr("h", 640); + windows_.push_back(win); + } + } + } + + // Target resize height (default 640) + target_height_ = config.ValueOr("target_height", 640); + + input_queue_ = ctx.input_queue; + output_queues_ = ctx.output_queues; + if (!input_queue_) { + LogError("[ai_scrfd_sliding] no input queue"); + return false; + } + + infer_backend_ = ctx.infer_backend; + if (!infer_backend_) { + LogError("[ai_scrfd_sliding] no infer backend"); + return false; + } + +#if defined(RK3588_ENABLE_RKNN) + std::string err; + model_handle_ = infer_backend_->LoadModel(model_path_, err); + if (model_handle_ == kInvalidModelHandle) { + LogError("[ai_scrfd_sliding] failed to load model: " + err); + return false; + } + + input_buf_.resize(model_w_ * model_h_ * 3); + + LogInfo("[ai_scrfd_sliding] model loaded: " + model_path_); +#else + LogWarn("[ai_scrfd_sliding] RKNN disabled"); +#endif + + return true; + } + + bool Start() override { + LogInfo("[ai_scrfd_sliding] start, windows=" + std::to_string(windows_.size())); + return true; + } + + void Stop() override { +#if defined(RK3588_ENABLE_RKNN) + if (model_handle_ != kInvalidModelHandle) { + infer_backend_->UnloadModel(model_handle_); + model_handle_ = kInvalidModelHandle; + } +#endif + LogInfo("[ai_scrfd_sliding] stop"); + } + + NodeStatus Process(FramePtr frame) override { + if (!frame) return NodeStatus::DROP; + +#if defined(RK3588_ENABLE_RKNN) + RunDetection(frame); +#endif + + Push(frame); + return NodeStatus::OK; + } + +private: + struct Window { + int x, y, w, h; + }; + + void Push(FramePtr frame) { + for (auto& q : output_queues_) q->Push(frame); + } + +#if defined(RK3588_ENABLE_RKNN) + + void RunDetection(FramePtr frame) { + if (!frame->data || frame->data_size == 0) return; + + const int src_w = frame->width; + const int src_h = frame->height; + + if (frame->DmaFd() >= 0) frame->SyncStart(); + + // Calculate windows if not pre-configured + std::vector windows = windows_; + if (windows.empty()) { + windows = CalculateWindows(src_w, src_h); + } + + std::vector all_detections; + + const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data; + const int src_stride = frame->planes[0].stride > 0 ? frame->planes[0].stride + : (frame->stride > 0 ? frame->stride : frame->width * 3); + + // Process each window - crop from original, then resize to 640x640 + for (size_t i = 0; i < windows.size(); ++i) { + const auto& win = windows[i]; + auto dets = DetectWindowFromSource(src, src_w, src_h, src_stride, win); + + // Detections are already in original coordinates + all_detections.insert(all_detections.end(), dets.begin(), dets.end()); + } + + // Apply NMS + all_detections = detector_.ApplyNMS(all_detections, det_cfg_.nms_thresh); + + if (all_detections.size() > static_cast(det_cfg_.max_faces)) { + all_detections.resize(det_cfg_.max_faces); + } + + FaceDetResult result; + result.img_w = src_w; + result.img_h = src_h; + result.model_name = "scrfd_sliding"; + result.faces = std::move(all_detections); + + frame->face_det = std::make_shared(std::move(result)); + } + + std::vector CalculateWindows(int src_w, int src_h) { + std::vector windows; + + // Strategy: Split source image into overlapping 640x640 regions + // For 1080p: 1920x1080 -> 3x2 grid (6 windows) + // For 1440p: 2560x1440 -> 4x2 grid (8 windows) + + // Calculate step size (with overlap) + int step_x = (src_w <= 640) ? src_w : (src_w - 640) / ((src_w + 639) / 640 - 1); + int step_y = (src_h <= 640) ? src_h : (src_h - 640) / ((src_h + 639) / 640 - 1); + + if (step_x < 640) step_x = 640; + if (step_y < 640) step_y = 640; + + for (int y = 0; y < src_h; y += step_y) { + for (int x = 0; x < src_w; x += step_x) { + Window win; + win.x = x; + win.y = y; + win.w = 640; + win.h = 640; + windows.push_back(win); + + // Stop if we've covered the width + if (x + 640 >= src_w) break; + } + // Stop if we've covered the height + if (y + 640 >= src_h) break; + } + + LogInfo("[ai_scrfd_sliding] Auto-calculated: " + std::to_string(windows.size()) + " windows for " + std::to_string(src_w) + "x" + std::to_string(src_h)); + + return windows; + } + + std::vector DetectWindowFromSource(const uint8_t* src, int src_w, int src_h, int src_stride, const Window& win) { + std::vector dets; + + // Clamp window to source bounds + int win_x = std::max(0, std::min(win.x, src_w - 1)); + int win_y = std::max(0, std::min(win.y, src_h - 1)); + int win_w = std::min(win.w, src_w - win_x); + int win_h = std::min(win.h, src_h - win_y); + + if (win_w <= 0 || win_h <= 0) { + LogWarn("[ai_scrfd_sliding] Invalid window"); + return dets; + } + + // Crop from source + std::vector crop_buf(static_cast(win_w) * win_h * 3); + for (int row = 0; row < win_h; ++row) { + const uint8_t* src_row = src + (win_y + row) * src_stride + win_x * 3; + uint8_t* dst_row = crop_buf.data() + row * win_w * 3; + memcpy(dst_row, src_row, static_cast(win_w) * 3); + } + + // Resize to 640x640 + std::vector model_input(640 * 640 * 3); + ResizeRgbBilinear(crop_buf.data(), win_w, win_h, win_w * 3, + model_input.data(), 640, 640, false); + + // NPU inference + InferInput input; + input.width = 640; + input.height = 640; + input.is_nhwc = true; + input.data = model_input.data(); + input.size = model_input.size(); + input.type = RKNN_TENSOR_UINT8; + + auto r = infer_backend_->InferBorrowed(model_handle_, input); + if (!r.success || r.outputs.empty()) { + LogWarn("[ai_scrfd_sliding] inference failed"); + return dets; + } + + // Decode (get detections in 640x640 coordinates) + dets = detector_.Decode(r.outputs, 640, 640, det_cfg_); + + // Map back to original coordinates + float scale_x = static_cast(win_w) / 640.0f; + float scale_y = static_cast(win_h) / 640.0f; + + for (auto& det : dets) { + det.bbox.x = win_x + det.bbox.x * scale_x; + det.bbox.y = win_y + det.bbox.y * scale_y; + det.bbox.w *= scale_x; + det.bbox.h *= scale_y; + if (det.has_landmarks) { + for (auto& lm : det.landmarks) { + lm.x = win_x + lm.x * scale_x; + lm.y = win_y + lm.y * scale_y; + } + } + } + + return dets; + } + +#endif + + std::string id_; + std::string model_path_; + ScrfdConfig det_cfg_; + ScrfdDetector detector_; + int model_w_ = 640; + int model_h_ = 640; + int target_height_ = 640; + + std::vector windows_; + + std::shared_ptr> input_queue_; + std::vector>> output_queues_; + std::shared_ptr infer_backend_; + +#if defined(RK3588_ENABLE_RKNN) + ModelHandle model_handle_ = kInvalidModelHandle; + std::vector input_buf_; +#endif +}; + +REGISTER_NODE(AiScrfdSlidingNode, "ai_scrfd_sliding"); + +} // namespace rk3588 diff --git a/plugins/alarm/alarm_node.cpp b/plugins/alarm/alarm_node.cpp index b3b8306..a655181 100644 --- a/plugins/alarm/alarm_node.cpp +++ b/plugins/alarm/alarm_node.cpp @@ -428,10 +428,7 @@ public: for (const auto& d : frame->det->items) { if (d.cls_id == 10) no_boots_count++; } - if (no_boots_count > 0 || processed_frames_ % 30 == 0) { - LogInfo("[alarm] frame received, dets=" + std::to_string(frame->det->items.size()) + - " no_boots=" + std::to_string(no_boots_count)); - } + // Log throttled } if (eval_interval_ms_ > 0 && frame->pts > 0) { diff --git a/plugins/logic_gate/logic_gate_node.cpp b/plugins/logic_gate/logic_gate_node.cpp index 08daa54..4251091 100644 --- a/plugins/logic_gate/logic_gate_node.cpp +++ b/plugins/logic_gate/logic_gate_node.cpp @@ -128,6 +128,30 @@ private: } } + // 将检测坐标(相对于原始图像)映射到当前帧坐标 + Rect MapDetCoordToFrame(const Rect& det_bbox, FramePtr frame) { + if (!frame->transform_meta || !frame->transform_meta->valid) { + return det_bbox; // 无变换信息,直接使用 + } + + const auto& meta = *frame->transform_meta; + if (meta.src_w <= 0 || meta.src_h <= 0 || frame->width <= 0 || frame->height <= 0) { + return det_bbox; + } + + // 计算缩放因子:检测坐标是基于 src_w x src_h 的 + float scale_x = static_cast(frame->width) / meta.src_w; + float scale_y = static_cast(frame->height) / meta.src_h; + + Rect mapped; + mapped.x = det_bbox.x * scale_x; + mapped.y = det_bbox.y * scale_y; + mapped.w = det_bbox.w * scale_x; + mapped.h = det_bbox.h * scale_y; + + return mapped; + } + void ProcessPpeBootsCheck(FramePtr frame) { const auto& detections = frame->det->items; @@ -145,7 +169,12 @@ private: if (config_.debug) { LogInfo("[LogicGateNode] Persons=" + std::to_string(persons.size()) + - " Boots=" + std::to_string(boots.size())); + " Boots=" + std::to_string(boots.size()) + + " Frame=" + std::to_string(frame->width) + "x" + std::to_string(frame->height)); + if (frame->transform_meta && frame->transform_meta->valid) { + LogInfo("[LogicGateNode] TransformMeta: src=" + std::to_string(frame->transform_meta->src_w) + + "x" + std::to_string(frame->transform_meta->src_h)); + } } // 简化逻辑:必须同时检测到人和鞋,才开始判断 @@ -158,7 +187,21 @@ private: // 对每只鞋进行颜色检查 for (const auto& boot : boots) { if (config_.enable_color_check && color_analyzer_) { - auto color_result = color_analyzer_->Analyze(*frame, boot.bbox); + // 将检测坐标映射到当前帧坐标 + Rect mapped_bbox = MapDetCoordToFrame(boot.bbox, frame); + + if (config_.debug) { + LogInfo("[LogicGateNode] Boot bbox: [" + std::to_string(static_cast(boot.bbox.x)) + + "," + std::to_string(static_cast(boot.bbox.y)) + + " " + std::to_string(static_cast(boot.bbox.w)) + + "x" + std::to_string(static_cast(boot.bbox.h)) + + "] -> Mapped: [" + std::to_string(static_cast(mapped_bbox.x)) + + "," + std::to_string(static_cast(mapped_bbox.y)) + + " " + std::to_string(static_cast(mapped_bbox.w)) + + "x" + std::to_string(static_cast(mapped_bbox.h)) + "]"); + } + + auto color_result = color_analyzer_->Analyze(*frame, mapped_bbox); if (config_.debug) { LogInfo("[LogicGateNode] Boot brightness=" + diff --git a/src/face/scrfd_detector.cpp b/src/face/scrfd_detector.cpp new file mode 100644 index 0000000..f3430a7 --- /dev/null +++ b/src/face/scrfd_detector.cpp @@ -0,0 +1,157 @@ +/** + * SCRFD Detector Implementation + */ + +#include "face/scrfd_detector.h" +#include "ai_scheduler.h" // For BorrowedOutput +#include "face/face_detection_utils.h" +#include +#include + +namespace rk3588 { + +ScrfdDetector::ScrfdDetector() = default; +ScrfdDetector::~ScrfdDetector() = default; + +void ScrfdDetector::Init(int model_w, int model_h) { + model_w_ = model_w; + model_h_ = model_h; + + // Generate center points + const int strides[] = {8, 16, 32}; + + for (int stride : strides) { + int num_grid = model_w_ / stride; + for (int y = 0; y < num_grid; ++y) { + for (int x = 0; x < num_grid; ++x) { + // 2 anchors per location + for (int a = 0; a < 2; ++a) { + CenterPoint pt; + pt.cx = static_cast(x); + pt.cy = static_cast(y); + pt.stride = static_cast(stride); + center_points_.push_back(pt); + } + } + } + } +} + +std::vector ScrfdDetector::Decode( + const std::vector& outputs, + int src_w, int src_h, + const ScrfdConfig& cfg) { + + std::vector detections; + + if (outputs.size() != 9) return detections; + + // Output order: score_8, score_16, score_32, bbox_8, bbox_16, bbox_32, kps_8, kps_16, kps_32 + const int anchor_counts[] = {12800, 3200, 800}; + const int strides[] = {8, 16, 32}; + + size_t anchor_idx = 0; + + for (int s = 0; s < 3; ++s) { + int stride = strides[s]; + int count = anchor_counts[s]; + + const auto& score_out = outputs[s]; + const auto& bbox_out = outputs[s + 3]; + const auto& kps_out = outputs[s + 6]; + + if (score_out.dims.size() < 3) continue; + + const float* scores = reinterpret_cast(score_out.data); + const float* bboxes = reinterpret_cast(bbox_out.data); + const float* kps = reinterpret_cast(kps_out.data); + + if (!scores || !bboxes || !kps) continue; + + for (int i = 0; i < count; ++i) { + if (anchor_idx >= center_points_.size()) break; + + float score = scores[i]; + if (score < cfg.conf_thresh) { + anchor_idx++; + continue; + } + + const CenterPoint& pt = center_points_[anchor_idx]; + + // BBox: [left, top, right, bottom] - distances from center + float left = bboxes[i * 4 + 0]; + float top = bboxes[i * 4 + 1]; + float right = bboxes[i * 4 + 2]; + float bottom = bboxes[i * 4 + 3]; + + // Decode to image coordinates (640x640) + float x1_640 = (pt.cx - left) * stride; + float y1_640 = (pt.cy - top) * stride; + float x2_640 = (pt.cx + right) * stride; + float y2_640 = (pt.cy + bottom) * stride; + + // Scale to original image size + float scale_x = static_cast(src_w) / model_w_; + float scale_y = static_cast(src_h) / model_h_; + + FaceDetItem det; + det.bbox.x = x1_640 * scale_x; + det.bbox.y = y1_640 * scale_y; + det.bbox.w = (x2_640 - x1_640) * scale_x; + det.bbox.h = (y2_640 - y1_640) * scale_y; + det.score = score; + det.has_landmarks = cfg.output_landmarks; + + // Keypoints + if (cfg.output_landmarks) { + for (int p = 0; p < 5; ++p) { + float kps_x = kps[i * 10 + p * 2 + 0]; + float kps_y = kps[i * 10 + p * 2 + 1]; + float kx_640 = (pt.cx + kps_x) * stride; + float ky_640 = (pt.cy + kps_y) * stride; + det.landmarks[p].x = kx_640 * scale_x; + det.landmarks[p].y = ky_640 * scale_y; + } + } + + detections.push_back(det); + anchor_idx++; + } + } + + return detections; +} + +std::vector ScrfdDetector::ApplyNMS( + std::vector& dets, + float nms_thresh) { + + if (dets.empty()) return dets; + + // Sort by score + std::sort(dets.begin(), dets.end(), + [](const FaceDetItem& a, const FaceDetItem& b) { + return a.score > b.score; + }); + + std::vector keep; + std::vector suppressed(dets.size(), false); + + for (size_t i = 0; i < dets.size(); ++i) { + if (suppressed[i]) continue; + + keep.push_back(dets[i]); + + for (size_t j = i + 1; j < dets.size(); ++j) { + if (suppressed[j]) continue; + if (face_detection::IoU(dets[i].bbox, dets[j].bbox) > nms_thresh) { + suppressed[j] = true; + } + } + } + + return keep; +} + +} // namespace rk3588 diff --git a/web/hls_player.html b/web/hls_player.html index 11627e5..0d7e01b 100644 --- a/web/hls_player.html +++ b/web/hls_player.html @@ -58,7 +58,6 @@