diff --git a/Readme.md b/Readme.md index 25a3455..1460e36 100644 --- a/Readme.md +++ b/Readme.md @@ -1266,3 +1266,40 @@ rsync -avz build-cross/ user@board_ip:/opt/media-server/ - [docs/models.md](docs/models.md) - 模型转换与部署指南 - [docs/Agent_API_Extensions.md](docs/Agent_API_Extensions.md) - Agent API 扩展说明 - [docs/API_Device_RemoteMgmt_InterfaceTable.md](docs/API_Device_RemoteMgmt_InterfaceTable.md) - 远程管理接口 +--- + +## Pose Behavior Pipeline + +The pose behavior pipeline is optional and does not replace the existing detection pipeline. + +Baseline behavior graph: + +```text +input_rtsp -> preprocess -> ai_yolo -> tracker -> region_event -> action_recog -> event_fusion +``` + +Pose-enabled behavior graph: + +```text +input_rtsp -> preprocess -> ai_yolo -> tracker -> ai_pose -> pose_assoc -> region_event -> action_recog -> event_fusion +``` + +Engineering rules: + +- `tracker` remains the source of stable `track_id` for the main pipeline. +- `ai_pose` only produces `Frame.pose` and does not replace `Frame.det`. +- `pose_assoc` is the canonical node for assigning `PoseItem.track_id`. +- `action_recog` may use pose-enhanced rules, but it must still run when pose nodes are absent. + +Degradation semantics: + +- If `ai_pose` is not present, the graph keeps the original bbox-based behavior logic. +- If `ai_pose` is present but inference fails on some frames, `Frame.pose` may be empty and downstream nodes must continue with bbox data. +- If `pose_assoc` is absent, downstream nodes should treat `pose.track_id` as optional and may fall back to bbox matching only as a compatibility path. + +Sample configs: + +- `configs/sample_region_behavior_intrusion.json` + - Baseline intrusion-only graph without pose nodes. This is the reference for the non-pose deployment path. +- `configs/sample_region_behavior_full.json` + - Full behavior graph with `ai_pose` and `pose_assoc`, intended for pose-aware `fall` and `fight`. diff --git a/configs/sample_region_behavior_full.json b/configs/sample_region_behavior_full.json index 55c7964..091bd97 100644 --- a/configs/sample_region_behavior_full.json +++ b/configs/sample_region_behavior_full.json @@ -60,6 +60,24 @@ "max_age_ms": 1200, "min_hits": 2 }, + { + "id": "pose", + "type": "ai_pose", + "role": "filter", + "enable": true, + "model_path": "./models/yolov8n-pose.rknn", + "model_input_w": 640, + "model_input_h": 640, + "conf_thresh": 0.25, + "nms_thresh": 0.45 + }, + { + "id": "pose_assoc", + "type": "pose_assoc", + "role": "filter", + "enable": true, + "min_iou": 0.1 + }, { "id": "region_evt", "type": "region_event", @@ -90,16 +108,38 @@ { "type": "fall", "window_ms": 1500, - "min_drop_pixels": 120, - "min_aspect_ratio_delta": 0.35, - "activate_duration_ms": 300 + "activate_duration_ms": 300, + "bbox": { + "enabled": true, + "min_drop_pixels": 120, + "min_aspect_ratio_delta": 0.35 + }, + "pose": { + "enabled": true, + "min_torso_drop_pixels": 120, + "max_upright_ratio": 0.60 + }, + "fusion": { + "match_mode": "any" + } }, { "type": "fight", "window_ms": 1200, - "proximity_pixels": 220, - "min_motion_pixels": 90, - "activate_duration_ms": 200 + "activate_duration_ms": 200, + "bbox": { + "enabled": true, + "proximity_pixels": 220, + "min_motion_pixels": 90 + }, + "pose": { + "enabled": true, + "min_wrist_motion_pixels": 120, + "max_wrist_distance_pixels": 120 + }, + "fusion": { + "match_mode": "any" + } } ] }, @@ -190,7 +230,9 @@ ["in", "pre"], ["pre", "person_det"], ["person_det", "trk"], - ["trk", "region_evt"], + ["trk", "pose"], + ["pose", "pose_assoc"], + ["pose_assoc", "region_evt"], ["region_evt", "action_evt"], ["action_evt", "fusion"], ["fusion", "osd"], diff --git a/docs/config_guide.md b/docs/config_guide.md index 7a84dd0..3d3ba5c 100644 --- a/docs/config_guide.md +++ b/docs/config_guide.md @@ -554,3 +554,117 @@ input_rtsp **版本**:v2.0 **更新日期**:2026-03-15 +## Pose-Enabled Behavior Configuration + +This repository now supports two behavior deployment modes. + +### Mode 1: Baseline Behavior Graph + +Use this when you only need tracked detections and rule-based region behavior: + +```text +input_rtsp -> preprocess -> ai_yolo -> tracker -> region_event -> action_recog -> event_fusion +``` + +Reference config: + +- `configs/sample_region_behavior_intrusion.json` + +Characteristics: + +- No pose model dependency +- Lowest compute cost +- `fall` and `fight` rely on bbox-only rules + +### Mode 2: Pose-Enabled Behavior Graph + +Use this when you need pose-aware `fall` and `fight`: + +```text +input_rtsp -> preprocess -> ai_yolo -> tracker -> ai_pose -> pose_assoc -> region_event -> action_recog -> event_fusion +``` + +Reference config: + +- `configs/sample_region_behavior_full.json` + +Characteristics: + +- `ai_pose` writes `Frame.pose` +- `pose_assoc` assigns `PoseItem.track_id` +- `action_recog` can fuse bbox and pose signals per event rule + +### Optional Integration Rules + +- `ai_pose` is optional. Do not add it to graphs that only need intrusion or climb. +- `pose_assoc` should be placed after `ai_pose` and after tracked detections are already available on the frame. +- `action_recog` must remain runnable when `Frame.pose` is missing or empty. + +### Degradation Semantics + +- Without `ai_pose`, the graph must still behave correctly with bbox-only behavior logic. +- With `ai_pose` but without `pose_assoc`, `pose.track_id` is not guaranteed and downstream logic should treat it as optional compatibility mode. +- If pose inference returns no items for a frame, the rest of the graph must keep using `Frame.det` and `track_id`. + +### Structured `action_recog` Rules + +`action_recog` supports a structured event format with `bbox`, `pose`, and `fusion` sections. + +Example: + +```json +{ + "id": "action_evt", + "type": "action_recog", + "events": [ + { + "type": "fall", + "window_ms": 1500, + "activate_duration_ms": 300, + "bbox": { + "enabled": true, + "min_drop_pixels": 120, + "min_aspect_ratio_delta": 0.35 + }, + "pose": { + "enabled": true, + "min_torso_drop_pixels": 120, + "max_upright_ratio": 0.60 + }, + "fusion": { + "match_mode": "any" + } + }, + { + "type": "fight", + "window_ms": 1200, + "activate_duration_ms": 200, + "bbox": { + "enabled": true, + "proximity_pixels": 220, + "min_motion_pixels": 90 + }, + "pose": { + "enabled": true, + "min_wrist_motion_pixels": 120, + "max_wrist_distance_pixels": 120 + }, + "fusion": { + "match_mode": "any" + } + } + ] +} +``` + +Rules: + +- `bbox.enabled=false` disables bbox signal evaluation for that event. +- `pose.enabled=false` disables pose signal evaluation for that event. +- `fusion.match_mode="any"` means either bbox or pose may activate the event. +- `fusion.match_mode="all"` means bbox and pose must both activate the event. + +Backward compatibility: + +- Old flat keys such as `min_drop_pixels` and `pose_min_torso_drop_pixels` are still accepted. +- New configs should prefer the structured form above. diff --git a/include/frame/frame.h b/include/frame/frame.h index fbafd70..7fc9c61 100644 --- a/include/frame/frame.h +++ b/include/frame/frame.h @@ -16,6 +16,7 @@ namespace rk3588 { struct FaceDetResult; struct FaceRecogResult; +struct PoseResult; enum class PixelFormat { NV12, @@ -82,6 +83,7 @@ struct Frame { std::shared_ptr det; std::shared_ptr behavior_events; + std::shared_ptr pose; // Face recognition pipeline meta (kept separate from user_meta to avoid conflicts with publish). std::shared_ptr face_det; std::shared_ptr face_recog; diff --git a/include/pose/pose_result.h b/include/pose/pose_result.h new file mode 100644 index 0000000..7b5106d --- /dev/null +++ b/include/pose/pose_result.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include + +#include "frame/rect.h" + +namespace rk3588 { + +struct PosePoint2f { + float x = 0.0f; + float y = 0.0f; +}; + +struct PoseKeypoint { + PosePoint2f point{}; + float score = 0.0f; +}; + +struct PoseItem { + Rect bbox{}; + float score = 0.0f; + int track_id = -1; + std::vector keypoints; +}; + +struct PoseResult { + std::vector items; + int img_w = 0; + int img_h = 0; + std::string model_name; +}; + +} // namespace rk3588 diff --git a/plugins/CMakeLists.txt b/plugins/CMakeLists.txt index 7ad7c5b..973ab7e 100644 --- a/plugins/CMakeLists.txt +++ b/plugins/CMakeLists.txt @@ -482,6 +482,30 @@ set_target_properties(action_recog PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} ) +# ai_pose plugin (pose inference scaffold) +add_library(ai_pose SHARED + ai_pose/ai_pose_node.cpp +) +target_include_directories(ai_pose PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/third_party) +target_link_libraries(ai_pose PRIVATE project_options Threads::Threads) +set_target_properties(ai_pose PROPERTIES + OUTPUT_NAME "ai_pose" + LIBRARY_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} + RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} +) + +# pose_assoc plugin (assign pose.track_id from tracked detections) +add_library(pose_assoc SHARED + pose_assoc/pose_assoc_node.cpp +) +target_include_directories(pose_assoc PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/third_party) +target_link_libraries(pose_assoc PRIVATE project_options Threads::Threads) +set_target_properties(pose_assoc PROPERTIES + OUTPUT_NAME "pose_assoc" + LIBRARY_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} + RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} +) + # event_fusion plugin (behavior event id and lifecycle normalization) add_library(event_fusion SHARED event_fusion/event_fusion_node.cpp @@ -565,7 +589,7 @@ set_target_properties(ai_shoe_det PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${RK_PLUGIN_OUTPUT_DIR} ) -install(TARGETS input_rtsp input_file publish preprocess ai_yolo ai_face_det ai_scrfd ai_scrfd_sliding ai_face_recog tracker gate osd alarm logic_gate region_event action_recog event_fusion storage ai_scheduler ai_shoe_det +install(TARGETS input_rtsp input_file publish preprocess ai_yolo ai_face_det ai_scrfd ai_scrfd_sliding ai_face_recog tracker gate osd alarm logic_gate region_event action_recog ai_pose pose_assoc event_fusion storage ai_scheduler ai_shoe_det LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/rk3588-media-server/plugins RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}/rk3588-media-server/plugins ) diff --git a/plugins/action_recog/action_recog_node.cpp b/plugins/action_recog/action_recog_node.cpp index 3ab69df..16d65fe 100644 --- a/plugins/action_recog/action_recog_node.cpp +++ b/plugins/action_recog/action_recog_node.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -12,6 +13,7 @@ #include #include "behavior/behavior_event.h" +#include "pose/pose_result.h" #include "utils/logger.h" namespace rk3588 { @@ -22,21 +24,135 @@ enum class ActionEventKind { Fight }; +enum class ActionSignalMode { + Any, + All +}; + +struct FallBboxRule { + bool enabled = true; + float min_drop_pixels = 0.0f; + float min_aspect_ratio_delta = 0.0f; +}; + +struct FallPoseRule { + bool enabled = false; + float min_torso_drop_pixels = 0.0f; + float max_upright_ratio = 0.0f; +}; + +struct FightBboxRule { + bool enabled = true; + float proximity_pixels = 0.0f; + float min_motion_pixels = 0.0f; +}; + +struct FightPoseRule { + bool enabled = false; + float min_wrist_motion_pixels = 0.0f; + float max_wrist_distance_pixels = 0.0f; +}; + struct ActionEventRule { ActionEventKind kind = ActionEventKind::Fall; uint64_t window_ms = 1000; uint64_t activate_duration_ms = 0; - float min_drop_pixels = 0.0f; - float min_aspect_ratio_delta = 0.0f; - float proximity_pixels = 0.0f; - float min_motion_pixels = 0.0f; + ActionSignalMode signal_mode = ActionSignalMode::Any; + FallBboxRule fall_bbox{}; + FallPoseRule fall_pose{}; + FightBboxRule fight_bbox{}; + FightPoseRule fight_pose{}; +}; + +struct PoseTrackSample { + bool matched = false; + float torso_center_y = 0.0f; + float upright_ratio = 0.0f; + float left_wrist_x = 0.0f; + float left_wrist_y = 0.0f; + float right_wrist_x = 0.0f; + float right_wrist_y = 0.0f; + bool has_left_wrist = false; + bool has_right_wrist = false; }; struct TrackSample { uint64_t pts = 0; Rect bbox{}; + PoseTrackSample pose{}; }; +static ActionSignalMode ParseSignalMode(const std::string& value) { + if (value == "all") return ActionSignalMode::All; + return ActionSignalMode::Any; +} + +static bool EvaluateSignalMode(ActionSignalMode mode, bool bbox_trigger, bool pose_trigger, + bool bbox_enabled, bool pose_enabled) { + if (bbox_enabled && pose_enabled) { + return mode == ActionSignalMode::All ? (bbox_trigger && pose_trigger) : (bbox_trigger || pose_trigger); + } + if (bbox_enabled) return bbox_trigger; + if (pose_enabled) return pose_trigger; + return false; +} + +static bool ParseFallRule(const SimpleJson& ev, ActionEventRule& rule) { + rule.kind = ActionEventKind::Fall; + rule.fall_bbox.min_drop_pixels = ev.ValueOr("min_drop_pixels", 0.0f); + rule.fall_bbox.min_aspect_ratio_delta = ev.ValueOr("min_aspect_ratio_delta", 0.0f); + rule.fall_pose.min_torso_drop_pixels = ev.ValueOr("pose_min_torso_drop_pixels", 0.0f); + rule.fall_pose.max_upright_ratio = ev.ValueOr("pose_max_upright_ratio", 0.0f); + + if (const SimpleJson* bbox = ev.Find("bbox"); bbox && bbox->IsObject()) { + rule.fall_bbox.enabled = bbox->ValueOr("enabled", true); + rule.fall_bbox.min_drop_pixels = bbox->ValueOr("min_drop_pixels", rule.fall_bbox.min_drop_pixels); + rule.fall_bbox.min_aspect_ratio_delta = + bbox->ValueOr("min_aspect_ratio_delta", rule.fall_bbox.min_aspect_ratio_delta); + } else { + rule.fall_bbox.enabled = true; + } + + if (const SimpleJson* pose = ev.Find("pose"); pose && pose->IsObject()) { + rule.fall_pose.enabled = pose->ValueOr("enabled", true); + rule.fall_pose.min_torso_drop_pixels = + pose->ValueOr("min_torso_drop_pixels", rule.fall_pose.min_torso_drop_pixels); + rule.fall_pose.max_upright_ratio = + pose->ValueOr("max_upright_ratio", rule.fall_pose.max_upright_ratio); + } else { + rule.fall_pose.enabled = rule.fall_pose.min_torso_drop_pixels > 0.0f || rule.fall_pose.max_upright_ratio > 0.0f; + } + return true; +} + +static bool ParseFightRule(const SimpleJson& ev, ActionEventRule& rule) { + rule.kind = ActionEventKind::Fight; + rule.fight_bbox.proximity_pixels = ev.ValueOr("proximity_pixels", 0.0f); + rule.fight_bbox.min_motion_pixels = ev.ValueOr("min_motion_pixels", 0.0f); + rule.fight_pose.min_wrist_motion_pixels = ev.ValueOr("pose_min_wrist_motion_pixels", 0.0f); + rule.fight_pose.max_wrist_distance_pixels = ev.ValueOr("pose_max_wrist_distance_pixels", 0.0f); + + if (const SimpleJson* bbox = ev.Find("bbox"); bbox && bbox->IsObject()) { + rule.fight_bbox.enabled = bbox->ValueOr("enabled", true); + rule.fight_bbox.proximity_pixels = bbox->ValueOr("proximity_pixels", rule.fight_bbox.proximity_pixels); + rule.fight_bbox.min_motion_pixels = bbox->ValueOr("min_motion_pixels", rule.fight_bbox.min_motion_pixels); + } else { + rule.fight_bbox.enabled = true; + } + + if (const SimpleJson* pose = ev.Find("pose"); pose && pose->IsObject()) { + rule.fight_pose.enabled = pose->ValueOr("enabled", true); + rule.fight_pose.min_wrist_motion_pixels = + pose->ValueOr("min_wrist_motion_pixels", rule.fight_pose.min_wrist_motion_pixels); + rule.fight_pose.max_wrist_distance_pixels = + pose->ValueOr("max_wrist_distance_pixels", rule.fight_pose.max_wrist_distance_pixels); + } else { + rule.fight_pose.enabled = + rule.fight_pose.min_wrist_motion_pixels > 0.0f || rule.fight_pose.max_wrist_distance_pixels > 0.0f; + } + return true; +} + static bool ParseRules(const SimpleJson& config, std::vector& out, std::string& err) { const SimpleJson* events = config.Find("events"); if (!events || !events->IsArray()) { @@ -55,15 +171,15 @@ static bool ParseRules(const SimpleJson& config, std::vector& o const std::string type = ev.ValueOr("type", ""); rule.window_ms = static_cast(std::max(0, ev.ValueOr("window_ms", 1000))); rule.activate_duration_ms = static_cast(std::max(0, ev.ValueOr("activate_duration_ms", 0))); + rule.signal_mode = ParseSignalMode(ev.ValueOr("signal_mode", "any")); + if (const SimpleJson* fusion = ev.Find("fusion"); fusion && fusion->IsObject()) { + rule.signal_mode = ParseSignalMode(fusion->ValueOr("match_mode", "any")); + } if (type == "fall") { - rule.kind = ActionEventKind::Fall; - rule.min_drop_pixels = ev.ValueOr("min_drop_pixels", 0.0f); - rule.min_aspect_ratio_delta = ev.ValueOr("min_aspect_ratio_delta", 0.0f); + ParseFallRule(ev, rule); } else if (type == "fight") { - rule.kind = ActionEventKind::Fight; - rule.proximity_pixels = ev.ValueOr("proximity_pixels", 0.0f); - rule.min_motion_pixels = ev.ValueOr("min_motion_pixels", 0.0f); + ParseFightRule(ev, rule); } else { err = "unsupported event type: " + type; return false; @@ -98,6 +214,129 @@ static float AspectRatio(const Rect& rect) { return rect.h > 0.0f ? (rect.w / rect.h) : 0.0f; } +static float BboxIoU(const Rect& lhs, const Rect& rhs) { + const float x1 = std::max(lhs.x, rhs.x); + const float y1 = std::max(lhs.y, rhs.y); + const float x2 = std::min(lhs.x + lhs.w, rhs.x + rhs.w); + const float y2 = std::min(lhs.y + lhs.h, rhs.y + rhs.h); + const float iw = std::max(0.0f, x2 - x1); + const float ih = std::max(0.0f, y2 - y1); + const float inter = iw * ih; + const float uni = lhs.w * lhs.h + rhs.w * rhs.h - inter; + return uni <= 0.0f ? 0.0f : (inter / uni); +} + +static bool IsValidKeypoint(const PoseItem& pose, int index) { + return index >= 0 && + static_cast(index) < pose.keypoints.size() && + pose.keypoints[static_cast(index)].score > 0.0f; +} + +static PoseTrackSample BuildPoseTrackSample(const PoseItem& pose) { + PoseTrackSample sample; + sample.matched = true; + + const bool has_left_shoulder = IsValidKeypoint(pose, 5); + const bool has_right_shoulder = IsValidKeypoint(pose, 6); + const bool has_left_hip = IsValidKeypoint(pose, 11); + const bool has_right_hip = IsValidKeypoint(pose, 12); + if (has_left_shoulder && has_right_shoulder && has_left_hip && has_right_hip) { + const PosePoint2f& ls = pose.keypoints[5].point; + const PosePoint2f& rs = pose.keypoints[6].point; + const PosePoint2f& lh = pose.keypoints[11].point; + const PosePoint2f& rh = pose.keypoints[12].point; + const float shoulder_center_y = (ls.y + rs.y) * 0.5f; + const float hip_center_y = (lh.y + rh.y) * 0.5f; + sample.torso_center_y = (shoulder_center_y + hip_center_y) * 0.5f; + const float min_x = std::min(std::min(ls.x, rs.x), std::min(lh.x, rh.x)); + const float max_x = std::max(std::max(ls.x, rs.x), std::max(lh.x, rh.x)); + const float min_y = std::min(std::min(ls.y, rs.y), std::min(lh.y, rh.y)); + const float max_y = std::max(std::max(ls.y, rs.y), std::max(lh.y, rh.y)); + const float horizontal_span = std::fabs(max_x - min_x); + const float vertical_span = std::fabs(max_y - min_y); + sample.upright_ratio = vertical_span > 1e-3f ? (horizontal_span / vertical_span) : std::numeric_limits::infinity(); + } + + if (IsValidKeypoint(pose, 9)) { + sample.left_wrist_x = pose.keypoints[9].point.x; + sample.left_wrist_y = pose.keypoints[9].point.y; + sample.has_left_wrist = true; + } + if (IsValidKeypoint(pose, 10)) { + sample.right_wrist_x = pose.keypoints[10].point.x; + sample.right_wrist_y = pose.keypoints[10].point.y; + sample.has_right_wrist = true; + } + return sample; +} + +static PoseTrackSample MatchPoseToDetection(const Rect& bbox, const std::shared_ptr& pose_result) { + if (!pose_result) return {}; + + float best_iou = 0.0f; + const PoseItem* best_pose = nullptr; + for (const auto& pose : pose_result->items) { + const float iou = BboxIoU(bbox, pose.bbox); + if (iou > best_iou) { + best_iou = iou; + best_pose = &pose; + } + } + if (!best_pose || best_iou <= 0.0f) return {}; + return BuildPoseTrackSample(*best_pose); +} + +static PoseTrackSample MatchPoseToTrackId(int track_id, const Rect& bbox, const std::shared_ptr& pose_result) { + if (!pose_result) return {}; + + const PoseItem* best_pose = nullptr; + float best_iou = 0.0f; + for (const auto& pose : pose_result->items) { + if (pose.track_id != track_id) continue; + const float iou = BboxIoU(bbox, pose.bbox); + if (iou >= best_iou) { + best_iou = iou; + best_pose = &pose; + } + } + if (best_pose) return BuildPoseTrackSample(*best_pose); + return MatchPoseToDetection(bbox, pose_result); +} + +static float WristMotion(const PoseTrackSample& newer, const PoseTrackSample& older) { + float motion = 0.0f; + if (newer.has_left_wrist && older.has_left_wrist) { + const float dx = newer.left_wrist_x - older.left_wrist_x; + const float dy = newer.left_wrist_y - older.left_wrist_y; + motion += std::sqrt(dx * dx + dy * dy); + } + if (newer.has_right_wrist && older.has_right_wrist) { + const float dx = newer.right_wrist_x - older.right_wrist_x; + const float dy = newer.right_wrist_y - older.right_wrist_y; + motion += std::sqrt(dx * dx + dy * dy); + } + return motion; +} + +static float MinimumWristDistance(const PoseTrackSample& lhs, const PoseTrackSample& rhs) { + float best = std::numeric_limits::infinity(); + auto update = [&](bool has_a, float ax, float ay, bool has_b, float bx, float by) { + if (!has_a || !has_b) return; + const float dx = ax - bx; + const float dy = ay - by; + best = std::min(best, std::sqrt(dx * dx + dy * dy)); + }; + update(lhs.has_left_wrist, lhs.left_wrist_x, lhs.left_wrist_y, + rhs.has_left_wrist, rhs.left_wrist_x, rhs.left_wrist_y); + update(lhs.has_left_wrist, lhs.left_wrist_x, lhs.left_wrist_y, + rhs.has_right_wrist, rhs.right_wrist_x, rhs.right_wrist_y); + update(lhs.has_right_wrist, lhs.right_wrist_x, lhs.right_wrist_y, + rhs.has_left_wrist, rhs.left_wrist_x, rhs.left_wrist_y); + update(lhs.has_right_wrist, lhs.right_wrist_x, lhs.right_wrist_y, + rhs.has_right_wrist, rhs.right_wrist_x, rhs.right_wrist_y); + return std::isfinite(best) ? best : std::numeric_limits::infinity(); +} + static BehaviorEventType ToBehaviorEventType(ActionEventKind kind) { return kind == ActionEventKind::Fall ? BehaviorEventType::Fall : BehaviorEventType::Fight; } @@ -152,7 +391,7 @@ NodeStatus ActionRecogNode::Process(FramePtr frame) { if (det.track_id < 0) continue; current_tracks[det.track_id] = det.bbox; auto& history = impl_->history[det.track_id]; - history.push_back(TrackSample{frame->pts, det.bbox}); + history.push_back(TrackSample{frame->pts, det.bbox, MatchPoseToTrackId(det.track_id, det.bbox, frame->pose)}); while (!history.empty() && frame->pts > history.front().pts && (frame->pts - history.front().pts) > impl_->rules.front().window_ms) { history.pop_front(); @@ -168,8 +407,18 @@ NodeStatus ActionRecogNode::Process(FramePtr frame) { const float drop = CenterY(bbox) - CenterY(first.bbox); const float aspect_delta = AspectRatio(bbox) - AspectRatio(first.bbox); const uint64_t duration = frame->pts >= first.pts ? (frame->pts - first.pts) : 0; - if (drop < rule.min_drop_pixels) continue; - if (aspect_delta < rule.min_aspect_ratio_delta) continue; + const bool bbox_enabled = rule.fall_bbox.enabled; + const bool pose_enabled = rule.fall_pose.enabled; + const bool bbox_trigger = drop >= rule.fall_bbox.min_drop_pixels && + aspect_delta >= rule.fall_bbox.min_aspect_ratio_delta; + bool pose_trigger = false; + if (pose_enabled && + first.pose.matched && it->second.back().pose.matched) { + const float torso_drop = it->second.back().pose.torso_center_y - first.pose.torso_center_y; + pose_trigger = torso_drop >= rule.fall_pose.min_torso_drop_pixels && + it->second.back().pose.upright_ratio >= rule.fall_pose.max_upright_ratio; + } + if (!EvaluateSignalMode(rule.signal_mode, bbox_trigger, pose_trigger, bbox_enabled, pose_enabled)) continue; if (duration < rule.activate_duration_ms) continue; BehaviorEventItem event; @@ -189,7 +438,7 @@ NodeStatus ActionRecogNode::Process(FramePtr frame) { for (auto left = current_tracks.begin(); left != current_tracks.end(); ++left) { for (auto right = std::next(left); right != current_tracks.end(); ++right) { const float proximity = CenterDistance(left->second, right->second); - if (proximity > rule.proximity_pixels) continue; + if (proximity > rule.fight_bbox.proximity_pixels) continue; const auto it_left = impl_->history.find(left->first); const auto it_right = impl_->history.find(right->first); @@ -203,7 +452,22 @@ NodeStatus ActionRecogNode::Process(FramePtr frame) { const uint64_t duration = std::min(duration_left, duration_right); const float combined_motion = left_motion + right_motion; - if (combined_motion < rule.min_motion_pixels) continue; + const bool bbox_enabled = rule.fight_bbox.enabled; + const bool pose_enabled = rule.fight_pose.enabled; + const bool bbox_trigger = combined_motion >= rule.fight_bbox.min_motion_pixels; + bool pose_trigger = false; + const auto& left_pose_now = it_left->second.back().pose; + const auto& right_pose_now = it_right->second.back().pose; + if (pose_enabled && + it_left->second.front().pose.matched && it_right->second.front().pose.matched && + left_pose_now.matched && right_pose_now.matched) { + const float pose_motion = WristMotion(left_pose_now, it_left->second.front().pose) + + WristMotion(right_pose_now, it_right->second.front().pose); + const float wrist_distance = MinimumWristDistance(left_pose_now, right_pose_now); + pose_trigger = pose_motion >= rule.fight_pose.min_wrist_motion_pixels && + wrist_distance <= rule.fight_pose.max_wrist_distance_pixels; + } + if (!EvaluateSignalMode(rule.signal_mode, bbox_trigger, pose_trigger, bbox_enabled, pose_enabled)) continue; if (duration < rule.activate_duration_ms) continue; const auto pair_key = std::make_pair(left->first, right->first); diff --git a/plugins/ai_pose/ai_pose_node.cpp b/plugins/ai_pose/ai_pose_node.cpp new file mode 100644 index 0000000..8604763 --- /dev/null +++ b/plugins/ai_pose/ai_pose_node.cpp @@ -0,0 +1,466 @@ +#include "ai_pose_node.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "face/face_detection_utils.h" +#include "hw/i_infer_backend.h" +#include "pose/pose_result.h" +#include "utils/logger.h" + +namespace rk3588 { +namespace { + +using face_detection::HalfToFloat; +using face_detection::ResizeRgbBilinear; + +constexpr int kPoseOutputCount = 4; +constexpr int kPoseLocChannels = 64; +constexpr int kPoseScoreChannel = 64; +constexpr int kDflBins = 16; +constexpr int kMaxPoseDetections = 32; +constexpr uint8_t kLetterboxValue = 114; + +struct PoseConfig { + bool enabled = true; + std::string model_path; + int model_input_w = 640; + int model_input_h = 640; + int expected_keypoints = 17; + float conf_thresh = 0.25f; + float nms_thresh = 0.45f; +}; + +struct LetterboxInfo { + float scale = 1.0f; + float pad_x = 0.0f; + float pad_y = 0.0f; +}; + +struct PoseCandidate { + Rect bbox{}; + float score = 0.0f; + int keypoint_index = -1; +}; + +bool ParseConfig(const SimpleJson& config, PoseConfig& out, std::string& err) { + out.enabled = config.ValueOr("enabled", true); + out.model_path = config.ValueOr("model_path", ""); + out.model_input_w = config.ValueOr("model_input_w", 640); + out.model_input_h = config.ValueOr("model_input_h", 640); + out.expected_keypoints = config.ValueOr("expected_keypoints", 17); + out.conf_thresh = config.ValueOr("conf_thresh", 0.25f); + out.nms_thresh = config.ValueOr("nms_thresh", 0.45f); + + if (out.model_input_w <= 0 || out.model_input_h <= 0) { + err = "model_input_w and model_input_h must be positive"; + return false; + } + if (out.expected_keypoints <= 0) { + err = "expected_keypoints must be positive"; + return false; + } + if (out.conf_thresh <= 0.0f || out.conf_thresh >= 1.0f) { + err = "conf_thresh must be in (0, 1)"; + return false; + } + if (out.nms_thresh <= 0.0f || out.nms_thresh >= 1.0f) { + err = "nms_thresh must be in (0, 1)"; + return false; + } + if (out.enabled && out.model_path.empty()) { + err = "model_path is required when enabled=true"; + return false; + } + return true; +} + +void PushToDownstream(const std::vector>>& output_queues, const FramePtr& frame) { + for (const auto& q : output_queues) { + if (q) q->Push(frame); + } +} + +inline float Sigmoid(float x) { + return 1.0f / (1.0f + std::exp(-x)); +} + +float Unsigmoid(float y) { + return -std::log((1.0f / y) - 1.0f); +} + +float ClampFloat(float v, float lo, float hi) { + return std::max(lo, std::min(v, hi)); +} + +float CalculateIoU(const Rect& lhs, const Rect& rhs) { + const float x1 = std::max(lhs.x, rhs.x); + const float y1 = std::max(lhs.y, rhs.y); + const float x2 = std::min(lhs.x + lhs.w, rhs.x + rhs.w); + const float y2 = std::min(lhs.y + lhs.h, rhs.y + rhs.h); + const float inter_w = std::max(0.0f, x2 - x1); + const float inter_h = std::max(0.0f, y2 - y1); + const float inter = inter_w * inter_h; + const float union_area = lhs.w * lhs.h + rhs.w * rhs.h - inter; + return union_area <= 0.0f ? 0.0f : (inter / union_area); +} + +float ReadTensorValue(const InferOutput& output, size_t index) { +#if defined(RK3588_ENABLE_RKNN) + switch (output.type) { + case RKNN_TENSOR_FLOAT32: { + const auto* p = reinterpret_cast(output.data.data()); + return p[index]; + } + case RKNN_TENSOR_FLOAT16: { + const auto* p = reinterpret_cast(output.data.data()); + return HalfToFloat(p[index]); + } + case RKNN_TENSOR_INT8: { + const auto* p = reinterpret_cast(output.data.data()); + return (static_cast(p[index]) - static_cast(output.zp)) * output.scale; + } + case RKNN_TENSOR_UINT8: { + const auto* p = reinterpret_cast(output.data.data()); + return (static_cast(p[index]) - static_cast(output.zp)) * output.scale; + } + default: + break; + } +#endif + const auto* p = reinterpret_cast(output.data.data()); + return p[index]; +} + +bool BuildLetterboxedRgbInput(const Frame& frame, int dst_w, int dst_h, + std::vector& out, LetterboxInfo& letterbox) { + if (frame.width <= 0 || frame.height <= 0 || dst_w <= 0 || dst_h <= 0) return false; + if (frame.format != PixelFormat::RGB && frame.format != PixelFormat::BGR) return false; + + const uint8_t* src = frame.planes[0].data ? frame.planes[0].data : frame.data; + if (!src) return false; + const int src_stride = frame.planes[0].stride > 0 ? frame.planes[0].stride : frame.width * 3; + + out.assign(static_cast(dst_w) * static_cast(dst_h) * 3, kLetterboxValue); + + const float scale = std::min(static_cast(dst_w) / static_cast(frame.width), + static_cast(dst_h) / static_cast(frame.height)); + const int resized_w = std::max(1, static_cast(std::round(frame.width * scale))); + const int resized_h = std::max(1, static_cast(std::round(frame.height * scale))); + const int pad_x = (dst_w - resized_w) / 2; + const int pad_y = (dst_h - resized_h) / 2; + + std::vector resized(static_cast(resized_w) * static_cast(resized_h) * 3); + const bool swap_rb = (frame.format == PixelFormat::BGR); + ResizeRgbBilinear(src, frame.width, frame.height, src_stride, resized.data(), resized_w, resized_h, swap_rb); + + for (int y = 0; y < resized_h; ++y) { + uint8_t* dst_row = out.data() + (static_cast(y + pad_y) * static_cast(dst_w) + pad_x) * 3; + const uint8_t* src_row = resized.data() + static_cast(y) * static_cast(resized_w) * 3; + std::memcpy(dst_row, src_row, static_cast(resized_w) * 3); + } + + letterbox.scale = scale; + letterbox.pad_x = static_cast(pad_x); + letterbox.pad_y = static_cast(pad_y); + return true; +} + +float DecodeDfl(const InferOutput& output, int cell_index, int cells_per_head, int channel_group) { + float max_value = -std::numeric_limits::infinity(); + float logits[kDflBins]{}; + for (int i = 0; i < kDflBins; ++i) { + const size_t offset = static_cast((channel_group * kDflBins + i) * cells_per_head + cell_index); + logits[i] = ReadTensorValue(output, offset); + max_value = std::max(max_value, logits[i]); + } + + float denom = 0.0f; + float numer = 0.0f; + for (int i = 0; i < kDflBins; ++i) { + const float e = std::exp(logits[i] - max_value); + denom += e; + numer += e * static_cast(i); + } + return denom > 0.0f ? (numer / denom) : 0.0f; +} + +std::vector DecodePoseCandidates(const std::vector& outputs, + const PoseConfig& config) { + std::vector candidates; + if (outputs.size() < kPoseOutputCount) return candidates; + + const float score_threshold = Unsigmoid(config.conf_thresh); + int keypoint_base_index = 0; + + for (int head = 0; head < 3; ++head) { + const int stride = (head == 0) ? 8 : (head == 1 ? 16 : 32); + const int grid_h = config.model_input_h / stride; + const int grid_w = config.model_input_w / stride; + const int cells = grid_h * grid_w; + if (cells <= 0) continue; + + const InferOutput& output = outputs[static_cast(head)]; + const size_t elem_count = sizeof(float) > 0 ? (output.data.size() / sizeof(float)) : 0; +#if defined(RK3588_ENABLE_RKNN) + size_t bytes_per_elem = 1; + switch (output.type) { + case RKNN_TENSOR_FLOAT32: bytes_per_elem = sizeof(float); break; + case RKNN_TENSOR_FLOAT16: bytes_per_elem = sizeof(uint16_t); break; + default: bytes_per_elem = 1; break; + } + const size_t actual_elem_count = bytes_per_elem > 0 ? (output.data.size() / bytes_per_elem) : 0; +#else + const size_t actual_elem_count = elem_count; +#endif + if (actual_elem_count < static_cast(65 * cells)) { + keypoint_base_index += cells; + continue; + } + + for (int cell = 0; cell < cells; ++cell) { + const size_t score_offset = static_cast(kPoseScoreChannel * cells + cell); + const float score_logit = ReadTensorValue(output, score_offset); + if (score_logit < score_threshold) continue; + + const float left = DecodeDfl(output, cell, cells, 0); + const float top = DecodeDfl(output, cell, cells, 1); + const float right = DecodeDfl(output, cell, cells, 2); + const float bottom = DecodeDfl(output, cell, cells, 3); + + const int grid_x = cell % grid_w; + const int grid_y = cell / grid_w; + const float center_x = (static_cast(grid_x) + 0.5f) * stride; + const float center_y = (static_cast(grid_y) + 0.5f) * stride; + const float x1 = center_x - left * stride; + const float y1 = center_y - top * stride; + const float x2 = center_x + right * stride; + const float y2 = center_y + bottom * stride; + + PoseCandidate candidate; + candidate.bbox.x = x1; + candidate.bbox.y = y1; + candidate.bbox.w = std::max(0.0f, x2 - x1); + candidate.bbox.h = std::max(0.0f, y2 - y1); + candidate.score = Sigmoid(score_logit); + candidate.keypoint_index = keypoint_base_index + cell; + candidates.push_back(candidate); + } + + keypoint_base_index += cells; + } + + std::sort(candidates.begin(), candidates.end(), [](const PoseCandidate& lhs, const PoseCandidate& rhs) { + return lhs.score > rhs.score; + }); + + std::vector kept; + kept.reserve(candidates.size()); + for (const auto& candidate : candidates) { + bool suppressed = false; + for (const auto& accepted : kept) { + if (CalculateIoU(candidate.bbox, accepted.bbox) > config.nms_thresh) { + suppressed = true; + break; + } + } + if (!suppressed) { + kept.push_back(candidate); + if (static_cast(kept.size()) >= kMaxPoseDetections) break; + } + } + return kept; +} + +float MapToSourceCoord(float value, float pad, float scale, int limit) { + if (scale <= 0.0f) return 0.0f; + return ClampFloat((value - pad) / scale, 0.0f, static_cast(std::max(0, limit))); +} + +std::shared_ptr BuildPoseResult(const std::vector& outputs, + const PoseConfig& config, + int frame_w, int frame_h, + const LetterboxInfo& letterbox) { + if (outputs.size() < kPoseOutputCount) return nullptr; + + const InferOutput& keypoint_output = outputs[3]; + const int total_points = (config.model_input_w / 8) * (config.model_input_h / 8) + + (config.model_input_w / 16) * (config.model_input_h / 16) + + (config.model_input_w / 32) * (config.model_input_h / 32); + const int values_per_point = config.expected_keypoints * 3; +#if defined(RK3588_ENABLE_RKNN) + size_t kp_bytes_per_elem = 1; + switch (keypoint_output.type) { + case RKNN_TENSOR_FLOAT32: kp_bytes_per_elem = sizeof(float); break; + case RKNN_TENSOR_FLOAT16: kp_bytes_per_elem = sizeof(uint16_t); break; + default: kp_bytes_per_elem = 1; break; + } +#else + const size_t kp_bytes_per_elem = sizeof(float); +#endif + const size_t kp_elem_count = kp_bytes_per_elem > 0 ? (keypoint_output.data.size() / kp_bytes_per_elem) : 0; + if (kp_elem_count < static_cast(values_per_point * total_points)) return nullptr; + + auto result = std::make_shared(); + result->img_w = frame_w; + result->img_h = frame_h; + result->model_name = "yolov8n-pose"; + + const std::vector candidates = DecodePoseCandidates(outputs, config); + result->items.reserve(candidates.size()); + for (const auto& candidate : candidates) { + if (candidate.keypoint_index < 0 || candidate.keypoint_index >= total_points) continue; + + PoseItem item; + item.score = candidate.score; + item.track_id = -1; + item.bbox.x = MapToSourceCoord(candidate.bbox.x, letterbox.pad_x, letterbox.scale, frame_w); + item.bbox.y = MapToSourceCoord(candidate.bbox.y, letterbox.pad_y, letterbox.scale, frame_h); + const float x2 = MapToSourceCoord(candidate.bbox.x + candidate.bbox.w, letterbox.pad_x, letterbox.scale, frame_w); + const float y2 = MapToSourceCoord(candidate.bbox.y + candidate.bbox.h, letterbox.pad_y, letterbox.scale, frame_h); + item.bbox.w = std::max(0.0f, x2 - item.bbox.x); + item.bbox.h = std::max(0.0f, y2 - item.bbox.y); + + item.keypoints.reserve(static_cast(config.expected_keypoints)); + for (int k = 0; k < config.expected_keypoints; ++k) { + const size_t x_offset = static_cast(k * 3 * total_points + candidate.keypoint_index); + const size_t y_offset = static_cast(k * 3 * total_points + total_points + candidate.keypoint_index); + const size_t s_offset = static_cast(k * 3 * total_points + 2 * total_points + candidate.keypoint_index); + + PoseKeypoint kp; + kp.point.x = MapToSourceCoord(ReadTensorValue(keypoint_output, x_offset), letterbox.pad_x, letterbox.scale, frame_w); + kp.point.y = MapToSourceCoord(ReadTensorValue(keypoint_output, y_offset), letterbox.pad_y, letterbox.scale, frame_h); + kp.score = ReadTensorValue(keypoint_output, s_offset); + item.keypoints.push_back(kp); + } + result->items.push_back(std::move(item)); + } + + return result; +} + +} // namespace + +struct AiPoseNode::Impl { + PoseConfig config; + std::string init_err; + std::shared_ptr infer_backend; + ModelHandle model_handle = kInvalidModelHandle; + bool started = false; + std::vector input_rgb; +}; + +AiPoseNode::AiPoseNode() : impl_(std::make_unique()) {} + +AiPoseNode::~AiPoseNode() = default; + +std::string AiPoseNode::Id() const { + return id_; +} + +std::string AiPoseNode::Type() const { + return "ai_pose"; +} + +bool AiPoseNode::Init(const SimpleJson& config, const NodeContext& ctx) { + id_ = config.ValueOr("id", "ai_pose"); + if (!ParseConfig(config, impl_->config, impl_->init_err)) { + LogError("[ai_pose] invalid config: " + impl_->init_err); + return false; + } + impl_->infer_backend = ctx.infer_backend; + output_queues_ = ctx.output_queues; + return true; +} + +bool AiPoseNode::Start() { + if (!impl_->config.enabled) { + impl_->started = true; + return true; + } + if (!impl_->infer_backend) { + LogError("[ai_pose] infer_backend is required when enabled=true"); + return false; + } + + std::string err; + impl_->model_handle = impl_->infer_backend->LoadModel(impl_->config.model_path, err); + if (impl_->model_handle == kInvalidModelHandle) { + LogError("[ai_pose] failed to load model: " + impl_->config.model_path + " err=" + err); + return false; + } + + impl_->started = true; + return true; +} + +void AiPoseNode::Stop() { + if (impl_->infer_backend && impl_->model_handle != kInvalidModelHandle) { + impl_->infer_backend->UnloadModel(impl_->model_handle); + impl_->model_handle = kInvalidModelHandle; + } + impl_->started = false; +} + +NodeStatus AiPoseNode::Process(FramePtr frame) { + if (!frame) return NodeStatus::DROP; + + if (!impl_->config.enabled) { + PushToDownstream(output_queues_, frame); + return NodeStatus::OK; + } + if (!impl_->started || !impl_->infer_backend || impl_->model_handle == kInvalidModelHandle) { + LogWarn("[ai_pose] process called before node is started"); + PushToDownstream(output_queues_, frame); + return NodeStatus::ERROR; + } + + LetterboxInfo letterbox; + if (!BuildLetterboxedRgbInput(*frame, impl_->config.model_input_w, impl_->config.model_input_h, + impl_->input_rgb, letterbox)) { + LogWarn("[ai_pose] only RGB/BGR frames are supported currently"); + PushToDownstream(output_queues_, frame); + return NodeStatus::OK; + } + + InferInput input; + input.data = impl_->input_rgb.data(); + input.size = impl_->input_rgb.size(); + input.width = impl_->config.model_input_w; + input.height = impl_->config.model_input_h; + input.is_nhwc = true; + input.dma_fd = -1; + input.dma_offset = 0; + + InferResult infer_result = impl_->infer_backend->Infer(impl_->model_handle, input); + if (!infer_result.success) { + LogWarn("[ai_pose] inference failed: " + infer_result.error); + PushToDownstream(output_queues_, frame); + return NodeStatus::OK; + } + + frame->pose = BuildPoseResult(infer_result.outputs, impl_->config, frame->width, frame->height, letterbox); + if (!frame->pose) { + frame->pose = std::make_shared(); + frame->pose->img_w = frame->width; + frame->pose->img_h = frame->height; + frame->pose->model_name = "yolov8n-pose"; + } + + PushToDownstream(output_queues_, frame); + return NodeStatus::OK; +} + +#ifndef RK3588_TEST_BUILD +REGISTER_NODE(AiPoseNode, "ai_pose"); +#endif + +} // namespace rk3588 diff --git a/plugins/ai_pose/ai_pose_node.h b/plugins/ai_pose/ai_pose_node.h new file mode 100644 index 0000000..f9aa4af --- /dev/null +++ b/plugins/ai_pose/ai_pose_node.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include + +#include "node.h" + +namespace rk3588 { + +class AiPoseNode final : public INode { +public: + AiPoseNode(); + ~AiPoseNode() override; + + std::string Id() const override; + std::string Type() const override; + bool Init(const SimpleJson& config, const NodeContext& ctx) override; + bool Start() override; + void Stop() override; + NodeStatus Process(FramePtr frame) override; + +private: + struct Impl; + std::unique_ptr impl_; + std::string id_; + std::vector>> output_queues_; +}; + +} // namespace rk3588 diff --git a/plugins/pose_assoc/pose_assoc_node.cpp b/plugins/pose_assoc/pose_assoc_node.cpp new file mode 100644 index 0000000..d853c2b --- /dev/null +++ b/plugins/pose_assoc/pose_assoc_node.cpp @@ -0,0 +1,139 @@ +#include "pose_assoc_node.h" + +#include +#include +#include +#include +#include + +#include "pose/pose_result.h" +#include "utils/logger.h" + +namespace rk3588 { +namespace { + +struct PoseAssocConfig { + float min_iou = 0.1f; +}; + +struct MatchCandidate { + float iou = 0.0f; + int pose_index = -1; + int det_index = -1; +}; + +bool ParseConfig(const SimpleJson& config, PoseAssocConfig& out, std::string& err) { + out.min_iou = config.ValueOr("min_iou", 0.1f); + if (out.min_iou < 0.0f || out.min_iou > 1.0f) { + err = "min_iou must be in [0, 1]"; + return false; + } + return true; +} + +float IoU(const Rect& lhs, const Rect& rhs) { + const float x1 = std::max(lhs.x, rhs.x); + const float y1 = std::max(lhs.y, rhs.y); + const float x2 = std::min(lhs.x + lhs.w, rhs.x + rhs.w); + const float y2 = std::min(lhs.y + lhs.h, rhs.y + rhs.h); + const float iw = std::max(0.0f, x2 - x1); + const float ih = std::max(0.0f, y2 - y1); + const float inter = iw * ih; + const float area_l = std::max(0.0f, lhs.w) * std::max(0.0f, lhs.h); + const float area_r = std::max(0.0f, rhs.w) * std::max(0.0f, rhs.h); + const float uni = area_l + area_r - inter; + return uni <= 0.0f ? 0.0f : (inter / uni); +} + +void PushToDownstream(const std::vector>>& output_queues, const FramePtr& frame) { + for (const auto& q : output_queues) { + if (q) q->Push(frame); + } +} + +} // namespace + +struct PoseAssocNode::Impl { + PoseAssocConfig config; + std::string init_err; +}; + +PoseAssocNode::PoseAssocNode() : impl_(std::make_unique()) {} + +PoseAssocNode::~PoseAssocNode() = default; + +std::string PoseAssocNode::Id() const { + return id_; +} + +std::string PoseAssocNode::Type() const { + return "pose_assoc"; +} + +bool PoseAssocNode::Init(const SimpleJson& config, const NodeContext& ctx) { + id_ = config.ValueOr("id", "pose_assoc"); + if (!ParseConfig(config, impl_->config, impl_->init_err)) { + LogError("[pose_assoc] invalid config: " + impl_->init_err); + return false; + } + output_queues_ = ctx.output_queues; + return true; +} + +bool PoseAssocNode::Start() { + return true; +} + +void PoseAssocNode::Stop() {} + +NodeStatus PoseAssocNode::Process(FramePtr frame) { + if (!frame) return NodeStatus::DROP; + + if (frame->pose && frame->det) { + for (auto& pose : frame->pose->items) { + pose.track_id = -1; + } + + std::vector candidates; + candidates.reserve(frame->pose->items.size() * frame->det->items.size()); + for (size_t pi = 0; pi < frame->pose->items.size(); ++pi) { + const Rect& pose_bbox = frame->pose->items[pi].bbox; + for (size_t di = 0; di < frame->det->items.size(); ++di) { + const auto& det = frame->det->items[di]; + if (det.track_id < 0) continue; + const float iou = IoU(pose_bbox, det.bbox); + if (iou >= impl_->config.min_iou) { + candidates.push_back(MatchCandidate{iou, static_cast(pi), static_cast(di)}); + } + } + } + + std::sort(candidates.begin(), candidates.end(), [](const MatchCandidate& lhs, const MatchCandidate& rhs) { + if (lhs.iou != rhs.iou) return lhs.iou > rhs.iou; + if (lhs.pose_index != rhs.pose_index) return lhs.pose_index < rhs.pose_index; + return lhs.det_index < rhs.det_index; + }); + + std::vector used_pose(frame->pose->items.size(), false); + std::vector used_det(frame->det->items.size(), false); + for (const auto& candidate : candidates) { + if (used_pose[static_cast(candidate.pose_index)] || + used_det[static_cast(candidate.det_index)]) { + continue; + } + frame->pose->items[static_cast(candidate.pose_index)].track_id = + frame->det->items[static_cast(candidate.det_index)].track_id; + used_pose[static_cast(candidate.pose_index)] = true; + used_det[static_cast(candidate.det_index)] = true; + } + } + + PushToDownstream(output_queues_, frame); + return NodeStatus::OK; +} + +#ifndef RK3588_TEST_BUILD +REGISTER_NODE(PoseAssocNode, "pose_assoc"); +#endif + +} // namespace rk3588 diff --git a/plugins/pose_assoc/pose_assoc_node.h b/plugins/pose_assoc/pose_assoc_node.h new file mode 100644 index 0000000..d3621e4 --- /dev/null +++ b/plugins/pose_assoc/pose_assoc_node.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include + +#include "node.h" + +namespace rk3588 { + +class PoseAssocNode final : public INode { +public: + PoseAssocNode(); + ~PoseAssocNode() override; + + std::string Id() const override; + std::string Type() const override; + bool Init(const SimpleJson& config, const NodeContext& ctx) override; + bool Start() override; + void Stop() override; + NodeStatus Process(FramePtr frame) override; + +private: + struct Impl; + std::unique_ptr impl_; + std::string id_; + std::vector>> output_queues_; +}; + +} // namespace rk3588 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d510c3b..418541d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -40,6 +40,8 @@ add_executable(rk3588_gtests test_behavior_event_model.cpp test_region_event.cpp test_action_recog.cpp + test_ai_pose.cpp + test_pose_assoc.cpp test_event_fusion.cpp test_alarm_behavior_events.cpp test_infer_backend.cpp @@ -53,6 +55,8 @@ add_executable(rk3588_gtests ${CMAKE_SOURCE_DIR}/src/utils/dma_alloc.cpp ${CMAKE_SOURCE_DIR}/plugins/region_event/region_event_node.cpp ${CMAKE_SOURCE_DIR}/plugins/action_recog/action_recog_node.cpp + ${CMAKE_SOURCE_DIR}/plugins/ai_pose/ai_pose_node.cpp + ${CMAKE_SOURCE_DIR}/plugins/pose_assoc/pose_assoc_node.cpp ${CMAKE_SOURCE_DIR}/plugins/event_fusion/event_fusion_node.cpp ${CMAKE_SOURCE_DIR}/plugins/alarm/rule_engine.cpp ) diff --git a/tests/test_action_recog.cpp b/tests/test_action_recog.cpp index 01b4706..c0e926b 100644 --- a/tests/test_action_recog.cpp +++ b/tests/test_action_recog.cpp @@ -5,6 +5,7 @@ #include "frame/frame.h" #include "node.h" +#include "pose/pose_result.h" #include "utils/simple_json.h" #include "../plugins/action_recog/action_recog_node.h" @@ -124,5 +125,385 @@ TEST(ActionRecogTest, EmitsFightWhenTwoTracksStayCloseWithRepeatedMotion) { EXPECT_EQ(frame2->behavior_events->items[0].track_ids[1], 32); } +TEST(ActionRecogTest, EmitsFallFromPoseSequenceWhenBboxShapeChangeIsSmall) { + ActionRecogNode node; + + const std::string config_text = R"({ + "id": "action_evt", + "events": [ + { + "type": "fall", + "window_ms": 1500, + "min_drop_pixels": 120, + "min_aspect_ratio_delta": 10.0, + "activate_duration_ms": 0, + "pose_min_torso_drop_pixels": 120, + "pose_max_upright_ratio": 0.6 + } + ] + })"; + + SimpleJson config = ParseActionConfig(config_text); + NodeContext ctx; + auto out = std::make_shared>(4, QueueDropStrategy::DropOldest); + ctx.output_queues.push_back(out); + + ASSERT_TRUE(node.Init(config, ctx)); + ASSERT_TRUE(node.Start()); + + auto frame1 = std::make_shared(); + frame1->width = 1920; + frame1->height = 1080; + frame1->pts = 1000; + frame1->det = std::make_shared(); + frame1->det->img_w = 1920; + frame1->det->img_h = 1080; + frame1->det->items.push_back(Detection{0, 0.92f, Rect{800.0f, 220.0f, 180.0f, 300.0f}, 17}); + frame1->pose = std::make_shared(); + frame1->pose->img_w = 1920; + frame1->pose->img_h = 1080; + PoseItem pose1; + pose1.bbox = Rect{800.0f, 220.0f, 180.0f, 300.0f}; + pose1.score = 0.9f; + pose1.keypoints.resize(17); + pose1.keypoints[5] = PoseKeypoint{PosePoint2f{840.0f, 280.0f}, 0.9f}; + pose1.keypoints[6] = PoseKeypoint{PosePoint2f{940.0f, 280.0f}, 0.9f}; + pose1.keypoints[11] = PoseKeypoint{PosePoint2f{850.0f, 420.0f}, 0.9f}; + pose1.keypoints[12] = PoseKeypoint{PosePoint2f{930.0f, 420.0f}, 0.9f}; + frame1->pose->items.push_back(pose1); + + auto frame2 = std::make_shared(); + frame2->width = 1920; + frame2->height = 1080; + frame2->pts = 1600; + frame2->det = std::make_shared(); + frame2->det->img_w = 1920; + frame2->det->img_h = 1080; + frame2->det->items.push_back(Detection{0, 0.93f, Rect{780.0f, 420.0f, 200.0f, 280.0f}, 17}); + frame2->pose = std::make_shared(); + frame2->pose->img_w = 1920; + frame2->pose->img_h = 1080; + PoseItem pose2; + pose2.bbox = Rect{780.0f, 420.0f, 200.0f, 280.0f}; + pose2.score = 0.91f; + pose2.keypoints.resize(17); + pose2.keypoints[5] = PoseKeypoint{PosePoint2f{790.0f, 520.0f}, 0.9f}; + pose2.keypoints[6] = PoseKeypoint{PosePoint2f{970.0f, 520.0f}, 0.9f}; + pose2.keypoints[11] = PoseKeypoint{PosePoint2f{820.0f, 590.0f}, 0.9f}; + pose2.keypoints[12] = PoseKeypoint{PosePoint2f{940.0f, 590.0f}, 0.9f}; + frame2->pose->items.push_back(pose2); + + EXPECT_EQ(static_cast(node.Process(frame1)), static_cast(NodeStatus::OK)); + EXPECT_EQ(static_cast(node.Process(frame2)), static_cast(NodeStatus::OK)); + ASSERT_NE(frame2->behavior_events, nullptr); + ASSERT_EQ(frame2->behavior_events->items.size(), 1u); + EXPECT_EQ(frame2->behavior_events->items[0].type, BehaviorEventType::Fall); + ASSERT_EQ(frame2->behavior_events->items[0].track_ids.size(), 1u); + EXPECT_EQ(frame2->behavior_events->items[0].track_ids[0], 17); +} + +TEST(ActionRecogTest, EmitsFightFromPoseMotionWhenBboxMotionIsSmall) { + ActionRecogNode node; + + const std::string config_text = R"({ + "id": "action_evt", + "events": [ + { + "type": "fight", + "window_ms": 1200, + "proximity_pixels": 220, + "min_motion_pixels": 1000, + "activate_duration_ms": 0, + "pose_min_wrist_motion_pixels": 120, + "pose_max_wrist_distance_pixels": 120 + } + ] + })"; + + SimpleJson config = ParseActionConfig(config_text); + NodeContext ctx; + auto out = std::make_shared>(4, QueueDropStrategy::DropOldest); + ctx.output_queues.push_back(out); + + ASSERT_TRUE(node.Init(config, ctx)); + ASSERT_TRUE(node.Start()); + + auto frame1 = std::make_shared(); + frame1->width = 1920; + frame1->height = 1080; + frame1->pts = 1000; + frame1->det = std::make_shared(); + frame1->det->img_w = 1920; + frame1->det->img_h = 1080; + frame1->det->items.push_back(Detection{0, 0.90f, Rect{700.0f, 320.0f, 120.0f, 300.0f}, 31}); + frame1->det->items.push_back(Detection{0, 0.89f, Rect{860.0f, 320.0f, 120.0f, 300.0f}, 32}); + frame1->pose = std::make_shared(); + frame1->pose->img_w = 1920; + frame1->pose->img_h = 1080; + PoseItem left1; + left1.bbox = Rect{700.0f, 320.0f, 120.0f, 300.0f}; + left1.keypoints.resize(17); + left1.keypoints[9] = PoseKeypoint{PosePoint2f{780.0f, 430.0f}, 0.9f}; + left1.keypoints[10] = PoseKeypoint{PosePoint2f{790.0f, 450.0f}, 0.9f}; + PoseItem right1; + right1.bbox = Rect{860.0f, 320.0f, 120.0f, 300.0f}; + right1.keypoints.resize(17); + right1.keypoints[9] = PoseKeypoint{PosePoint2f{900.0f, 430.0f}, 0.9f}; + right1.keypoints[10] = PoseKeypoint{PosePoint2f{910.0f, 450.0f}, 0.9f}; + frame1->pose->items.push_back(left1); + frame1->pose->items.push_back(right1); + + auto frame2 = std::make_shared(); + frame2->width = 1920; + frame2->height = 1080; + frame2->pts = 1300; + frame2->det = std::make_shared(); + frame2->det->img_w = 1920; + frame2->det->img_h = 1080; + frame2->det->items.push_back(Detection{0, 0.91f, Rect{710.0f, 325.0f, 120.0f, 300.0f}, 31}); + frame2->det->items.push_back(Detection{0, 0.92f, Rect{850.0f, 315.0f, 120.0f, 300.0f}, 32}); + frame2->pose = std::make_shared(); + frame2->pose->img_w = 1920; + frame2->pose->img_h = 1080; + PoseItem left2; + left2.bbox = Rect{710.0f, 325.0f, 120.0f, 300.0f}; + left2.keypoints.resize(17); + left2.keypoints[9] = PoseKeypoint{PosePoint2f{860.0f, 430.0f}, 0.9f}; + left2.keypoints[10] = PoseKeypoint{PosePoint2f{870.0f, 450.0f}, 0.9f}; + PoseItem right2; + right2.bbox = Rect{850.0f, 315.0f, 120.0f, 300.0f}; + right2.keypoints.resize(17); + right2.keypoints[9] = PoseKeypoint{PosePoint2f{840.0f, 430.0f}, 0.9f}; + right2.keypoints[10] = PoseKeypoint{PosePoint2f{830.0f, 450.0f}, 0.9f}; + frame2->pose->items.push_back(left2); + frame2->pose->items.push_back(right2); + + EXPECT_EQ(static_cast(node.Process(frame1)), static_cast(NodeStatus::OK)); + EXPECT_EQ(static_cast(node.Process(frame2)), static_cast(NodeStatus::OK)); + ASSERT_NE(frame2->behavior_events, nullptr); + ASSERT_EQ(frame2->behavior_events->items.size(), 1u); + EXPECT_EQ(frame2->behavior_events->items[0].type, BehaviorEventType::Fight); + ASSERT_EQ(frame2->behavior_events->items[0].track_ids.size(), 2u); + EXPECT_EQ(frame2->behavior_events->items[0].track_ids[0], 31); + EXPECT_EQ(frame2->behavior_events->items[0].track_ids[1], 32); +} + +TEST(ActionRecogTest, SupportsStructuredFusionConfigForFall) { + ActionRecogNode node; + + const std::string config_text = R"({ + "id": "action_evt", + "events": [ + { + "type": "fall", + "window_ms": 1500, + "activate_duration_ms": 0, + "bbox": { + "enabled": false + }, + "pose": { + "enabled": true, + "min_torso_drop_pixels": 120, + "max_upright_ratio": 0.6 + }, + "fusion": { + "match_mode": "any" + } + } + ] + })"; + + SimpleJson config = ParseActionConfig(config_text); + NodeContext ctx; + ASSERT_TRUE(node.Init(config, ctx)); + ASSERT_TRUE(node.Start()); + + auto frame1 = std::make_shared(); + frame1->width = 1920; + frame1->height = 1080; + frame1->pts = 1000; + frame1->det = std::make_shared(); + frame1->det->items.push_back(Detection{0, 0.92f, Rect{800.0f, 220.0f, 180.0f, 300.0f}, 17}); + frame1->pose = std::make_shared(); + PoseItem pose1; + pose1.track_id = 17; + pose1.bbox = Rect{800.0f, 220.0f, 180.0f, 300.0f}; + pose1.keypoints.resize(17); + pose1.keypoints[5] = PoseKeypoint{PosePoint2f{840.0f, 280.0f}, 0.9f}; + pose1.keypoints[6] = PoseKeypoint{PosePoint2f{940.0f, 280.0f}, 0.9f}; + pose1.keypoints[11] = PoseKeypoint{PosePoint2f{850.0f, 420.0f}, 0.9f}; + pose1.keypoints[12] = PoseKeypoint{PosePoint2f{930.0f, 420.0f}, 0.9f}; + frame1->pose->items.push_back(pose1); + + auto frame2 = std::make_shared(); + frame2->width = 1920; + frame2->height = 1080; + frame2->pts = 1600; + frame2->det = std::make_shared(); + frame2->det->items.push_back(Detection{0, 0.93f, Rect{790.0f, 400.0f, 180.0f, 300.0f}, 17}); + frame2->pose = std::make_shared(); + PoseItem pose2; + pose2.track_id = 17; + pose2.bbox = Rect{790.0f, 400.0f, 180.0f, 300.0f}; + pose2.keypoints.resize(17); + pose2.keypoints[5] = PoseKeypoint{PosePoint2f{790.0f, 520.0f}, 0.9f}; + pose2.keypoints[6] = PoseKeypoint{PosePoint2f{970.0f, 520.0f}, 0.9f}; + pose2.keypoints[11] = PoseKeypoint{PosePoint2f{820.0f, 590.0f}, 0.9f}; + pose2.keypoints[12] = PoseKeypoint{PosePoint2f{940.0f, 590.0f}, 0.9f}; + frame2->pose->items.push_back(pose2); + + EXPECT_EQ(static_cast(node.Process(frame1)), static_cast(NodeStatus::OK)); + EXPECT_EQ(static_cast(node.Process(frame2)), static_cast(NodeStatus::OK)); + ASSERT_NE(frame2->behavior_events, nullptr); + ASSERT_EQ(frame2->behavior_events->items.size(), 1u); + EXPECT_EQ(frame2->behavior_events->items[0].type, BehaviorEventType::Fall); +} + +TEST(ActionRecogTest, SuppressesFallFalsePositiveWhenPoseShapeConditionFails) { + ActionRecogNode node; + + const std::string config_text = R"({ + "id": "action_evt", + "events": [ + { + "type": "fall", + "window_ms": 1500, + "activate_duration_ms": 0, + "bbox": { + "enabled": false + }, + "pose": { + "enabled": true, + "min_torso_drop_pixels": 120, + "max_upright_ratio": 0.8 + }, + "fusion": { + "match_mode": "any" + } + } + ] + })"; + + SimpleJson config = ParseActionConfig(config_text); + NodeContext ctx; + ASSERT_TRUE(node.Init(config, ctx)); + ASSERT_TRUE(node.Start()); + + auto frame1 = std::make_shared(); + frame1->pts = 1000; + frame1->det = std::make_shared(); + frame1->det->items.push_back(Detection{0, 0.92f, Rect{800.0f, 220.0f, 180.0f, 300.0f}, 17}); + frame1->pose = std::make_shared(); + PoseItem pose1; + pose1.track_id = 17; + pose1.bbox = Rect{800.0f, 220.0f, 180.0f, 300.0f}; + pose1.keypoints.resize(17); + pose1.keypoints[5] = PoseKeypoint{PosePoint2f{840.0f, 280.0f}, 0.9f}; + pose1.keypoints[6] = PoseKeypoint{PosePoint2f{940.0f, 280.0f}, 0.9f}; + pose1.keypoints[11] = PoseKeypoint{PosePoint2f{850.0f, 420.0f}, 0.9f}; + pose1.keypoints[12] = PoseKeypoint{PosePoint2f{930.0f, 420.0f}, 0.9f}; + frame1->pose->items.push_back(pose1); + + auto frame2 = std::make_shared(); + frame2->pts = 1600; + frame2->det = std::make_shared(); + frame2->det->items.push_back(Detection{0, 0.93f, Rect{790.0f, 400.0f, 180.0f, 300.0f}, 17}); + frame2->pose = std::make_shared(); + PoseItem pose2; + pose2.track_id = 17; + pose2.bbox = Rect{790.0f, 400.0f, 180.0f, 300.0f}; + pose2.keypoints.resize(17); + pose2.keypoints[5] = PoseKeypoint{PosePoint2f{860.0f, 520.0f}, 0.9f}; + pose2.keypoints[6] = PoseKeypoint{PosePoint2f{900.0f, 520.0f}, 0.9f}; + pose2.keypoints[11] = PoseKeypoint{PosePoint2f{865.0f, 700.0f}, 0.9f}; + pose2.keypoints[12] = PoseKeypoint{PosePoint2f{905.0f, 700.0f}, 0.9f}; + frame2->pose->items.push_back(pose2); + + EXPECT_EQ(static_cast(node.Process(frame1)), static_cast(NodeStatus::OK)); + EXPECT_EQ(static_cast(node.Process(frame2)), static_cast(NodeStatus::OK)); + ASSERT_NE(frame2->behavior_events, nullptr); + EXPECT_TRUE(frame2->behavior_events->items.empty()); +} + +TEST(ActionRecogTest, SuppressesFightFalsePositiveWhenWristsStayFarApart) { + ActionRecogNode node; + + const std::string config_text = R"({ + "id": "action_evt", + "events": [ + { + "type": "fight", + "window_ms": 1200, + "activate_duration_ms": 0, + "bbox": { + "enabled": false, + "proximity_pixels": 220, + "min_motion_pixels": 0 + }, + "pose": { + "enabled": true, + "min_wrist_motion_pixels": 120, + "max_wrist_distance_pixels": 60 + }, + "fusion": { + "match_mode": "any" + } + } + ] + })"; + + SimpleJson config = ParseActionConfig(config_text); + NodeContext ctx; + ASSERT_TRUE(node.Init(config, ctx)); + ASSERT_TRUE(node.Start()); + + auto frame1 = std::make_shared(); + frame1->pts = 1000; + frame1->det = std::make_shared(); + frame1->det->items.push_back(Detection{0, 0.90f, Rect{700.0f, 320.0f, 120.0f, 300.0f}, 31}); + frame1->det->items.push_back(Detection{0, 0.89f, Rect{860.0f, 320.0f, 120.0f, 300.0f}, 32}); + frame1->pose = std::make_shared(); + PoseItem left1; + left1.track_id = 31; + left1.bbox = Rect{700.0f, 320.0f, 120.0f, 300.0f}; + left1.keypoints.resize(17); + left1.keypoints[9] = PoseKeypoint{PosePoint2f{760.0f, 430.0f}, 0.9f}; + left1.keypoints[10] = PoseKeypoint{PosePoint2f{770.0f, 450.0f}, 0.9f}; + PoseItem right1; + right1.track_id = 32; + right1.bbox = Rect{860.0f, 320.0f, 120.0f, 300.0f}; + right1.keypoints.resize(17); + right1.keypoints[9] = PoseKeypoint{PosePoint2f{960.0f, 430.0f}, 0.9f}; + right1.keypoints[10] = PoseKeypoint{PosePoint2f{970.0f, 450.0f}, 0.9f}; + frame1->pose->items.push_back(left1); + frame1->pose->items.push_back(right1); + + auto frame2 = std::make_shared(); + frame2->pts = 1300; + frame2->det = std::make_shared(); + frame2->det->items.push_back(Detection{0, 0.91f, Rect{710.0f, 325.0f, 120.0f, 300.0f}, 31}); + frame2->det->items.push_back(Detection{0, 0.92f, Rect{850.0f, 315.0f, 120.0f, 300.0f}, 32}); + frame2->pose = std::make_shared(); + PoseItem left2; + left2.track_id = 31; + left2.bbox = Rect{710.0f, 325.0f, 120.0f, 300.0f}; + left2.keypoints.resize(17); + left2.keypoints[9] = PoseKeypoint{PosePoint2f{820.0f, 430.0f}, 0.9f}; + left2.keypoints[10] = PoseKeypoint{PosePoint2f{830.0f, 450.0f}, 0.9f}; + PoseItem right2; + right2.track_id = 32; + right2.bbox = Rect{850.0f, 315.0f, 120.0f, 300.0f}; + right2.keypoints.resize(17); + right2.keypoints[9] = PoseKeypoint{PosePoint2f{980.0f, 430.0f}, 0.9f}; + right2.keypoints[10] = PoseKeypoint{PosePoint2f{990.0f, 450.0f}, 0.9f}; + frame2->pose->items.push_back(left2); + frame2->pose->items.push_back(right2); + + EXPECT_EQ(static_cast(node.Process(frame1)), static_cast(NodeStatus::OK)); + EXPECT_EQ(static_cast(node.Process(frame2)), static_cast(NodeStatus::OK)); + ASSERT_NE(frame2->behavior_events, nullptr); + EXPECT_TRUE(frame2->behavior_events->items.empty()); +} + } // namespace } // namespace rk3588 diff --git a/tests/test_ai_pose.cpp b/tests/test_ai_pose.cpp new file mode 100644 index 0000000..3c4314b --- /dev/null +++ b/tests/test_ai_pose.cpp @@ -0,0 +1,232 @@ +#include + +#include +#include +#include +#include +#include + +#include "hw/i_infer_backend.h" +#include "node.h" +#include "pose/pose_result.h" +#include "utils/simple_json.h" +#include "../plugins/ai_pose/ai_pose_node.h" + +namespace rk3588 { +namespace { + +SimpleJson ParsePoseConfig(const std::string& text) { + SimpleJson config; + std::string err; + const bool ok = ParseSimpleJson(text, config, err); + EXPECT_TRUE(ok); + return config; +} + +class FakeInferBackend final : public IInferBackend { +public: + ModelHandle LoadModel(const std::string& /*model_path*/, std::string& /*err*/) override { + return 1; + } + + void UnloadModel(ModelHandle /*handle*/) override {} + + bool GetModelInfo(ModelHandle /*handle*/, ModelInfo& info) const override { + info.input_width = 640; + info.input_height = 640; + info.input_channels = 3; + info.n_input = 1; + info.n_output = 4; + info.name = "fake_yolov8_pose"; + return true; + } + + InferResult Infer(ModelHandle /*handle*/, const InferInput& /*input*/) override { + InferResult result; + result.success = true; + result.outputs = outputs_; + return result; + } + + AiScheduler::BorrowedInferResult InferBorrowed(ModelHandle /*handle*/, const InferInput& /*input*/) override { + AiScheduler::BorrowedInferResult result; + result.success = false; + result.error = "not used in unit test"; + return result; + } + + std::vector outputs_; +}; + +InferOutput MakeFloatOutput(const std::vector& values) { + InferOutput out; + out.size = values.size() * sizeof(float); + out.data.resize(out.size); + std::memcpy(out.data.data(), values.data(), out.size); + return out; +} + +TEST(AiPoseNodeTest, RejectsEnabledConfigWithoutModelPath) { + AiPoseNode node; + SimpleJson config = ParsePoseConfig(R"({ + "id": "pose0", + "enabled": true + })"); + + NodeContext ctx; + EXPECT_FALSE(node.Init(config, ctx)); +} + +TEST(AiPoseNodeTest, DisabledNodeStartsAndPassesFramesThrough) { + AiPoseNode node; + SimpleJson config = ParsePoseConfig(R"({ + "id": "pose0", + "enabled": false + })"); + + NodeContext ctx; + auto out = std::make_shared>(4, QueueDropStrategy::DropOldest); + ctx.output_queues.push_back(out); + + ASSERT_TRUE(node.Init(config, ctx)); + ASSERT_TRUE(node.Start()); + + auto frame = std::make_shared(); + frame->width = 1280; + frame->height = 720; + + EXPECT_EQ(static_cast(node.Process(frame)), static_cast(NodeStatus::OK)); + + FramePtr forwarded; + ASSERT_TRUE(out->TryPop(forwarded)); + EXPECT_EQ(forwarded.get(), frame.get()); + EXPECT_EQ(frame->pose, nullptr); +} + +TEST(AiPoseNodeTest, EnabledNodeRequiresInferBackendAtStart) { + AiPoseNode node; + SimpleJson config = ParsePoseConfig(R"({ + "id": "pose0", + "enabled": true, + "model_path": "models/yolov8n-pose.rknn" + })"); + + NodeContext ctx; + ASSERT_TRUE(node.Init(config, ctx)); + EXPECT_FALSE(node.Start()); +} + +TEST(AiPoseNodeTest, DecodePoseOutputsToOriginalCoordinates) { + const int num_keypoints = 17; + const int total_points = 8400; + + std::vector head0(static_cast(65 * 80 * 80), -10.0f); + std::vector head1(static_cast(65 * 40 * 40), -10.0f); + std::vector head2(static_cast(65 * 20 * 20), -10.0f); + std::vector keypoints(static_cast(num_keypoints * 3 * total_points), 0.0f); + + const int cell_x = 10; + const int cell_y = 15; + const int grid_w = 80; + const int point_index = cell_y * grid_w + cell_x; + + auto set_dfl_bin = [&](int channel_group, int bin) { + for (int i = 0; i < 16; ++i) { + head0[static_cast((channel_group * 16 + i) * grid_w * 80 + point_index)] = (i == bin) ? 10.0f : -10.0f; + } + }; + + set_dfl_bin(0, 4); + set_dfl_bin(1, 6); + set_dfl_bin(2, 8); + set_dfl_bin(3, 10); + head0[static_cast(64 * grid_w * 80 + point_index)] = 10.0f; + + for (int k = 0; k < num_keypoints; ++k) { + keypoints[static_cast(k * 3 * total_points + point_index)] = 120.0f + static_cast(k); + keypoints[static_cast(k * 3 * total_points + total_points + point_index)] = 140.0f + static_cast(k * 2); + keypoints[static_cast(k * 3 * total_points + 2 * total_points + point_index)] = 0.9f; + } + + auto backend = std::make_shared(); + backend->outputs_ = { + MakeFloatOutput(head0), + MakeFloatOutput(head1), + MakeFloatOutput(head2), + MakeFloatOutput(keypoints), + }; + + AiPoseNode node; + SimpleJson config = ParsePoseConfig(R"({ + "id": "pose0", + "enabled": true, + "model_path": "models/yolov8n-pose.rknn", + "model_input_w": 640, + "model_input_h": 640, + "conf_thresh": 0.25, + "nms_thresh": 0.45 + })"); + + NodeContext ctx; + ctx.infer_backend = backend; + auto out = std::make_shared>(4, QueueDropStrategy::DropOldest); + ctx.output_queues.push_back(out); + + ASSERT_TRUE(node.Init(config, ctx)); + ASSERT_TRUE(node.Start()); + + auto rgb = std::make_shared>(static_cast(320 * 320 * 3), 127); + auto frame = std::make_shared(); + frame->width = 320; + frame->height = 320; + frame->format = PixelFormat::RGB; + frame->data = rgb->data(); + frame->data_size = rgb->size(); + frame->stride = 320 * 3; + frame->plane_count = 1; + frame->planes[0] = {frame->data, frame->stride, static_cast(frame->data_size), 0}; + frame->data_owner = rgb; + + EXPECT_EQ(static_cast(node.Process(frame)), static_cast(NodeStatus::OK)); + + ASSERT_NE(frame->pose, nullptr); + ASSERT_EQ(frame->pose->items.size(), 1u); + const PoseItem& item = frame->pose->items[0]; + EXPECT_NEAR(item.bbox.x, 26.0f, 1.0f); + EXPECT_NEAR(item.bbox.y, 38.0f, 1.0f); + EXPECT_NEAR(item.bbox.w, 48.0f, 1.0f); + EXPECT_NEAR(item.bbox.h, 64.0f, 1.0f); + ASSERT_EQ(item.keypoints.size(), static_cast(num_keypoints)); + EXPECT_NEAR(item.keypoints[0].point.x, 60.0f, 1.0f); + EXPECT_NEAR(item.keypoints[0].point.y, 70.0f, 1.0f); + EXPECT_NEAR(item.keypoints[16].point.x, 68.0f, 1.0f); + EXPECT_NEAR(item.keypoints[16].point.y, 86.0f, 1.0f); + + FramePtr forwarded; + ASSERT_TRUE(out->TryPop(forwarded)); + EXPECT_EQ(forwarded.get(), frame.get()); +} + +TEST(PoseResultTest, SupportsPerTrackKeypoints) { + PoseResult result; + result.img_w = 1920; + result.img_h = 1080; + result.model_name = "pose_model"; + + PoseItem item; + item.track_id = 42; + item.score = 0.93f; + item.bbox = Rect{100.0f, 200.0f, 300.0f, 400.0f}; + item.keypoints.push_back(PoseKeypoint{PosePoint2f{120.0f, 220.0f}, 0.99f}); + item.keypoints.push_back(PoseKeypoint{PosePoint2f{140.0f, 260.0f}, 0.95f}); + result.items.push_back(item); + + ASSERT_EQ(result.items.size(), 1u); + EXPECT_EQ(result.items[0].track_id, 42); + ASSERT_EQ(result.items[0].keypoints.size(), 2u); + EXPECT_FLOAT_EQ(result.items[0].keypoints[0].point.x, 120.0f); + EXPECT_FLOAT_EQ(result.items[0].keypoints[1].score, 0.95f); +} + +} // namespace +} // namespace rk3588 diff --git a/tests/test_pose_assoc.cpp b/tests/test_pose_assoc.cpp new file mode 100644 index 0000000..140b112 --- /dev/null +++ b/tests/test_pose_assoc.cpp @@ -0,0 +1,117 @@ +#include + +#include +#include + +#include "frame/frame.h" +#include "node.h" +#include "pose/pose_result.h" +#include "utils/simple_json.h" +#include "../plugins/pose_assoc/pose_assoc_node.h" + +namespace rk3588 { +namespace { + +SimpleJson ParsePoseAssocConfig(const std::string& text) { + SimpleJson config; + std::string err; + const bool ok = ParseSimpleJson(text, config, err); + EXPECT_TRUE(ok); + return config; +} + +TEST(PoseAssocNodeTest, AssociatesPoseItemsToTrackedDetections) { + PoseAssocNode node; + SimpleJson config = ParsePoseAssocConfig(R"({ + "id": "pose_assoc0", + "min_iou": 0.1 + })"); + + NodeContext ctx; + auto out = std::make_shared>(4, QueueDropStrategy::DropOldest); + ctx.output_queues.push_back(out); + + ASSERT_TRUE(node.Init(config, ctx)); + ASSERT_TRUE(node.Start()); + + auto frame = std::make_shared(); + frame->det = std::make_shared(); + frame->det->items.push_back(Detection{0, 0.95f, Rect{100.0f, 100.0f, 120.0f, 240.0f}, 7}); + frame->det->items.push_back(Detection{0, 0.94f, Rect{320.0f, 120.0f, 120.0f, 240.0f}, 8}); + + frame->pose = std::make_shared(); + PoseItem pose0; + pose0.bbox = Rect{110.0f, 110.0f, 110.0f, 220.0f}; + PoseItem pose1; + pose1.bbox = Rect{330.0f, 130.0f, 100.0f, 210.0f}; + frame->pose->items.push_back(pose0); + frame->pose->items.push_back(pose1); + + EXPECT_EQ(static_cast(node.Process(frame)), static_cast(NodeStatus::OK)); + ASSERT_EQ(frame->pose->items.size(), 2u); + EXPECT_EQ(frame->pose->items[0].track_id, 7); + EXPECT_EQ(frame->pose->items[1].track_id, 8); + + FramePtr forwarded; + ASSERT_TRUE(out->TryPop(forwarded)); + EXPECT_EQ(forwarded.get(), frame.get()); +} + +TEST(PoseAssocNodeTest, UsesOneToOneAssignmentForOverlappingCandidates) { + PoseAssocNode node; + SimpleJson config = ParsePoseAssocConfig(R"({ + "id": "pose_assoc0", + "min_iou": 0.05 + })"); + + NodeContext ctx; + ASSERT_TRUE(node.Init(config, ctx)); + ASSERT_TRUE(node.Start()); + + auto frame = std::make_shared(); + frame->det = std::make_shared(); + frame->det->items.push_back(Detection{0, 0.95f, Rect{100.0f, 100.0f, 150.0f, 250.0f}, 11}); + frame->det->items.push_back(Detection{0, 0.94f, Rect{130.0f, 120.0f, 150.0f, 250.0f}, 12}); + + frame->pose = std::make_shared(); + PoseItem pose0; + pose0.bbox = Rect{105.0f, 110.0f, 145.0f, 245.0f}; + PoseItem pose1; + pose1.bbox = Rect{132.0f, 125.0f, 148.0f, 245.0f}; + frame->pose->items.push_back(pose0); + frame->pose->items.push_back(pose1); + + EXPECT_EQ(static_cast(node.Process(frame)), static_cast(NodeStatus::OK)); + EXPECT_NE(frame->pose->items[0].track_id, frame->pose->items[1].track_id); + EXPECT_TRUE(frame->pose->items[0].track_id == 11 || frame->pose->items[0].track_id == 12); + EXPECT_TRUE(frame->pose->items[1].track_id == 11 || frame->pose->items[1].track_id == 12); +} + +TEST(PoseAssocNodeTest, PassesThroughWhenPoseOrDetectionMissing) { + PoseAssocNode node; + SimpleJson config = ParsePoseAssocConfig(R"({ + "id": "pose_assoc0" + })"); + + NodeContext ctx; + ASSERT_TRUE(node.Init(config, ctx)); + ASSERT_TRUE(node.Start()); + + auto no_pose = std::make_shared(); + no_pose->det = std::make_shared(); + no_pose->det->items.push_back(Detection{0, 0.9f, Rect{0.0f, 0.0f, 10.0f, 10.0f}, 1}); + EXPECT_EQ(static_cast(node.Process(no_pose)), static_cast(NodeStatus::OK)); + EXPECT_EQ(no_pose->pose, nullptr); + + auto no_det = std::make_shared(); + no_det->pose = std::make_shared(); + PoseItem pose; + pose.bbox = Rect{0.0f, 0.0f, 10.0f, 10.0f}; + no_det->pose->items.push_back(pose); + EXPECT_EQ(static_cast(node.Process(no_det)), static_cast(NodeStatus::OK)); + ASSERT_EQ(no_det->pose->items.size(), 1u); + EXPECT_EQ(no_det->pose->items[0].track_id, -1); +} + +} // namespace +} // namespace rk3588