467 lines
17 KiB
C++
467 lines
17 KiB
C++
#include "ai_pose_node.h"
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
#include <limits>
|
|
#include <memory>
|
|
#include <set>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "face/face_detection_utils.h"
|
|
#include "hw/i_infer_backend.h"
|
|
#include "pose/pose_result.h"
|
|
#include "utils/logger.h"
|
|
|
|
namespace rk3588 {
|
|
namespace {
|
|
|
|
using face_detection::HalfToFloat;
|
|
using face_detection::ResizeRgbBilinear;
|
|
|
|
constexpr int kPoseOutputCount = 4;
|
|
constexpr int kPoseLocChannels = 64;
|
|
constexpr int kPoseScoreChannel = 64;
|
|
constexpr int kDflBins = 16;
|
|
constexpr int kMaxPoseDetections = 32;
|
|
constexpr uint8_t kLetterboxValue = 114;
|
|
|
|
struct PoseConfig {
|
|
bool enabled = true;
|
|
std::string model_path;
|
|
int model_input_w = 640;
|
|
int model_input_h = 640;
|
|
int expected_keypoints = 17;
|
|
float conf_thresh = 0.25f;
|
|
float nms_thresh = 0.45f;
|
|
};
|
|
|
|
struct LetterboxInfo {
|
|
float scale = 1.0f;
|
|
float pad_x = 0.0f;
|
|
float pad_y = 0.0f;
|
|
};
|
|
|
|
struct PoseCandidate {
|
|
Rect bbox{};
|
|
float score = 0.0f;
|
|
int keypoint_index = -1;
|
|
};
|
|
|
|
bool ParseConfig(const SimpleJson& config, PoseConfig& out, std::string& err) {
|
|
out.enabled = config.ValueOr<bool>("enabled", true);
|
|
out.model_path = config.ValueOr<std::string>("model_path", "");
|
|
out.model_input_w = config.ValueOr<int>("model_input_w", 640);
|
|
out.model_input_h = config.ValueOr<int>("model_input_h", 640);
|
|
out.expected_keypoints = config.ValueOr<int>("expected_keypoints", 17);
|
|
out.conf_thresh = config.ValueOr<float>("conf_thresh", 0.25f);
|
|
out.nms_thresh = config.ValueOr<float>("nms_thresh", 0.45f);
|
|
|
|
if (out.model_input_w <= 0 || out.model_input_h <= 0) {
|
|
err = "model_input_w and model_input_h must be positive";
|
|
return false;
|
|
}
|
|
if (out.expected_keypoints <= 0) {
|
|
err = "expected_keypoints must be positive";
|
|
return false;
|
|
}
|
|
if (out.conf_thresh <= 0.0f || out.conf_thresh >= 1.0f) {
|
|
err = "conf_thresh must be in (0, 1)";
|
|
return false;
|
|
}
|
|
if (out.nms_thresh <= 0.0f || out.nms_thresh >= 1.0f) {
|
|
err = "nms_thresh must be in (0, 1)";
|
|
return false;
|
|
}
|
|
if (out.enabled && out.model_path.empty()) {
|
|
err = "model_path is required when enabled=true";
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void PushToDownstream(const std::vector<std::shared_ptr<SpscQueue<FramePtr>>>& output_queues, const FramePtr& frame) {
|
|
for (const auto& q : output_queues) {
|
|
if (q) q->Push(frame);
|
|
}
|
|
}
|
|
|
|
inline float Sigmoid(float x) {
|
|
return 1.0f / (1.0f + std::exp(-x));
|
|
}
|
|
|
|
float Unsigmoid(float y) {
|
|
return -std::log((1.0f / y) - 1.0f);
|
|
}
|
|
|
|
float ClampFloat(float v, float lo, float hi) {
|
|
return std::max(lo, std::min(v, hi));
|
|
}
|
|
|
|
float CalculateIoU(const Rect& lhs, const Rect& rhs) {
|
|
const float x1 = std::max(lhs.x, rhs.x);
|
|
const float y1 = std::max(lhs.y, rhs.y);
|
|
const float x2 = std::min(lhs.x + lhs.w, rhs.x + rhs.w);
|
|
const float y2 = std::min(lhs.y + lhs.h, rhs.y + rhs.h);
|
|
const float inter_w = std::max(0.0f, x2 - x1);
|
|
const float inter_h = std::max(0.0f, y2 - y1);
|
|
const float inter = inter_w * inter_h;
|
|
const float union_area = lhs.w * lhs.h + rhs.w * rhs.h - inter;
|
|
return union_area <= 0.0f ? 0.0f : (inter / union_area);
|
|
}
|
|
|
|
float ReadTensorValue(const InferOutput& output, size_t index) {
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
switch (output.type) {
|
|
case RKNN_TENSOR_FLOAT32: {
|
|
const auto* p = reinterpret_cast<const float*>(output.data.data());
|
|
return p[index];
|
|
}
|
|
case RKNN_TENSOR_FLOAT16: {
|
|
const auto* p = reinterpret_cast<const uint16_t*>(output.data.data());
|
|
return HalfToFloat(p[index]);
|
|
}
|
|
case RKNN_TENSOR_INT8: {
|
|
const auto* p = reinterpret_cast<const int8_t*>(output.data.data());
|
|
return (static_cast<float>(p[index]) - static_cast<float>(output.zp)) * output.scale;
|
|
}
|
|
case RKNN_TENSOR_UINT8: {
|
|
const auto* p = reinterpret_cast<const uint8_t*>(output.data.data());
|
|
return (static_cast<float>(p[index]) - static_cast<float>(output.zp)) * output.scale;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
#endif
|
|
const auto* p = reinterpret_cast<const float*>(output.data.data());
|
|
return p[index];
|
|
}
|
|
|
|
bool BuildLetterboxedRgbInput(const Frame& frame, int dst_w, int dst_h,
|
|
std::vector<uint8_t>& out, LetterboxInfo& letterbox) {
|
|
if (frame.width <= 0 || frame.height <= 0 || dst_w <= 0 || dst_h <= 0) return false;
|
|
if (frame.format != PixelFormat::RGB && frame.format != PixelFormat::BGR) return false;
|
|
|
|
const uint8_t* src = frame.planes[0].data ? frame.planes[0].data : frame.data;
|
|
if (!src) return false;
|
|
const int src_stride = frame.planes[0].stride > 0 ? frame.planes[0].stride : frame.width * 3;
|
|
|
|
out.assign(static_cast<size_t>(dst_w) * static_cast<size_t>(dst_h) * 3, kLetterboxValue);
|
|
|
|
const float scale = std::min(static_cast<float>(dst_w) / static_cast<float>(frame.width),
|
|
static_cast<float>(dst_h) / static_cast<float>(frame.height));
|
|
const int resized_w = std::max(1, static_cast<int>(std::round(frame.width * scale)));
|
|
const int resized_h = std::max(1, static_cast<int>(std::round(frame.height * scale)));
|
|
const int pad_x = (dst_w - resized_w) / 2;
|
|
const int pad_y = (dst_h - resized_h) / 2;
|
|
|
|
std::vector<uint8_t> resized(static_cast<size_t>(resized_w) * static_cast<size_t>(resized_h) * 3);
|
|
const bool swap_rb = (frame.format == PixelFormat::BGR);
|
|
ResizeRgbBilinear(src, frame.width, frame.height, src_stride, resized.data(), resized_w, resized_h, swap_rb);
|
|
|
|
for (int y = 0; y < resized_h; ++y) {
|
|
uint8_t* dst_row = out.data() + (static_cast<size_t>(y + pad_y) * static_cast<size_t>(dst_w) + pad_x) * 3;
|
|
const uint8_t* src_row = resized.data() + static_cast<size_t>(y) * static_cast<size_t>(resized_w) * 3;
|
|
std::memcpy(dst_row, src_row, static_cast<size_t>(resized_w) * 3);
|
|
}
|
|
|
|
letterbox.scale = scale;
|
|
letterbox.pad_x = static_cast<float>(pad_x);
|
|
letterbox.pad_y = static_cast<float>(pad_y);
|
|
return true;
|
|
}
|
|
|
|
float DecodeDfl(const InferOutput& output, int cell_index, int cells_per_head, int channel_group) {
|
|
float max_value = -std::numeric_limits<float>::infinity();
|
|
float logits[kDflBins]{};
|
|
for (int i = 0; i < kDflBins; ++i) {
|
|
const size_t offset = static_cast<size_t>((channel_group * kDflBins + i) * cells_per_head + cell_index);
|
|
logits[i] = ReadTensorValue(output, offset);
|
|
max_value = std::max(max_value, logits[i]);
|
|
}
|
|
|
|
float denom = 0.0f;
|
|
float numer = 0.0f;
|
|
for (int i = 0; i < kDflBins; ++i) {
|
|
const float e = std::exp(logits[i] - max_value);
|
|
denom += e;
|
|
numer += e * static_cast<float>(i);
|
|
}
|
|
return denom > 0.0f ? (numer / denom) : 0.0f;
|
|
}
|
|
|
|
std::vector<PoseCandidate> DecodePoseCandidates(const std::vector<InferOutput>& outputs,
|
|
const PoseConfig& config) {
|
|
std::vector<PoseCandidate> candidates;
|
|
if (outputs.size() < kPoseOutputCount) return candidates;
|
|
|
|
const float score_threshold = Unsigmoid(config.conf_thresh);
|
|
int keypoint_base_index = 0;
|
|
|
|
for (int head = 0; head < 3; ++head) {
|
|
const int stride = (head == 0) ? 8 : (head == 1 ? 16 : 32);
|
|
const int grid_h = config.model_input_h / stride;
|
|
const int grid_w = config.model_input_w / stride;
|
|
const int cells = grid_h * grid_w;
|
|
if (cells <= 0) continue;
|
|
|
|
const InferOutput& output = outputs[static_cast<size_t>(head)];
|
|
const size_t elem_count = sizeof(float) > 0 ? (output.data.size() / sizeof(float)) : 0;
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
size_t bytes_per_elem = 1;
|
|
switch (output.type) {
|
|
case RKNN_TENSOR_FLOAT32: bytes_per_elem = sizeof(float); break;
|
|
case RKNN_TENSOR_FLOAT16: bytes_per_elem = sizeof(uint16_t); break;
|
|
default: bytes_per_elem = 1; break;
|
|
}
|
|
const size_t actual_elem_count = bytes_per_elem > 0 ? (output.data.size() / bytes_per_elem) : 0;
|
|
#else
|
|
const size_t actual_elem_count = elem_count;
|
|
#endif
|
|
if (actual_elem_count < static_cast<size_t>(65 * cells)) {
|
|
keypoint_base_index += cells;
|
|
continue;
|
|
}
|
|
|
|
for (int cell = 0; cell < cells; ++cell) {
|
|
const size_t score_offset = static_cast<size_t>(kPoseScoreChannel * cells + cell);
|
|
const float score_logit = ReadTensorValue(output, score_offset);
|
|
if (score_logit < score_threshold) continue;
|
|
|
|
const float left = DecodeDfl(output, cell, cells, 0);
|
|
const float top = DecodeDfl(output, cell, cells, 1);
|
|
const float right = DecodeDfl(output, cell, cells, 2);
|
|
const float bottom = DecodeDfl(output, cell, cells, 3);
|
|
|
|
const int grid_x = cell % grid_w;
|
|
const int grid_y = cell / grid_w;
|
|
const float center_x = (static_cast<float>(grid_x) + 0.5f) * stride;
|
|
const float center_y = (static_cast<float>(grid_y) + 0.5f) * stride;
|
|
const float x1 = center_x - left * stride;
|
|
const float y1 = center_y - top * stride;
|
|
const float x2 = center_x + right * stride;
|
|
const float y2 = center_y + bottom * stride;
|
|
|
|
PoseCandidate candidate;
|
|
candidate.bbox.x = x1;
|
|
candidate.bbox.y = y1;
|
|
candidate.bbox.w = std::max(0.0f, x2 - x1);
|
|
candidate.bbox.h = std::max(0.0f, y2 - y1);
|
|
candidate.score = Sigmoid(score_logit);
|
|
candidate.keypoint_index = keypoint_base_index + cell;
|
|
candidates.push_back(candidate);
|
|
}
|
|
|
|
keypoint_base_index += cells;
|
|
}
|
|
|
|
std::sort(candidates.begin(), candidates.end(), [](const PoseCandidate& lhs, const PoseCandidate& rhs) {
|
|
return lhs.score > rhs.score;
|
|
});
|
|
|
|
std::vector<PoseCandidate> kept;
|
|
kept.reserve(candidates.size());
|
|
for (const auto& candidate : candidates) {
|
|
bool suppressed = false;
|
|
for (const auto& accepted : kept) {
|
|
if (CalculateIoU(candidate.bbox, accepted.bbox) > config.nms_thresh) {
|
|
suppressed = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!suppressed) {
|
|
kept.push_back(candidate);
|
|
if (static_cast<int>(kept.size()) >= kMaxPoseDetections) break;
|
|
}
|
|
}
|
|
return kept;
|
|
}
|
|
|
|
float MapToSourceCoord(float value, float pad, float scale, int limit) {
|
|
if (scale <= 0.0f) return 0.0f;
|
|
return ClampFloat((value - pad) / scale, 0.0f, static_cast<float>(std::max(0, limit)));
|
|
}
|
|
|
|
std::shared_ptr<PoseResult> BuildPoseResult(const std::vector<InferOutput>& outputs,
|
|
const PoseConfig& config,
|
|
int frame_w, int frame_h,
|
|
const LetterboxInfo& letterbox) {
|
|
if (outputs.size() < kPoseOutputCount) return nullptr;
|
|
|
|
const InferOutput& keypoint_output = outputs[3];
|
|
const int total_points = (config.model_input_w / 8) * (config.model_input_h / 8) +
|
|
(config.model_input_w / 16) * (config.model_input_h / 16) +
|
|
(config.model_input_w / 32) * (config.model_input_h / 32);
|
|
const int values_per_point = config.expected_keypoints * 3;
|
|
#if defined(RK3588_ENABLE_RKNN)
|
|
size_t kp_bytes_per_elem = 1;
|
|
switch (keypoint_output.type) {
|
|
case RKNN_TENSOR_FLOAT32: kp_bytes_per_elem = sizeof(float); break;
|
|
case RKNN_TENSOR_FLOAT16: kp_bytes_per_elem = sizeof(uint16_t); break;
|
|
default: kp_bytes_per_elem = 1; break;
|
|
}
|
|
#else
|
|
const size_t kp_bytes_per_elem = sizeof(float);
|
|
#endif
|
|
const size_t kp_elem_count = kp_bytes_per_elem > 0 ? (keypoint_output.data.size() / kp_bytes_per_elem) : 0;
|
|
if (kp_elem_count < static_cast<size_t>(values_per_point * total_points)) return nullptr;
|
|
|
|
auto result = std::make_shared<PoseResult>();
|
|
result->img_w = frame_w;
|
|
result->img_h = frame_h;
|
|
result->model_name = "yolov8n-pose";
|
|
|
|
const std::vector<PoseCandidate> candidates = DecodePoseCandidates(outputs, config);
|
|
result->items.reserve(candidates.size());
|
|
for (const auto& candidate : candidates) {
|
|
if (candidate.keypoint_index < 0 || candidate.keypoint_index >= total_points) continue;
|
|
|
|
PoseItem item;
|
|
item.score = candidate.score;
|
|
item.track_id = -1;
|
|
item.bbox.x = MapToSourceCoord(candidate.bbox.x, letterbox.pad_x, letterbox.scale, frame_w);
|
|
item.bbox.y = MapToSourceCoord(candidate.bbox.y, letterbox.pad_y, letterbox.scale, frame_h);
|
|
const float x2 = MapToSourceCoord(candidate.bbox.x + candidate.bbox.w, letterbox.pad_x, letterbox.scale, frame_w);
|
|
const float y2 = MapToSourceCoord(candidate.bbox.y + candidate.bbox.h, letterbox.pad_y, letterbox.scale, frame_h);
|
|
item.bbox.w = std::max(0.0f, x2 - item.bbox.x);
|
|
item.bbox.h = std::max(0.0f, y2 - item.bbox.y);
|
|
|
|
item.keypoints.reserve(static_cast<size_t>(config.expected_keypoints));
|
|
for (int k = 0; k < config.expected_keypoints; ++k) {
|
|
const size_t x_offset = static_cast<size_t>(k * 3 * total_points + candidate.keypoint_index);
|
|
const size_t y_offset = static_cast<size_t>(k * 3 * total_points + total_points + candidate.keypoint_index);
|
|
const size_t s_offset = static_cast<size_t>(k * 3 * total_points + 2 * total_points + candidate.keypoint_index);
|
|
|
|
PoseKeypoint kp;
|
|
kp.point.x = MapToSourceCoord(ReadTensorValue(keypoint_output, x_offset), letterbox.pad_x, letterbox.scale, frame_w);
|
|
kp.point.y = MapToSourceCoord(ReadTensorValue(keypoint_output, y_offset), letterbox.pad_y, letterbox.scale, frame_h);
|
|
kp.score = ReadTensorValue(keypoint_output, s_offset);
|
|
item.keypoints.push_back(kp);
|
|
}
|
|
result->items.push_back(std::move(item));
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
struct AiPoseNode::Impl {
|
|
PoseConfig config;
|
|
std::string init_err;
|
|
std::shared_ptr<IInferBackend> infer_backend;
|
|
ModelHandle model_handle = kInvalidModelHandle;
|
|
bool started = false;
|
|
std::vector<uint8_t> input_rgb;
|
|
};
|
|
|
|
AiPoseNode::AiPoseNode() : impl_(std::make_unique<Impl>()) {}
|
|
|
|
AiPoseNode::~AiPoseNode() = default;
|
|
|
|
std::string AiPoseNode::Id() const {
|
|
return id_;
|
|
}
|
|
|
|
std::string AiPoseNode::Type() const {
|
|
return "ai_pose";
|
|
}
|
|
|
|
bool AiPoseNode::Init(const SimpleJson& config, const NodeContext& ctx) {
|
|
id_ = config.ValueOr<std::string>("id", "ai_pose");
|
|
if (!ParseConfig(config, impl_->config, impl_->init_err)) {
|
|
LogError("[ai_pose] invalid config: " + impl_->init_err);
|
|
return false;
|
|
}
|
|
impl_->infer_backend = ctx.infer_backend;
|
|
output_queues_ = ctx.output_queues;
|
|
return true;
|
|
}
|
|
|
|
bool AiPoseNode::Start() {
|
|
if (!impl_->config.enabled) {
|
|
impl_->started = true;
|
|
return true;
|
|
}
|
|
if (!impl_->infer_backend) {
|
|
LogError("[ai_pose] infer_backend is required when enabled=true");
|
|
return false;
|
|
}
|
|
|
|
std::string err;
|
|
impl_->model_handle = impl_->infer_backend->LoadModel(impl_->config.model_path, err);
|
|
if (impl_->model_handle == kInvalidModelHandle) {
|
|
LogError("[ai_pose] failed to load model: " + impl_->config.model_path + " err=" + err);
|
|
return false;
|
|
}
|
|
|
|
impl_->started = true;
|
|
return true;
|
|
}
|
|
|
|
void AiPoseNode::Stop() {
|
|
if (impl_->infer_backend && impl_->model_handle != kInvalidModelHandle) {
|
|
impl_->infer_backend->UnloadModel(impl_->model_handle);
|
|
impl_->model_handle = kInvalidModelHandle;
|
|
}
|
|
impl_->started = false;
|
|
}
|
|
|
|
NodeStatus AiPoseNode::Process(FramePtr frame) {
|
|
if (!frame) return NodeStatus::DROP;
|
|
|
|
if (!impl_->config.enabled) {
|
|
PushToDownstream(output_queues_, frame);
|
|
return NodeStatus::OK;
|
|
}
|
|
if (!impl_->started || !impl_->infer_backend || impl_->model_handle == kInvalidModelHandle) {
|
|
LogWarn("[ai_pose] process called before node is started");
|
|
PushToDownstream(output_queues_, frame);
|
|
return NodeStatus::ERROR;
|
|
}
|
|
|
|
LetterboxInfo letterbox;
|
|
if (!BuildLetterboxedRgbInput(*frame, impl_->config.model_input_w, impl_->config.model_input_h,
|
|
impl_->input_rgb, letterbox)) {
|
|
LogWarn("[ai_pose] only RGB/BGR frames are supported currently");
|
|
PushToDownstream(output_queues_, frame);
|
|
return NodeStatus::OK;
|
|
}
|
|
|
|
InferInput input;
|
|
input.data = impl_->input_rgb.data();
|
|
input.size = impl_->input_rgb.size();
|
|
input.width = impl_->config.model_input_w;
|
|
input.height = impl_->config.model_input_h;
|
|
input.is_nhwc = true;
|
|
input.dma_fd = -1;
|
|
input.dma_offset = 0;
|
|
|
|
InferResult infer_result = impl_->infer_backend->Infer(impl_->model_handle, input);
|
|
if (!infer_result.success) {
|
|
LogWarn("[ai_pose] inference failed: " + infer_result.error);
|
|
PushToDownstream(output_queues_, frame);
|
|
return NodeStatus::OK;
|
|
}
|
|
|
|
frame->pose = BuildPoseResult(infer_result.outputs, impl_->config, frame->width, frame->height, letterbox);
|
|
if (!frame->pose) {
|
|
frame->pose = std::make_shared<PoseResult>();
|
|
frame->pose->img_w = frame->width;
|
|
frame->pose->img_h = frame->height;
|
|
frame->pose->model_name = "yolov8n-pose";
|
|
}
|
|
|
|
PushToDownstream(output_queues_, frame);
|
|
return NodeStatus::OK;
|
|
}
|
|
|
|
#ifndef RK3588_TEST_BUILD
|
|
REGISTER_NODE(AiPoseNode, "ai_pose");
|
|
#endif
|
|
|
|
} // namespace rk3588
|