OrangePi3588Media/plugins/ai_pose/ai_pose_node.cpp

467 lines
17 KiB
C++

#include "ai_pose_node.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "face/face_detection_utils.h"
#include "hw/i_infer_backend.h"
#include "pose/pose_result.h"
#include "utils/logger.h"
namespace rk3588 {
namespace {
using face_detection::HalfToFloat;
using face_detection::ResizeRgbBilinear;
constexpr int kPoseOutputCount = 4;
constexpr int kPoseLocChannels = 64;
constexpr int kPoseScoreChannel = 64;
constexpr int kDflBins = 16;
constexpr int kMaxPoseDetections = 32;
constexpr uint8_t kLetterboxValue = 114;
struct PoseConfig {
bool enabled = true;
std::string model_path;
int model_input_w = 640;
int model_input_h = 640;
int expected_keypoints = 17;
float conf_thresh = 0.25f;
float nms_thresh = 0.45f;
};
struct LetterboxInfo {
float scale = 1.0f;
float pad_x = 0.0f;
float pad_y = 0.0f;
};
struct PoseCandidate {
Rect bbox{};
float score = 0.0f;
int keypoint_index = -1;
};
bool ParseConfig(const SimpleJson& config, PoseConfig& out, std::string& err) {
out.enabled = config.ValueOr<bool>("enabled", true);
out.model_path = config.ValueOr<std::string>("model_path", "");
out.model_input_w = config.ValueOr<int>("model_input_w", 640);
out.model_input_h = config.ValueOr<int>("model_input_h", 640);
out.expected_keypoints = config.ValueOr<int>("expected_keypoints", 17);
out.conf_thresh = config.ValueOr<float>("conf_thresh", 0.25f);
out.nms_thresh = config.ValueOr<float>("nms_thresh", 0.45f);
if (out.model_input_w <= 0 || out.model_input_h <= 0) {
err = "model_input_w and model_input_h must be positive";
return false;
}
if (out.expected_keypoints <= 0) {
err = "expected_keypoints must be positive";
return false;
}
if (out.conf_thresh <= 0.0f || out.conf_thresh >= 1.0f) {
err = "conf_thresh must be in (0, 1)";
return false;
}
if (out.nms_thresh <= 0.0f || out.nms_thresh >= 1.0f) {
err = "nms_thresh must be in (0, 1)";
return false;
}
if (out.enabled && out.model_path.empty()) {
err = "model_path is required when enabled=true";
return false;
}
return true;
}
void PushToDownstream(const std::vector<std::shared_ptr<SpscQueue<FramePtr>>>& output_queues, const FramePtr& frame) {
for (const auto& q : output_queues) {
if (q) q->Push(frame);
}
}
inline float Sigmoid(float x) {
return 1.0f / (1.0f + std::exp(-x));
}
float Unsigmoid(float y) {
return -std::log((1.0f / y) - 1.0f);
}
float ClampFloat(float v, float lo, float hi) {
return std::max(lo, std::min(v, hi));
}
float CalculateIoU(const Rect& lhs, const Rect& rhs) {
const float x1 = std::max(lhs.x, rhs.x);
const float y1 = std::max(lhs.y, rhs.y);
const float x2 = std::min(lhs.x + lhs.w, rhs.x + rhs.w);
const float y2 = std::min(lhs.y + lhs.h, rhs.y + rhs.h);
const float inter_w = std::max(0.0f, x2 - x1);
const float inter_h = std::max(0.0f, y2 - y1);
const float inter = inter_w * inter_h;
const float union_area = lhs.w * lhs.h + rhs.w * rhs.h - inter;
return union_area <= 0.0f ? 0.0f : (inter / union_area);
}
float ReadTensorValue(const InferOutput& output, size_t index) {
#if defined(RK3588_ENABLE_RKNN)
switch (output.type) {
case RKNN_TENSOR_FLOAT32: {
const auto* p = reinterpret_cast<const float*>(output.data.data());
return p[index];
}
case RKNN_TENSOR_FLOAT16: {
const auto* p = reinterpret_cast<const uint16_t*>(output.data.data());
return HalfToFloat(p[index]);
}
case RKNN_TENSOR_INT8: {
const auto* p = reinterpret_cast<const int8_t*>(output.data.data());
return (static_cast<float>(p[index]) - static_cast<float>(output.zp)) * output.scale;
}
case RKNN_TENSOR_UINT8: {
const auto* p = reinterpret_cast<const uint8_t*>(output.data.data());
return (static_cast<float>(p[index]) - static_cast<float>(output.zp)) * output.scale;
}
default:
break;
}
#endif
const auto* p = reinterpret_cast<const float*>(output.data.data());
return p[index];
}
bool BuildLetterboxedRgbInput(const Frame& frame, int dst_w, int dst_h,
std::vector<uint8_t>& out, LetterboxInfo& letterbox) {
if (frame.width <= 0 || frame.height <= 0 || dst_w <= 0 || dst_h <= 0) return false;
if (frame.format != PixelFormat::RGB && frame.format != PixelFormat::BGR) return false;
const uint8_t* src = frame.planes[0].data ? frame.planes[0].data : frame.data;
if (!src) return false;
const int src_stride = frame.planes[0].stride > 0 ? frame.planes[0].stride : frame.width * 3;
out.assign(static_cast<size_t>(dst_w) * static_cast<size_t>(dst_h) * 3, kLetterboxValue);
const float scale = std::min(static_cast<float>(dst_w) / static_cast<float>(frame.width),
static_cast<float>(dst_h) / static_cast<float>(frame.height));
const int resized_w = std::max(1, static_cast<int>(std::round(frame.width * scale)));
const int resized_h = std::max(1, static_cast<int>(std::round(frame.height * scale)));
const int pad_x = (dst_w - resized_w) / 2;
const int pad_y = (dst_h - resized_h) / 2;
std::vector<uint8_t> resized(static_cast<size_t>(resized_w) * static_cast<size_t>(resized_h) * 3);
const bool swap_rb = (frame.format == PixelFormat::BGR);
ResizeRgbBilinear(src, frame.width, frame.height, src_stride, resized.data(), resized_w, resized_h, swap_rb);
for (int y = 0; y < resized_h; ++y) {
uint8_t* dst_row = out.data() + (static_cast<size_t>(y + pad_y) * static_cast<size_t>(dst_w) + pad_x) * 3;
const uint8_t* src_row = resized.data() + static_cast<size_t>(y) * static_cast<size_t>(resized_w) * 3;
std::memcpy(dst_row, src_row, static_cast<size_t>(resized_w) * 3);
}
letterbox.scale = scale;
letterbox.pad_x = static_cast<float>(pad_x);
letterbox.pad_y = static_cast<float>(pad_y);
return true;
}
float DecodeDfl(const InferOutput& output, int cell_index, int cells_per_head, int channel_group) {
float max_value = -std::numeric_limits<float>::infinity();
float logits[kDflBins]{};
for (int i = 0; i < kDflBins; ++i) {
const size_t offset = static_cast<size_t>((channel_group * kDflBins + i) * cells_per_head + cell_index);
logits[i] = ReadTensorValue(output, offset);
max_value = std::max(max_value, logits[i]);
}
float denom = 0.0f;
float numer = 0.0f;
for (int i = 0; i < kDflBins; ++i) {
const float e = std::exp(logits[i] - max_value);
denom += e;
numer += e * static_cast<float>(i);
}
return denom > 0.0f ? (numer / denom) : 0.0f;
}
std::vector<PoseCandidate> DecodePoseCandidates(const std::vector<InferOutput>& outputs,
const PoseConfig& config) {
std::vector<PoseCandidate> candidates;
if (outputs.size() < kPoseOutputCount) return candidates;
const float score_threshold = Unsigmoid(config.conf_thresh);
int keypoint_base_index = 0;
for (int head = 0; head < 3; ++head) {
const int stride = (head == 0) ? 8 : (head == 1 ? 16 : 32);
const int grid_h = config.model_input_h / stride;
const int grid_w = config.model_input_w / stride;
const int cells = grid_h * grid_w;
if (cells <= 0) continue;
const InferOutput& output = outputs[static_cast<size_t>(head)];
const size_t elem_count = sizeof(float) > 0 ? (output.data.size() / sizeof(float)) : 0;
#if defined(RK3588_ENABLE_RKNN)
size_t bytes_per_elem = 1;
switch (output.type) {
case RKNN_TENSOR_FLOAT32: bytes_per_elem = sizeof(float); break;
case RKNN_TENSOR_FLOAT16: bytes_per_elem = sizeof(uint16_t); break;
default: bytes_per_elem = 1; break;
}
const size_t actual_elem_count = bytes_per_elem > 0 ? (output.data.size() / bytes_per_elem) : 0;
#else
const size_t actual_elem_count = elem_count;
#endif
if (actual_elem_count < static_cast<size_t>(65 * cells)) {
keypoint_base_index += cells;
continue;
}
for (int cell = 0; cell < cells; ++cell) {
const size_t score_offset = static_cast<size_t>(kPoseScoreChannel * cells + cell);
const float score_logit = ReadTensorValue(output, score_offset);
if (score_logit < score_threshold) continue;
const float left = DecodeDfl(output, cell, cells, 0);
const float top = DecodeDfl(output, cell, cells, 1);
const float right = DecodeDfl(output, cell, cells, 2);
const float bottom = DecodeDfl(output, cell, cells, 3);
const int grid_x = cell % grid_w;
const int grid_y = cell / grid_w;
const float center_x = (static_cast<float>(grid_x) + 0.5f) * stride;
const float center_y = (static_cast<float>(grid_y) + 0.5f) * stride;
const float x1 = center_x - left * stride;
const float y1 = center_y - top * stride;
const float x2 = center_x + right * stride;
const float y2 = center_y + bottom * stride;
PoseCandidate candidate;
candidate.bbox.x = x1;
candidate.bbox.y = y1;
candidate.bbox.w = std::max(0.0f, x2 - x1);
candidate.bbox.h = std::max(0.0f, y2 - y1);
candidate.score = Sigmoid(score_logit);
candidate.keypoint_index = keypoint_base_index + cell;
candidates.push_back(candidate);
}
keypoint_base_index += cells;
}
std::sort(candidates.begin(), candidates.end(), [](const PoseCandidate& lhs, const PoseCandidate& rhs) {
return lhs.score > rhs.score;
});
std::vector<PoseCandidate> kept;
kept.reserve(candidates.size());
for (const auto& candidate : candidates) {
bool suppressed = false;
for (const auto& accepted : kept) {
if (CalculateIoU(candidate.bbox, accepted.bbox) > config.nms_thresh) {
suppressed = true;
break;
}
}
if (!suppressed) {
kept.push_back(candidate);
if (static_cast<int>(kept.size()) >= kMaxPoseDetections) break;
}
}
return kept;
}
float MapToSourceCoord(float value, float pad, float scale, int limit) {
if (scale <= 0.0f) return 0.0f;
return ClampFloat((value - pad) / scale, 0.0f, static_cast<float>(std::max(0, limit)));
}
std::shared_ptr<PoseResult> BuildPoseResult(const std::vector<InferOutput>& outputs,
const PoseConfig& config,
int frame_w, int frame_h,
const LetterboxInfo& letterbox) {
if (outputs.size() < kPoseOutputCount) return nullptr;
const InferOutput& keypoint_output = outputs[3];
const int total_points = (config.model_input_w / 8) * (config.model_input_h / 8) +
(config.model_input_w / 16) * (config.model_input_h / 16) +
(config.model_input_w / 32) * (config.model_input_h / 32);
const int values_per_point = config.expected_keypoints * 3;
#if defined(RK3588_ENABLE_RKNN)
size_t kp_bytes_per_elem = 1;
switch (keypoint_output.type) {
case RKNN_TENSOR_FLOAT32: kp_bytes_per_elem = sizeof(float); break;
case RKNN_TENSOR_FLOAT16: kp_bytes_per_elem = sizeof(uint16_t); break;
default: kp_bytes_per_elem = 1; break;
}
#else
const size_t kp_bytes_per_elem = sizeof(float);
#endif
const size_t kp_elem_count = kp_bytes_per_elem > 0 ? (keypoint_output.data.size() / kp_bytes_per_elem) : 0;
if (kp_elem_count < static_cast<size_t>(values_per_point * total_points)) return nullptr;
auto result = std::make_shared<PoseResult>();
result->img_w = frame_w;
result->img_h = frame_h;
result->model_name = "yolov8n-pose";
const std::vector<PoseCandidate> candidates = DecodePoseCandidates(outputs, config);
result->items.reserve(candidates.size());
for (const auto& candidate : candidates) {
if (candidate.keypoint_index < 0 || candidate.keypoint_index >= total_points) continue;
PoseItem item;
item.score = candidate.score;
item.track_id = -1;
item.bbox.x = MapToSourceCoord(candidate.bbox.x, letterbox.pad_x, letterbox.scale, frame_w);
item.bbox.y = MapToSourceCoord(candidate.bbox.y, letterbox.pad_y, letterbox.scale, frame_h);
const float x2 = MapToSourceCoord(candidate.bbox.x + candidate.bbox.w, letterbox.pad_x, letterbox.scale, frame_w);
const float y2 = MapToSourceCoord(candidate.bbox.y + candidate.bbox.h, letterbox.pad_y, letterbox.scale, frame_h);
item.bbox.w = std::max(0.0f, x2 - item.bbox.x);
item.bbox.h = std::max(0.0f, y2 - item.bbox.y);
item.keypoints.reserve(static_cast<size_t>(config.expected_keypoints));
for (int k = 0; k < config.expected_keypoints; ++k) {
const size_t x_offset = static_cast<size_t>(k * 3 * total_points + candidate.keypoint_index);
const size_t y_offset = static_cast<size_t>(k * 3 * total_points + total_points + candidate.keypoint_index);
const size_t s_offset = static_cast<size_t>(k * 3 * total_points + 2 * total_points + candidate.keypoint_index);
PoseKeypoint kp;
kp.point.x = MapToSourceCoord(ReadTensorValue(keypoint_output, x_offset), letterbox.pad_x, letterbox.scale, frame_w);
kp.point.y = MapToSourceCoord(ReadTensorValue(keypoint_output, y_offset), letterbox.pad_y, letterbox.scale, frame_h);
kp.score = ReadTensorValue(keypoint_output, s_offset);
item.keypoints.push_back(kp);
}
result->items.push_back(std::move(item));
}
return result;
}
} // namespace
struct AiPoseNode::Impl {
PoseConfig config;
std::string init_err;
std::shared_ptr<IInferBackend> infer_backend;
ModelHandle model_handle = kInvalidModelHandle;
bool started = false;
std::vector<uint8_t> input_rgb;
};
AiPoseNode::AiPoseNode() : impl_(std::make_unique<Impl>()) {}
AiPoseNode::~AiPoseNode() = default;
std::string AiPoseNode::Id() const {
return id_;
}
std::string AiPoseNode::Type() const {
return "ai_pose";
}
bool AiPoseNode::Init(const SimpleJson& config, const NodeContext& ctx) {
id_ = config.ValueOr<std::string>("id", "ai_pose");
if (!ParseConfig(config, impl_->config, impl_->init_err)) {
LogError("[ai_pose] invalid config: " + impl_->init_err);
return false;
}
impl_->infer_backend = ctx.infer_backend;
output_queues_ = ctx.output_queues;
return true;
}
bool AiPoseNode::Start() {
if (!impl_->config.enabled) {
impl_->started = true;
return true;
}
if (!impl_->infer_backend) {
LogError("[ai_pose] infer_backend is required when enabled=true");
return false;
}
std::string err;
impl_->model_handle = impl_->infer_backend->LoadModel(impl_->config.model_path, err);
if (impl_->model_handle == kInvalidModelHandle) {
LogError("[ai_pose] failed to load model: " + impl_->config.model_path + " err=" + err);
return false;
}
impl_->started = true;
return true;
}
void AiPoseNode::Stop() {
if (impl_->infer_backend && impl_->model_handle != kInvalidModelHandle) {
impl_->infer_backend->UnloadModel(impl_->model_handle);
impl_->model_handle = kInvalidModelHandle;
}
impl_->started = false;
}
NodeStatus AiPoseNode::Process(FramePtr frame) {
if (!frame) return NodeStatus::DROP;
if (!impl_->config.enabled) {
PushToDownstream(output_queues_, frame);
return NodeStatus::OK;
}
if (!impl_->started || !impl_->infer_backend || impl_->model_handle == kInvalidModelHandle) {
LogWarn("[ai_pose] process called before node is started");
PushToDownstream(output_queues_, frame);
return NodeStatus::ERROR;
}
LetterboxInfo letterbox;
if (!BuildLetterboxedRgbInput(*frame, impl_->config.model_input_w, impl_->config.model_input_h,
impl_->input_rgb, letterbox)) {
LogWarn("[ai_pose] only RGB/BGR frames are supported currently");
PushToDownstream(output_queues_, frame);
return NodeStatus::OK;
}
InferInput input;
input.data = impl_->input_rgb.data();
input.size = impl_->input_rgb.size();
input.width = impl_->config.model_input_w;
input.height = impl_->config.model_input_h;
input.is_nhwc = true;
input.dma_fd = -1;
input.dma_offset = 0;
InferResult infer_result = impl_->infer_backend->Infer(impl_->model_handle, input);
if (!infer_result.success) {
LogWarn("[ai_pose] inference failed: " + infer_result.error);
PushToDownstream(output_queues_, frame);
return NodeStatus::OK;
}
frame->pose = BuildPoseResult(infer_result.outputs, impl_->config, frame->width, frame->height, letterbox);
if (!frame->pose) {
frame->pose = std::make_shared<PoseResult>();
frame->pose->img_w = frame->width;
frame->pose->img_h = frame->height;
frame->pose->model_name = "yolov8n-pose";
}
PushToDownstream(output_queues_, frame);
return NodeStatus::OK;
}
#ifndef RK3588_TEST_BUILD
REGISTER_NODE(AiPoseNode, "ai_pose");
#endif
} // namespace rk3588