添加letterbox 模式可以保持目标宽高比,避免变形,同时通过 FrameTransformMeta 自动处理坐标映射
This commit is contained in:
parent
d6880498f9
commit
ebd9cd687e
@ -31,6 +31,7 @@
|
||||
"dst_h": 768,
|
||||
"dst_format": "rgb",
|
||||
"dst_packed": true,
|
||||
"resize_mode": "letterbox",
|
||||
"keep_ratio": false,
|
||||
"rga_gate": "cam_ppe11_detection",
|
||||
"use_rga": true
|
||||
|
||||
@ -44,6 +44,19 @@ struct DetectionResult {
|
||||
std::string model_name;
|
||||
};
|
||||
|
||||
struct FrameTransformMeta {
|
||||
bool valid = false;
|
||||
bool letterbox = false;
|
||||
int src_w = 0;
|
||||
int src_h = 0;
|
||||
int dst_w = 0;
|
||||
int dst_h = 0;
|
||||
float scale_x = 1.0f;
|
||||
float scale_y = 1.0f;
|
||||
float pad_x = 0.0f;
|
||||
float pad_y = 0.0f;
|
||||
};
|
||||
|
||||
struct FramePlane {
|
||||
uint8_t* data = nullptr;
|
||||
int stride = 0; // bytes per row
|
||||
@ -76,6 +89,7 @@ struct Frame {
|
||||
// Face recognition pipeline meta (kept separate from user_meta to avoid conflicts with publish).
|
||||
std::shared_ptr<FaceDetResult> face_det;
|
||||
std::shared_ptr<FaceRecogResult> face_recog;
|
||||
std::shared_ptr<FrameTransformMeta> transform_meta;
|
||||
std::shared_ptr<void> user_meta;
|
||||
|
||||
int DmaFd() const { return buffer ? buffer->DmaFd() : dma_fd; }
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <thread>
|
||||
@ -33,6 +34,8 @@ const int kAnchor1[6] = {30, 61, 62, 45, 59, 119};
|
||||
const int kAnchor2[6] = {116, 90, 156, 198, 373, 326};
|
||||
|
||||
enum class YoloVersion { V5, V8 };
|
||||
enum class V8BoxFormat { Auto, CxCyWh, XyXy, XyWh };
|
||||
enum class V8ClsActivation { Auto, None, Sigmoid };
|
||||
|
||||
const char* kCocoLabels[kObjClassNum] = {
|
||||
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
|
||||
@ -51,6 +54,67 @@ inline int Clamp(float val, int min_val, int max_val) {
|
||||
return val > min_val ? (val < max_val ? static_cast<int>(val) : max_val) : min_val;
|
||||
}
|
||||
|
||||
struct DetCoordContext {
|
||||
bool has_transform = false;
|
||||
int out_w = 0;
|
||||
int out_h = 0;
|
||||
float scale_x = 1.0f;
|
||||
float scale_y = 1.0f;
|
||||
float pad_x = 0.0f;
|
||||
float pad_y = 0.0f;
|
||||
float fallback_scale_w = 1.0f;
|
||||
float fallback_scale_h = 1.0f;
|
||||
};
|
||||
|
||||
DetCoordContext BuildDetCoordContext(const Frame& frame, int model_input_w, int model_input_h) {
|
||||
DetCoordContext ctx{};
|
||||
ctx.fallback_scale_w = frame.width > 0 ? static_cast<float>(model_input_w) / frame.width : 1.0f;
|
||||
ctx.fallback_scale_h = frame.height > 0 ? static_cast<float>(model_input_h) / frame.height : 1.0f;
|
||||
ctx.out_w = frame.width;
|
||||
ctx.out_h = frame.height;
|
||||
|
||||
if (frame.transform_meta && frame.transform_meta->valid &&
|
||||
frame.transform_meta->src_w > 0 && frame.transform_meta->src_h > 0 &&
|
||||
frame.transform_meta->scale_x > 1e-6f && frame.transform_meta->scale_y > 1e-6f) {
|
||||
ctx.has_transform = true;
|
||||
ctx.out_w = frame.transform_meta->src_w;
|
||||
ctx.out_h = frame.transform_meta->src_h;
|
||||
ctx.scale_x = frame.transform_meta->scale_x;
|
||||
ctx.scale_y = frame.transform_meta->scale_y;
|
||||
ctx.pad_x = frame.transform_meta->pad_x;
|
||||
ctx.pad_y = frame.transform_meta->pad_y;
|
||||
}
|
||||
return ctx;
|
||||
}
|
||||
|
||||
Rect DecodeToOutputRect(float x, float y, float w, float h, const DetCoordContext& ctx) {
|
||||
float ox = x;
|
||||
float oy = y;
|
||||
float ow = w;
|
||||
float oh = h;
|
||||
|
||||
if (ctx.has_transform) {
|
||||
ox = (x - ctx.pad_x) / ctx.scale_x;
|
||||
oy = (y - ctx.pad_y) / ctx.scale_y;
|
||||
ow = w / ctx.scale_x;
|
||||
oh = h / ctx.scale_y;
|
||||
} else {
|
||||
ox = x / ctx.fallback_scale_w;
|
||||
oy = y / ctx.fallback_scale_h;
|
||||
ow = w / ctx.fallback_scale_w;
|
||||
oh = h / ctx.fallback_scale_h;
|
||||
}
|
||||
|
||||
Rect r{};
|
||||
const int out_w = std::max(1, ctx.out_w);
|
||||
const int out_h = std::max(1, ctx.out_h);
|
||||
r.x = static_cast<float>(Clamp(static_cast<int>(ox), 0, out_w));
|
||||
r.y = static_cast<float>(Clamp(static_cast<int>(oy), 0, out_h));
|
||||
r.w = static_cast<float>(Clamp(static_cast<int>(ow), 0, out_w - static_cast<int>(r.x)));
|
||||
r.h = static_cast<float>(Clamp(static_cast<int>(oh), 0, out_h - static_cast<int>(r.y)));
|
||||
return r;
|
||||
}
|
||||
|
||||
inline int32_t ClipFloat(float val, float min_val, float max_val) {
|
||||
return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
|
||||
}
|
||||
@ -64,39 +128,29 @@ inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
|
||||
return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
|
||||
}
|
||||
|
||||
// FP16 (half) to FP32 conversion
|
||||
// IEEE 754 half-precision: 1 sign bit, 5 exponent bits, 10 mantissa bits
|
||||
inline float Sigmoid(float x) {
|
||||
return 1.0f / (1.0f + std::exp(-x));
|
||||
}
|
||||
|
||||
// FP16 (half) to FP32 conversion.
|
||||
// Uses arithmetic reconstruction to avoid undefined behavior on subnormals.
|
||||
inline float Fp16ToFp32(uint16_t h) {
|
||||
uint32_t sign = (h >> 15) & 0x1;
|
||||
uint32_t exp = (h >> 10) & 0x1F;
|
||||
uint32_t mant = h & 0x3FF;
|
||||
|
||||
uint32_t f;
|
||||
const int sign = (h & 0x8000) ? -1 : 1;
|
||||
const int exp = (h >> 10) & 0x1F;
|
||||
const int mant = h & 0x03FF;
|
||||
|
||||
if (exp == 0) {
|
||||
// Zero or subnormal
|
||||
if (mant == 0) {
|
||||
f = (sign << 31); // Signed zero
|
||||
} else {
|
||||
// Subnormal: convert to normal
|
||||
exp = 1;
|
||||
while ((mant & 0x400) == 0) {
|
||||
mant <<= 1;
|
||||
exp--;
|
||||
}
|
||||
mant &= 0x3FF;
|
||||
f = (sign << 31) | ((exp + 112) << 23) | (mant << 13);
|
||||
}
|
||||
} else if (exp == 0x1F) {
|
||||
// Infinity or NaN
|
||||
f = (sign << 31) | (0xFF << 23) | (mant << 13);
|
||||
} else {
|
||||
// Normal number
|
||||
f = (sign << 31) | ((exp + 112) << 23) | (mant << 13);
|
||||
if (mant == 0) return sign < 0 ? -0.0f : 0.0f;
|
||||
// subnormal: mant * 2^-24
|
||||
return static_cast<float>(sign) * std::ldexp(static_cast<float>(mant), -24);
|
||||
}
|
||||
|
||||
float result;
|
||||
memcpy(&result, &f, sizeof(float));
|
||||
return result;
|
||||
if (exp == 0x1F) {
|
||||
if (mant == 0) return sign < 0 ? -INFINITY : INFINITY;
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
// normal: (mant + 1024) * 2^(exp-25)
|
||||
return static_cast<float>(sign) *
|
||||
std::ldexp(static_cast<float>(mant + 1024), exp - 25);
|
||||
}
|
||||
|
||||
float CalculateIoU(float x1_min, float y1_min, float x1_max, float y1_max,
|
||||
@ -212,18 +266,239 @@ int ProcessFeatureMapV5(int8_t* input, const int* anchor, int grid_h, int grid_w
|
||||
return valid_count;
|
||||
}
|
||||
|
||||
uint32_t TensorTypeSizeBytes(rknn_tensor_type t) {
|
||||
switch (t) {
|
||||
case RKNN_TENSOR_INT8:
|
||||
case RKNN_TENSOR_UINT8:
|
||||
return 1;
|
||||
case RKNN_TENSOR_FLOAT16:
|
||||
return 2;
|
||||
case RKNN_TENSOR_FLOAT32:
|
||||
return 4;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int DefaultV8NumBoxes(int model_h, int model_w) {
|
||||
if (model_h <= 0 || model_w <= 0) return 0;
|
||||
return (model_h / 8) * (model_w / 8) +
|
||||
(model_h / 16) * (model_w / 16) +
|
||||
(model_h / 32) * (model_w / 32);
|
||||
}
|
||||
|
||||
struct V8LayoutInfo {
|
||||
int num_boxes = 0;
|
||||
bool channels_first = true; // true: CxN, false: NxC
|
||||
};
|
||||
|
||||
float ScoreBoxCandidate(float x, float y, float w, float h, int model_w, int model_h) {
|
||||
float s = 0.0f;
|
||||
if (w > 0.0f && h > 0.0f) s += 3.0f;
|
||||
if (w <= model_w * 1.2f) s += 1.0f;
|
||||
if (h <= model_h * 1.2f) s += 1.0f;
|
||||
if (x >= -model_w * 0.1f) s += 1.0f;
|
||||
if (y >= -model_h * 0.1f) s += 1.0f;
|
||||
if ((x + w) <= model_w * 1.2f) s += 1.0f;
|
||||
if ((y + h) <= model_h * 1.2f) s += 1.0f;
|
||||
return s;
|
||||
}
|
||||
|
||||
bool SeemsNormalized(float a, float b, float c, float d) {
|
||||
auto in_range = [](float v) { return v >= -0.05f && v <= 2.5f; };
|
||||
return in_range(a) && in_range(b) && in_range(c) && in_range(d);
|
||||
}
|
||||
|
||||
const char* V8BoxFormatName(V8BoxFormat fmt) {
|
||||
switch (fmt) {
|
||||
case V8BoxFormat::CxCyWh: return "cxcywh";
|
||||
case V8BoxFormat::XyXy: return "xyxy";
|
||||
case V8BoxFormat::XyWh: return "xywh";
|
||||
default: return "auto";
|
||||
}
|
||||
}
|
||||
|
||||
void DecodeV8Box(float a, float b, float c, float d, int model_w, int model_h, V8BoxFormat fmt,
|
||||
float& out_x, float& out_y, float& out_w, float& out_h, V8BoxFormat* used_fmt = nullptr) {
|
||||
if (SeemsNormalized(a, b, c, d)) {
|
||||
a *= static_cast<float>(model_w);
|
||||
b *= static_cast<float>(model_h);
|
||||
c *= static_cast<float>(model_w);
|
||||
d *= static_cast<float>(model_h);
|
||||
}
|
||||
|
||||
auto decode_cxcywh = [&](float& x, float& y, float& w, float& h) {
|
||||
x = a - c / 2.0f;
|
||||
y = b - d / 2.0f;
|
||||
w = c;
|
||||
h = d;
|
||||
};
|
||||
auto decode_xyxy = [&](float& x, float& y, float& w, float& h) {
|
||||
x = a;
|
||||
y = b;
|
||||
w = c - a;
|
||||
h = d - b;
|
||||
};
|
||||
auto decode_xywh = [&](float& x, float& y, float& w, float& h) {
|
||||
x = a;
|
||||
y = b;
|
||||
w = c;
|
||||
h = d;
|
||||
};
|
||||
|
||||
if (fmt == V8BoxFormat::CxCyWh) {
|
||||
decode_cxcywh(out_x, out_y, out_w, out_h);
|
||||
if (used_fmt) *used_fmt = V8BoxFormat::CxCyWh;
|
||||
return;
|
||||
}
|
||||
if (fmt == V8BoxFormat::XyXy) {
|
||||
decode_xyxy(out_x, out_y, out_w, out_h);
|
||||
if (used_fmt) *used_fmt = V8BoxFormat::XyXy;
|
||||
return;
|
||||
}
|
||||
if (fmt == V8BoxFormat::XyWh) {
|
||||
decode_xywh(out_x, out_y, out_w, out_h);
|
||||
if (used_fmt) *used_fmt = V8BoxFormat::XyWh;
|
||||
return;
|
||||
}
|
||||
|
||||
float x1 = 0.0f, y1 = 0.0f, w1 = 0.0f, h1 = 0.0f;
|
||||
float x2 = 0.0f, y2 = 0.0f, w2 = 0.0f, h2 = 0.0f;
|
||||
float x3 = 0.0f, y3 = 0.0f, w3 = 0.0f, h3 = 0.0f;
|
||||
decode_cxcywh(x1, y1, w1, h1);
|
||||
decode_xyxy(x2, y2, w2, h2);
|
||||
decode_xywh(x3, y3, w3, h3);
|
||||
|
||||
const float s1 = ScoreBoxCandidate(x1, y1, w1, h1, model_w, model_h);
|
||||
const float s2 = ScoreBoxCandidate(x2, y2, w2, h2, model_w, model_h);
|
||||
const float s3 = ScoreBoxCandidate(x3, y3, w3, h3, model_w, model_h);
|
||||
if (s2 >= s1 && s2 >= s3) {
|
||||
out_x = x2; out_y = y2; out_w = w2; out_h = h2;
|
||||
if (used_fmt) *used_fmt = V8BoxFormat::XyXy;
|
||||
} else if (s3 >= s1 && s3 >= s2) {
|
||||
out_x = x3; out_y = y3; out_w = w3; out_h = h3;
|
||||
if (used_fmt) *used_fmt = V8BoxFormat::XyWh;
|
||||
} else {
|
||||
out_x = x1; out_y = y1; out_w = w1; out_h = h1;
|
||||
if (used_fmt) *used_fmt = V8BoxFormat::CxCyWh;
|
||||
}
|
||||
}
|
||||
|
||||
bool ResolveV8ApplySigmoid(const float* output, int num_boxes, int num_classes, bool channels_first,
|
||||
V8ClsActivation act_mode) {
|
||||
if (act_mode == V8ClsActivation::None) return false;
|
||||
if (act_mode == V8ClsActivation::Sigmoid) return true;
|
||||
if (!output || num_boxes <= 0 || num_classes <= 0) return false;
|
||||
|
||||
const int num_channels = 4 + num_classes;
|
||||
const int sample_boxes = std::min(num_boxes, 64);
|
||||
float min_v = 1e9f;
|
||||
float max_v = -1e9f;
|
||||
for (int i = 0; i < sample_boxes; ++i) {
|
||||
for (int c = 0; c < num_classes; ++c) {
|
||||
const float v = channels_first ? output[(4 + c) * num_boxes + i]
|
||||
: output[i * num_channels + (4 + c)];
|
||||
if (v < min_v) min_v = v;
|
||||
if (v > max_v) max_v = v;
|
||||
}
|
||||
}
|
||||
// If class outputs clearly look like logits, enable sigmoid.
|
||||
return (min_v < -0.1f || max_v > 1.5f);
|
||||
}
|
||||
|
||||
V8LayoutInfo ResolveV8Layout(const std::vector<uint32_t>& dims, size_t byte_size,
|
||||
rknn_tensor_type type, int num_classes,
|
||||
int model_h, int model_w) {
|
||||
V8LayoutInfo info;
|
||||
const int num_channels = 4 + num_classes;
|
||||
if (num_channels <= 0) return info;
|
||||
|
||||
const uint32_t elem_bytes = TensorTypeSizeBytes(type);
|
||||
const size_t total_elems = elem_bytes > 0 ? (byte_size / elem_bytes) : 0;
|
||||
const size_t max_boxes_from_data = static_cast<size_t>(num_channels) > 0
|
||||
? (total_elems / static_cast<size_t>(num_channels))
|
||||
: 0;
|
||||
|
||||
int ch_idx = -1;
|
||||
for (size_t i = 0; i < dims.size(); ++i) {
|
||||
if (dims[i] == static_cast<uint32_t>(num_channels)) {
|
||||
ch_idx = static_cast<int>(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch_idx >= 0 && total_elems >= static_cast<size_t>(num_channels)) {
|
||||
info.num_boxes = static_cast<int>(max_boxes_from_data);
|
||||
|
||||
int prev_non1 = 1;
|
||||
for (int i = ch_idx - 1; i >= 0; --i) {
|
||||
if (dims[static_cast<size_t>(i)] > 1U) {
|
||||
prev_non1 = static_cast<int>(dims[static_cast<size_t>(i)]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
int next_non1 = 1;
|
||||
for (size_t i = static_cast<size_t>(ch_idx + 1); i < dims.size(); ++i) {
|
||||
if (dims[i] > 1U) {
|
||||
next_non1 = static_cast<int>(dims[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (next_non1 > 1 && prev_non1 <= 1) {
|
||||
info.channels_first = true;
|
||||
} else if (prev_non1 > 1 && next_non1 <= 1) {
|
||||
info.channels_first = false;
|
||||
} else if (next_non1 > 1 && prev_non1 > 1) {
|
||||
info.channels_first = next_non1 >= prev_non1;
|
||||
} else {
|
||||
info.channels_first = true;
|
||||
}
|
||||
} else if (dims.size() >= 3) {
|
||||
// Compatibility with old rank-3 assumptions.
|
||||
if (dims[1] == static_cast<uint32_t>(num_channels)) {
|
||||
info.num_boxes = static_cast<int>(dims[2]);
|
||||
info.channels_first = true;
|
||||
} else if (dims[2] == static_cast<uint32_t>(num_channels)) {
|
||||
info.num_boxes = static_cast<int>(dims[1]);
|
||||
info.channels_first = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (info.num_boxes <= 0 && max_boxes_from_data > 0) {
|
||||
info.num_boxes = static_cast<int>(max_boxes_from_data);
|
||||
}
|
||||
if (info.num_boxes <= 0) {
|
||||
info.num_boxes = DefaultV8NumBoxes(model_h, model_w);
|
||||
}
|
||||
if (info.num_boxes <= 0) {
|
||||
info.num_boxes = 8400;
|
||||
}
|
||||
|
||||
if (max_boxes_from_data > 0 && static_cast<size_t>(info.num_boxes) > max_boxes_from_data) {
|
||||
info.num_boxes = static_cast<int>(max_boxes_from_data);
|
||||
}
|
||||
if (info.num_boxes < 0) info.num_boxes = 0;
|
||||
return info;
|
||||
}
|
||||
|
||||
// YOLOv8 output processing (anchor-free, single output tensor)
|
||||
int ProcessOutputV8(float* output, int num_boxes, int num_classes,
|
||||
int model_h, int model_w,
|
||||
std::vector<float>& boxes, std::vector<float>& obj_probs,
|
||||
std::vector<int>& class_ids, float conf_thresh) {
|
||||
std::vector<int>& class_ids, float conf_thresh,
|
||||
bool channels_first, V8BoxFormat box_format, bool apply_sigmoid,
|
||||
bool debug_decode, int* debug_left) {
|
||||
int valid_count = 0;
|
||||
const int num_channels = 4 + num_classes;
|
||||
|
||||
for (int i = 0; i < num_boxes; ++i) {
|
||||
float max_score = 0.0f;
|
||||
int max_cls_id = 0;
|
||||
for (int c = 0; c < num_classes; ++c) {
|
||||
float score = output[(4 + c) * num_boxes + i];
|
||||
float score = channels_first ? output[(4 + c) * num_boxes + i]
|
||||
: output[i * num_channels + (4 + c)];
|
||||
if (apply_sigmoid) score = Sigmoid(score);
|
||||
if (score > max_score) {
|
||||
max_score = score;
|
||||
max_cls_id = c;
|
||||
@ -231,13 +506,28 @@ int ProcessOutputV8(float* output, int num_boxes, int num_classes,
|
||||
}
|
||||
|
||||
if (max_score >= conf_thresh) {
|
||||
float cx = output[0 * num_boxes + i];
|
||||
float cy = output[1 * num_boxes + i];
|
||||
float w = output[2 * num_boxes + i];
|
||||
float h = output[3 * num_boxes + i];
|
||||
|
||||
float x1 = cx - w / 2.0f;
|
||||
float y1 = cy - h / 2.0f;
|
||||
const float a = channels_first ? output[0 * num_boxes + i] : output[i * num_channels + 0];
|
||||
const float b = channels_first ? output[1 * num_boxes + i] : output[i * num_channels + 1];
|
||||
const float c = channels_first ? output[2 * num_boxes + i] : output[i * num_channels + 2];
|
||||
const float d = channels_first ? output[3 * num_boxes + i] : output[i * num_channels + 3];
|
||||
if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) {
|
||||
continue;
|
||||
}
|
||||
float x1 = 0.0f, y1 = 0.0f, w = 0.0f, h = 0.0f;
|
||||
V8BoxFormat used_fmt = box_format;
|
||||
DecodeV8Box(a, b, c, d, model_w, model_h, box_format, x1, y1, w, h, &used_fmt);
|
||||
if (!std::isfinite(x1) || !std::isfinite(y1) || !std::isfinite(w) || !std::isfinite(h)) {
|
||||
continue;
|
||||
}
|
||||
if (w <= 1e-3f || h <= 1e-3f) continue;
|
||||
if (debug_decode && debug_left && *debug_left > 0) {
|
||||
--(*debug_left);
|
||||
LogInfo("[ai_yolo] v8 decode f32: raw4(" + std::to_string(a) + "," + std::to_string(b) + "," +
|
||||
std::to_string(c) + "," + std::to_string(d) + ") fmt=" + V8BoxFormatName(used_fmt) +
|
||||
" -> xywh(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
|
||||
std::to_string(w) + "," + std::to_string(h) + ") cls=" +
|
||||
std::to_string(max_cls_id) + " score=" + std::to_string(max_score));
|
||||
}
|
||||
|
||||
boxes.push_back(x1);
|
||||
boxes.push_back(y1);
|
||||
@ -256,15 +546,18 @@ int ProcessOutputV8Int8(int8_t* output, int num_boxes, int num_classes,
|
||||
int model_h, int model_w,
|
||||
std::vector<float>& boxes, std::vector<float>& obj_probs,
|
||||
std::vector<int>& class_ids, float conf_thresh,
|
||||
int32_t zp, float scale) {
|
||||
int32_t zp, float scale, bool channels_first, V8BoxFormat box_format,
|
||||
bool debug_decode, int* debug_left) {
|
||||
int valid_count = 0;
|
||||
int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);
|
||||
const int num_channels = 4 + num_classes;
|
||||
|
||||
for (int i = 0; i < num_boxes; ++i) {
|
||||
int8_t max_score_i8 = -128;
|
||||
int max_cls_id = 0;
|
||||
for (int c = 0; c < num_classes; ++c) {
|
||||
int8_t score = output[(4 + c) * num_boxes + i];
|
||||
int8_t score = channels_first ? output[(4 + c) * num_boxes + i]
|
||||
: output[i * num_channels + (4 + c)];
|
||||
if (score > max_score_i8) {
|
||||
max_score_i8 = score;
|
||||
max_cls_id = c;
|
||||
@ -272,14 +565,33 @@ int ProcessOutputV8Int8(int8_t* output, int num_boxes, int num_classes,
|
||||
}
|
||||
|
||||
if (max_score_i8 >= thresh_i8) {
|
||||
float cx = DequantizeAffineToF32(output[0 * num_boxes + i], zp, scale);
|
||||
float cy = DequantizeAffineToF32(output[1 * num_boxes + i], zp, scale);
|
||||
float w = DequantizeAffineToF32(output[2 * num_boxes + i], zp, scale);
|
||||
float h = DequantizeAffineToF32(output[3 * num_boxes + i], zp, scale);
|
||||
float a = DequantizeAffineToF32(
|
||||
channels_first ? output[0 * num_boxes + i] : output[i * num_channels + 0], zp, scale);
|
||||
float b = DequantizeAffineToF32(
|
||||
channels_first ? output[1 * num_boxes + i] : output[i * num_channels + 1], zp, scale);
|
||||
float c = DequantizeAffineToF32(
|
||||
channels_first ? output[2 * num_boxes + i] : output[i * num_channels + 2], zp, scale);
|
||||
float d = DequantizeAffineToF32(
|
||||
channels_first ? output[3 * num_boxes + i] : output[i * num_channels + 3], zp, scale);
|
||||
float max_score = DequantizeAffineToF32(max_score_i8, zp, scale);
|
||||
|
||||
float x1 = cx - w / 2.0f;
|
||||
float y1 = cy - h / 2.0f;
|
||||
if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) {
|
||||
continue;
|
||||
}
|
||||
float x1 = 0.0f, y1 = 0.0f, w = 0.0f, h = 0.0f;
|
||||
V8BoxFormat used_fmt = box_format;
|
||||
DecodeV8Box(a, b, c, d, model_w, model_h, box_format, x1, y1, w, h, &used_fmt);
|
||||
if (!std::isfinite(x1) || !std::isfinite(y1) || !std::isfinite(w) || !std::isfinite(h)) {
|
||||
continue;
|
||||
}
|
||||
if (w <= 1e-3f || h <= 1e-3f) continue;
|
||||
if (debug_decode && debug_left && *debug_left > 0) {
|
||||
--(*debug_left);
|
||||
LogInfo("[ai_yolo] v8 decode int8: raw4(" + std::to_string(a) + "," + std::to_string(b) + "," +
|
||||
std::to_string(c) + "," + std::to_string(d) + ") fmt=" + V8BoxFormatName(used_fmt) +
|
||||
" -> xywh(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
|
||||
std::to_string(w) + "," + std::to_string(h) + ") cls=" +
|
||||
std::to_string(max_cls_id) + " score=" + std::to_string(max_score));
|
||||
}
|
||||
|
||||
boxes.push_back(x1);
|
||||
boxes.push_back(y1);
|
||||
@ -309,6 +621,28 @@ public:
|
||||
model_input_w_ = config.ValueOr<int>("model_w", 640);
|
||||
model_input_h_ = config.ValueOr<int>("model_h", 640);
|
||||
num_classes_ = config.ValueOr<int>("num_classes", 80);
|
||||
{
|
||||
const std::string bf = config.ValueOr<std::string>("v8_box_format", "cxcywh");
|
||||
if (bf == "xyxy") {
|
||||
v8_box_format_ = V8BoxFormat::XyXy;
|
||||
} else if (bf == "xywh") {
|
||||
v8_box_format_ = V8BoxFormat::XyWh;
|
||||
} else if (bf == "cxcywh") {
|
||||
v8_box_format_ = V8BoxFormat::CxCyWh;
|
||||
} else {
|
||||
v8_box_format_ = V8BoxFormat::Auto;
|
||||
}
|
||||
}
|
||||
{
|
||||
const std::string act = config.ValueOr<std::string>("v8_cls_activation", "auto");
|
||||
if (act == "sigmoid") {
|
||||
v8_cls_activation_ = V8ClsActivation::Sigmoid;
|
||||
} else if (act == "none") {
|
||||
v8_cls_activation_ = V8ClsActivation::None;
|
||||
} else {
|
||||
v8_cls_activation_ = V8ClsActivation::Auto;
|
||||
}
|
||||
}
|
||||
|
||||
if (const SimpleJson* dbg = config.Find("debug"); dbg && dbg->IsObject()) {
|
||||
stats_log_ = dbg->ValueOr<bool>("stats", stats_log_);
|
||||
@ -432,10 +766,9 @@ public:
|
||||
PushToDownstream(frame);
|
||||
++processed_;
|
||||
|
||||
// Stats log disabled to reduce log spam
|
||||
// if (stats_log_ && stats_interval_ > 0 && (processed_ % stats_interval_) == 0) {
|
||||
// LogInfo("[ai_yolo] processed=" + std::to_string(processed_) + " id=" + id_);
|
||||
// }
|
||||
if (stats_log_ && stats_interval_ > 0 && (processed_ % stats_interval_) == 0) {
|
||||
LogInfo("[ai_yolo] processed=" + std::to_string(processed_) + " id=" + id_);
|
||||
}
|
||||
return NodeStatus::OK;
|
||||
}
|
||||
|
||||
@ -540,29 +873,42 @@ private:
|
||||
outputs[2].zp, outputs[2].scale);
|
||||
valid_count = cnt0 + cnt1 + cnt2;
|
||||
} else {
|
||||
if (outputs.empty()) return;
|
||||
if (outputs.empty()) return;
|
||||
if (!outputs[0].data || outputs[0].size == 0) return;
|
||||
|
||||
int num_boxes = 0;
|
||||
int num_channels = 4 + num_classes_;
|
||||
|
||||
if (outputs[0].dims.size() >= 3) {
|
||||
if (outputs[0].dims[1] == static_cast<uint32_t>(num_channels)) {
|
||||
num_boxes = static_cast<int>(outputs[0].dims[2]);
|
||||
} else if (outputs[0].dims[2] == static_cast<uint32_t>(num_channels)) {
|
||||
num_boxes = static_cast<int>(outputs[0].dims[1]);
|
||||
} else {
|
||||
num_boxes = 8400;
|
||||
const V8LayoutInfo layout = ResolveV8Layout(outputs[0].dims, outputs[0].size,
|
||||
outputs[0].type, num_classes_,
|
||||
model_input_h_, model_input_w_);
|
||||
const int num_boxes = layout.num_boxes;
|
||||
int debug_decode_left = (debug_det_ && processed_ < 20) ? 5 : 0;
|
||||
if (num_boxes <= 0) return;
|
||||
if (debug_det_ && processed_ < 5) {
|
||||
std::string dims_s;
|
||||
for (size_t di = 0; di < outputs[0].dims.size(); ++di) {
|
||||
dims_s += (di == 0 ? "[" : ",");
|
||||
dims_s += std::to_string(outputs[0].dims[di]);
|
||||
}
|
||||
} else {
|
||||
num_boxes = static_cast<int>(outputs[0].size) / num_channels;
|
||||
dims_s += "]";
|
||||
LogInfo("[ai_yolo] v8 out type=" + std::to_string(static_cast<int>(outputs[0].type)) +
|
||||
" size=" + std::to_string(outputs[0].size) +
|
||||
" dims=" + dims_s +
|
||||
" num_boxes=" + std::to_string(num_boxes) +
|
||||
" layout=" + std::string(layout.channels_first ? "CxN" : "NxC"));
|
||||
}
|
||||
|
||||
if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
|
||||
const bool apply_sigmoid = ResolveV8ApplySigmoid(
|
||||
reinterpret_cast<float*>(const_cast<uint8_t*>(outputs[0].data)),
|
||||
num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
|
||||
if (debug_det_ && processed_ < 5) {
|
||||
LogInfo("[ai_yolo] v8 cls activation=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
|
||||
}
|
||||
valid_count = ProcessOutputV8(reinterpret_cast<float*>(const_cast<uint8_t*>(outputs[0].data)),
|
||||
num_boxes, num_classes_,
|
||||
model_input_h_, model_input_w_,
|
||||
boxes, obj_probs, class_ids, conf_thresh_);
|
||||
boxes, obj_probs, class_ids, conf_thresh_,
|
||||
layout.channels_first, v8_box_format_, apply_sigmoid,
|
||||
debug_det_, &debug_decode_left);
|
||||
} else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
|
||||
// Convert FP16 to FP32
|
||||
size_t num_elements = outputs[0].size / sizeof(uint16_t);
|
||||
@ -571,16 +917,25 @@ private:
|
||||
for (size_t i = 0; i < num_elements; ++i) {
|
||||
fp32_buffer_[i] = Fp16ToFp32(fp16_data[i]);
|
||||
}
|
||||
const bool apply_sigmoid = ResolveV8ApplySigmoid(
|
||||
fp32_buffer_.data(), num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
|
||||
if (debug_det_ && processed_ < 5) {
|
||||
LogInfo("[ai_yolo] v8 cls activation=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
|
||||
}
|
||||
valid_count = ProcessOutputV8(fp32_buffer_.data(),
|
||||
num_boxes, num_classes_,
|
||||
model_input_h_, model_input_w_,
|
||||
boxes, obj_probs, class_ids, conf_thresh_);
|
||||
boxes, obj_probs, class_ids, conf_thresh_,
|
||||
layout.channels_first, v8_box_format_, apply_sigmoid,
|
||||
debug_det_, &debug_decode_left);
|
||||
} else {
|
||||
valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[0].data)),
|
||||
num_boxes, num_classes_,
|
||||
model_input_h_, model_input_w_,
|
||||
boxes, obj_probs, class_ids, conf_thresh_,
|
||||
outputs[0].zp, outputs[0].scale);
|
||||
outputs[0].zp, outputs[0].scale,
|
||||
layout.channels_first, v8_box_format_,
|
||||
debug_det_, &debug_decode_left);
|
||||
}
|
||||
}
|
||||
|
||||
@ -596,12 +951,11 @@ private:
|
||||
NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
|
||||
}
|
||||
|
||||
float scale_w = static_cast<float>(model_input_w_) / frame->width;
|
||||
float scale_h = static_cast<float>(model_input_h_) / frame->height;
|
||||
const DetCoordContext coord_ctx = BuildDetCoordContext(*frame, model_input_w_, model_input_h_);
|
||||
|
||||
auto det_result = std::make_shared<DetectionResult>();
|
||||
det_result->img_w = frame->width;
|
||||
det_result->img_h = frame->height;
|
||||
det_result->img_w = coord_ctx.out_w;
|
||||
det_result->img_h = coord_ctx.out_h;
|
||||
det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";
|
||||
|
||||
for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
|
||||
@ -621,22 +975,23 @@ private:
|
||||
Detection det;
|
||||
det.cls_id = cls_id;
|
||||
det.score = obj_probs[i];
|
||||
det.bbox.x = static_cast<float>(Clamp(static_cast<int>(x1 / scale_w), 0, frame->width));
|
||||
det.bbox.y = static_cast<float>(Clamp(static_cast<int>(y1 / scale_h), 0, frame->height));
|
||||
det.bbox.w = static_cast<float>(Clamp(static_cast<int>(w / scale_w), 0, frame->width - static_cast<int>(det.bbox.x)));
|
||||
det.bbox.h = static_cast<float>(Clamp(static_cast<int>(h / scale_h), 0, frame->height - static_cast<int>(det.bbox.y)));
|
||||
det.bbox = DecodeToOutputRect(x1, y1, w, h, coord_ctx);
|
||||
det.track_id = -1;
|
||||
|
||||
if (debug_det_ && det_result->items.size() < 3 && processed_ < 10) {
|
||||
LogDebug("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
|
||||
std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
|
||||
std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
|
||||
std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
|
||||
std::to_string(cls_id) + " score=" + std::to_string(det.score));
|
||||
if (debug_det_ && det_result->items.size() < 5 && processed_ < 20) {
|
||||
LogInfo("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
|
||||
std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
|
||||
std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
|
||||
std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
|
||||
std::to_string(cls_id) + " score=" + std::to_string(det.score));
|
||||
}
|
||||
|
||||
det_result->items.push_back(det);
|
||||
}
|
||||
if (debug_det_ && processed_ < 20) {
|
||||
LogInfo("[ai_yolo] det summary: valid_count=" + std::to_string(valid_count) +
|
||||
" final=" + std::to_string(det_result->items.size()));
|
||||
}
|
||||
|
||||
frame->det = det_result;
|
||||
}
|
||||
@ -671,33 +1026,66 @@ private:
|
||||
} else {
|
||||
if (outputs.empty()) return;
|
||||
|
||||
int num_boxes = 0;
|
||||
int num_channels = 4 + num_classes_;
|
||||
|
||||
if (outputs[0].dims.size() >= 3) {
|
||||
if (outputs[0].dims[1] == static_cast<uint32_t>(num_channels)) {
|
||||
num_boxes = outputs[0].dims[2];
|
||||
} else if (outputs[0].dims[2] == static_cast<uint32_t>(num_channels)) {
|
||||
num_boxes = outputs[0].dims[1];
|
||||
} else {
|
||||
num_boxes = 8400;
|
||||
const V8LayoutInfo layout = ResolveV8Layout(outputs[0].dims, outputs[0].data.size(),
|
||||
outputs[0].type, num_classes_,
|
||||
model_input_h_, model_input_w_);
|
||||
const int num_boxes = layout.num_boxes;
|
||||
int debug_decode_left = (debug_det_ && processed_ < 20) ? 5 : 0;
|
||||
if (num_boxes <= 0) return;
|
||||
if (debug_det_ && processed_ < 5) {
|
||||
std::string dims_s;
|
||||
for (size_t di = 0; di < outputs[0].dims.size(); ++di) {
|
||||
dims_s += (di == 0 ? "[" : ",");
|
||||
dims_s += std::to_string(outputs[0].dims[di]);
|
||||
}
|
||||
} else {
|
||||
num_boxes = outputs[0].data.size() / num_channels;
|
||||
dims_s += "]";
|
||||
LogInfo("[ai_yolo] v8 out(type copy) type=" + std::to_string(static_cast<int>(outputs[0].type)) +
|
||||
" size=" + std::to_string(outputs[0].data.size()) +
|
||||
" dims=" + dims_s +
|
||||
" num_boxes=" + std::to_string(num_boxes) +
|
||||
" layout=" + std::string(layout.channels_first ? "CxN" : "NxC"));
|
||||
}
|
||||
|
||||
if (outputs[0].type == RKNN_TENSOR_FLOAT32 ||
|
||||
outputs[0].type == RKNN_TENSOR_FLOAT16) {
|
||||
if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
|
||||
const bool apply_sigmoid = ResolveV8ApplySigmoid(
|
||||
reinterpret_cast<float*>(outputs[0].data.data()),
|
||||
num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
|
||||
if (debug_det_ && processed_ < 5) {
|
||||
LogInfo("[ai_yolo] v8 cls activation(copy)=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
|
||||
}
|
||||
valid_count = ProcessOutputV8(reinterpret_cast<float*>(outputs[0].data.data()),
|
||||
num_boxes, num_classes_,
|
||||
model_input_h_, model_input_w_,
|
||||
boxes, obj_probs, class_ids, conf_thresh_);
|
||||
boxes, obj_probs, class_ids, conf_thresh_,
|
||||
layout.channels_first, v8_box_format_, apply_sigmoid,
|
||||
debug_det_, &debug_decode_left);
|
||||
} else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
|
||||
// Convert FP16 to FP32
|
||||
size_t num_elements = outputs[0].data.size() / sizeof(uint16_t);
|
||||
fp32_buffer_.resize(num_elements);
|
||||
const uint16_t* fp16_data = reinterpret_cast<const uint16_t*>(outputs[0].data.data());
|
||||
for (size_t i = 0; i < num_elements; ++i) {
|
||||
fp32_buffer_[i] = Fp16ToFp32(fp16_data[i]);
|
||||
}
|
||||
const bool apply_sigmoid = ResolveV8ApplySigmoid(
|
||||
fp32_buffer_.data(), num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
|
||||
if (debug_det_ && processed_ < 5) {
|
||||
LogInfo("[ai_yolo] v8 cls activation(copy)=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
|
||||
}
|
||||
valid_count = ProcessOutputV8(fp32_buffer_.data(),
|
||||
num_boxes, num_classes_,
|
||||
model_input_h_, model_input_w_,
|
||||
boxes, obj_probs, class_ids, conf_thresh_,
|
||||
layout.channels_first, v8_box_format_, apply_sigmoid,
|
||||
debug_det_, &debug_decode_left);
|
||||
} else {
|
||||
valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(outputs[0].data.data()),
|
||||
num_boxes, num_classes_,
|
||||
model_input_h_, model_input_w_,
|
||||
boxes, obj_probs, class_ids, conf_thresh_,
|
||||
outputs[0].zp, outputs[0].scale);
|
||||
outputs[0].zp, outputs[0].scale,
|
||||
layout.channels_first, v8_box_format_,
|
||||
debug_det_, &debug_decode_left);
|
||||
}
|
||||
}
|
||||
|
||||
@ -713,12 +1101,11 @@ private:
|
||||
NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
|
||||
}
|
||||
|
||||
float scale_w = static_cast<float>(model_input_w_) / frame->width;
|
||||
float scale_h = static_cast<float>(model_input_h_) / frame->height;
|
||||
const DetCoordContext coord_ctx = BuildDetCoordContext(*frame, model_input_w_, model_input_h_);
|
||||
|
||||
auto det_result = std::make_shared<DetectionResult>();
|
||||
det_result->img_w = frame->width;
|
||||
det_result->img_h = frame->height;
|
||||
det_result->img_w = coord_ctx.out_w;
|
||||
det_result->img_h = coord_ctx.out_h;
|
||||
det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";
|
||||
|
||||
for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
|
||||
@ -738,23 +1125,23 @@ private:
|
||||
Detection det;
|
||||
det.cls_id = cls_id;
|
||||
det.score = obj_probs[i];
|
||||
det.bbox.x = static_cast<float>(Clamp(static_cast<int>(x1 / scale_w), 0, frame->width));
|
||||
det.bbox.y = static_cast<float>(Clamp(static_cast<int>(y1 / scale_h), 0, frame->height));
|
||||
det.bbox.w = static_cast<float>(Clamp(static_cast<int>(w / scale_w), 0, frame->width - static_cast<int>(det.bbox.x)));
|
||||
det.bbox.h = static_cast<float>(Clamp(static_cast<int>(h / scale_h), 0, frame->height - static_cast<int>(det.bbox.y)));
|
||||
det.bbox = DecodeToOutputRect(x1, y1, w, h, coord_ctx);
|
||||
det.track_id = -1;
|
||||
|
||||
// Debug output for first few detections
|
||||
if (debug_det_ && det_result->items.size() < 3 && processed_ < 10) {
|
||||
LogDebug("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
|
||||
std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
|
||||
std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
|
||||
std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
|
||||
std::to_string(cls_id) + " score=" + std::to_string(det.score));
|
||||
if (debug_det_ && det_result->items.size() < 5 && processed_ < 20) {
|
||||
LogInfo("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
|
||||
std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
|
||||
std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
|
||||
std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
|
||||
std::to_string(cls_id) + " score=" + std::to_string(det.score));
|
||||
}
|
||||
|
||||
det_result->items.push_back(det);
|
||||
}
|
||||
if (debug_det_ && processed_ < 20) {
|
||||
LogInfo("[ai_yolo] det summary(copy): valid_count=" + std::to_string(valid_count) +
|
||||
" final=" + std::to_string(det_result->items.size()));
|
||||
}
|
||||
|
||||
frame->det = det_result;
|
||||
}
|
||||
@ -767,6 +1154,8 @@ private:
|
||||
int model_input_w_ = 640;
|
||||
int model_input_h_ = 640;
|
||||
int num_classes_ = 80;
|
||||
V8BoxFormat v8_box_format_ = V8BoxFormat::CxCyWh;
|
||||
V8ClsActivation v8_cls_activation_ = V8ClsActivation::Auto;
|
||||
YoloVersion yolo_version_ = YoloVersion::V8;
|
||||
bool auto_detect_version_ = false;
|
||||
std::set<int> class_filter_;
|
||||
|
||||
@ -63,6 +63,20 @@ inline int Clamp(int val, int min_val, int max_val) {
|
||||
return val < min_val ? min_val : (val > max_val ? max_val : val);
|
||||
}
|
||||
|
||||
Rect MapRectToFrame(const Rect& in, int src_w, int src_h, int dst_w, int dst_h) {
|
||||
if (src_w <= 0 || src_h <= 0 || dst_w <= 0 || dst_h <= 0) return Rect{};
|
||||
const float sx = static_cast<float>(dst_w) / static_cast<float>(src_w);
|
||||
const float sy = static_cast<float>(dst_h) / static_cast<float>(src_h);
|
||||
Rect out{};
|
||||
out.x = std::max(0.0f, in.x * sx);
|
||||
out.y = std::max(0.0f, in.y * sy);
|
||||
out.w = std::max(0.0f, in.w * sx);
|
||||
out.h = std::max(0.0f, in.h * sy);
|
||||
if (out.x + out.w > static_cast<float>(dst_w)) out.w = std::max(0.0f, static_cast<float>(dst_w) - out.x);
|
||||
if (out.y + out.h > static_cast<float>(dst_h)) out.h = std::max(0.0f, static_cast<float>(dst_h) - out.y);
|
||||
return out;
|
||||
}
|
||||
|
||||
#if defined(RK3588_ENABLE_RGA)
|
||||
inline uint32_t PackColorArgb(const Color& c) {
|
||||
return (0xFFu << 24) | (static_cast<uint32_t>(c.r) << 16) |
|
||||
@ -561,6 +575,9 @@ private:
|
||||
|
||||
int w = frame->width;
|
||||
int h = frame->height;
|
||||
const int det_w = frame->det->img_w > 0 ? frame->det->img_w : w;
|
||||
const int det_h = frame->det->img_h > 0 ? frame->det->img_h : h;
|
||||
const bool map_det_to_frame = (det_w != w || det_h != h);
|
||||
uint8_t* data = frame->planes[0].data ? frame->planes[0].data : frame->data;
|
||||
PixelFormat fmt = frame->format;
|
||||
|
||||
@ -587,10 +604,11 @@ private:
|
||||
} else {
|
||||
bool ok = true;
|
||||
for (const auto& det : frame->det->items) {
|
||||
int x = Clamp(static_cast<int>(det.bbox.x), 0, w - 1);
|
||||
int y = Clamp(static_cast<int>(det.bbox.y), 0, h - 1);
|
||||
int rw = static_cast<int>(det.bbox.w);
|
||||
int rh = static_cast<int>(det.bbox.h);
|
||||
const Rect draw = map_det_to_frame ? MapRectToFrame(det.bbox, det_w, det_h, w, h) : det.bbox;
|
||||
int x = Clamp(static_cast<int>(draw.x), 0, w - 1);
|
||||
int y = Clamp(static_cast<int>(draw.y), 0, h - 1);
|
||||
int rw = static_cast<int>(draw.w);
|
||||
int rh = static_cast<int>(draw.h);
|
||||
rw = Clamp(rw, 1, w - x);
|
||||
rh = Clamp(rh, 1, h - y);
|
||||
im_rect rect{x, y, rw, rh};
|
||||
@ -636,10 +654,11 @@ private:
|
||||
}
|
||||
|
||||
for (const auto& det : frame->det->items) {
|
||||
int x1 = static_cast<int>(det.bbox.x);
|
||||
int y1 = static_cast<int>(det.bbox.y);
|
||||
int x2 = static_cast<int>(det.bbox.x + det.bbox.w);
|
||||
int y2 = static_cast<int>(det.bbox.y + det.bbox.h);
|
||||
const Rect draw = map_det_to_frame ? MapRectToFrame(det.bbox, det_w, det_h, w, h) : det.bbox;
|
||||
int x1 = static_cast<int>(draw.x);
|
||||
int y1 = static_cast<int>(draw.y);
|
||||
int x2 = static_cast<int>(draw.x + draw.w);
|
||||
int y2 = static_cast<int>(draw.y + draw.h);
|
||||
|
||||
Color color = GetClassColor(det.cls_id);
|
||||
|
||||
|
||||
@ -1,16 +1,27 @@
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "face/face_result.h"
|
||||
#include "hw/i_image_processor.h"
|
||||
#include "node.h"
|
||||
#include "utils/dma_alloc.h"
|
||||
#include "utils/logger.h"
|
||||
|
||||
namespace rk3588 {
|
||||
|
||||
namespace {
|
||||
|
||||
enum class ResizeMode {
|
||||
Stretch,
|
||||
KeepRatio,
|
||||
Letterbox,
|
||||
};
|
||||
|
||||
PixelFormat ParseFormat(const std::string& s) {
|
||||
if (s == "nv12" || s == "NV12") return PixelFormat::NV12;
|
||||
if (s == "yuv420" || s == "YUV420") return PixelFormat::YUV420;
|
||||
@ -19,6 +30,229 @@ PixelFormat ParseFormat(const std::string& s) {
|
||||
return PixelFormat::UNKNOWN;
|
||||
}
|
||||
|
||||
ResizeMode ParseResizeMode(const std::string& s, bool keep_ratio) {
|
||||
if (s == "stretch") return ResizeMode::Stretch;
|
||||
if (s == "keep_ratio" || s == "fit") return ResizeMode::KeepRatio;
|
||||
if (s == "letterbox") return ResizeMode::Letterbox;
|
||||
return keep_ratio ? ResizeMode::KeepRatio : ResizeMode::Stretch;
|
||||
}
|
||||
|
||||
inline bool IsYuvFormat(PixelFormat fmt) {
|
||||
return fmt == PixelFormat::NV12 || fmt == PixelFormat::YUV420;
|
||||
}
|
||||
|
||||
inline float ClampFloat(float v, float lo, float hi) {
|
||||
return std::max(lo, std::min(v, hi));
|
||||
}
|
||||
|
||||
inline int MakeEvenFloor(int v) {
|
||||
if (v <= 0) return 0;
|
||||
return v & ~1;
|
||||
}
|
||||
|
||||
size_t CalcImageSize(int w, int h, PixelFormat fmt) {
|
||||
if (w <= 0 || h <= 0) return 0;
|
||||
switch (fmt) {
|
||||
case PixelFormat::NV12:
|
||||
case PixelFormat::YUV420:
|
||||
return static_cast<size_t>(w) * static_cast<size_t>(h) * 3 / 2;
|
||||
case PixelFormat::RGB:
|
||||
case PixelFormat::BGR:
|
||||
return static_cast<size_t>(w) * static_cast<size_t>(h) * 3;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void SetupPlanes(Frame& f) {
|
||||
if (!f.data || f.width <= 0 || f.height <= 0) return;
|
||||
if (f.format == PixelFormat::NV12) {
|
||||
const int y_stride = f.width;
|
||||
const int y_size = y_stride * f.height;
|
||||
const int uv_size = y_stride * (f.height / 2);
|
||||
f.stride = y_stride;
|
||||
f.plane_count = 2;
|
||||
f.planes[0] = {f.data, y_stride, y_size, 0};
|
||||
f.planes[1] = {f.data + y_size, y_stride, uv_size, y_size};
|
||||
} else if (f.format == PixelFormat::YUV420) {
|
||||
const int y_stride = f.width;
|
||||
const int y_size = y_stride * f.height;
|
||||
const int uv_stride = f.width / 2;
|
||||
const int u_size = uv_stride * (f.height / 2);
|
||||
f.stride = y_stride;
|
||||
f.plane_count = 3;
|
||||
f.planes[0] = {f.data, y_stride, y_size, 0};
|
||||
f.planes[1] = {f.data + y_size, uv_stride, u_size, y_size};
|
||||
f.planes[2] = {f.data + y_size + u_size, uv_stride, u_size, y_size + u_size};
|
||||
} else {
|
||||
const int stride = f.width * 3;
|
||||
f.stride = stride;
|
||||
f.plane_count = 1;
|
||||
f.planes[0] = {f.data, stride, static_cast<int>(f.data_size), 0};
|
||||
}
|
||||
f.SyncBufferFromFrame();
|
||||
}
|
||||
|
||||
bool InitFrameStorage(Frame& f) {
|
||||
const size_t need = CalcImageSize(f.width, f.height, f.format);
|
||||
if (need == 0) return false;
|
||||
|
||||
if (auto dma = DmaAlloc(need); dma && dma->valid()) {
|
||||
f.SetDmaFd(dma->fd);
|
||||
f.data = dma->data();
|
||||
f.data_size = dma->size;
|
||||
f.SetOwner(dma);
|
||||
SetupPlanes(f);
|
||||
return true;
|
||||
}
|
||||
|
||||
auto buf = std::make_shared<std::vector<uint8_t>>(need);
|
||||
f.SetDmaFd(-1);
|
||||
f.data = buf->data();
|
||||
f.data_size = buf->size();
|
||||
f.SetOwner(buf);
|
||||
SetupPlanes(f);
|
||||
return true;
|
||||
}
|
||||
|
||||
void FillBlack(Frame& f) {
|
||||
if (!f.data || f.data_size == 0) return;
|
||||
if (f.DmaFd() >= 0) {
|
||||
DmaSyncStartFd(f.DmaFd());
|
||||
}
|
||||
if (f.format == PixelFormat::NV12) {
|
||||
const int y_size = f.width * f.height;
|
||||
std::memset(f.data, 0, static_cast<size_t>(y_size));
|
||||
std::memset(f.data + y_size, 128, static_cast<size_t>(f.width * f.height / 2));
|
||||
} else if (f.format == PixelFormat::YUV420) {
|
||||
const int y_size = f.width * f.height;
|
||||
const int u_size = (f.width / 2) * (f.height / 2);
|
||||
std::memset(f.data, 0, static_cast<size_t>(y_size));
|
||||
std::memset(f.data + y_size, 128, static_cast<size_t>(u_size));
|
||||
std::memset(f.data + y_size + u_size, 128, static_cast<size_t>(u_size));
|
||||
} else {
|
||||
std::memset(f.data, 0, f.data_size);
|
||||
}
|
||||
if (f.DmaFd() >= 0) {
|
||||
DmaSyncEndFd(f.DmaFd());
|
||||
}
|
||||
}
|
||||
|
||||
bool BlitLetterbox(const Frame& src, Frame& dst, int pad_x, int pad_y) {
|
||||
if (!src.data || !dst.data || src.format != dst.format) return false;
|
||||
if (pad_x < 0 || pad_y < 0) return false;
|
||||
if (src.width + pad_x > dst.width || src.height + pad_y > dst.height) return false;
|
||||
|
||||
if (src.DmaFd() >= 0) src.SyncStart();
|
||||
if (dst.DmaFd() >= 0) DmaSyncStartFd(dst.DmaFd());
|
||||
|
||||
if (src.format == PixelFormat::RGB || src.format == PixelFormat::BGR) {
|
||||
const int src_stride = src.planes[0].stride > 0 ? src.planes[0].stride : src.width * 3;
|
||||
const int dst_stride = dst.planes[0].stride > 0 ? dst.planes[0].stride : dst.width * 3;
|
||||
const uint8_t* src_ptr = src.planes[0].data ? src.planes[0].data : src.data;
|
||||
uint8_t* dst_ptr = dst.planes[0].data ? dst.planes[0].data : dst.data;
|
||||
const size_t row_bytes = static_cast<size_t>(src.width) * 3;
|
||||
for (int y = 0; y < src.height; ++y) {
|
||||
std::memcpy(dst_ptr + static_cast<size_t>(y + pad_y) * dst_stride + static_cast<size_t>(pad_x) * 3,
|
||||
src_ptr + static_cast<size_t>(y) * src_stride,
|
||||
row_bytes);
|
||||
}
|
||||
} else if (src.format == PixelFormat::NV12) {
|
||||
const int src_y_stride = src.planes[0].stride > 0 ? src.planes[0].stride : src.width;
|
||||
const int src_uv_stride = src.planes[1].stride > 0 ? src.planes[1].stride : src.width;
|
||||
const int dst_y_stride = dst.planes[0].stride > 0 ? dst.planes[0].stride : dst.width;
|
||||
const int dst_uv_stride = dst.planes[1].stride > 0 ? dst.planes[1].stride : dst.width;
|
||||
const uint8_t* src_y = src.planes[0].data ? src.planes[0].data : src.data;
|
||||
const uint8_t* src_uv = src.planes[1].data ? src.planes[1].data : (src.data + src.width * src.height);
|
||||
uint8_t* dst_y = dst.planes[0].data ? dst.planes[0].data : dst.data;
|
||||
uint8_t* dst_uv = dst.planes[1].data ? dst.planes[1].data : (dst.data + dst.width * dst.height);
|
||||
|
||||
for (int y = 0; y < src.height; ++y) {
|
||||
std::memcpy(dst_y + static_cast<size_t>(y + pad_y) * dst_y_stride + pad_x,
|
||||
src_y + static_cast<size_t>(y) * src_y_stride,
|
||||
static_cast<size_t>(src.width));
|
||||
}
|
||||
|
||||
const int uv_rows = src.height / 2;
|
||||
for (int y = 0; y < uv_rows; ++y) {
|
||||
std::memcpy(dst_uv + static_cast<size_t>(y + pad_y / 2) * dst_uv_stride + pad_x,
|
||||
src_uv + static_cast<size_t>(y) * src_uv_stride,
|
||||
static_cast<size_t>(src.width));
|
||||
}
|
||||
} else if (src.format == PixelFormat::YUV420) {
|
||||
const int src_y_stride = src.planes[0].stride > 0 ? src.planes[0].stride : src.width;
|
||||
const int src_u_stride = src.planes[1].stride > 0 ? src.planes[1].stride : src.width / 2;
|
||||
const int src_v_stride = src.planes[2].stride > 0 ? src.planes[2].stride : src.width / 2;
|
||||
const int dst_y_stride = dst.planes[0].stride > 0 ? dst.planes[0].stride : dst.width;
|
||||
const int dst_u_stride = dst.planes[1].stride > 0 ? dst.planes[1].stride : dst.width / 2;
|
||||
const int dst_v_stride = dst.planes[2].stride > 0 ? dst.planes[2].stride : dst.width / 2;
|
||||
const uint8_t* src_y = src.planes[0].data ? src.planes[0].data : src.data;
|
||||
const uint8_t* src_u = src.planes[1].data ? src.planes[1].data : (src.data + src.width * src.height);
|
||||
const uint8_t* src_v = src.planes[2].data ? src.planes[2].data : (src_u + (src.width / 2) * (src.height / 2));
|
||||
uint8_t* dst_y = dst.planes[0].data ? dst.planes[0].data : dst.data;
|
||||
uint8_t* dst_u = dst.planes[1].data ? dst.planes[1].data : (dst.data + dst.width * dst.height);
|
||||
uint8_t* dst_v = dst.planes[2].data ? dst.planes[2].data : (dst_u + (dst.width / 2) * (dst.height / 2));
|
||||
|
||||
for (int y = 0; y < src.height; ++y) {
|
||||
std::memcpy(dst_y + static_cast<size_t>(y + pad_y) * dst_y_stride + pad_x,
|
||||
src_y + static_cast<size_t>(y) * src_y_stride,
|
||||
static_cast<size_t>(src.width));
|
||||
}
|
||||
|
||||
const int uv_rows = src.height / 2;
|
||||
const int uv_pad_x = pad_x / 2;
|
||||
const int uv_pad_y = pad_y / 2;
|
||||
const int uv_cols = src.width / 2;
|
||||
for (int y = 0; y < uv_rows; ++y) {
|
||||
std::memcpy(dst_u + static_cast<size_t>(y + uv_pad_y) * dst_u_stride + uv_pad_x,
|
||||
src_u + static_cast<size_t>(y) * src_u_stride,
|
||||
static_cast<size_t>(uv_cols));
|
||||
std::memcpy(dst_v + static_cast<size_t>(y + uv_pad_y) * dst_v_stride + uv_pad_x,
|
||||
src_v + static_cast<size_t>(y) * src_v_stride,
|
||||
static_cast<size_t>(uv_cols));
|
||||
}
|
||||
} else {
|
||||
if (src.DmaFd() >= 0) src.SyncEnd();
|
||||
if (dst.DmaFd() >= 0) DmaSyncEndFd(dst.DmaFd());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst.DmaFd() >= 0) DmaSyncEndFd(dst.DmaFd());
|
||||
if (src.DmaFd() >= 0) src.SyncEnd();
|
||||
return true;
|
||||
}
|
||||
|
||||
void TransformRect(Rect& r, float sx, float sy, float tx, float ty, int out_w, int out_h) {
|
||||
if (out_w <= 0 || out_h <= 0) {
|
||||
r = Rect{};
|
||||
return;
|
||||
}
|
||||
const float fw = static_cast<float>(out_w);
|
||||
const float fh = static_cast<float>(out_h);
|
||||
|
||||
const float x = ClampFloat(r.x * sx + tx, 0.0f, fw);
|
||||
const float y = ClampFloat(r.y * sy + ty, 0.0f, fh);
|
||||
float w = std::max(0.0f, r.w * sx);
|
||||
float h = std::max(0.0f, r.h * sy);
|
||||
|
||||
if (x + w > fw) w = std::max(0.0f, fw - x);
|
||||
if (y + h > fh) h = std::max(0.0f, fh - y);
|
||||
|
||||
r.x = x;
|
||||
r.y = y;
|
||||
r.w = w;
|
||||
r.h = h;
|
||||
}
|
||||
|
||||
void TransformPoint(Point2f& p, float sx, float sy, float tx, float ty, int out_w, int out_h) {
|
||||
if (out_w <= 0 || out_h <= 0) {
|
||||
p = Point2f{};
|
||||
return;
|
||||
}
|
||||
p.x = ClampFloat(p.x * sx + tx, 0.0f, static_cast<float>(out_w));
|
||||
p.y = ClampFloat(p.y * sy + ty, 0.0f, static_cast<float>(out_h));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
class PreprocessNode : public INode {
|
||||
@ -31,6 +265,7 @@ public:
|
||||
dst_w_ = config.ValueOr<int>("dst_w", 640);
|
||||
dst_h_ = config.ValueOr<int>("dst_h", 640);
|
||||
keep_ratio_ = config.ValueOr<bool>("keep_ratio", false);
|
||||
resize_mode_ = ParseResizeMode(config.ValueOr<std::string>("resize_mode", ""), keep_ratio_);
|
||||
|
||||
std::string fmt_str = config.ValueOr<std::string>("dst_format", "");
|
||||
if (!fmt_str.empty()) {
|
||||
@ -81,8 +316,11 @@ public:
|
||||
}
|
||||
|
||||
bool Start() override {
|
||||
std::string mode = "stretch";
|
||||
if (resize_mode_ == ResizeMode::KeepRatio) mode = "keep_ratio";
|
||||
if (resize_mode_ == ResizeMode::Letterbox) mode = "letterbox";
|
||||
LogInfo("[preprocess] start id=" + id_ + " dst=" + std::to_string(dst_w_) + "x" +
|
||||
std::to_string(dst_h_) + (use_rga_ ? " (rga)" : " (swscale)"));
|
||||
std::to_string(dst_h_) + " mode=" + mode + (use_rga_ ? " (rga)" : " (swscale)"));
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -95,52 +333,80 @@ public:
|
||||
PixelFormat out_fmt = (dst_fmt_ != PixelFormat::UNKNOWN) ? dst_fmt_ : frame->format;
|
||||
int out_w = dst_w_;
|
||||
int out_h = dst_h_;
|
||||
|
||||
if (out_w <= 0) out_w = frame->width;
|
||||
if (out_h <= 0) out_h = frame->height;
|
||||
|
||||
if (keep_ratio_ && dst_w_ > 0 && dst_h_ > 0 && frame->width > 0 && frame->height > 0) {
|
||||
float scale = std::min(static_cast<float>(dst_w_) / frame->width,
|
||||
static_cast<float>(dst_h_) / frame->height);
|
||||
out_w = static_cast<int>(frame->width * scale);
|
||||
out_h = static_cast<int>(frame->height * scale);
|
||||
out_w = (out_w + 1) & ~1;
|
||||
out_h = (out_h + 1) & ~1;
|
||||
}
|
||||
|
||||
const bool need_resize = (frame->width != out_w || frame->height != out_h);
|
||||
const bool need_cvt = (frame->format != out_fmt);
|
||||
|
||||
if (need_resize) {
|
||||
WarnMetaResizeOnce(frame, out_w, out_h);
|
||||
}
|
||||
|
||||
if (!need_resize && !need_cvt) {
|
||||
ProcessPassthrough(frame);
|
||||
return NodeStatus::OK;
|
||||
}
|
||||
FrameTransformMeta tx{};
|
||||
tx.valid = true;
|
||||
tx.src_w = frame->width;
|
||||
tx.src_h = frame->height;
|
||||
|
||||
Frame out;
|
||||
out.width = out_w;
|
||||
out.height = out_h;
|
||||
out.format = out_fmt;
|
||||
if (resize_mode_ == ResizeMode::Letterbox && dst_w_ > 0 && dst_h_ > 0 &&
|
||||
frame->width > 0 && frame->height > 0) {
|
||||
Status lb = BuildLetterbox(*frame, out_fmt, out_w, out_h, out, tx);
|
||||
if (lb.Failed()) {
|
||||
LogError("[preprocess] letterbox failed: " + lb.ErrMessage());
|
||||
return NodeStatus::ERROR;
|
||||
}
|
||||
} else {
|
||||
if (resize_mode_ == ResizeMode::KeepRatio && dst_w_ > 0 && dst_h_ > 0 &&
|
||||
frame->width > 0 && frame->height > 0) {
|
||||
float scale = std::min(static_cast<float>(dst_w_) / frame->width,
|
||||
static_cast<float>(dst_h_) / frame->height);
|
||||
out_w = static_cast<int>(std::round(frame->width * scale));
|
||||
out_h = static_cast<int>(std::round(frame->height * scale));
|
||||
if (IsYuvFormat(out_fmt)) {
|
||||
out_w = std::max(2, MakeEvenFloor(out_w));
|
||||
out_h = std::max(2, MakeEvenFloor(out_h));
|
||||
}
|
||||
if (out_w <= 0) out_w = frame->width;
|
||||
if (out_h <= 0) out_h = frame->height;
|
||||
}
|
||||
|
||||
Status st = image_processor_->Resize(*frame, out);
|
||||
if (st.Failed()) {
|
||||
if (!use_rga_ && st.ErrMessage().find("unsupported format") != std::string::npos) {
|
||||
const bool need_resize = (frame->width != out_w || frame->height != out_h);
|
||||
const bool need_cvt = (frame->format != out_fmt);
|
||||
|
||||
tx.letterbox = false;
|
||||
tx.dst_w = out_w;
|
||||
tx.dst_h = out_h;
|
||||
tx.scale_x = frame->width > 0 ? static_cast<float>(out_w) / frame->width : 1.0f;
|
||||
tx.scale_y = frame->height > 0 ? static_cast<float>(out_h) / frame->height : 1.0f;
|
||||
tx.pad_x = 0.0f;
|
||||
tx.pad_y = 0.0f;
|
||||
|
||||
if (!need_resize && !need_cvt) {
|
||||
auto t = std::make_shared<FrameTransformMeta>(tx);
|
||||
frame->transform_meta = t;
|
||||
ProcessPassthrough(frame);
|
||||
return NodeStatus::OK;
|
||||
}
|
||||
LogError("[preprocess] " + st.ErrMessage());
|
||||
return NodeStatus::ERROR;
|
||||
|
||||
if (need_resize) {
|
||||
WarnMetaResizeOnce(frame, out_w, out_h);
|
||||
}
|
||||
|
||||
out.width = out_w;
|
||||
out.height = out_h;
|
||||
out.format = out_fmt;
|
||||
Status st = image_processor_->Resize(*frame, out);
|
||||
if (st.Failed()) {
|
||||
if (!use_rga_ && st.ErrMessage().find("unsupported format") != std::string::npos) {
|
||||
auto t = std::make_shared<FrameTransformMeta>(tx);
|
||||
frame->transform_meta = t;
|
||||
ProcessPassthrough(frame);
|
||||
return NodeStatus::OK;
|
||||
}
|
||||
LogError("[preprocess] " + st.ErrMessage());
|
||||
return NodeStatus::ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
auto out_frame = std::make_shared<Frame>(out);
|
||||
out_frame->pts = frame->pts;
|
||||
out_frame->frame_id = frame->frame_id;
|
||||
out_frame->det = frame->det;
|
||||
out_frame->face_det = frame->face_det;
|
||||
out_frame->face_recog = frame->face_recog;
|
||||
out_frame->transform_meta = std::make_shared<FrameTransformMeta>(tx);
|
||||
ScaleMeta(*frame, *out_frame, tx);
|
||||
out_frame->user_meta = frame->user_meta;
|
||||
|
||||
PushToDownstream(out_frame);
|
||||
@ -150,7 +416,7 @@ public:
|
||||
LogInfo("[preprocess] " + std::string(use_rga_ ? "rga" : "swscale") +
|
||||
" frame=" + std::to_string(out_frame->frame_id) +
|
||||
" " + std::to_string(frame->width) + "x" + std::to_string(frame->height) +
|
||||
" -> " + std::to_string(out_w) + "x" + std::to_string(out_h) +
|
||||
" -> " + std::to_string(out_frame->width) + "x" + std::to_string(out_frame->height) +
|
||||
" id=" + id_);
|
||||
}
|
||||
|
||||
@ -158,6 +424,142 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
Status BuildLetterbox(const Frame& input, PixelFormat out_fmt, int dst_w, int dst_h, Frame& out,
|
||||
FrameTransformMeta& tx) const {
|
||||
Frame src = input;
|
||||
if (input.format != out_fmt) {
|
||||
src.width = input.width;
|
||||
src.height = input.height;
|
||||
src.format = out_fmt;
|
||||
Status cvt = image_processor_->CvtColor(input, src, out_fmt);
|
||||
if (cvt.Failed()) return cvt;
|
||||
}
|
||||
|
||||
if (src.width <= 0 || src.height <= 0 || dst_w <= 0 || dst_h <= 0) {
|
||||
return FailStatus("invalid letterbox size");
|
||||
}
|
||||
|
||||
float scale = std::min(static_cast<float>(dst_w) / static_cast<float>(src.width),
|
||||
static_cast<float>(dst_h) / static_cast<float>(src.height));
|
||||
int inner_w = std::max(1, static_cast<int>(std::round(src.width * scale)));
|
||||
int inner_h = std::max(1, static_cast<int>(std::round(src.height * scale)));
|
||||
if (IsYuvFormat(out_fmt)) {
|
||||
inner_w = std::max(2, MakeEvenFloor(inner_w));
|
||||
inner_h = std::max(2, MakeEvenFloor(inner_h));
|
||||
if (((dst_w - inner_w) & 1) != 0) inner_w = std::max(2, inner_w - 2);
|
||||
if (((dst_h - inner_h) & 1) != 0) inner_h = std::max(2, inner_h - 2);
|
||||
}
|
||||
if (inner_w <= 0 || inner_h <= 0 || inner_w > dst_w || inner_h > dst_h) {
|
||||
return FailStatus("invalid inner letterbox size");
|
||||
}
|
||||
|
||||
Frame resized;
|
||||
resized.width = inner_w;
|
||||
resized.height = inner_h;
|
||||
resized.format = out_fmt;
|
||||
Status st = image_processor_->Resize(src, resized);
|
||||
if (st.Failed()) return st;
|
||||
|
||||
out.width = dst_w;
|
||||
out.height = dst_h;
|
||||
out.format = out_fmt;
|
||||
if (!InitFrameStorage(out)) {
|
||||
return FailStatus("alloc letterbox output failed");
|
||||
}
|
||||
FillBlack(out);
|
||||
|
||||
const int pad_x = (dst_w - inner_w) / 2;
|
||||
const int pad_y = (dst_h - inner_h) / 2;
|
||||
if (!BlitLetterbox(resized, out, pad_x, pad_y)) {
|
||||
return FailStatus("blit letterbox failed");
|
||||
}
|
||||
|
||||
tx.letterbox = true;
|
||||
tx.src_w = input.width;
|
||||
tx.src_h = input.height;
|
||||
tx.dst_w = dst_w;
|
||||
tx.dst_h = dst_h;
|
||||
tx.scale_x = static_cast<float>(inner_w) / static_cast<float>(input.width);
|
||||
tx.scale_y = static_cast<float>(inner_h) / static_cast<float>(input.height);
|
||||
tx.pad_x = static_cast<float>(pad_x);
|
||||
tx.pad_y = static_cast<float>(pad_y);
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
void ScaleMeta(const Frame& in_frame, Frame& out_frame, const FrameTransformMeta& tx) const {
|
||||
if (in_frame.det) {
|
||||
auto det = std::make_shared<DetectionResult>(*in_frame.det);
|
||||
const int src_meta_w = det->img_w > 0 ? det->img_w : in_frame.width;
|
||||
const int src_meta_h = det->img_h > 0 ? det->img_h : in_frame.height;
|
||||
const float to_frame_x = src_meta_w > 0 ? static_cast<float>(in_frame.width) / src_meta_w : 1.0f;
|
||||
const float to_frame_y = src_meta_h > 0 ? static_cast<float>(in_frame.height) / src_meta_h : 1.0f;
|
||||
for (auto& it : det->items) {
|
||||
TransformRect(it.bbox,
|
||||
tx.scale_x * to_frame_x,
|
||||
tx.scale_y * to_frame_y,
|
||||
tx.pad_x, tx.pad_y,
|
||||
out_frame.width, out_frame.height);
|
||||
}
|
||||
det->img_w = out_frame.width;
|
||||
det->img_h = out_frame.height;
|
||||
out_frame.det = std::move(det);
|
||||
}
|
||||
|
||||
if (in_frame.face_det) {
|
||||
auto face_det = std::make_shared<FaceDetResult>(*in_frame.face_det);
|
||||
const int src_meta_w = face_det->img_w > 0 ? face_det->img_w : in_frame.width;
|
||||
const int src_meta_h = face_det->img_h > 0 ? face_det->img_h : in_frame.height;
|
||||
const float to_frame_x = src_meta_w > 0 ? static_cast<float>(in_frame.width) / src_meta_w : 1.0f;
|
||||
const float to_frame_y = src_meta_h > 0 ? static_cast<float>(in_frame.height) / src_meta_h : 1.0f;
|
||||
for (auto& it : face_det->faces) {
|
||||
TransformRect(it.bbox,
|
||||
tx.scale_x * to_frame_x,
|
||||
tx.scale_y * to_frame_y,
|
||||
tx.pad_x, tx.pad_y,
|
||||
out_frame.width, out_frame.height);
|
||||
if (it.has_landmarks) {
|
||||
for (auto& lm : it.landmarks) {
|
||||
TransformPoint(lm,
|
||||
tx.scale_x * to_frame_x,
|
||||
tx.scale_y * to_frame_y,
|
||||
tx.pad_x, tx.pad_y,
|
||||
out_frame.width, out_frame.height);
|
||||
}
|
||||
}
|
||||
}
|
||||
face_det->img_w = out_frame.width;
|
||||
face_det->img_h = out_frame.height;
|
||||
out_frame.face_det = std::move(face_det);
|
||||
}
|
||||
|
||||
if (in_frame.face_recog) {
|
||||
auto face_recog = std::make_shared<FaceRecogResult>(*in_frame.face_recog);
|
||||
const int src_meta_w = face_recog->img_w > 0 ? face_recog->img_w : in_frame.width;
|
||||
const int src_meta_h = face_recog->img_h > 0 ? face_recog->img_h : in_frame.height;
|
||||
const float to_frame_x = src_meta_w > 0 ? static_cast<float>(in_frame.width) / src_meta_w : 1.0f;
|
||||
const float to_frame_y = src_meta_h > 0 ? static_cast<float>(in_frame.height) / src_meta_h : 1.0f;
|
||||
for (auto& it : face_recog->items) {
|
||||
TransformRect(it.bbox,
|
||||
tx.scale_x * to_frame_x,
|
||||
tx.scale_y * to_frame_y,
|
||||
tx.pad_x, tx.pad_y,
|
||||
out_frame.width, out_frame.height);
|
||||
if (it.has_landmarks) {
|
||||
for (auto& lm : it.landmarks) {
|
||||
TransformPoint(lm,
|
||||
tx.scale_x * to_frame_x,
|
||||
tx.scale_y * to_frame_y,
|
||||
tx.pad_x, tx.pad_y,
|
||||
out_frame.width, out_frame.height);
|
||||
}
|
||||
}
|
||||
}
|
||||
face_recog->img_w = out_frame.width;
|
||||
face_recog->img_h = out_frame.height;
|
||||
out_frame.face_recog = std::move(face_recog);
|
||||
}
|
||||
}
|
||||
|
||||
void PushToDownstream(FramePtr frame) {
|
||||
for (auto& q : output_queues_) {
|
||||
q->Push(frame);
|
||||
@ -170,7 +572,7 @@ private:
|
||||
if (frame->width == out_w && frame->height == out_h) return;
|
||||
if (!frame->det && !frame->face_det && !frame->face_recog) return;
|
||||
warned_meta_resize_ = true;
|
||||
LogWarn("[preprocess] resized frame but forwarded det/face meta without coordinate scaling; ensure det/recog/osd use same resolution (id=" + id_ + ")");
|
||||
LogInfo("[preprocess] resized frame and scaled det/face meta to destination resolution (id=" + id_ + ")");
|
||||
}
|
||||
|
||||
void ProcessPassthrough(FramePtr frame) {
|
||||
@ -185,12 +587,12 @@ private:
|
||||
int dst_w_ = 640;
|
||||
int dst_h_ = 640;
|
||||
bool keep_ratio_ = false;
|
||||
ResizeMode resize_mode_ = ResizeMode::Stretch;
|
||||
PixelFormat dst_fmt_ = PixelFormat::UNKNOWN;
|
||||
bool use_rga_ = true;
|
||||
|
||||
bool stats_log_ = false;
|
||||
uint64_t stats_interval_ = 100;
|
||||
|
||||
bool warned_meta_resize_ = false;
|
||||
|
||||
std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user