diff --git a/configs/sample_cam_ppe11.json b/configs/sample_cam_ppe11.json index 7237b74..53815f8 100644 --- a/configs/sample_cam_ppe11.json +++ b/configs/sample_cam_ppe11.json @@ -31,6 +31,7 @@ "dst_h": 768, "dst_format": "rgb", "dst_packed": true, + "resize_mode": "letterbox", "keep_ratio": false, "rga_gate": "cam_ppe11_detection", "use_rga": true diff --git a/include/frame/frame.h b/include/frame/frame.h index 23ea702..c928b86 100644 --- a/include/frame/frame.h +++ b/include/frame/frame.h @@ -44,6 +44,19 @@ struct DetectionResult { std::string model_name; }; +struct FrameTransformMeta { + bool valid = false; + bool letterbox = false; + int src_w = 0; + int src_h = 0; + int dst_w = 0; + int dst_h = 0; + float scale_x = 1.0f; + float scale_y = 1.0f; + float pad_x = 0.0f; + float pad_y = 0.0f; +}; + struct FramePlane { uint8_t* data = nullptr; int stride = 0; // bytes per row @@ -76,6 +89,7 @@ struct Frame { // Face recognition pipeline meta (kept separate from user_meta to avoid conflicts with publish). std::shared_ptr face_det; std::shared_ptr face_recog; + std::shared_ptr transform_meta; std::shared_ptr user_meta; int DmaFd() const { return buffer ? buffer->DmaFd() : dma_fd; } diff --git a/plugins/ai_yolo/ai_yolo_node.cpp b/plugins/ai_yolo/ai_yolo_node.cpp index e1fa69a..f76db2e 100644 --- a/plugins/ai_yolo/ai_yolo_node.cpp +++ b/plugins/ai_yolo/ai_yolo_node.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,8 @@ const int kAnchor1[6] = {30, 61, 62, 45, 59, 119}; const int kAnchor2[6] = {116, 90, 156, 198, 373, 326}; enum class YoloVersion { V5, V8 }; +enum class V8BoxFormat { Auto, CxCyWh, XyXy, XyWh }; +enum class V8ClsActivation { Auto, None, Sigmoid }; const char* kCocoLabels[kObjClassNum] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", @@ -51,6 +54,67 @@ inline int Clamp(float val, int min_val, int max_val) { return val > min_val ? (val < max_val ? static_cast(val) : max_val) : min_val; } +struct DetCoordContext { + bool has_transform = false; + int out_w = 0; + int out_h = 0; + float scale_x = 1.0f; + float scale_y = 1.0f; + float pad_x = 0.0f; + float pad_y = 0.0f; + float fallback_scale_w = 1.0f; + float fallback_scale_h = 1.0f; +}; + +DetCoordContext BuildDetCoordContext(const Frame& frame, int model_input_w, int model_input_h) { + DetCoordContext ctx{}; + ctx.fallback_scale_w = frame.width > 0 ? static_cast(model_input_w) / frame.width : 1.0f; + ctx.fallback_scale_h = frame.height > 0 ? static_cast(model_input_h) / frame.height : 1.0f; + ctx.out_w = frame.width; + ctx.out_h = frame.height; + + if (frame.transform_meta && frame.transform_meta->valid && + frame.transform_meta->src_w > 0 && frame.transform_meta->src_h > 0 && + frame.transform_meta->scale_x > 1e-6f && frame.transform_meta->scale_y > 1e-6f) { + ctx.has_transform = true; + ctx.out_w = frame.transform_meta->src_w; + ctx.out_h = frame.transform_meta->src_h; + ctx.scale_x = frame.transform_meta->scale_x; + ctx.scale_y = frame.transform_meta->scale_y; + ctx.pad_x = frame.transform_meta->pad_x; + ctx.pad_y = frame.transform_meta->pad_y; + } + return ctx; +} + +Rect DecodeToOutputRect(float x, float y, float w, float h, const DetCoordContext& ctx) { + float ox = x; + float oy = y; + float ow = w; + float oh = h; + + if (ctx.has_transform) { + ox = (x - ctx.pad_x) / ctx.scale_x; + oy = (y - ctx.pad_y) / ctx.scale_y; + ow = w / ctx.scale_x; + oh = h / ctx.scale_y; + } else { + ox = x / ctx.fallback_scale_w; + oy = y / ctx.fallback_scale_h; + ow = w / ctx.fallback_scale_w; + oh = h / ctx.fallback_scale_h; + } + + Rect r{}; + const int out_w = std::max(1, ctx.out_w); + const int out_h = std::max(1, ctx.out_h); + r.x = static_cast(Clamp(static_cast(ox), 0, out_w)); + r.y = static_cast(Clamp(static_cast(oy), 0, out_h)); + r.w = static_cast(Clamp(static_cast(ow), 0, out_w - static_cast(r.x))); + r.h = static_cast(Clamp(static_cast(oh), 0, out_h - static_cast(r.y))); + return r; +} + inline int32_t ClipFloat(float val, float min_val, float max_val) { return static_cast(val <= min_val ? min_val : (val >= max_val ? max_val : val)); } @@ -64,39 +128,29 @@ inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) { return (static_cast(qnt) - static_cast(zp)) * scale; } -// FP16 (half) to FP32 conversion -// IEEE 754 half-precision: 1 sign bit, 5 exponent bits, 10 mantissa bits +inline float Sigmoid(float x) { + return 1.0f / (1.0f + std::exp(-x)); +} + +// FP16 (half) to FP32 conversion. +// Uses arithmetic reconstruction to avoid undefined behavior on subnormals. inline float Fp16ToFp32(uint16_t h) { - uint32_t sign = (h >> 15) & 0x1; - uint32_t exp = (h >> 10) & 0x1F; - uint32_t mant = h & 0x3FF; - - uint32_t f; + const int sign = (h & 0x8000) ? -1 : 1; + const int exp = (h >> 10) & 0x1F; + const int mant = h & 0x03FF; + if (exp == 0) { - // Zero or subnormal - if (mant == 0) { - f = (sign << 31); // Signed zero - } else { - // Subnormal: convert to normal - exp = 1; - while ((mant & 0x400) == 0) { - mant <<= 1; - exp--; - } - mant &= 0x3FF; - f = (sign << 31) | ((exp + 112) << 23) | (mant << 13); - } - } else if (exp == 0x1F) { - // Infinity or NaN - f = (sign << 31) | (0xFF << 23) | (mant << 13); - } else { - // Normal number - f = (sign << 31) | ((exp + 112) << 23) | (mant << 13); + if (mant == 0) return sign < 0 ? -0.0f : 0.0f; + // subnormal: mant * 2^-24 + return static_cast(sign) * std::ldexp(static_cast(mant), -24); } - - float result; - memcpy(&result, &f, sizeof(float)); - return result; + if (exp == 0x1F) { + if (mant == 0) return sign < 0 ? -INFINITY : INFINITY; + return std::numeric_limits::quiet_NaN(); + } + // normal: (mant + 1024) * 2^(exp-25) + return static_cast(sign) * + std::ldexp(static_cast(mant + 1024), exp - 25); } float CalculateIoU(float x1_min, float y1_min, float x1_max, float y1_max, @@ -212,18 +266,239 @@ int ProcessFeatureMapV5(int8_t* input, const int* anchor, int grid_h, int grid_w return valid_count; } +uint32_t TensorTypeSizeBytes(rknn_tensor_type t) { + switch (t) { + case RKNN_TENSOR_INT8: + case RKNN_TENSOR_UINT8: + return 1; + case RKNN_TENSOR_FLOAT16: + return 2; + case RKNN_TENSOR_FLOAT32: + return 4; + default: + return 1; + } +} + +int DefaultV8NumBoxes(int model_h, int model_w) { + if (model_h <= 0 || model_w <= 0) return 0; + return (model_h / 8) * (model_w / 8) + + (model_h / 16) * (model_w / 16) + + (model_h / 32) * (model_w / 32); +} + +struct V8LayoutInfo { + int num_boxes = 0; + bool channels_first = true; // true: CxN, false: NxC +}; + +float ScoreBoxCandidate(float x, float y, float w, float h, int model_w, int model_h) { + float s = 0.0f; + if (w > 0.0f && h > 0.0f) s += 3.0f; + if (w <= model_w * 1.2f) s += 1.0f; + if (h <= model_h * 1.2f) s += 1.0f; + if (x >= -model_w * 0.1f) s += 1.0f; + if (y >= -model_h * 0.1f) s += 1.0f; + if ((x + w) <= model_w * 1.2f) s += 1.0f; + if ((y + h) <= model_h * 1.2f) s += 1.0f; + return s; +} + +bool SeemsNormalized(float a, float b, float c, float d) { + auto in_range = [](float v) { return v >= -0.05f && v <= 2.5f; }; + return in_range(a) && in_range(b) && in_range(c) && in_range(d); +} + +const char* V8BoxFormatName(V8BoxFormat fmt) { + switch (fmt) { + case V8BoxFormat::CxCyWh: return "cxcywh"; + case V8BoxFormat::XyXy: return "xyxy"; + case V8BoxFormat::XyWh: return "xywh"; + default: return "auto"; + } +} + +void DecodeV8Box(float a, float b, float c, float d, int model_w, int model_h, V8BoxFormat fmt, + float& out_x, float& out_y, float& out_w, float& out_h, V8BoxFormat* used_fmt = nullptr) { + if (SeemsNormalized(a, b, c, d)) { + a *= static_cast(model_w); + b *= static_cast(model_h); + c *= static_cast(model_w); + d *= static_cast(model_h); + } + + auto decode_cxcywh = [&](float& x, float& y, float& w, float& h) { + x = a - c / 2.0f; + y = b - d / 2.0f; + w = c; + h = d; + }; + auto decode_xyxy = [&](float& x, float& y, float& w, float& h) { + x = a; + y = b; + w = c - a; + h = d - b; + }; + auto decode_xywh = [&](float& x, float& y, float& w, float& h) { + x = a; + y = b; + w = c; + h = d; + }; + + if (fmt == V8BoxFormat::CxCyWh) { + decode_cxcywh(out_x, out_y, out_w, out_h); + if (used_fmt) *used_fmt = V8BoxFormat::CxCyWh; + return; + } + if (fmt == V8BoxFormat::XyXy) { + decode_xyxy(out_x, out_y, out_w, out_h); + if (used_fmt) *used_fmt = V8BoxFormat::XyXy; + return; + } + if (fmt == V8BoxFormat::XyWh) { + decode_xywh(out_x, out_y, out_w, out_h); + if (used_fmt) *used_fmt = V8BoxFormat::XyWh; + return; + } + + float x1 = 0.0f, y1 = 0.0f, w1 = 0.0f, h1 = 0.0f; + float x2 = 0.0f, y2 = 0.0f, w2 = 0.0f, h2 = 0.0f; + float x3 = 0.0f, y3 = 0.0f, w3 = 0.0f, h3 = 0.0f; + decode_cxcywh(x1, y1, w1, h1); + decode_xyxy(x2, y2, w2, h2); + decode_xywh(x3, y3, w3, h3); + + const float s1 = ScoreBoxCandidate(x1, y1, w1, h1, model_w, model_h); + const float s2 = ScoreBoxCandidate(x2, y2, w2, h2, model_w, model_h); + const float s3 = ScoreBoxCandidate(x3, y3, w3, h3, model_w, model_h); + if (s2 >= s1 && s2 >= s3) { + out_x = x2; out_y = y2; out_w = w2; out_h = h2; + if (used_fmt) *used_fmt = V8BoxFormat::XyXy; + } else if (s3 >= s1 && s3 >= s2) { + out_x = x3; out_y = y3; out_w = w3; out_h = h3; + if (used_fmt) *used_fmt = V8BoxFormat::XyWh; + } else { + out_x = x1; out_y = y1; out_w = w1; out_h = h1; + if (used_fmt) *used_fmt = V8BoxFormat::CxCyWh; + } +} + +bool ResolveV8ApplySigmoid(const float* output, int num_boxes, int num_classes, bool channels_first, + V8ClsActivation act_mode) { + if (act_mode == V8ClsActivation::None) return false; + if (act_mode == V8ClsActivation::Sigmoid) return true; + if (!output || num_boxes <= 0 || num_classes <= 0) return false; + + const int num_channels = 4 + num_classes; + const int sample_boxes = std::min(num_boxes, 64); + float min_v = 1e9f; + float max_v = -1e9f; + for (int i = 0; i < sample_boxes; ++i) { + for (int c = 0; c < num_classes; ++c) { + const float v = channels_first ? output[(4 + c) * num_boxes + i] + : output[i * num_channels + (4 + c)]; + if (v < min_v) min_v = v; + if (v > max_v) max_v = v; + } + } + // If class outputs clearly look like logits, enable sigmoid. + return (min_v < -0.1f || max_v > 1.5f); +} + +V8LayoutInfo ResolveV8Layout(const std::vector& dims, size_t byte_size, + rknn_tensor_type type, int num_classes, + int model_h, int model_w) { + V8LayoutInfo info; + const int num_channels = 4 + num_classes; + if (num_channels <= 0) return info; + + const uint32_t elem_bytes = TensorTypeSizeBytes(type); + const size_t total_elems = elem_bytes > 0 ? (byte_size / elem_bytes) : 0; + const size_t max_boxes_from_data = static_cast(num_channels) > 0 + ? (total_elems / static_cast(num_channels)) + : 0; + + int ch_idx = -1; + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] == static_cast(num_channels)) { + ch_idx = static_cast(i); + break; + } + } + + if (ch_idx >= 0 && total_elems >= static_cast(num_channels)) { + info.num_boxes = static_cast(max_boxes_from_data); + + int prev_non1 = 1; + for (int i = ch_idx - 1; i >= 0; --i) { + if (dims[static_cast(i)] > 1U) { + prev_non1 = static_cast(dims[static_cast(i)]); + break; + } + } + int next_non1 = 1; + for (size_t i = static_cast(ch_idx + 1); i < dims.size(); ++i) { + if (dims[i] > 1U) { + next_non1 = static_cast(dims[i]); + break; + } + } + + if (next_non1 > 1 && prev_non1 <= 1) { + info.channels_first = true; + } else if (prev_non1 > 1 && next_non1 <= 1) { + info.channels_first = false; + } else if (next_non1 > 1 && prev_non1 > 1) { + info.channels_first = next_non1 >= prev_non1; + } else { + info.channels_first = true; + } + } else if (dims.size() >= 3) { + // Compatibility with old rank-3 assumptions. + if (dims[1] == static_cast(num_channels)) { + info.num_boxes = static_cast(dims[2]); + info.channels_first = true; + } else if (dims[2] == static_cast(num_channels)) { + info.num_boxes = static_cast(dims[1]); + info.channels_first = false; + } + } + + if (info.num_boxes <= 0 && max_boxes_from_data > 0) { + info.num_boxes = static_cast(max_boxes_from_data); + } + if (info.num_boxes <= 0) { + info.num_boxes = DefaultV8NumBoxes(model_h, model_w); + } + if (info.num_boxes <= 0) { + info.num_boxes = 8400; + } + + if (max_boxes_from_data > 0 && static_cast(info.num_boxes) > max_boxes_from_data) { + info.num_boxes = static_cast(max_boxes_from_data); + } + if (info.num_boxes < 0) info.num_boxes = 0; + return info; +} + // YOLOv8 output processing (anchor-free, single output tensor) int ProcessOutputV8(float* output, int num_boxes, int num_classes, int model_h, int model_w, std::vector& boxes, std::vector& obj_probs, - std::vector& class_ids, float conf_thresh) { + std::vector& class_ids, float conf_thresh, + bool channels_first, V8BoxFormat box_format, bool apply_sigmoid, + bool debug_decode, int* debug_left) { int valid_count = 0; + const int num_channels = 4 + num_classes; for (int i = 0; i < num_boxes; ++i) { float max_score = 0.0f; int max_cls_id = 0; for (int c = 0; c < num_classes; ++c) { - float score = output[(4 + c) * num_boxes + i]; + float score = channels_first ? output[(4 + c) * num_boxes + i] + : output[i * num_channels + (4 + c)]; + if (apply_sigmoid) score = Sigmoid(score); if (score > max_score) { max_score = score; max_cls_id = c; @@ -231,13 +506,28 @@ int ProcessOutputV8(float* output, int num_boxes, int num_classes, } if (max_score >= conf_thresh) { - float cx = output[0 * num_boxes + i]; - float cy = output[1 * num_boxes + i]; - float w = output[2 * num_boxes + i]; - float h = output[3 * num_boxes + i]; - - float x1 = cx - w / 2.0f; - float y1 = cy - h / 2.0f; + const float a = channels_first ? output[0 * num_boxes + i] : output[i * num_channels + 0]; + const float b = channels_first ? output[1 * num_boxes + i] : output[i * num_channels + 1]; + const float c = channels_first ? output[2 * num_boxes + i] : output[i * num_channels + 2]; + const float d = channels_first ? output[3 * num_boxes + i] : output[i * num_channels + 3]; + if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) { + continue; + } + float x1 = 0.0f, y1 = 0.0f, w = 0.0f, h = 0.0f; + V8BoxFormat used_fmt = box_format; + DecodeV8Box(a, b, c, d, model_w, model_h, box_format, x1, y1, w, h, &used_fmt); + if (!std::isfinite(x1) || !std::isfinite(y1) || !std::isfinite(w) || !std::isfinite(h)) { + continue; + } + if (w <= 1e-3f || h <= 1e-3f) continue; + if (debug_decode && debug_left && *debug_left > 0) { + --(*debug_left); + LogInfo("[ai_yolo] v8 decode f32: raw4(" + std::to_string(a) + "," + std::to_string(b) + "," + + std::to_string(c) + "," + std::to_string(d) + ") fmt=" + V8BoxFormatName(used_fmt) + + " -> xywh(" + std::to_string(x1) + "," + std::to_string(y1) + "," + + std::to_string(w) + "," + std::to_string(h) + ") cls=" + + std::to_string(max_cls_id) + " score=" + std::to_string(max_score)); + } boxes.push_back(x1); boxes.push_back(y1); @@ -256,15 +546,18 @@ int ProcessOutputV8Int8(int8_t* output, int num_boxes, int num_classes, int model_h, int model_w, std::vector& boxes, std::vector& obj_probs, std::vector& class_ids, float conf_thresh, - int32_t zp, float scale) { + int32_t zp, float scale, bool channels_first, V8BoxFormat box_format, + bool debug_decode, int* debug_left) { int valid_count = 0; int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale); + const int num_channels = 4 + num_classes; for (int i = 0; i < num_boxes; ++i) { int8_t max_score_i8 = -128; int max_cls_id = 0; for (int c = 0; c < num_classes; ++c) { - int8_t score = output[(4 + c) * num_boxes + i]; + int8_t score = channels_first ? output[(4 + c) * num_boxes + i] + : output[i * num_channels + (4 + c)]; if (score > max_score_i8) { max_score_i8 = score; max_cls_id = c; @@ -272,14 +565,33 @@ int ProcessOutputV8Int8(int8_t* output, int num_boxes, int num_classes, } if (max_score_i8 >= thresh_i8) { - float cx = DequantizeAffineToF32(output[0 * num_boxes + i], zp, scale); - float cy = DequantizeAffineToF32(output[1 * num_boxes + i], zp, scale); - float w = DequantizeAffineToF32(output[2 * num_boxes + i], zp, scale); - float h = DequantizeAffineToF32(output[3 * num_boxes + i], zp, scale); + float a = DequantizeAffineToF32( + channels_first ? output[0 * num_boxes + i] : output[i * num_channels + 0], zp, scale); + float b = DequantizeAffineToF32( + channels_first ? output[1 * num_boxes + i] : output[i * num_channels + 1], zp, scale); + float c = DequantizeAffineToF32( + channels_first ? output[2 * num_boxes + i] : output[i * num_channels + 2], zp, scale); + float d = DequantizeAffineToF32( + channels_first ? output[3 * num_boxes + i] : output[i * num_channels + 3], zp, scale); float max_score = DequantizeAffineToF32(max_score_i8, zp, scale); - - float x1 = cx - w / 2.0f; - float y1 = cy - h / 2.0f; + if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) { + continue; + } + float x1 = 0.0f, y1 = 0.0f, w = 0.0f, h = 0.0f; + V8BoxFormat used_fmt = box_format; + DecodeV8Box(a, b, c, d, model_w, model_h, box_format, x1, y1, w, h, &used_fmt); + if (!std::isfinite(x1) || !std::isfinite(y1) || !std::isfinite(w) || !std::isfinite(h)) { + continue; + } + if (w <= 1e-3f || h <= 1e-3f) continue; + if (debug_decode && debug_left && *debug_left > 0) { + --(*debug_left); + LogInfo("[ai_yolo] v8 decode int8: raw4(" + std::to_string(a) + "," + std::to_string(b) + "," + + std::to_string(c) + "," + std::to_string(d) + ") fmt=" + V8BoxFormatName(used_fmt) + + " -> xywh(" + std::to_string(x1) + "," + std::to_string(y1) + "," + + std::to_string(w) + "," + std::to_string(h) + ") cls=" + + std::to_string(max_cls_id) + " score=" + std::to_string(max_score)); + } boxes.push_back(x1); boxes.push_back(y1); @@ -309,6 +621,28 @@ public: model_input_w_ = config.ValueOr("model_w", 640); model_input_h_ = config.ValueOr("model_h", 640); num_classes_ = config.ValueOr("num_classes", 80); + { + const std::string bf = config.ValueOr("v8_box_format", "cxcywh"); + if (bf == "xyxy") { + v8_box_format_ = V8BoxFormat::XyXy; + } else if (bf == "xywh") { + v8_box_format_ = V8BoxFormat::XyWh; + } else if (bf == "cxcywh") { + v8_box_format_ = V8BoxFormat::CxCyWh; + } else { + v8_box_format_ = V8BoxFormat::Auto; + } + } + { + const std::string act = config.ValueOr("v8_cls_activation", "auto"); + if (act == "sigmoid") { + v8_cls_activation_ = V8ClsActivation::Sigmoid; + } else if (act == "none") { + v8_cls_activation_ = V8ClsActivation::None; + } else { + v8_cls_activation_ = V8ClsActivation::Auto; + } + } if (const SimpleJson* dbg = config.Find("debug"); dbg && dbg->IsObject()) { stats_log_ = dbg->ValueOr("stats", stats_log_); @@ -432,10 +766,9 @@ public: PushToDownstream(frame); ++processed_; - // Stats log disabled to reduce log spam - // if (stats_log_ && stats_interval_ > 0 && (processed_ % stats_interval_) == 0) { - // LogInfo("[ai_yolo] processed=" + std::to_string(processed_) + " id=" + id_); - // } + if (stats_log_ && stats_interval_ > 0 && (processed_ % stats_interval_) == 0) { + LogInfo("[ai_yolo] processed=" + std::to_string(processed_) + " id=" + id_); + } return NodeStatus::OK; } @@ -540,29 +873,42 @@ private: outputs[2].zp, outputs[2].scale); valid_count = cnt0 + cnt1 + cnt2; } else { - if (outputs.empty()) return; + if (outputs.empty()) return; if (!outputs[0].data || outputs[0].size == 0) return; - int num_boxes = 0; - int num_channels = 4 + num_classes_; - - if (outputs[0].dims.size() >= 3) { - if (outputs[0].dims[1] == static_cast(num_channels)) { - num_boxes = static_cast(outputs[0].dims[2]); - } else if (outputs[0].dims[2] == static_cast(num_channels)) { - num_boxes = static_cast(outputs[0].dims[1]); - } else { - num_boxes = 8400; + const V8LayoutInfo layout = ResolveV8Layout(outputs[0].dims, outputs[0].size, + outputs[0].type, num_classes_, + model_input_h_, model_input_w_); + const int num_boxes = layout.num_boxes; + int debug_decode_left = (debug_det_ && processed_ < 20) ? 5 : 0; + if (num_boxes <= 0) return; + if (debug_det_ && processed_ < 5) { + std::string dims_s; + for (size_t di = 0; di < outputs[0].dims.size(); ++di) { + dims_s += (di == 0 ? "[" : ","); + dims_s += std::to_string(outputs[0].dims[di]); } - } else { - num_boxes = static_cast(outputs[0].size) / num_channels; + dims_s += "]"; + LogInfo("[ai_yolo] v8 out type=" + std::to_string(static_cast(outputs[0].type)) + + " size=" + std::to_string(outputs[0].size) + + " dims=" + dims_s + + " num_boxes=" + std::to_string(num_boxes) + + " layout=" + std::string(layout.channels_first ? "CxN" : "NxC")); } if (outputs[0].type == RKNN_TENSOR_FLOAT32) { + const bool apply_sigmoid = ResolveV8ApplySigmoid( + reinterpret_cast(const_cast(outputs[0].data)), + num_boxes, num_classes_, layout.channels_first, v8_cls_activation_); + if (debug_det_ && processed_ < 5) { + LogInfo("[ai_yolo] v8 cls activation=" + std::string(apply_sigmoid ? "sigmoid" : "none")); + } valid_count = ProcessOutputV8(reinterpret_cast(const_cast(outputs[0].data)), num_boxes, num_classes_, model_input_h_, model_input_w_, - boxes, obj_probs, class_ids, conf_thresh_); + boxes, obj_probs, class_ids, conf_thresh_, + layout.channels_first, v8_box_format_, apply_sigmoid, + debug_det_, &debug_decode_left); } else if (outputs[0].type == RKNN_TENSOR_FLOAT16) { // Convert FP16 to FP32 size_t num_elements = outputs[0].size / sizeof(uint16_t); @@ -571,16 +917,25 @@ private: for (size_t i = 0; i < num_elements; ++i) { fp32_buffer_[i] = Fp16ToFp32(fp16_data[i]); } + const bool apply_sigmoid = ResolveV8ApplySigmoid( + fp32_buffer_.data(), num_boxes, num_classes_, layout.channels_first, v8_cls_activation_); + if (debug_det_ && processed_ < 5) { + LogInfo("[ai_yolo] v8 cls activation=" + std::string(apply_sigmoid ? "sigmoid" : "none")); + } valid_count = ProcessOutputV8(fp32_buffer_.data(), num_boxes, num_classes_, model_input_h_, model_input_w_, - boxes, obj_probs, class_ids, conf_thresh_); + boxes, obj_probs, class_ids, conf_thresh_, + layout.channels_first, v8_box_format_, apply_sigmoid, + debug_det_, &debug_decode_left); } else { valid_count = ProcessOutputV8Int8(reinterpret_cast(const_cast(outputs[0].data)), num_boxes, num_classes_, model_input_h_, model_input_w_, boxes, obj_probs, class_ids, conf_thresh_, - outputs[0].zp, outputs[0].scale); + outputs[0].zp, outputs[0].scale, + layout.channels_first, v8_box_format_, + debug_det_, &debug_decode_left); } } @@ -596,12 +951,11 @@ private: NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_); } - float scale_w = static_cast(model_input_w_) / frame->width; - float scale_h = static_cast(model_input_h_) / frame->height; + const DetCoordContext coord_ctx = BuildDetCoordContext(*frame, model_input_w_, model_input_h_); auto det_result = std::make_shared(); - det_result->img_w = frame->width; - det_result->img_h = frame->height; + det_result->img_w = coord_ctx.out_w; + det_result->img_h = coord_ctx.out_h; det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8"; for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) { @@ -621,22 +975,23 @@ private: Detection det; det.cls_id = cls_id; det.score = obj_probs[i]; - det.bbox.x = static_cast(Clamp(static_cast(x1 / scale_w), 0, frame->width)); - det.bbox.y = static_cast(Clamp(static_cast(y1 / scale_h), 0, frame->height)); - det.bbox.w = static_cast(Clamp(static_cast(w / scale_w), 0, frame->width - static_cast(det.bbox.x))); - det.bbox.h = static_cast(Clamp(static_cast(h / scale_h), 0, frame->height - static_cast(det.bbox.y))); + det.bbox = DecodeToOutputRect(x1, y1, w, h, coord_ctx); det.track_id = -1; - if (debug_det_ && det_result->items.size() < 3 && processed_ < 10) { - LogDebug("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," + - std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" + - std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," + - std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" + - std::to_string(cls_id) + " score=" + std::to_string(det.score)); + if (debug_det_ && det_result->items.size() < 5 && processed_ < 20) { + LogInfo("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," + + std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" + + std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," + + std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" + + std::to_string(cls_id) + " score=" + std::to_string(det.score)); } det_result->items.push_back(det); } + if (debug_det_ && processed_ < 20) { + LogInfo("[ai_yolo] det summary: valid_count=" + std::to_string(valid_count) + + " final=" + std::to_string(det_result->items.size())); + } frame->det = det_result; } @@ -671,33 +1026,66 @@ private: } else { if (outputs.empty()) return; - int num_boxes = 0; - int num_channels = 4 + num_classes_; - - if (outputs[0].dims.size() >= 3) { - if (outputs[0].dims[1] == static_cast(num_channels)) { - num_boxes = outputs[0].dims[2]; - } else if (outputs[0].dims[2] == static_cast(num_channels)) { - num_boxes = outputs[0].dims[1]; - } else { - num_boxes = 8400; + const V8LayoutInfo layout = ResolveV8Layout(outputs[0].dims, outputs[0].data.size(), + outputs[0].type, num_classes_, + model_input_h_, model_input_w_); + const int num_boxes = layout.num_boxes; + int debug_decode_left = (debug_det_ && processed_ < 20) ? 5 : 0; + if (num_boxes <= 0) return; + if (debug_det_ && processed_ < 5) { + std::string dims_s; + for (size_t di = 0; di < outputs[0].dims.size(); ++di) { + dims_s += (di == 0 ? "[" : ","); + dims_s += std::to_string(outputs[0].dims[di]); } - } else { - num_boxes = outputs[0].data.size() / num_channels; + dims_s += "]"; + LogInfo("[ai_yolo] v8 out(type copy) type=" + std::to_string(static_cast(outputs[0].type)) + + " size=" + std::to_string(outputs[0].data.size()) + + " dims=" + dims_s + + " num_boxes=" + std::to_string(num_boxes) + + " layout=" + std::string(layout.channels_first ? "CxN" : "NxC")); } - if (outputs[0].type == RKNN_TENSOR_FLOAT32 || - outputs[0].type == RKNN_TENSOR_FLOAT16) { + if (outputs[0].type == RKNN_TENSOR_FLOAT32) { + const bool apply_sigmoid = ResolveV8ApplySigmoid( + reinterpret_cast(outputs[0].data.data()), + num_boxes, num_classes_, layout.channels_first, v8_cls_activation_); + if (debug_det_ && processed_ < 5) { + LogInfo("[ai_yolo] v8 cls activation(copy)=" + std::string(apply_sigmoid ? "sigmoid" : "none")); + } valid_count = ProcessOutputV8(reinterpret_cast(outputs[0].data.data()), num_boxes, num_classes_, model_input_h_, model_input_w_, - boxes, obj_probs, class_ids, conf_thresh_); + boxes, obj_probs, class_ids, conf_thresh_, + layout.channels_first, v8_box_format_, apply_sigmoid, + debug_det_, &debug_decode_left); + } else if (outputs[0].type == RKNN_TENSOR_FLOAT16) { + // Convert FP16 to FP32 + size_t num_elements = outputs[0].data.size() / sizeof(uint16_t); + fp32_buffer_.resize(num_elements); + const uint16_t* fp16_data = reinterpret_cast(outputs[0].data.data()); + for (size_t i = 0; i < num_elements; ++i) { + fp32_buffer_[i] = Fp16ToFp32(fp16_data[i]); + } + const bool apply_sigmoid = ResolveV8ApplySigmoid( + fp32_buffer_.data(), num_boxes, num_classes_, layout.channels_first, v8_cls_activation_); + if (debug_det_ && processed_ < 5) { + LogInfo("[ai_yolo] v8 cls activation(copy)=" + std::string(apply_sigmoid ? "sigmoid" : "none")); + } + valid_count = ProcessOutputV8(fp32_buffer_.data(), + num_boxes, num_classes_, + model_input_h_, model_input_w_, + boxes, obj_probs, class_ids, conf_thresh_, + layout.channels_first, v8_box_format_, apply_sigmoid, + debug_det_, &debug_decode_left); } else { valid_count = ProcessOutputV8Int8(reinterpret_cast(outputs[0].data.data()), num_boxes, num_classes_, model_input_h_, model_input_w_, boxes, obj_probs, class_ids, conf_thresh_, - outputs[0].zp, outputs[0].scale); + outputs[0].zp, outputs[0].scale, + layout.channels_first, v8_box_format_, + debug_det_, &debug_decode_left); } } @@ -713,12 +1101,11 @@ private: NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_); } - float scale_w = static_cast(model_input_w_) / frame->width; - float scale_h = static_cast(model_input_h_) / frame->height; + const DetCoordContext coord_ctx = BuildDetCoordContext(*frame, model_input_w_, model_input_h_); auto det_result = std::make_shared(); - det_result->img_w = frame->width; - det_result->img_h = frame->height; + det_result->img_w = coord_ctx.out_w; + det_result->img_h = coord_ctx.out_h; det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8"; for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) { @@ -738,23 +1125,23 @@ private: Detection det; det.cls_id = cls_id; det.score = obj_probs[i]; - det.bbox.x = static_cast(Clamp(static_cast(x1 / scale_w), 0, frame->width)); - det.bbox.y = static_cast(Clamp(static_cast(y1 / scale_h), 0, frame->height)); - det.bbox.w = static_cast(Clamp(static_cast(w / scale_w), 0, frame->width - static_cast(det.bbox.x))); - det.bbox.h = static_cast(Clamp(static_cast(h / scale_h), 0, frame->height - static_cast(det.bbox.y))); + det.bbox = DecodeToOutputRect(x1, y1, w, h, coord_ctx); det.track_id = -1; - // Debug output for first few detections - if (debug_det_ && det_result->items.size() < 3 && processed_ < 10) { - LogDebug("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," + - std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" + - std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," + - std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" + - std::to_string(cls_id) + " score=" + std::to_string(det.score)); + if (debug_det_ && det_result->items.size() < 5 && processed_ < 20) { + LogInfo("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," + + std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" + + std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," + + std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" + + std::to_string(cls_id) + " score=" + std::to_string(det.score)); } det_result->items.push_back(det); } + if (debug_det_ && processed_ < 20) { + LogInfo("[ai_yolo] det summary(copy): valid_count=" + std::to_string(valid_count) + + " final=" + std::to_string(det_result->items.size())); + } frame->det = det_result; } @@ -767,6 +1154,8 @@ private: int model_input_w_ = 640; int model_input_h_ = 640; int num_classes_ = 80; + V8BoxFormat v8_box_format_ = V8BoxFormat::CxCyWh; + V8ClsActivation v8_cls_activation_ = V8ClsActivation::Auto; YoloVersion yolo_version_ = YoloVersion::V8; bool auto_detect_version_ = false; std::set class_filter_; diff --git a/plugins/osd/osd_node.cpp b/plugins/osd/osd_node.cpp index 4841924..d3dfed1 100644 --- a/plugins/osd/osd_node.cpp +++ b/plugins/osd/osd_node.cpp @@ -63,6 +63,20 @@ inline int Clamp(int val, int min_val, int max_val) { return val < min_val ? min_val : (val > max_val ? max_val : val); } +Rect MapRectToFrame(const Rect& in, int src_w, int src_h, int dst_w, int dst_h) { + if (src_w <= 0 || src_h <= 0 || dst_w <= 0 || dst_h <= 0) return Rect{}; + const float sx = static_cast(dst_w) / static_cast(src_w); + const float sy = static_cast(dst_h) / static_cast(src_h); + Rect out{}; + out.x = std::max(0.0f, in.x * sx); + out.y = std::max(0.0f, in.y * sy); + out.w = std::max(0.0f, in.w * sx); + out.h = std::max(0.0f, in.h * sy); + if (out.x + out.w > static_cast(dst_w)) out.w = std::max(0.0f, static_cast(dst_w) - out.x); + if (out.y + out.h > static_cast(dst_h)) out.h = std::max(0.0f, static_cast(dst_h) - out.y); + return out; +} + #if defined(RK3588_ENABLE_RGA) inline uint32_t PackColorArgb(const Color& c) { return (0xFFu << 24) | (static_cast(c.r) << 16) | @@ -561,6 +575,9 @@ private: int w = frame->width; int h = frame->height; + const int det_w = frame->det->img_w > 0 ? frame->det->img_w : w; + const int det_h = frame->det->img_h > 0 ? frame->det->img_h : h; + const bool map_det_to_frame = (det_w != w || det_h != h); uint8_t* data = frame->planes[0].data ? frame->planes[0].data : frame->data; PixelFormat fmt = frame->format; @@ -587,10 +604,11 @@ private: } else { bool ok = true; for (const auto& det : frame->det->items) { - int x = Clamp(static_cast(det.bbox.x), 0, w - 1); - int y = Clamp(static_cast(det.bbox.y), 0, h - 1); - int rw = static_cast(det.bbox.w); - int rh = static_cast(det.bbox.h); + const Rect draw = map_det_to_frame ? MapRectToFrame(det.bbox, det_w, det_h, w, h) : det.bbox; + int x = Clamp(static_cast(draw.x), 0, w - 1); + int y = Clamp(static_cast(draw.y), 0, h - 1); + int rw = static_cast(draw.w); + int rh = static_cast(draw.h); rw = Clamp(rw, 1, w - x); rh = Clamp(rh, 1, h - y); im_rect rect{x, y, rw, rh}; @@ -636,10 +654,11 @@ private: } for (const auto& det : frame->det->items) { - int x1 = static_cast(det.bbox.x); - int y1 = static_cast(det.bbox.y); - int x2 = static_cast(det.bbox.x + det.bbox.w); - int y2 = static_cast(det.bbox.y + det.bbox.h); + const Rect draw = map_det_to_frame ? MapRectToFrame(det.bbox, det_w, det_h, w, h) : det.bbox; + int x1 = static_cast(draw.x); + int y1 = static_cast(draw.y); + int x2 = static_cast(draw.x + draw.w); + int y2 = static_cast(draw.y + draw.h); Color color = GetClassColor(det.cls_id); diff --git a/plugins/preprocess/preprocess_node.cpp b/plugins/preprocess/preprocess_node.cpp index 36538c7..d55ad38 100644 --- a/plugins/preprocess/preprocess_node.cpp +++ b/plugins/preprocess/preprocess_node.cpp @@ -1,16 +1,27 @@ #include +#include #include +#include +#include #include #include +#include "face/face_result.h" #include "hw/i_image_processor.h" #include "node.h" +#include "utils/dma_alloc.h" #include "utils/logger.h" namespace rk3588 { namespace { +enum class ResizeMode { + Stretch, + KeepRatio, + Letterbox, +}; + PixelFormat ParseFormat(const std::string& s) { if (s == "nv12" || s == "NV12") return PixelFormat::NV12; if (s == "yuv420" || s == "YUV420") return PixelFormat::YUV420; @@ -19,6 +30,229 @@ PixelFormat ParseFormat(const std::string& s) { return PixelFormat::UNKNOWN; } +ResizeMode ParseResizeMode(const std::string& s, bool keep_ratio) { + if (s == "stretch") return ResizeMode::Stretch; + if (s == "keep_ratio" || s == "fit") return ResizeMode::KeepRatio; + if (s == "letterbox") return ResizeMode::Letterbox; + return keep_ratio ? ResizeMode::KeepRatio : ResizeMode::Stretch; +} + +inline bool IsYuvFormat(PixelFormat fmt) { + return fmt == PixelFormat::NV12 || fmt == PixelFormat::YUV420; +} + +inline float ClampFloat(float v, float lo, float hi) { + return std::max(lo, std::min(v, hi)); +} + +inline int MakeEvenFloor(int v) { + if (v <= 0) return 0; + return v & ~1; +} + +size_t CalcImageSize(int w, int h, PixelFormat fmt) { + if (w <= 0 || h <= 0) return 0; + switch (fmt) { + case PixelFormat::NV12: + case PixelFormat::YUV420: + return static_cast(w) * static_cast(h) * 3 / 2; + case PixelFormat::RGB: + case PixelFormat::BGR: + return static_cast(w) * static_cast(h) * 3; + default: + return 0; + } +} + +void SetupPlanes(Frame& f) { + if (!f.data || f.width <= 0 || f.height <= 0) return; + if (f.format == PixelFormat::NV12) { + const int y_stride = f.width; + const int y_size = y_stride * f.height; + const int uv_size = y_stride * (f.height / 2); + f.stride = y_stride; + f.plane_count = 2; + f.planes[0] = {f.data, y_stride, y_size, 0}; + f.planes[1] = {f.data + y_size, y_stride, uv_size, y_size}; + } else if (f.format == PixelFormat::YUV420) { + const int y_stride = f.width; + const int y_size = y_stride * f.height; + const int uv_stride = f.width / 2; + const int u_size = uv_stride * (f.height / 2); + f.stride = y_stride; + f.plane_count = 3; + f.planes[0] = {f.data, y_stride, y_size, 0}; + f.planes[1] = {f.data + y_size, uv_stride, u_size, y_size}; + f.planes[2] = {f.data + y_size + u_size, uv_stride, u_size, y_size + u_size}; + } else { + const int stride = f.width * 3; + f.stride = stride; + f.plane_count = 1; + f.planes[0] = {f.data, stride, static_cast(f.data_size), 0}; + } + f.SyncBufferFromFrame(); +} + +bool InitFrameStorage(Frame& f) { + const size_t need = CalcImageSize(f.width, f.height, f.format); + if (need == 0) return false; + + if (auto dma = DmaAlloc(need); dma && dma->valid()) { + f.SetDmaFd(dma->fd); + f.data = dma->data(); + f.data_size = dma->size; + f.SetOwner(dma); + SetupPlanes(f); + return true; + } + + auto buf = std::make_shared>(need); + f.SetDmaFd(-1); + f.data = buf->data(); + f.data_size = buf->size(); + f.SetOwner(buf); + SetupPlanes(f); + return true; +} + +void FillBlack(Frame& f) { + if (!f.data || f.data_size == 0) return; + if (f.DmaFd() >= 0) { + DmaSyncStartFd(f.DmaFd()); + } + if (f.format == PixelFormat::NV12) { + const int y_size = f.width * f.height; + std::memset(f.data, 0, static_cast(y_size)); + std::memset(f.data + y_size, 128, static_cast(f.width * f.height / 2)); + } else if (f.format == PixelFormat::YUV420) { + const int y_size = f.width * f.height; + const int u_size = (f.width / 2) * (f.height / 2); + std::memset(f.data, 0, static_cast(y_size)); + std::memset(f.data + y_size, 128, static_cast(u_size)); + std::memset(f.data + y_size + u_size, 128, static_cast(u_size)); + } else { + std::memset(f.data, 0, f.data_size); + } + if (f.DmaFd() >= 0) { + DmaSyncEndFd(f.DmaFd()); + } +} + +bool BlitLetterbox(const Frame& src, Frame& dst, int pad_x, int pad_y) { + if (!src.data || !dst.data || src.format != dst.format) return false; + if (pad_x < 0 || pad_y < 0) return false; + if (src.width + pad_x > dst.width || src.height + pad_y > dst.height) return false; + + if (src.DmaFd() >= 0) src.SyncStart(); + if (dst.DmaFd() >= 0) DmaSyncStartFd(dst.DmaFd()); + + if (src.format == PixelFormat::RGB || src.format == PixelFormat::BGR) { + const int src_stride = src.planes[0].stride > 0 ? src.planes[0].stride : src.width * 3; + const int dst_stride = dst.planes[0].stride > 0 ? dst.planes[0].stride : dst.width * 3; + const uint8_t* src_ptr = src.planes[0].data ? src.planes[0].data : src.data; + uint8_t* dst_ptr = dst.planes[0].data ? dst.planes[0].data : dst.data; + const size_t row_bytes = static_cast(src.width) * 3; + for (int y = 0; y < src.height; ++y) { + std::memcpy(dst_ptr + static_cast(y + pad_y) * dst_stride + static_cast(pad_x) * 3, + src_ptr + static_cast(y) * src_stride, + row_bytes); + } + } else if (src.format == PixelFormat::NV12) { + const int src_y_stride = src.planes[0].stride > 0 ? src.planes[0].stride : src.width; + const int src_uv_stride = src.planes[1].stride > 0 ? src.planes[1].stride : src.width; + const int dst_y_stride = dst.planes[0].stride > 0 ? dst.planes[0].stride : dst.width; + const int dst_uv_stride = dst.planes[1].stride > 0 ? dst.planes[1].stride : dst.width; + const uint8_t* src_y = src.planes[0].data ? src.planes[0].data : src.data; + const uint8_t* src_uv = src.planes[1].data ? src.planes[1].data : (src.data + src.width * src.height); + uint8_t* dst_y = dst.planes[0].data ? dst.planes[0].data : dst.data; + uint8_t* dst_uv = dst.planes[1].data ? dst.planes[1].data : (dst.data + dst.width * dst.height); + + for (int y = 0; y < src.height; ++y) { + std::memcpy(dst_y + static_cast(y + pad_y) * dst_y_stride + pad_x, + src_y + static_cast(y) * src_y_stride, + static_cast(src.width)); + } + + const int uv_rows = src.height / 2; + for (int y = 0; y < uv_rows; ++y) { + std::memcpy(dst_uv + static_cast(y + pad_y / 2) * dst_uv_stride + pad_x, + src_uv + static_cast(y) * src_uv_stride, + static_cast(src.width)); + } + } else if (src.format == PixelFormat::YUV420) { + const int src_y_stride = src.planes[0].stride > 0 ? src.planes[0].stride : src.width; + const int src_u_stride = src.planes[1].stride > 0 ? src.planes[1].stride : src.width / 2; + const int src_v_stride = src.planes[2].stride > 0 ? src.planes[2].stride : src.width / 2; + const int dst_y_stride = dst.planes[0].stride > 0 ? dst.planes[0].stride : dst.width; + const int dst_u_stride = dst.planes[1].stride > 0 ? dst.planes[1].stride : dst.width / 2; + const int dst_v_stride = dst.planes[2].stride > 0 ? dst.planes[2].stride : dst.width / 2; + const uint8_t* src_y = src.planes[0].data ? src.planes[0].data : src.data; + const uint8_t* src_u = src.planes[1].data ? src.planes[1].data : (src.data + src.width * src.height); + const uint8_t* src_v = src.planes[2].data ? src.planes[2].data : (src_u + (src.width / 2) * (src.height / 2)); + uint8_t* dst_y = dst.planes[0].data ? dst.planes[0].data : dst.data; + uint8_t* dst_u = dst.planes[1].data ? dst.planes[1].data : (dst.data + dst.width * dst.height); + uint8_t* dst_v = dst.planes[2].data ? dst.planes[2].data : (dst_u + (dst.width / 2) * (dst.height / 2)); + + for (int y = 0; y < src.height; ++y) { + std::memcpy(dst_y + static_cast(y + pad_y) * dst_y_stride + pad_x, + src_y + static_cast(y) * src_y_stride, + static_cast(src.width)); + } + + const int uv_rows = src.height / 2; + const int uv_pad_x = pad_x / 2; + const int uv_pad_y = pad_y / 2; + const int uv_cols = src.width / 2; + for (int y = 0; y < uv_rows; ++y) { + std::memcpy(dst_u + static_cast(y + uv_pad_y) * dst_u_stride + uv_pad_x, + src_u + static_cast(y) * src_u_stride, + static_cast(uv_cols)); + std::memcpy(dst_v + static_cast(y + uv_pad_y) * dst_v_stride + uv_pad_x, + src_v + static_cast(y) * src_v_stride, + static_cast(uv_cols)); + } + } else { + if (src.DmaFd() >= 0) src.SyncEnd(); + if (dst.DmaFd() >= 0) DmaSyncEndFd(dst.DmaFd()); + return false; + } + + if (dst.DmaFd() >= 0) DmaSyncEndFd(dst.DmaFd()); + if (src.DmaFd() >= 0) src.SyncEnd(); + return true; +} + +void TransformRect(Rect& r, float sx, float sy, float tx, float ty, int out_w, int out_h) { + if (out_w <= 0 || out_h <= 0) { + r = Rect{}; + return; + } + const float fw = static_cast(out_w); + const float fh = static_cast(out_h); + + const float x = ClampFloat(r.x * sx + tx, 0.0f, fw); + const float y = ClampFloat(r.y * sy + ty, 0.0f, fh); + float w = std::max(0.0f, r.w * sx); + float h = std::max(0.0f, r.h * sy); + + if (x + w > fw) w = std::max(0.0f, fw - x); + if (y + h > fh) h = std::max(0.0f, fh - y); + + r.x = x; + r.y = y; + r.w = w; + r.h = h; +} + +void TransformPoint(Point2f& p, float sx, float sy, float tx, float ty, int out_w, int out_h) { + if (out_w <= 0 || out_h <= 0) { + p = Point2f{}; + return; + } + p.x = ClampFloat(p.x * sx + tx, 0.0f, static_cast(out_w)); + p.y = ClampFloat(p.y * sy + ty, 0.0f, static_cast(out_h)); +} + } // namespace class PreprocessNode : public INode { @@ -31,6 +265,7 @@ public: dst_w_ = config.ValueOr("dst_w", 640); dst_h_ = config.ValueOr("dst_h", 640); keep_ratio_ = config.ValueOr("keep_ratio", false); + resize_mode_ = ParseResizeMode(config.ValueOr("resize_mode", ""), keep_ratio_); std::string fmt_str = config.ValueOr("dst_format", ""); if (!fmt_str.empty()) { @@ -81,8 +316,11 @@ public: } bool Start() override { + std::string mode = "stretch"; + if (resize_mode_ == ResizeMode::KeepRatio) mode = "keep_ratio"; + if (resize_mode_ == ResizeMode::Letterbox) mode = "letterbox"; LogInfo("[preprocess] start id=" + id_ + " dst=" + std::to_string(dst_w_) + "x" + - std::to_string(dst_h_) + (use_rga_ ? " (rga)" : " (swscale)")); + std::to_string(dst_h_) + " mode=" + mode + (use_rga_ ? " (rga)" : " (swscale)")); return true; } @@ -95,52 +333,80 @@ public: PixelFormat out_fmt = (dst_fmt_ != PixelFormat::UNKNOWN) ? dst_fmt_ : frame->format; int out_w = dst_w_; int out_h = dst_h_; - if (out_w <= 0) out_w = frame->width; if (out_h <= 0) out_h = frame->height; - if (keep_ratio_ && dst_w_ > 0 && dst_h_ > 0 && frame->width > 0 && frame->height > 0) { - float scale = std::min(static_cast(dst_w_) / frame->width, - static_cast(dst_h_) / frame->height); - out_w = static_cast(frame->width * scale); - out_h = static_cast(frame->height * scale); - out_w = (out_w + 1) & ~1; - out_h = (out_h + 1) & ~1; - } - - const bool need_resize = (frame->width != out_w || frame->height != out_h); - const bool need_cvt = (frame->format != out_fmt); - - if (need_resize) { - WarnMetaResizeOnce(frame, out_w, out_h); - } - - if (!need_resize && !need_cvt) { - ProcessPassthrough(frame); - return NodeStatus::OK; - } + FrameTransformMeta tx{}; + tx.valid = true; + tx.src_w = frame->width; + tx.src_h = frame->height; Frame out; - out.width = out_w; - out.height = out_h; - out.format = out_fmt; + if (resize_mode_ == ResizeMode::Letterbox && dst_w_ > 0 && dst_h_ > 0 && + frame->width > 0 && frame->height > 0) { + Status lb = BuildLetterbox(*frame, out_fmt, out_w, out_h, out, tx); + if (lb.Failed()) { + LogError("[preprocess] letterbox failed: " + lb.ErrMessage()); + return NodeStatus::ERROR; + } + } else { + if (resize_mode_ == ResizeMode::KeepRatio && dst_w_ > 0 && dst_h_ > 0 && + frame->width > 0 && frame->height > 0) { + float scale = std::min(static_cast(dst_w_) / frame->width, + static_cast(dst_h_) / frame->height); + out_w = static_cast(std::round(frame->width * scale)); + out_h = static_cast(std::round(frame->height * scale)); + if (IsYuvFormat(out_fmt)) { + out_w = std::max(2, MakeEvenFloor(out_w)); + out_h = std::max(2, MakeEvenFloor(out_h)); + } + if (out_w <= 0) out_w = frame->width; + if (out_h <= 0) out_h = frame->height; + } - Status st = image_processor_->Resize(*frame, out); - if (st.Failed()) { - if (!use_rga_ && st.ErrMessage().find("unsupported format") != std::string::npos) { + const bool need_resize = (frame->width != out_w || frame->height != out_h); + const bool need_cvt = (frame->format != out_fmt); + + tx.letterbox = false; + tx.dst_w = out_w; + tx.dst_h = out_h; + tx.scale_x = frame->width > 0 ? static_cast(out_w) / frame->width : 1.0f; + tx.scale_y = frame->height > 0 ? static_cast(out_h) / frame->height : 1.0f; + tx.pad_x = 0.0f; + tx.pad_y = 0.0f; + + if (!need_resize && !need_cvt) { + auto t = std::make_shared(tx); + frame->transform_meta = t; ProcessPassthrough(frame); return NodeStatus::OK; } - LogError("[preprocess] " + st.ErrMessage()); - return NodeStatus::ERROR; + + if (need_resize) { + WarnMetaResizeOnce(frame, out_w, out_h); + } + + out.width = out_w; + out.height = out_h; + out.format = out_fmt; + Status st = image_processor_->Resize(*frame, out); + if (st.Failed()) { + if (!use_rga_ && st.ErrMessage().find("unsupported format") != std::string::npos) { + auto t = std::make_shared(tx); + frame->transform_meta = t; + ProcessPassthrough(frame); + return NodeStatus::OK; + } + LogError("[preprocess] " + st.ErrMessage()); + return NodeStatus::ERROR; + } } auto out_frame = std::make_shared(out); out_frame->pts = frame->pts; out_frame->frame_id = frame->frame_id; - out_frame->det = frame->det; - out_frame->face_det = frame->face_det; - out_frame->face_recog = frame->face_recog; + out_frame->transform_meta = std::make_shared(tx); + ScaleMeta(*frame, *out_frame, tx); out_frame->user_meta = frame->user_meta; PushToDownstream(out_frame); @@ -150,7 +416,7 @@ public: LogInfo("[preprocess] " + std::string(use_rga_ ? "rga" : "swscale") + " frame=" + std::to_string(out_frame->frame_id) + " " + std::to_string(frame->width) + "x" + std::to_string(frame->height) + - " -> " + std::to_string(out_w) + "x" + std::to_string(out_h) + + " -> " + std::to_string(out_frame->width) + "x" + std::to_string(out_frame->height) + " id=" + id_); } @@ -158,6 +424,142 @@ public: } private: + Status BuildLetterbox(const Frame& input, PixelFormat out_fmt, int dst_w, int dst_h, Frame& out, + FrameTransformMeta& tx) const { + Frame src = input; + if (input.format != out_fmt) { + src.width = input.width; + src.height = input.height; + src.format = out_fmt; + Status cvt = image_processor_->CvtColor(input, src, out_fmt); + if (cvt.Failed()) return cvt; + } + + if (src.width <= 0 || src.height <= 0 || dst_w <= 0 || dst_h <= 0) { + return FailStatus("invalid letterbox size"); + } + + float scale = std::min(static_cast(dst_w) / static_cast(src.width), + static_cast(dst_h) / static_cast(src.height)); + int inner_w = std::max(1, static_cast(std::round(src.width * scale))); + int inner_h = std::max(1, static_cast(std::round(src.height * scale))); + if (IsYuvFormat(out_fmt)) { + inner_w = std::max(2, MakeEvenFloor(inner_w)); + inner_h = std::max(2, MakeEvenFloor(inner_h)); + if (((dst_w - inner_w) & 1) != 0) inner_w = std::max(2, inner_w - 2); + if (((dst_h - inner_h) & 1) != 0) inner_h = std::max(2, inner_h - 2); + } + if (inner_w <= 0 || inner_h <= 0 || inner_w > dst_w || inner_h > dst_h) { + return FailStatus("invalid inner letterbox size"); + } + + Frame resized; + resized.width = inner_w; + resized.height = inner_h; + resized.format = out_fmt; + Status st = image_processor_->Resize(src, resized); + if (st.Failed()) return st; + + out.width = dst_w; + out.height = dst_h; + out.format = out_fmt; + if (!InitFrameStorage(out)) { + return FailStatus("alloc letterbox output failed"); + } + FillBlack(out); + + const int pad_x = (dst_w - inner_w) / 2; + const int pad_y = (dst_h - inner_h) / 2; + if (!BlitLetterbox(resized, out, pad_x, pad_y)) { + return FailStatus("blit letterbox failed"); + } + + tx.letterbox = true; + tx.src_w = input.width; + tx.src_h = input.height; + tx.dst_w = dst_w; + tx.dst_h = dst_h; + tx.scale_x = static_cast(inner_w) / static_cast(input.width); + tx.scale_y = static_cast(inner_h) / static_cast(input.height); + tx.pad_x = static_cast(pad_x); + tx.pad_y = static_cast(pad_y); + return OkStatus(); + } + + void ScaleMeta(const Frame& in_frame, Frame& out_frame, const FrameTransformMeta& tx) const { + if (in_frame.det) { + auto det = std::make_shared(*in_frame.det); + const int src_meta_w = det->img_w > 0 ? det->img_w : in_frame.width; + const int src_meta_h = det->img_h > 0 ? det->img_h : in_frame.height; + const float to_frame_x = src_meta_w > 0 ? static_cast(in_frame.width) / src_meta_w : 1.0f; + const float to_frame_y = src_meta_h > 0 ? static_cast(in_frame.height) / src_meta_h : 1.0f; + for (auto& it : det->items) { + TransformRect(it.bbox, + tx.scale_x * to_frame_x, + tx.scale_y * to_frame_y, + tx.pad_x, tx.pad_y, + out_frame.width, out_frame.height); + } + det->img_w = out_frame.width; + det->img_h = out_frame.height; + out_frame.det = std::move(det); + } + + if (in_frame.face_det) { + auto face_det = std::make_shared(*in_frame.face_det); + const int src_meta_w = face_det->img_w > 0 ? face_det->img_w : in_frame.width; + const int src_meta_h = face_det->img_h > 0 ? face_det->img_h : in_frame.height; + const float to_frame_x = src_meta_w > 0 ? static_cast(in_frame.width) / src_meta_w : 1.0f; + const float to_frame_y = src_meta_h > 0 ? static_cast(in_frame.height) / src_meta_h : 1.0f; + for (auto& it : face_det->faces) { + TransformRect(it.bbox, + tx.scale_x * to_frame_x, + tx.scale_y * to_frame_y, + tx.pad_x, tx.pad_y, + out_frame.width, out_frame.height); + if (it.has_landmarks) { + for (auto& lm : it.landmarks) { + TransformPoint(lm, + tx.scale_x * to_frame_x, + tx.scale_y * to_frame_y, + tx.pad_x, tx.pad_y, + out_frame.width, out_frame.height); + } + } + } + face_det->img_w = out_frame.width; + face_det->img_h = out_frame.height; + out_frame.face_det = std::move(face_det); + } + + if (in_frame.face_recog) { + auto face_recog = std::make_shared(*in_frame.face_recog); + const int src_meta_w = face_recog->img_w > 0 ? face_recog->img_w : in_frame.width; + const int src_meta_h = face_recog->img_h > 0 ? face_recog->img_h : in_frame.height; + const float to_frame_x = src_meta_w > 0 ? static_cast(in_frame.width) / src_meta_w : 1.0f; + const float to_frame_y = src_meta_h > 0 ? static_cast(in_frame.height) / src_meta_h : 1.0f; + for (auto& it : face_recog->items) { + TransformRect(it.bbox, + tx.scale_x * to_frame_x, + tx.scale_y * to_frame_y, + tx.pad_x, tx.pad_y, + out_frame.width, out_frame.height); + if (it.has_landmarks) { + for (auto& lm : it.landmarks) { + TransformPoint(lm, + tx.scale_x * to_frame_x, + tx.scale_y * to_frame_y, + tx.pad_x, tx.pad_y, + out_frame.width, out_frame.height); + } + } + } + face_recog->img_w = out_frame.width; + face_recog->img_h = out_frame.height; + out_frame.face_recog = std::move(face_recog); + } + } + void PushToDownstream(FramePtr frame) { for (auto& q : output_queues_) { q->Push(frame); @@ -170,7 +572,7 @@ private: if (frame->width == out_w && frame->height == out_h) return; if (!frame->det && !frame->face_det && !frame->face_recog) return; warned_meta_resize_ = true; - LogWarn("[preprocess] resized frame but forwarded det/face meta without coordinate scaling; ensure det/recog/osd use same resolution (id=" + id_ + ")"); + LogInfo("[preprocess] resized frame and scaled det/face meta to destination resolution (id=" + id_ + ")"); } void ProcessPassthrough(FramePtr frame) { @@ -185,12 +587,12 @@ private: int dst_w_ = 640; int dst_h_ = 640; bool keep_ratio_ = false; + ResizeMode resize_mode_ = ResizeMode::Stretch; PixelFormat dst_fmt_ = PixelFormat::UNKNOWN; bool use_rga_ = true; bool stats_log_ = false; uint64_t stats_interval_ = 100; - bool warned_meta_resize_ = false; std::shared_ptr> input_queue_;