diff --git a/configs/sample_cam_ppe11.json b/configs/sample_cam_ppe11.json
index 7237b74..53815f8 100644
--- a/configs/sample_cam_ppe11.json
+++ b/configs/sample_cam_ppe11.json
@@ -31,6 +31,7 @@
           "dst_h": 768,
           "dst_format": "rgb",
           "dst_packed": true,
+          "resize_mode": "letterbox",
           "keep_ratio": false,
           "rga_gate": "cam_ppe11_detection",
           "use_rga": true
diff --git a/include/frame/frame.h b/include/frame/frame.h
index 23ea702..c928b86 100644
--- a/include/frame/frame.h
+++ b/include/frame/frame.h
@@ -44,6 +44,19 @@ struct DetectionResult {
     std::string model_name;
 };
 
+struct FrameTransformMeta {
+    bool valid = false;
+    bool letterbox = false;
+    int src_w = 0;
+    int src_h = 0;
+    int dst_w = 0;
+    int dst_h = 0;
+    float scale_x = 1.0f;
+    float scale_y = 1.0f;
+    float pad_x = 0.0f;
+    float pad_y = 0.0f;
+};
+
 struct FramePlane {
     uint8_t* data = nullptr;
     int stride = 0;   // bytes per row
@@ -76,6 +89,7 @@ struct Frame {
     // Face recognition pipeline meta (kept separate from user_meta to avoid conflicts with publish).
     std::shared_ptr<FaceDetResult> face_det;
     std::shared_ptr<FaceRecogResult> face_recog;
+    std::shared_ptr<FrameTransformMeta> transform_meta;
     std::shared_ptr<void> user_meta;
 
     int DmaFd() const { return buffer ? buffer->DmaFd() : dma_fd; }
diff --git a/plugins/ai_yolo/ai_yolo_node.cpp b/plugins/ai_yolo/ai_yolo_node.cpp
index e1fa69a..f76db2e 100644
--- a/plugins/ai_yolo/ai_yolo_node.cpp
+++ b/plugins/ai_yolo/ai_yolo_node.cpp
@@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstring>
+#include <limits>
 #include <memory>
 #include <set>
 #include <thread>
@@ -33,6 +34,8 @@ const int kAnchor1[6] = {30, 61, 62, 45, 59, 119};
 const int kAnchor2[6] = {116, 90, 156, 198, 373, 326};
 
 enum class YoloVersion { V5, V8 };
+enum class V8BoxFormat { Auto, CxCyWh, XyXy, XyWh };
+enum class V8ClsActivation { Auto, None, Sigmoid };
 
 const char* kCocoLabels[kObjClassNum] = {
     "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
@@ -51,6 +54,67 @@ inline int Clamp(float val, int min_val, int max_val) {
     return val > min_val ? (val < max_val ? static_cast<int>(val) : max_val) : min_val;
 }
 
+struct DetCoordContext {
+    bool has_transform = false;
+    int out_w = 0;
+    int out_h = 0;
+    float scale_x = 1.0f;
+    float scale_y = 1.0f;
+    float pad_x = 0.0f;
+    float pad_y = 0.0f;
+    float fallback_scale_w = 1.0f;
+    float fallback_scale_h = 1.0f;
+};
+
+DetCoordContext BuildDetCoordContext(const Frame& frame, int model_input_w, int model_input_h) {
+    DetCoordContext ctx{};
+    ctx.fallback_scale_w = frame.width > 0 ? static_cast<float>(model_input_w) / frame.width : 1.0f;
+    ctx.fallback_scale_h = frame.height > 0 ? static_cast<float>(model_input_h) / frame.height : 1.0f;
+    ctx.out_w = frame.width;
+    ctx.out_h = frame.height;
+
+    if (frame.transform_meta && frame.transform_meta->valid &&
+        frame.transform_meta->src_w > 0 && frame.transform_meta->src_h > 0 &&
+        frame.transform_meta->scale_x > 1e-6f && frame.transform_meta->scale_y > 1e-6f) {
+        ctx.has_transform = true;
+        ctx.out_w = frame.transform_meta->src_w;
+        ctx.out_h = frame.transform_meta->src_h;
+        ctx.scale_x = frame.transform_meta->scale_x;
+        ctx.scale_y = frame.transform_meta->scale_y;
+        ctx.pad_x = frame.transform_meta->pad_x;
+        ctx.pad_y = frame.transform_meta->pad_y;
+    }
+    return ctx;
+}
+
+Rect DecodeToOutputRect(float x, float y, float w, float h, const DetCoordContext& ctx) {
+    float ox = x;
+    float oy = y;
+    float ow = w;
+    float oh = h;
+
+    if (ctx.has_transform) {
+        ox = (x - ctx.pad_x) / ctx.scale_x;
+        oy = (y - ctx.pad_y) / ctx.scale_y;
+        ow = w / ctx.scale_x;
+        oh = h / ctx.scale_y;
+    } else {
+        ox = x / ctx.fallback_scale_w;
+        oy = y / ctx.fallback_scale_h;
+        ow = w / ctx.fallback_scale_w;
+        oh = h / ctx.fallback_scale_h;
+    }
+
+    Rect r{};
+    const int out_w = std::max(1, ctx.out_w);
+    const int out_h = std::max(1, ctx.out_h);
+    r.x = static_cast<float>(Clamp(static_cast<int>(ox), 0, out_w));
+    r.y = static_cast<float>(Clamp(static_cast<int>(oy), 0, out_h));
+    r.w = static_cast<float>(Clamp(static_cast<int>(ow), 0, out_w - static_cast<int>(r.x)));
+    r.h = static_cast<float>(Clamp(static_cast<int>(oh), 0, out_h - static_cast<int>(r.y)));
+    return r;
+}
+
 inline int32_t ClipFloat(float val, float min_val, float max_val) {
     return static_cast<int32_t>(val <= min_val ? min_val : (val >= max_val ? max_val : val));
 }
@@ -64,39 +128,29 @@ inline float DequantizeAffineToF32(int8_t qnt, int32_t zp, float scale) {
     return (static_cast<float>(qnt) - static_cast<float>(zp)) * scale;
 }
 
-// FP16 (half) to FP32 conversion
-// IEEE 754 half-precision: 1 sign bit, 5 exponent bits, 10 mantissa bits
+inline float Sigmoid(float x) {
+    return 1.0f / (1.0f + std::exp(-x));
+}
+
+// FP16 (half) to FP32 conversion.
+// Uses arithmetic reconstruction to avoid undefined behavior on subnormals.
 inline float Fp16ToFp32(uint16_t h) {
-    uint32_t sign = (h >> 15) & 0x1;
-    uint32_t exp = (h >> 10) & 0x1F;
-    uint32_t mant = h & 0x3FF;
-    
-    uint32_t f;
+    const int sign = (h & 0x8000) ? -1 : 1;
+    const int exp = (h >> 10) & 0x1F;
+    const int mant = h & 0x03FF;
+
     if (exp == 0) {
-        // Zero or subnormal
-        if (mant == 0) {
-            f = (sign << 31);  // Signed zero
-        } else {
-            // Subnormal: convert to normal
-            exp = 1;
-            while ((mant & 0x400) == 0) {
-                mant <<= 1;
-                exp--;
-            }
-            mant &= 0x3FF;
-            f = (sign << 31) | ((exp + 112) << 23) | (mant << 13);
-        }
-    } else if (exp == 0x1F) {
-        // Infinity or NaN
-        f = (sign << 31) | (0xFF << 23) | (mant << 13);
-    } else {
-        // Normal number
-        f = (sign << 31) | ((exp + 112) << 23) | (mant << 13);
+        if (mant == 0) return sign < 0 ? -0.0f : 0.0f;
+        // subnormal: mant * 2^-24
+        return static_cast<float>(sign) * std::ldexp(static_cast<float>(mant), -24);
     }
-    
-    float result;
-    memcpy(&result, &f, sizeof(float));
-    return result;
+    if (exp == 0x1F) {
+        if (mant == 0) return sign < 0 ? -INFINITY : INFINITY;
+        return std::numeric_limits<float>::quiet_NaN();
+    }
+    // normal: (mant + 1024) * 2^(exp-25)
+    return static_cast<float>(sign) *
+           std::ldexp(static_cast<float>(mant + 1024), exp - 25);
 }
 
 float CalculateIoU(float x1_min, float y1_min, float x1_max, float y1_max,
@@ -212,18 +266,239 @@ int ProcessFeatureMapV5(int8_t* input, const int* anchor, int grid_h, int grid_w
     return valid_count;
 }
 
+uint32_t TensorTypeSizeBytes(rknn_tensor_type t) {
+    switch (t) {
+        case RKNN_TENSOR_INT8:
+        case RKNN_TENSOR_UINT8:
+            return 1;
+        case RKNN_TENSOR_FLOAT16:
+            return 2;
+        case RKNN_TENSOR_FLOAT32:
+            return 4;
+        default:
+            return 1;
+    }
+}
+
+int DefaultV8NumBoxes(int model_h, int model_w) {
+    if (model_h <= 0 || model_w <= 0) return 0;
+    return (model_h / 8) * (model_w / 8) +
+           (model_h / 16) * (model_w / 16) +
+           (model_h / 32) * (model_w / 32);
+}
+
+struct V8LayoutInfo {
+    int num_boxes = 0;
+    bool channels_first = true;  // true: CxN, false: NxC
+};
+
+float ScoreBoxCandidate(float x, float y, float w, float h, int model_w, int model_h) {
+    float s = 0.0f;
+    if (w > 0.0f && h > 0.0f) s += 3.0f;
+    if (w <= model_w * 1.2f) s += 1.0f;
+    if (h <= model_h * 1.2f) s += 1.0f;
+    if (x >= -model_w * 0.1f) s += 1.0f;
+    if (y >= -model_h * 0.1f) s += 1.0f;
+    if ((x + w) <= model_w * 1.2f) s += 1.0f;
+    if ((y + h) <= model_h * 1.2f) s += 1.0f;
+    return s;
+}
+
+bool SeemsNormalized(float a, float b, float c, float d) {
+    auto in_range = [](float v) { return v >= -0.05f && v <= 2.5f; };
+    return in_range(a) && in_range(b) && in_range(c) && in_range(d);
+}
+
+const char* V8BoxFormatName(V8BoxFormat fmt) {
+    switch (fmt) {
+        case V8BoxFormat::CxCyWh: return "cxcywh";
+        case V8BoxFormat::XyXy: return "xyxy";
+        case V8BoxFormat::XyWh: return "xywh";
+        default: return "auto";
+    }
+}
+
+void DecodeV8Box(float a, float b, float c, float d, int model_w, int model_h, V8BoxFormat fmt,
+                 float& out_x, float& out_y, float& out_w, float& out_h, V8BoxFormat* used_fmt = nullptr) {
+    if (SeemsNormalized(a, b, c, d)) {
+        a *= static_cast<float>(model_w);
+        b *= static_cast<float>(model_h);
+        c *= static_cast<float>(model_w);
+        d *= static_cast<float>(model_h);
+    }
+
+    auto decode_cxcywh = [&](float& x, float& y, float& w, float& h) {
+        x = a - c / 2.0f;
+        y = b - d / 2.0f;
+        w = c;
+        h = d;
+    };
+    auto decode_xyxy = [&](float& x, float& y, float& w, float& h) {
+        x = a;
+        y = b;
+        w = c - a;
+        h = d - b;
+    };
+    auto decode_xywh = [&](float& x, float& y, float& w, float& h) {
+        x = a;
+        y = b;
+        w = c;
+        h = d;
+    };
+
+    if (fmt == V8BoxFormat::CxCyWh) {
+        decode_cxcywh(out_x, out_y, out_w, out_h);
+        if (used_fmt) *used_fmt = V8BoxFormat::CxCyWh;
+        return;
+    }
+    if (fmt == V8BoxFormat::XyXy) {
+        decode_xyxy(out_x, out_y, out_w, out_h);
+        if (used_fmt) *used_fmt = V8BoxFormat::XyXy;
+        return;
+    }
+    if (fmt == V8BoxFormat::XyWh) {
+        decode_xywh(out_x, out_y, out_w, out_h);
+        if (used_fmt) *used_fmt = V8BoxFormat::XyWh;
+        return;
+    }
+
+    float x1 = 0.0f, y1 = 0.0f, w1 = 0.0f, h1 = 0.0f;
+    float x2 = 0.0f, y2 = 0.0f, w2 = 0.0f, h2 = 0.0f;
+    float x3 = 0.0f, y3 = 0.0f, w3 = 0.0f, h3 = 0.0f;
+    decode_cxcywh(x1, y1, w1, h1);
+    decode_xyxy(x2, y2, w2, h2);
+    decode_xywh(x3, y3, w3, h3);
+
+    const float s1 = ScoreBoxCandidate(x1, y1, w1, h1, model_w, model_h);
+    const float s2 = ScoreBoxCandidate(x2, y2, w2, h2, model_w, model_h);
+    const float s3 = ScoreBoxCandidate(x3, y3, w3, h3, model_w, model_h);
+    if (s2 >= s1 && s2 >= s3) {
+        out_x = x2; out_y = y2; out_w = w2; out_h = h2;
+        if (used_fmt) *used_fmt = V8BoxFormat::XyXy;
+    } else if (s3 >= s1 && s3 >= s2) {
+        out_x = x3; out_y = y3; out_w = w3; out_h = h3;
+        if (used_fmt) *used_fmt = V8BoxFormat::XyWh;
+    } else {
+        out_x = x1; out_y = y1; out_w = w1; out_h = h1;
+        if (used_fmt) *used_fmt = V8BoxFormat::CxCyWh;
+    }
+}
+
+bool ResolveV8ApplySigmoid(const float* output, int num_boxes, int num_classes, bool channels_first,
+                           V8ClsActivation act_mode) {
+    if (act_mode == V8ClsActivation::None) return false;
+    if (act_mode == V8ClsActivation::Sigmoid) return true;
+    if (!output || num_boxes <= 0 || num_classes <= 0) return false;
+
+    const int num_channels = 4 + num_classes;
+    const int sample_boxes = std::min(num_boxes, 64);
+    float min_v = 1e9f;
+    float max_v = -1e9f;
+    for (int i = 0; i < sample_boxes; ++i) {
+        for (int c = 0; c < num_classes; ++c) {
+            const float v = channels_first ? output[(4 + c) * num_boxes + i]
+                                           : output[i * num_channels + (4 + c)];
+            if (v < min_v) min_v = v;
+            if (v > max_v) max_v = v;
+        }
+    }
+    // If class outputs clearly look like logits, enable sigmoid.
+    return (min_v < -0.1f || max_v > 1.5f);
+}
+
+V8LayoutInfo ResolveV8Layout(const std::vector<uint32_t>& dims, size_t byte_size,
+                             rknn_tensor_type type, int num_classes,
+                             int model_h, int model_w) {
+    V8LayoutInfo info;
+    const int num_channels = 4 + num_classes;
+    if (num_channels <= 0) return info;
+
+    const uint32_t elem_bytes = TensorTypeSizeBytes(type);
+    const size_t total_elems = elem_bytes > 0 ? (byte_size / elem_bytes) : 0;
+    const size_t max_boxes_from_data = static_cast<size_t>(num_channels) > 0
+                                           ? (total_elems / static_cast<size_t>(num_channels))
+                                           : 0;
+
+    int ch_idx = -1;
+    for (size_t i = 0; i < dims.size(); ++i) {
+        if (dims[i] == static_cast<uint32_t>(num_channels)) {
+            ch_idx = static_cast<int>(i);
+            break;
+        }
+    }
+
+    if (ch_idx >= 0 && total_elems >= static_cast<size_t>(num_channels)) {
+        info.num_boxes = static_cast<int>(max_boxes_from_data);
+
+        int prev_non1 = 1;
+        for (int i = ch_idx - 1; i >= 0; --i) {
+            if (dims[static_cast<size_t>(i)] > 1U) {
+                prev_non1 = static_cast<int>(dims[static_cast<size_t>(i)]);
+                break;
+            }
+        }
+        int next_non1 = 1;
+        for (size_t i = static_cast<size_t>(ch_idx + 1); i < dims.size(); ++i) {
+            if (dims[i] > 1U) {
+                next_non1 = static_cast<int>(dims[i]);
+                break;
+            }
+        }
+
+        if (next_non1 > 1 && prev_non1 <= 1) {
+            info.channels_first = true;
+        } else if (prev_non1 > 1 && next_non1 <= 1) {
+            info.channels_first = false;
+        } else if (next_non1 > 1 && prev_non1 > 1) {
+            info.channels_first = next_non1 >= prev_non1;
+        } else {
+            info.channels_first = true;
+        }
+    } else if (dims.size() >= 3) {
+        // Compatibility with old rank-3 assumptions.
+        if (dims[1] == static_cast<uint32_t>(num_channels)) {
+            info.num_boxes = static_cast<int>(dims[2]);
+            info.channels_first = true;
+        } else if (dims[2] == static_cast<uint32_t>(num_channels)) {
+            info.num_boxes = static_cast<int>(dims[1]);
+            info.channels_first = false;
+        }
+    }
+
+    if (info.num_boxes <= 0 && max_boxes_from_data > 0) {
+        info.num_boxes = static_cast<int>(max_boxes_from_data);
+    }
+    if (info.num_boxes <= 0) {
+        info.num_boxes = DefaultV8NumBoxes(model_h, model_w);
+    }
+    if (info.num_boxes <= 0) {
+        info.num_boxes = 8400;
+    }
+
+    if (max_boxes_from_data > 0 && static_cast<size_t>(info.num_boxes) > max_boxes_from_data) {
+        info.num_boxes = static_cast<int>(max_boxes_from_data);
+    }
+    if (info.num_boxes < 0) info.num_boxes = 0;
+    return info;
+}
+
 // YOLOv8 output processing (anchor-free, single output tensor)
 int ProcessOutputV8(float* output, int num_boxes, int num_classes,
                     int model_h, int model_w,
                     std::vector<float>& boxes, std::vector<float>& obj_probs,
-                    std::vector<int>& class_ids, float conf_thresh) {
+                    std::vector<int>& class_ids, float conf_thresh,
+                    bool channels_first, V8BoxFormat box_format, bool apply_sigmoid,
+                    bool debug_decode, int* debug_left) {
     int valid_count = 0;
+    const int num_channels = 4 + num_classes;
 
     for (int i = 0; i < num_boxes; ++i) {
         float max_score = 0.0f;
         int max_cls_id = 0;
         for (int c = 0; c < num_classes; ++c) {
-            float score = output[(4 + c) * num_boxes + i];
+            float score = channels_first ? output[(4 + c) * num_boxes + i]
+                                         : output[i * num_channels + (4 + c)];
+            if (apply_sigmoid) score = Sigmoid(score);
             if (score > max_score) {
                 max_score = score;
                 max_cls_id = c;
@@ -231,13 +506,28 @@ int ProcessOutputV8(float* output, int num_boxes, int num_classes,
         }
 
         if (max_score >= conf_thresh) {
-            float cx = output[0 * num_boxes + i];
-            float cy = output[1 * num_boxes + i];
-            float w = output[2 * num_boxes + i];
-            float h = output[3 * num_boxes + i];
-
-            float x1 = cx - w / 2.0f;
-            float y1 = cy - h / 2.0f;
+            const float a = channels_first ? output[0 * num_boxes + i] : output[i * num_channels + 0];
+            const float b = channels_first ? output[1 * num_boxes + i] : output[i * num_channels + 1];
+            const float c = channels_first ? output[2 * num_boxes + i] : output[i * num_channels + 2];
+            const float d = channels_first ? output[3 * num_boxes + i] : output[i * num_channels + 3];
+            if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) {
+                continue;
+            }
+            float x1 = 0.0f, y1 = 0.0f, w = 0.0f, h = 0.0f;
+            V8BoxFormat used_fmt = box_format;
+            DecodeV8Box(a, b, c, d, model_w, model_h, box_format, x1, y1, w, h, &used_fmt);
+            if (!std::isfinite(x1) || !std::isfinite(y1) || !std::isfinite(w) || !std::isfinite(h)) {
+                continue;
+            }
+            if (w <= 1e-3f || h <= 1e-3f) continue;
+            if (debug_decode && debug_left && *debug_left > 0) {
+                --(*debug_left);
+                LogInfo("[ai_yolo] v8 decode f32: raw4(" + std::to_string(a) + "," + std::to_string(b) + "," +
+                        std::to_string(c) + "," + std::to_string(d) + ") fmt=" + V8BoxFormatName(used_fmt) +
+                        " -> xywh(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
+                        std::to_string(w) + "," + std::to_string(h) + ") cls=" +
+                        std::to_string(max_cls_id) + " score=" + std::to_string(max_score));
+            }
 
             boxes.push_back(x1);
             boxes.push_back(y1);
@@ -256,15 +546,18 @@ int ProcessOutputV8Int8(int8_t* output, int num_boxes, int num_classes,
                         int model_h, int model_w,
                         std::vector<float>& boxes, std::vector<float>& obj_probs,
                         std::vector<int>& class_ids, float conf_thresh,
-                        int32_t zp, float scale) {
+                        int32_t zp, float scale, bool channels_first, V8BoxFormat box_format,
+                        bool debug_decode, int* debug_left) {
     int valid_count = 0;
     int8_t thresh_i8 = QuantizeF32ToAffine(conf_thresh, zp, scale);
+    const int num_channels = 4 + num_classes;
 
     for (int i = 0; i < num_boxes; ++i) {
         int8_t max_score_i8 = -128;
         int max_cls_id = 0;
         for (int c = 0; c < num_classes; ++c) {
-            int8_t score = output[(4 + c) * num_boxes + i];
+            int8_t score = channels_first ? output[(4 + c) * num_boxes + i]
+                                          : output[i * num_channels + (4 + c)];
             if (score > max_score_i8) {
                 max_score_i8 = score;
                 max_cls_id = c;
@@ -272,14 +565,33 @@ int ProcessOutputV8Int8(int8_t* output, int num_boxes, int num_classes,
         }
 
         if (max_score_i8 >= thresh_i8) {
-            float cx = DequantizeAffineToF32(output[0 * num_boxes + i], zp, scale);
-            float cy = DequantizeAffineToF32(output[1 * num_boxes + i], zp, scale);
-            float w = DequantizeAffineToF32(output[2 * num_boxes + i], zp, scale);
-            float h = DequantizeAffineToF32(output[3 * num_boxes + i], zp, scale);
+            float a = DequantizeAffineToF32(
+                channels_first ? output[0 * num_boxes + i] : output[i * num_channels + 0], zp, scale);
+            float b = DequantizeAffineToF32(
+                channels_first ? output[1 * num_boxes + i] : output[i * num_channels + 1], zp, scale);
+            float c = DequantizeAffineToF32(
+                channels_first ? output[2 * num_boxes + i] : output[i * num_channels + 2], zp, scale);
+            float d = DequantizeAffineToF32(
+                channels_first ? output[3 * num_boxes + i] : output[i * num_channels + 3], zp, scale);
             float max_score = DequantizeAffineToF32(max_score_i8, zp, scale);
-
-            float x1 = cx - w / 2.0f;
-            float y1 = cy - h / 2.0f;
+            if (!std::isfinite(a) || !std::isfinite(b) || !std::isfinite(c) || !std::isfinite(d)) {
+                continue;
+            }
+            float x1 = 0.0f, y1 = 0.0f, w = 0.0f, h = 0.0f;
+            V8BoxFormat used_fmt = box_format;
+            DecodeV8Box(a, b, c, d, model_w, model_h, box_format, x1, y1, w, h, &used_fmt);
+            if (!std::isfinite(x1) || !std::isfinite(y1) || !std::isfinite(w) || !std::isfinite(h)) {
+                continue;
+            }
+            if (w <= 1e-3f || h <= 1e-3f) continue;
+            if (debug_decode && debug_left && *debug_left > 0) {
+                --(*debug_left);
+                LogInfo("[ai_yolo] v8 decode int8: raw4(" + std::to_string(a) + "," + std::to_string(b) + "," +
+                        std::to_string(c) + "," + std::to_string(d) + ") fmt=" + V8BoxFormatName(used_fmt) +
+                        " -> xywh(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
+                        std::to_string(w) + "," + std::to_string(h) + ") cls=" +
+                        std::to_string(max_cls_id) + " score=" + std::to_string(max_score));
+            }
 
             boxes.push_back(x1);
             boxes.push_back(y1);
@@ -309,6 +621,28 @@ public:
         model_input_w_ = config.ValueOr<int>("model_w", 640);
         model_input_h_ = config.ValueOr<int>("model_h", 640);
         num_classes_ = config.ValueOr<int>("num_classes", 80);
+        {
+            const std::string bf = config.ValueOr<std::string>("v8_box_format", "cxcywh");
+            if (bf == "xyxy") {
+                v8_box_format_ = V8BoxFormat::XyXy;
+            } else if (bf == "xywh") {
+                v8_box_format_ = V8BoxFormat::XyWh;
+            } else if (bf == "cxcywh") {
+                v8_box_format_ = V8BoxFormat::CxCyWh;
+            } else {
+                v8_box_format_ = V8BoxFormat::Auto;
+            }
+        }
+        {
+            const std::string act = config.ValueOr<std::string>("v8_cls_activation", "auto");
+            if (act == "sigmoid") {
+                v8_cls_activation_ = V8ClsActivation::Sigmoid;
+            } else if (act == "none") {
+                v8_cls_activation_ = V8ClsActivation::None;
+            } else {
+                v8_cls_activation_ = V8ClsActivation::Auto;
+            }
+        }
 
         if (const SimpleJson* dbg = config.Find("debug"); dbg && dbg->IsObject()) {
             stats_log_ = dbg->ValueOr<bool>("stats", stats_log_);
@@ -432,10 +766,9 @@ public:
         PushToDownstream(frame);
         ++processed_;
 
-        // Stats log disabled to reduce log spam
-        // if (stats_log_ && stats_interval_ > 0 && (processed_ % stats_interval_) == 0) {
-        //     LogInfo("[ai_yolo] processed=" + std::to_string(processed_) + " id=" + id_);
-        // }
+        if (stats_log_ && stats_interval_ > 0 && (processed_ % stats_interval_) == 0) {
+            LogInfo("[ai_yolo] processed=" + std::to_string(processed_) + " id=" + id_);
+        }
         return NodeStatus::OK;
     }
 
@@ -540,29 +873,42 @@ private:
                                            outputs[2].zp, outputs[2].scale);
             valid_count = cnt0 + cnt1 + cnt2;
         } else {
-                if (outputs.empty()) return;
+            if (outputs.empty()) return;
             if (!outputs[0].data || outputs[0].size == 0) return;
 
-            int num_boxes = 0;
-            int num_channels = 4 + num_classes_;
-
-            if (outputs[0].dims.size() >= 3) {
-                if (outputs[0].dims[1] == static_cast<uint32_t>(num_channels)) {
-                    num_boxes = static_cast<int>(outputs[0].dims[2]);
-                } else if (outputs[0].dims[2] == static_cast<uint32_t>(num_channels)) {
-                    num_boxes = static_cast<int>(outputs[0].dims[1]);
-                } else {
-                    num_boxes = 8400;
+            const V8LayoutInfo layout = ResolveV8Layout(outputs[0].dims, outputs[0].size,
+                                                        outputs[0].type, num_classes_,
+                                                        model_input_h_, model_input_w_);
+            const int num_boxes = layout.num_boxes;
+            int debug_decode_left = (debug_det_ && processed_ < 20) ? 5 : 0;
+            if (num_boxes <= 0) return;
+            if (debug_det_ && processed_ < 5) {
+                std::string dims_s;
+                for (size_t di = 0; di < outputs[0].dims.size(); ++di) {
+                    dims_s += (di == 0 ? "[" : ",");
+                    dims_s += std::to_string(outputs[0].dims[di]);
                 }
-            } else {
-                num_boxes = static_cast<int>(outputs[0].size) / num_channels;
+                dims_s += "]";
+                LogInfo("[ai_yolo] v8 out type=" + std::to_string(static_cast<int>(outputs[0].type)) +
+                        " size=" + std::to_string(outputs[0].size) +
+                        " dims=" + dims_s +
+                        " num_boxes=" + std::to_string(num_boxes) +
+                        " layout=" + std::string(layout.channels_first ? "CxN" : "NxC"));
             }
 
             if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
+                const bool apply_sigmoid = ResolveV8ApplySigmoid(
+                    reinterpret_cast<float*>(const_cast<uint8_t*>(outputs[0].data)),
+                    num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
+                if (debug_det_ && processed_ < 5) {
+                    LogInfo("[ai_yolo] v8 cls activation=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
+                }
                 valid_count = ProcessOutputV8(reinterpret_cast<float*>(const_cast<uint8_t*>(outputs[0].data)),
                                               num_boxes, num_classes_,
                                               model_input_h_, model_input_w_,
-                                              boxes, obj_probs, class_ids, conf_thresh_);
+                                              boxes, obj_probs, class_ids, conf_thresh_,
+                                              layout.channels_first, v8_box_format_, apply_sigmoid,
+                                              debug_det_, &debug_decode_left);
             } else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
                 // Convert FP16 to FP32
                 size_t num_elements = outputs[0].size / sizeof(uint16_t);
@@ -571,16 +917,25 @@ private:
                 for (size_t i = 0; i < num_elements; ++i) {
                     fp32_buffer_[i] = Fp16ToFp32(fp16_data[i]);
                 }
+                const bool apply_sigmoid = ResolveV8ApplySigmoid(
+                    fp32_buffer_.data(), num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
+                if (debug_det_ && processed_ < 5) {
+                    LogInfo("[ai_yolo] v8 cls activation=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
+                }
                 valid_count = ProcessOutputV8(fp32_buffer_.data(),
                                               num_boxes, num_classes_,
                                               model_input_h_, model_input_w_,
-                                              boxes, obj_probs, class_ids, conf_thresh_);
+                                              boxes, obj_probs, class_ids, conf_thresh_,
+                                              layout.channels_first, v8_box_format_, apply_sigmoid,
+                                              debug_det_, &debug_decode_left);
             } else {
                 valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(const_cast<uint8_t*>(outputs[0].data)),
                                                   num_boxes, num_classes_,
                                                   model_input_h_, model_input_w_,
                                                   boxes, obj_probs, class_ids, conf_thresh_,
-                                                  outputs[0].zp, outputs[0].scale);
+                                                  outputs[0].zp, outputs[0].scale,
+                                                  layout.channels_first, v8_box_format_,
+                                                  debug_det_, &debug_decode_left);
             }
         }
 
@@ -596,12 +951,11 @@ private:
             NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
         }
 
-        float scale_w = static_cast<float>(model_input_w_) / frame->width;
-        float scale_h = static_cast<float>(model_input_h_) / frame->height;
+        const DetCoordContext coord_ctx = BuildDetCoordContext(*frame, model_input_w_, model_input_h_);
 
         auto det_result = std::make_shared<DetectionResult>();
-        det_result->img_w = frame->width;
-        det_result->img_h = frame->height;
+        det_result->img_w = coord_ctx.out_w;
+        det_result->img_h = coord_ctx.out_h;
         det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";
 
         for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
@@ -621,22 +975,23 @@ private:
             Detection det;
             det.cls_id = cls_id;
             det.score = obj_probs[i];
-            det.bbox.x = static_cast<float>(Clamp(static_cast<int>(x1 / scale_w), 0, frame->width));
-            det.bbox.y = static_cast<float>(Clamp(static_cast<int>(y1 / scale_h), 0, frame->height));
-            det.bbox.w = static_cast<float>(Clamp(static_cast<int>(w / scale_w), 0, frame->width - static_cast<int>(det.bbox.x)));
-            det.bbox.h = static_cast<float>(Clamp(static_cast<int>(h / scale_h), 0, frame->height - static_cast<int>(det.bbox.y)));
+            det.bbox = DecodeToOutputRect(x1, y1, w, h, coord_ctx);
             det.track_id = -1;
 
-            if (debug_det_ && det_result->items.size() < 3 && processed_ < 10) {
-                LogDebug("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
-                         std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
-                         std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
-                         std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
-                         std::to_string(cls_id) + " score=" + std::to_string(det.score));
+            if (debug_det_ && det_result->items.size() < 5 && processed_ < 20) {
+                LogInfo("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
+                        std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
+                        std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
+                        std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
+                        std::to_string(cls_id) + " score=" + std::to_string(det.score));
             }
 
             det_result->items.push_back(det);
         }
+        if (debug_det_ && processed_ < 20) {
+            LogInfo("[ai_yolo] det summary: valid_count=" + std::to_string(valid_count) +
+                    " final=" + std::to_string(det_result->items.size()));
+        }
 
         frame->det = det_result;
     }
@@ -671,33 +1026,66 @@ private:
         } else {
             if (outputs.empty()) return;
 
-            int num_boxes = 0;
-            int num_channels = 4 + num_classes_;
-
-            if (outputs[0].dims.size() >= 3) {
-                if (outputs[0].dims[1] == static_cast<uint32_t>(num_channels)) {
-                    num_boxes = outputs[0].dims[2];
-                } else if (outputs[0].dims[2] == static_cast<uint32_t>(num_channels)) {
-                    num_boxes = outputs[0].dims[1];
-                } else {
-                    num_boxes = 8400;
+            const V8LayoutInfo layout = ResolveV8Layout(outputs[0].dims, outputs[0].data.size(),
+                                                        outputs[0].type, num_classes_,
+                                                        model_input_h_, model_input_w_);
+            const int num_boxes = layout.num_boxes;
+            int debug_decode_left = (debug_det_ && processed_ < 20) ? 5 : 0;
+            if (num_boxes <= 0) return;
+            if (debug_det_ && processed_ < 5) {
+                std::string dims_s;
+                for (size_t di = 0; di < outputs[0].dims.size(); ++di) {
+                    dims_s += (di == 0 ? "[" : ",");
+                    dims_s += std::to_string(outputs[0].dims[di]);
                 }
-            } else {
-                num_boxes = outputs[0].data.size() / num_channels;
+                dims_s += "]";
+                LogInfo("[ai_yolo] v8 out(type copy) type=" + std::to_string(static_cast<int>(outputs[0].type)) +
+                        " size=" + std::to_string(outputs[0].data.size()) +
+                        " dims=" + dims_s +
+                        " num_boxes=" + std::to_string(num_boxes) +
+                        " layout=" + std::string(layout.channels_first ? "CxN" : "NxC"));
             }
 
-            if (outputs[0].type == RKNN_TENSOR_FLOAT32 ||
-                outputs[0].type == RKNN_TENSOR_FLOAT16) {
+            if (outputs[0].type == RKNN_TENSOR_FLOAT32) {
+                const bool apply_sigmoid = ResolveV8ApplySigmoid(
+                    reinterpret_cast<float*>(outputs[0].data.data()),
+                    num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
+                if (debug_det_ && processed_ < 5) {
+                    LogInfo("[ai_yolo] v8 cls activation(copy)=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
+                }
                 valid_count = ProcessOutputV8(reinterpret_cast<float*>(outputs[0].data.data()),
                                               num_boxes, num_classes_,
                                               model_input_h_, model_input_w_,
-                                              boxes, obj_probs, class_ids, conf_thresh_);
+                                              boxes, obj_probs, class_ids, conf_thresh_,
+                                              layout.channels_first, v8_box_format_, apply_sigmoid,
+                                              debug_det_, &debug_decode_left);
+            } else if (outputs[0].type == RKNN_TENSOR_FLOAT16) {
+                // Convert FP16 to FP32
+                size_t num_elements = outputs[0].data.size() / sizeof(uint16_t);
+                fp32_buffer_.resize(num_elements);
+                const uint16_t* fp16_data = reinterpret_cast<const uint16_t*>(outputs[0].data.data());
+                for (size_t i = 0; i < num_elements; ++i) {
+                    fp32_buffer_[i] = Fp16ToFp32(fp16_data[i]);
+                }
+                const bool apply_sigmoid = ResolveV8ApplySigmoid(
+                    fp32_buffer_.data(), num_boxes, num_classes_, layout.channels_first, v8_cls_activation_);
+                if (debug_det_ && processed_ < 5) {
+                    LogInfo("[ai_yolo] v8 cls activation(copy)=" + std::string(apply_sigmoid ? "sigmoid" : "none"));
+                }
+                valid_count = ProcessOutputV8(fp32_buffer_.data(),
+                                              num_boxes, num_classes_,
+                                              model_input_h_, model_input_w_,
+                                              boxes, obj_probs, class_ids, conf_thresh_,
+                                              layout.channels_first, v8_box_format_, apply_sigmoid,
+                                              debug_det_, &debug_decode_left);
             } else {
                 valid_count = ProcessOutputV8Int8(reinterpret_cast<int8_t*>(outputs[0].data.data()),
                                                   num_boxes, num_classes_,
                                                   model_input_h_, model_input_w_,
                                                   boxes, obj_probs, class_ids, conf_thresh_,
-                                                  outputs[0].zp, outputs[0].scale);
+                                                  outputs[0].zp, outputs[0].scale,
+                                                  layout.channels_first, v8_box_format_,
+                                                  debug_det_, &debug_decode_left);
             }
         }
 
@@ -713,12 +1101,11 @@ private:
             NMS(valid_count, boxes, class_ids, indices, c, nms_thresh_);
         }
 
-        float scale_w = static_cast<float>(model_input_w_) / frame->width;
-        float scale_h = static_cast<float>(model_input_h_) / frame->height;
+        const DetCoordContext coord_ctx = BuildDetCoordContext(*frame, model_input_w_, model_input_h_);
 
         auto det_result = std::make_shared<DetectionResult>();
-        det_result->img_w = frame->width;
-        det_result->img_h = frame->height;
+        det_result->img_w = coord_ctx.out_w;
+        det_result->img_h = coord_ctx.out_h;
         det_result->model_name = (yolo_version_ == YoloVersion::V5) ? "yolov5" : "yolov8";
 
         for (int i = 0; i < valid_count && det_result->items.size() < kMaxDetections; ++i) {
@@ -738,23 +1125,23 @@ private:
             Detection det;
             det.cls_id = cls_id;
             det.score = obj_probs[i];
-            det.bbox.x = static_cast<float>(Clamp(static_cast<int>(x1 / scale_w), 0, frame->width));
-            det.bbox.y = static_cast<float>(Clamp(static_cast<int>(y1 / scale_h), 0, frame->height));
-            det.bbox.w = static_cast<float>(Clamp(static_cast<int>(w / scale_w), 0, frame->width - static_cast<int>(det.bbox.x)));
-            det.bbox.h = static_cast<float>(Clamp(static_cast<int>(h / scale_h), 0, frame->height - static_cast<int>(det.bbox.y)));
+            det.bbox = DecodeToOutputRect(x1, y1, w, h, coord_ctx);
             det.track_id = -1;
 
-            // Debug output for first few detections
-            if (debug_det_ && det_result->items.size() < 3 && processed_ < 10) {
-                LogDebug("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
-                         std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
-                         std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
-                         std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
-                         std::to_string(cls_id) + " score=" + std::to_string(det.score));
+            if (debug_det_ && det_result->items.size() < 5 && processed_ < 20) {
+                LogInfo("[ai_yolo] det: raw(" + std::to_string(x1) + "," + std::to_string(y1) + "," +
+                        std::to_string(w) + "," + std::to_string(h) + ") -> bbox(" +
+                        std::to_string(det.bbox.x) + "," + std::to_string(det.bbox.y) + "," +
+                        std::to_string(det.bbox.w) + "," + std::to_string(det.bbox.h) + ") cls=" +
+                        std::to_string(cls_id) + " score=" + std::to_string(det.score));
             }
 
             det_result->items.push_back(det);
         }
+        if (debug_det_ && processed_ < 20) {
+            LogInfo("[ai_yolo] det summary(copy): valid_count=" + std::to_string(valid_count) +
+                    " final=" + std::to_string(det_result->items.size()));
+        }
 
         frame->det = det_result;
     }
@@ -767,6 +1154,8 @@ private:
     int model_input_w_ = 640;
     int model_input_h_ = 640;
     int num_classes_ = 80;
+    V8BoxFormat v8_box_format_ = V8BoxFormat::CxCyWh;
+    V8ClsActivation v8_cls_activation_ = V8ClsActivation::Auto;
     YoloVersion yolo_version_ = YoloVersion::V8;
     bool auto_detect_version_ = false;
     std::set<int> class_filter_;
diff --git a/plugins/osd/osd_node.cpp b/plugins/osd/osd_node.cpp
index 4841924..d3dfed1 100644
--- a/plugins/osd/osd_node.cpp
+++ b/plugins/osd/osd_node.cpp
@@ -63,6 +63,20 @@ inline int Clamp(int val, int min_val, int max_val) {
     return val < min_val ? min_val : (val > max_val ? max_val : val);
 }
 
+Rect MapRectToFrame(const Rect& in, int src_w, int src_h, int dst_w, int dst_h) {
+    if (src_w <= 0 || src_h <= 0 || dst_w <= 0 || dst_h <= 0) return Rect{};
+    const float sx = static_cast<float>(dst_w) / static_cast<float>(src_w);
+    const float sy = static_cast<float>(dst_h) / static_cast<float>(src_h);
+    Rect out{};
+    out.x = std::max(0.0f, in.x * sx);
+    out.y = std::max(0.0f, in.y * sy);
+    out.w = std::max(0.0f, in.w * sx);
+    out.h = std::max(0.0f, in.h * sy);
+    if (out.x + out.w > static_cast<float>(dst_w)) out.w = std::max(0.0f, static_cast<float>(dst_w) - out.x);
+    if (out.y + out.h > static_cast<float>(dst_h)) out.h = std::max(0.0f, static_cast<float>(dst_h) - out.y);
+    return out;
+}
+
 #if defined(RK3588_ENABLE_RGA)
 inline uint32_t PackColorArgb(const Color& c) {
     return (0xFFu << 24) | (static_cast<uint32_t>(c.r) << 16) |
@@ -561,6 +575,9 @@ private:
 
         int w = frame->width;
         int h = frame->height;
+        const int det_w = frame->det->img_w > 0 ? frame->det->img_w : w;
+        const int det_h = frame->det->img_h > 0 ? frame->det->img_h : h;
+        const bool map_det_to_frame = (det_w != w || det_h != h);
         uint8_t* data = frame->planes[0].data ? frame->planes[0].data : frame->data;
         PixelFormat fmt = frame->format;
 
@@ -587,10 +604,11 @@ private:
                     } else {
                         bool ok = true;
                         for (const auto& det : frame->det->items) {
-                            int x = Clamp(static_cast<int>(det.bbox.x), 0, w - 1);
-                            int y = Clamp(static_cast<int>(det.bbox.y), 0, h - 1);
-                            int rw = static_cast<int>(det.bbox.w);
-                            int rh = static_cast<int>(det.bbox.h);
+                            const Rect draw = map_det_to_frame ? MapRectToFrame(det.bbox, det_w, det_h, w, h) : det.bbox;
+                            int x = Clamp(static_cast<int>(draw.x), 0, w - 1);
+                            int y = Clamp(static_cast<int>(draw.y), 0, h - 1);
+                            int rw = static_cast<int>(draw.w);
+                            int rh = static_cast<int>(draw.h);
                             rw = Clamp(rw, 1, w - x);
                             rh = Clamp(rh, 1, h - y);
                             im_rect rect{x, y, rw, rh};
@@ -636,10 +654,11 @@ private:
         }
 
         for (const auto& det : frame->det->items) {
-            int x1 = static_cast<int>(det.bbox.x);
-            int y1 = static_cast<int>(det.bbox.y);
-            int x2 = static_cast<int>(det.bbox.x + det.bbox.w);
-            int y2 = static_cast<int>(det.bbox.y + det.bbox.h);
+            const Rect draw = map_det_to_frame ? MapRectToFrame(det.bbox, det_w, det_h, w, h) : det.bbox;
+            int x1 = static_cast<int>(draw.x);
+            int y1 = static_cast<int>(draw.y);
+            int x2 = static_cast<int>(draw.x + draw.w);
+            int y2 = static_cast<int>(draw.y + draw.h);
 
             Color color = GetClassColor(det.cls_id);
 
diff --git a/plugins/preprocess/preprocess_node.cpp b/plugins/preprocess/preprocess_node.cpp
index 36538c7..d55ad38 100644
--- a/plugins/preprocess/preprocess_node.cpp
+++ b/plugins/preprocess/preprocess_node.cpp
@@ -1,16 +1,27 @@
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
+#include <cstring>
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "face/face_result.h"
 #include "hw/i_image_processor.h"
 #include "node.h"
+#include "utils/dma_alloc.h"
 #include "utils/logger.h"
 
 namespace rk3588 {
 
 namespace {
 
+enum class ResizeMode {
+    Stretch,
+    KeepRatio,
+    Letterbox,
+};
+
 PixelFormat ParseFormat(const std::string& s) {
     if (s == "nv12" || s == "NV12") return PixelFormat::NV12;
     if (s == "yuv420" || s == "YUV420") return PixelFormat::YUV420;
@@ -19,6 +30,229 @@ PixelFormat ParseFormat(const std::string& s) {
     return PixelFormat::UNKNOWN;
 }
 
+ResizeMode ParseResizeMode(const std::string& s, bool keep_ratio) {
+    if (s == "stretch") return ResizeMode::Stretch;
+    if (s == "keep_ratio" || s == "fit") return ResizeMode::KeepRatio;
+    if (s == "letterbox") return ResizeMode::Letterbox;
+    return keep_ratio ? ResizeMode::KeepRatio : ResizeMode::Stretch;
+}
+
+inline bool IsYuvFormat(PixelFormat fmt) {
+    return fmt == PixelFormat::NV12 || fmt == PixelFormat::YUV420;
+}
+
+inline float ClampFloat(float v, float lo, float hi) {
+    return std::max(lo, std::min(v, hi));
+}
+
+inline int MakeEvenFloor(int v) {
+    if (v <= 0) return 0;
+    return v & ~1;
+}
+
+size_t CalcImageSize(int w, int h, PixelFormat fmt) {
+    if (w <= 0 || h <= 0) return 0;
+    switch (fmt) {
+        case PixelFormat::NV12:
+        case PixelFormat::YUV420:
+            return static_cast<size_t>(w) * static_cast<size_t>(h) * 3 / 2;
+        case PixelFormat::RGB:
+        case PixelFormat::BGR:
+            return static_cast<size_t>(w) * static_cast<size_t>(h) * 3;
+        default:
+            return 0;
+    }
+}
+
+void SetupPlanes(Frame& f) {
+    if (!f.data || f.width <= 0 || f.height <= 0) return;
+    if (f.format == PixelFormat::NV12) {
+        const int y_stride = f.width;
+        const int y_size = y_stride * f.height;
+        const int uv_size = y_stride * (f.height / 2);
+        f.stride = y_stride;
+        f.plane_count = 2;
+        f.planes[0] = {f.data, y_stride, y_size, 0};
+        f.planes[1] = {f.data + y_size, y_stride, uv_size, y_size};
+    } else if (f.format == PixelFormat::YUV420) {
+        const int y_stride = f.width;
+        const int y_size = y_stride * f.height;
+        const int uv_stride = f.width / 2;
+        const int u_size = uv_stride * (f.height / 2);
+        f.stride = y_stride;
+        f.plane_count = 3;
+        f.planes[0] = {f.data, y_stride, y_size, 0};
+        f.planes[1] = {f.data + y_size, uv_stride, u_size, y_size};
+        f.planes[2] = {f.data + y_size + u_size, uv_stride, u_size, y_size + u_size};
+    } else {
+        const int stride = f.width * 3;
+        f.stride = stride;
+        f.plane_count = 1;
+        f.planes[0] = {f.data, stride, static_cast<int>(f.data_size), 0};
+    }
+    f.SyncBufferFromFrame();
+}
+
+bool InitFrameStorage(Frame& f) {
+    const size_t need = CalcImageSize(f.width, f.height, f.format);
+    if (need == 0) return false;
+
+    if (auto dma = DmaAlloc(need); dma && dma->valid()) {
+        f.SetDmaFd(dma->fd);
+        f.data = dma->data();
+        f.data_size = dma->size;
+        f.SetOwner(dma);
+        SetupPlanes(f);
+        return true;
+    }
+
+    auto buf = std::make_shared<std::vector<uint8_t>>(need);
+    f.SetDmaFd(-1);
+    f.data = buf->data();
+    f.data_size = buf->size();
+    f.SetOwner(buf);
+    SetupPlanes(f);
+    return true;
+}
+
+void FillBlack(Frame& f) {
+    if (!f.data || f.data_size == 0) return;
+    if (f.DmaFd() >= 0) {
+        DmaSyncStartFd(f.DmaFd());
+    }
+    if (f.format == PixelFormat::NV12) {
+        const int y_size = f.width * f.height;
+        std::memset(f.data, 0, static_cast<size_t>(y_size));
+        std::memset(f.data + y_size, 128, static_cast<size_t>(f.width * f.height / 2));
+    } else if (f.format == PixelFormat::YUV420) {
+        const int y_size = f.width * f.height;
+        const int u_size = (f.width / 2) * (f.height / 2);
+        std::memset(f.data, 0, static_cast<size_t>(y_size));
+        std::memset(f.data + y_size, 128, static_cast<size_t>(u_size));
+        std::memset(f.data + y_size + u_size, 128, static_cast<size_t>(u_size));
+    } else {
+        std::memset(f.data, 0, f.data_size);
+    }
+    if (f.DmaFd() >= 0) {
+        DmaSyncEndFd(f.DmaFd());
+    }
+}
+
+bool BlitLetterbox(const Frame& src, Frame& dst, int pad_x, int pad_y) {
+    if (!src.data || !dst.data || src.format != dst.format) return false;
+    if (pad_x < 0 || pad_y < 0) return false;
+    if (src.width + pad_x > dst.width || src.height + pad_y > dst.height) return false;
+
+    if (src.DmaFd() >= 0) src.SyncStart();
+    if (dst.DmaFd() >= 0) DmaSyncStartFd(dst.DmaFd());
+
+    if (src.format == PixelFormat::RGB || src.format == PixelFormat::BGR) {
+        const int src_stride = src.planes[0].stride > 0 ? src.planes[0].stride : src.width * 3;
+        const int dst_stride = dst.planes[0].stride > 0 ? dst.planes[0].stride : dst.width * 3;
+        const uint8_t* src_ptr = src.planes[0].data ? src.planes[0].data : src.data;
+        uint8_t* dst_ptr = dst.planes[0].data ? dst.planes[0].data : dst.data;
+        const size_t row_bytes = static_cast<size_t>(src.width) * 3;
+        for (int y = 0; y < src.height; ++y) {
+            std::memcpy(dst_ptr + static_cast<size_t>(y + pad_y) * dst_stride + static_cast<size_t>(pad_x) * 3,
+                        src_ptr + static_cast<size_t>(y) * src_stride,
+                        row_bytes);
+        }
+    } else if (src.format == PixelFormat::NV12) {
+        const int src_y_stride = src.planes[0].stride > 0 ? src.planes[0].stride : src.width;
+        const int src_uv_stride = src.planes[1].stride > 0 ? src.planes[1].stride : src.width;
+        const int dst_y_stride = dst.planes[0].stride > 0 ? dst.planes[0].stride : dst.width;
+        const int dst_uv_stride = dst.planes[1].stride > 0 ? dst.planes[1].stride : dst.width;
+        const uint8_t* src_y = src.planes[0].data ? src.planes[0].data : src.data;
+        const uint8_t* src_uv = src.planes[1].data ? src.planes[1].data : (src.data + src.width * src.height);
+        uint8_t* dst_y = dst.planes[0].data ? dst.planes[0].data : dst.data;
+        uint8_t* dst_uv = dst.planes[1].data ? dst.planes[1].data : (dst.data + dst.width * dst.height);
+
+        for (int y = 0; y < src.height; ++y) {
+            std::memcpy(dst_y + static_cast<size_t>(y + pad_y) * dst_y_stride + pad_x,
+                        src_y + static_cast<size_t>(y) * src_y_stride,
+                        static_cast<size_t>(src.width));
+        }
+
+        const int uv_rows = src.height / 2;
+        for (int y = 0; y < uv_rows; ++y) {
+            std::memcpy(dst_uv + static_cast<size_t>(y + pad_y / 2) * dst_uv_stride + pad_x,
+                        src_uv + static_cast<size_t>(y) * src_uv_stride,
+                        static_cast<size_t>(src.width));
+        }
+    } else if (src.format == PixelFormat::YUV420) {
+        const int src_y_stride = src.planes[0].stride > 0 ? src.planes[0].stride : src.width;
+        const int src_u_stride = src.planes[1].stride > 0 ? src.planes[1].stride : src.width / 2;
+        const int src_v_stride = src.planes[2].stride > 0 ? src.planes[2].stride : src.width / 2;
+        const int dst_y_stride = dst.planes[0].stride > 0 ? dst.planes[0].stride : dst.width;
+        const int dst_u_stride = dst.planes[1].stride > 0 ? dst.planes[1].stride : dst.width / 2;
+        const int dst_v_stride = dst.planes[2].stride > 0 ? dst.planes[2].stride : dst.width / 2;
+        const uint8_t* src_y = src.planes[0].data ? src.planes[0].data : src.data;
+        const uint8_t* src_u = src.planes[1].data ? src.planes[1].data : (src.data + src.width * src.height);
+        const uint8_t* src_v = src.planes[2].data ? src.planes[2].data : (src_u + (src.width / 2) * (src.height / 2));
+        uint8_t* dst_y = dst.planes[0].data ? dst.planes[0].data : dst.data;
+        uint8_t* dst_u = dst.planes[1].data ? dst.planes[1].data : (dst.data + dst.width * dst.height);
+        uint8_t* dst_v = dst.planes[2].data ? dst.planes[2].data : (dst_u + (dst.width / 2) * (dst.height / 2));
+
+        for (int y = 0; y < src.height; ++y) {
+            std::memcpy(dst_y + static_cast<size_t>(y + pad_y) * dst_y_stride + pad_x,
+                        src_y + static_cast<size_t>(y) * src_y_stride,
+                        static_cast<size_t>(src.width));
+        }
+
+        const int uv_rows = src.height / 2;
+        const int uv_pad_x = pad_x / 2;
+        const int uv_pad_y = pad_y / 2;
+        const int uv_cols = src.width / 2;
+        for (int y = 0; y < uv_rows; ++y) {
+            std::memcpy(dst_u + static_cast<size_t>(y + uv_pad_y) * dst_u_stride + uv_pad_x,
+                        src_u + static_cast<size_t>(y) * src_u_stride,
+                        static_cast<size_t>(uv_cols));
+            std::memcpy(dst_v + static_cast<size_t>(y + uv_pad_y) * dst_v_stride + uv_pad_x,
+                        src_v + static_cast<size_t>(y) * src_v_stride,
+                        static_cast<size_t>(uv_cols));
+        }
+    } else {
+        if (src.DmaFd() >= 0) src.SyncEnd();
+        if (dst.DmaFd() >= 0) DmaSyncEndFd(dst.DmaFd());
+        return false;
+    }
+
+    if (dst.DmaFd() >= 0) DmaSyncEndFd(dst.DmaFd());
+    if (src.DmaFd() >= 0) src.SyncEnd();
+    return true;
+}
+
+void TransformRect(Rect& r, float sx, float sy, float tx, float ty, int out_w, int out_h) {
+    if (out_w <= 0 || out_h <= 0) {
+        r = Rect{};
+        return;
+    }
+    const float fw = static_cast<float>(out_w);
+    const float fh = static_cast<float>(out_h);
+
+    const float x = ClampFloat(r.x * sx + tx, 0.0f, fw);
+    const float y = ClampFloat(r.y * sy + ty, 0.0f, fh);
+    float w = std::max(0.0f, r.w * sx);
+    float h = std::max(0.0f, r.h * sy);
+
+    if (x + w > fw) w = std::max(0.0f, fw - x);
+    if (y + h > fh) h = std::max(0.0f, fh - y);
+
+    r.x = x;
+    r.y = y;
+    r.w = w;
+    r.h = h;
+}
+
+void TransformPoint(Point2f& p, float sx, float sy, float tx, float ty, int out_w, int out_h) {
+    if (out_w <= 0 || out_h <= 0) {
+        p = Point2f{};
+        return;
+    }
+    p.x = ClampFloat(p.x * sx + tx, 0.0f, static_cast<float>(out_w));
+    p.y = ClampFloat(p.y * sy + ty, 0.0f, static_cast<float>(out_h));
+}
+
 }  // namespace
 
 class PreprocessNode : public INode {
@@ -31,6 +265,7 @@ public:
         dst_w_ = config.ValueOr<int>("dst_w", 640);
         dst_h_ = config.ValueOr<int>("dst_h", 640);
         keep_ratio_ = config.ValueOr<bool>("keep_ratio", false);
+        resize_mode_ = ParseResizeMode(config.ValueOr<std::string>("resize_mode", ""), keep_ratio_);
 
         std::string fmt_str = config.ValueOr<std::string>("dst_format", "");
         if (!fmt_str.empty()) {
@@ -81,8 +316,11 @@ public:
     }
 
     bool Start() override {
+        std::string mode = "stretch";
+        if (resize_mode_ == ResizeMode::KeepRatio) mode = "keep_ratio";
+        if (resize_mode_ == ResizeMode::Letterbox) mode = "letterbox";
         LogInfo("[preprocess] start id=" + id_ + " dst=" + std::to_string(dst_w_) + "x" +
-                std::to_string(dst_h_) + (use_rga_ ? " (rga)" : " (swscale)"));
+                std::to_string(dst_h_) + " mode=" + mode + (use_rga_ ? " (rga)" : " (swscale)"));
         return true;
     }
 
@@ -95,52 +333,80 @@ public:
         PixelFormat out_fmt = (dst_fmt_ != PixelFormat::UNKNOWN) ? dst_fmt_ : frame->format;
         int out_w = dst_w_;
         int out_h = dst_h_;
-
         if (out_w <= 0) out_w = frame->width;
         if (out_h <= 0) out_h = frame->height;
 
-        if (keep_ratio_ && dst_w_ > 0 && dst_h_ > 0 && frame->width > 0 && frame->height > 0) {
-            float scale = std::min(static_cast<float>(dst_w_) / frame->width,
-                                   static_cast<float>(dst_h_) / frame->height);
-            out_w = static_cast<int>(frame->width * scale);
-            out_h = static_cast<int>(frame->height * scale);
-            out_w = (out_w + 1) & ~1;
-            out_h = (out_h + 1) & ~1;
-        }
-
-        const bool need_resize = (frame->width != out_w || frame->height != out_h);
-        const bool need_cvt = (frame->format != out_fmt);
-
-        if (need_resize) {
-            WarnMetaResizeOnce(frame, out_w, out_h);
-        }
-
-        if (!need_resize && !need_cvt) {
-            ProcessPassthrough(frame);
-            return NodeStatus::OK;
-        }
+        FrameTransformMeta tx{};
+        tx.valid = true;
+        tx.src_w = frame->width;
+        tx.src_h = frame->height;
 
         Frame out;
-        out.width = out_w;
-        out.height = out_h;
-        out.format = out_fmt;
+        if (resize_mode_ == ResizeMode::Letterbox && dst_w_ > 0 && dst_h_ > 0 &&
+            frame->width > 0 && frame->height > 0) {
+            Status lb = BuildLetterbox(*frame, out_fmt, out_w, out_h, out, tx);
+            if (lb.Failed()) {
+                LogError("[preprocess] letterbox failed: " + lb.ErrMessage());
+                return NodeStatus::ERROR;
+            }
+        } else {
+            if (resize_mode_ == ResizeMode::KeepRatio && dst_w_ > 0 && dst_h_ > 0 &&
+                frame->width > 0 && frame->height > 0) {
+                float scale = std::min(static_cast<float>(dst_w_) / frame->width,
+                                       static_cast<float>(dst_h_) / frame->height);
+                out_w = static_cast<int>(std::round(frame->width * scale));
+                out_h = static_cast<int>(std::round(frame->height * scale));
+                if (IsYuvFormat(out_fmt)) {
+                    out_w = std::max(2, MakeEvenFloor(out_w));
+                    out_h = std::max(2, MakeEvenFloor(out_h));
+                }
+                if (out_w <= 0) out_w = frame->width;
+                if (out_h <= 0) out_h = frame->height;
+            }
 
-        Status st = image_processor_->Resize(*frame, out);
-        if (st.Failed()) {
-            if (!use_rga_ && st.ErrMessage().find("unsupported format") != std::string::npos) {
+            const bool need_resize = (frame->width != out_w || frame->height != out_h);
+            const bool need_cvt = (frame->format != out_fmt);
+
+            tx.letterbox = false;
+            tx.dst_w = out_w;
+            tx.dst_h = out_h;
+            tx.scale_x = frame->width > 0 ? static_cast<float>(out_w) / frame->width : 1.0f;
+            tx.scale_y = frame->height > 0 ? static_cast<float>(out_h) / frame->height : 1.0f;
+            tx.pad_x = 0.0f;
+            tx.pad_y = 0.0f;
+
+            if (!need_resize && !need_cvt) {
+                auto t = std::make_shared<FrameTransformMeta>(tx);
+                frame->transform_meta = t;
                 ProcessPassthrough(frame);
                 return NodeStatus::OK;
             }
-            LogError("[preprocess] " + st.ErrMessage());
-            return NodeStatus::ERROR;
+
+            if (need_resize) {
+                WarnMetaResizeOnce(frame, out_w, out_h);
+            }
+
+            out.width = out_w;
+            out.height = out_h;
+            out.format = out_fmt;
+            Status st = image_processor_->Resize(*frame, out);
+            if (st.Failed()) {
+                if (!use_rga_ && st.ErrMessage().find("unsupported format") != std::string::npos) {
+                    auto t = std::make_shared<FrameTransformMeta>(tx);
+                    frame->transform_meta = t;
+                    ProcessPassthrough(frame);
+                    return NodeStatus::OK;
+                }
+                LogError("[preprocess] " + st.ErrMessage());
+                return NodeStatus::ERROR;
+            }
         }
 
         auto out_frame = std::make_shared<Frame>(out);
         out_frame->pts = frame->pts;
         out_frame->frame_id = frame->frame_id;
-        out_frame->det = frame->det;
-        out_frame->face_det = frame->face_det;
-        out_frame->face_recog = frame->face_recog;
+        out_frame->transform_meta = std::make_shared<FrameTransformMeta>(tx);
+        ScaleMeta(*frame, *out_frame, tx);
         out_frame->user_meta = frame->user_meta;
 
         PushToDownstream(out_frame);
@@ -150,7 +416,7 @@ public:
             LogInfo("[preprocess] " + std::string(use_rga_ ? "rga" : "swscale") +
                     " frame=" + std::to_string(out_frame->frame_id) +
                     " " + std::to_string(frame->width) + "x" + std::to_string(frame->height) +
-                    " -> " + std::to_string(out_w) + "x" + std::to_string(out_h) +
+                    " -> " + std::to_string(out_frame->width) + "x" + std::to_string(out_frame->height) +
                     " id=" + id_);
         }
 
@@ -158,6 +424,142 @@ public:
     }
 
 private:
+    Status BuildLetterbox(const Frame& input, PixelFormat out_fmt, int dst_w, int dst_h, Frame& out,
+                          FrameTransformMeta& tx) const {
+        Frame src = input;
+        if (input.format != out_fmt) {
+            src.width = input.width;
+            src.height = input.height;
+            src.format = out_fmt;
+            Status cvt = image_processor_->CvtColor(input, src, out_fmt);
+            if (cvt.Failed()) return cvt;
+        }
+
+        if (src.width <= 0 || src.height <= 0 || dst_w <= 0 || dst_h <= 0) {
+            return FailStatus("invalid letterbox size");
+        }
+
+        float scale = std::min(static_cast<float>(dst_w) / static_cast<float>(src.width),
+                               static_cast<float>(dst_h) / static_cast<float>(src.height));
+        int inner_w = std::max(1, static_cast<int>(std::round(src.width * scale)));
+        int inner_h = std::max(1, static_cast<int>(std::round(src.height * scale)));
+        if (IsYuvFormat(out_fmt)) {
+            inner_w = std::max(2, MakeEvenFloor(inner_w));
+            inner_h = std::max(2, MakeEvenFloor(inner_h));
+            if (((dst_w - inner_w) & 1) != 0) inner_w = std::max(2, inner_w - 2);
+            if (((dst_h - inner_h) & 1) != 0) inner_h = std::max(2, inner_h - 2);
+        }
+        if (inner_w <= 0 || inner_h <= 0 || inner_w > dst_w || inner_h > dst_h) {
+            return FailStatus("invalid inner letterbox size");
+        }
+
+        Frame resized;
+        resized.width = inner_w;
+        resized.height = inner_h;
+        resized.format = out_fmt;
+        Status st = image_processor_->Resize(src, resized);
+        if (st.Failed()) return st;
+
+        out.width = dst_w;
+        out.height = dst_h;
+        out.format = out_fmt;
+        if (!InitFrameStorage(out)) {
+            return FailStatus("alloc letterbox output failed");
+        }
+        FillBlack(out);
+
+        const int pad_x = (dst_w - inner_w) / 2;
+        const int pad_y = (dst_h - inner_h) / 2;
+        if (!BlitLetterbox(resized, out, pad_x, pad_y)) {
+            return FailStatus("blit letterbox failed");
+        }
+
+        tx.letterbox = true;
+        tx.src_w = input.width;
+        tx.src_h = input.height;
+        tx.dst_w = dst_w;
+        tx.dst_h = dst_h;
+        tx.scale_x = static_cast<float>(inner_w) / static_cast<float>(input.width);
+        tx.scale_y = static_cast<float>(inner_h) / static_cast<float>(input.height);
+        tx.pad_x = static_cast<float>(pad_x);
+        tx.pad_y = static_cast<float>(pad_y);
+        return OkStatus();
+    }
+
+    void ScaleMeta(const Frame& in_frame, Frame& out_frame, const FrameTransformMeta& tx) const {
+        if (in_frame.det) {
+            auto det = std::make_shared<DetectionResult>(*in_frame.det);
+            const int src_meta_w = det->img_w > 0 ? det->img_w : in_frame.width;
+            const int src_meta_h = det->img_h > 0 ? det->img_h : in_frame.height;
+            const float to_frame_x = src_meta_w > 0 ? static_cast<float>(in_frame.width) / src_meta_w : 1.0f;
+            const float to_frame_y = src_meta_h > 0 ? static_cast<float>(in_frame.height) / src_meta_h : 1.0f;
+            for (auto& it : det->items) {
+                TransformRect(it.bbox,
+                              tx.scale_x * to_frame_x,
+                              tx.scale_y * to_frame_y,
+                              tx.pad_x, tx.pad_y,
+                              out_frame.width, out_frame.height);
+            }
+            det->img_w = out_frame.width;
+            det->img_h = out_frame.height;
+            out_frame.det = std::move(det);
+        }
+
+        if (in_frame.face_det) {
+            auto face_det = std::make_shared<FaceDetResult>(*in_frame.face_det);
+            const int src_meta_w = face_det->img_w > 0 ? face_det->img_w : in_frame.width;
+            const int src_meta_h = face_det->img_h > 0 ? face_det->img_h : in_frame.height;
+            const float to_frame_x = src_meta_w > 0 ? static_cast<float>(in_frame.width) / src_meta_w : 1.0f;
+            const float to_frame_y = src_meta_h > 0 ? static_cast<float>(in_frame.height) / src_meta_h : 1.0f;
+            for (auto& it : face_det->faces) {
+                TransformRect(it.bbox,
+                              tx.scale_x * to_frame_x,
+                              tx.scale_y * to_frame_y,
+                              tx.pad_x, tx.pad_y,
+                              out_frame.width, out_frame.height);
+                if (it.has_landmarks) {
+                    for (auto& lm : it.landmarks) {
+                        TransformPoint(lm,
+                                       tx.scale_x * to_frame_x,
+                                       tx.scale_y * to_frame_y,
+                                       tx.pad_x, tx.pad_y,
+                                       out_frame.width, out_frame.height);
+                    }
+                }
+            }
+            face_det->img_w = out_frame.width;
+            face_det->img_h = out_frame.height;
+            out_frame.face_det = std::move(face_det);
+        }
+
+        if (in_frame.face_recog) {
+            auto face_recog = std::make_shared<FaceRecogResult>(*in_frame.face_recog);
+            const int src_meta_w = face_recog->img_w > 0 ? face_recog->img_w : in_frame.width;
+            const int src_meta_h = face_recog->img_h > 0 ? face_recog->img_h : in_frame.height;
+            const float to_frame_x = src_meta_w > 0 ? static_cast<float>(in_frame.width) / src_meta_w : 1.0f;
+            const float to_frame_y = src_meta_h > 0 ? static_cast<float>(in_frame.height) / src_meta_h : 1.0f;
+            for (auto& it : face_recog->items) {
+                TransformRect(it.bbox,
+                              tx.scale_x * to_frame_x,
+                              tx.scale_y * to_frame_y,
+                              tx.pad_x, tx.pad_y,
+                              out_frame.width, out_frame.height);
+                if (it.has_landmarks) {
+                    for (auto& lm : it.landmarks) {
+                        TransformPoint(lm,
+                                       tx.scale_x * to_frame_x,
+                                       tx.scale_y * to_frame_y,
+                                       tx.pad_x, tx.pad_y,
+                                       out_frame.width, out_frame.height);
+                    }
+                }
+            }
+            face_recog->img_w = out_frame.width;
+            face_recog->img_h = out_frame.height;
+            out_frame.face_recog = std::move(face_recog);
+        }
+    }
+
     void PushToDownstream(FramePtr frame) {
         for (auto& q : output_queues_) {
             q->Push(frame);
@@ -170,7 +572,7 @@ private:
         if (frame->width == out_w && frame->height == out_h) return;
         if (!frame->det && !frame->face_det && !frame->face_recog) return;
         warned_meta_resize_ = true;
-        LogWarn("[preprocess] resized frame but forwarded det/face meta without coordinate scaling; ensure det/recog/osd use same resolution (id=" + id_ + ")");
+        LogInfo("[preprocess] resized frame and scaled det/face meta to destination resolution (id=" + id_ + ")");
     }
 
     void ProcessPassthrough(FramePtr frame) {
@@ -185,12 +587,12 @@ private:
     int dst_w_ = 640;
     int dst_h_ = 640;
     bool keep_ratio_ = false;
+    ResizeMode resize_mode_ = ResizeMode::Stretch;
     PixelFormat dst_fmt_ = PixelFormat::UNKNOWN;
     bool use_rga_ = true;
 
     bool stats_log_ = false;
     uint64_t stats_interval_ = 100;
-
     bool warned_meta_resize_ = false;
 
     std::shared_ptr<SpscQueue<FramePtr>> input_queue_;