OrangePi3588Media/plugins/ai_face_recog/ai_face_recog_node.cpp

#include <algorithm>
#include <cctype>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <memory>
#include <mutex>
#include <string>
#include <utility>
#include <vector>

// For test image loading
#define STB_IMAGE_IMPLEMENTATION
#define STBI_ONLY_PNG
#define STBI_NO_FAILURE_STRINGS
#include "../../third_party/rknpu2/examples/3rdparty/stb/stb_image.h"

#include "ai_scheduler.h"
#include "face/face_result.h"
#include "node.h"
#include "utils/logger.h"

#if defined(RK3588_ENABLE_SQLITE3)
#include <sqlite3.h>
#endif

namespace rk3588 {

namespace {

inline int ClampInt(int v, int lo, int hi) {
    return v < lo ? lo : (v > hi ? hi : v);
}

bool ReadFileToString(const std::string& path, std::string& out) {
    std::ifstream ifs(path, std::ios::binary);
    if (!ifs) return false;
    ifs.seekg(0, std::ios::end);
    std::streamsize sz = ifs.tellg();
    if (sz < 0) return false;
    ifs.seekg(0, std::ios::beg);
    out.resize(static_cast<size_t>(sz));
    if (sz == 0) return true;
    ifs.read(&out[0], sz);
    return ifs.good();
}

struct GalleryEntry {
    int person_id = -1;
    std::string name;
    std::vector<float> emb;  // L2 normalized
};

inline float HalfToFloat(uint16_t h) {
    const uint32_t sign = (static_cast<uint32_t>(h & 0x8000u)) << 16;
    uint32_t exp = (h & 0x7C00u) >> 10;
    uint32_t mant = (h & 0x03FFu);

    uint32_t f = 0;
    if (exp == 0) {
        if (mant == 0) {
            f = sign;
        } else {
            exp = 1;
            while ((mant & 0x0400u) == 0) {
                mant <<= 1;
                --exp;
            }
            mant &= 0x03FFu;
            exp = exp + (127 - 15);
            f = sign | (exp << 23) | (mant << 13);
        }
    } else if (exp == 31) {
        f = sign | 0x7F800000u | (mant << 13);
    } else {
        exp = exp + (127 - 15);
        f = sign | (exp << 23) | (mant << 13);
    }

    float out;
    memcpy(&out, &f, sizeof(out));
    return out;
}

class FaceGallery {
public:
    void SetExpectedDim(int dim) { expected_dim_ = dim; }
    void SetPreferredDtype(std::string dtype) {
        for (auto& c : dtype) c = static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
        preferred_dtype_ = std::move(dtype);
    }

    bool LoadSqliteBackend(const std::string& db_path, std::string& err) {
#if defined(RK3588_ENABLE_SQLITE3)
        entries_.clear();
        dim_ = 0;

        sqlite3* db = nullptr;
        if (sqlite3_open_v2(db_path.c_str(), &db, SQLITE_OPEN_READONLY, nullptr) != SQLITE_OK || !db) {
            err = "failed to open sqlite db: " + db_path;
            if (db) sqlite3_close(db);
            return false;
        }

        const char* sql =
            "SELECT p.id, p.name, e.emb "
            "FROM embedding e JOIN person p ON e.person_id = p.id";

        sqlite3_stmt* stmt = nullptr;
        if (sqlite3_prepare_v2(db, sql, -1, &stmt, nullptr) != SQLITE_OK || !stmt) {
            err = "sqlite prepare failed";
            if (stmt) sqlite3_finalize(stmt);
            sqlite3_close(db);
            return false;
        }

        while (sqlite3_step(stmt) == SQLITE_ROW) {
            const int person_id = sqlite3_column_int(stmt, 0);
            const unsigned char* name_u8 = sqlite3_column_text(stmt, 1);
            const void* blob = sqlite3_column_blob(stmt, 2);
            const int blob_sz = sqlite3_column_bytes(stmt, 2);
            if (!blob || blob_sz <= 0) continue;

            const int expected_dim = expected_dim_;
            int dim = 0;
            enum class BlobType { F16, F32, F64 } blob_type = BlobType::F32;

            if (expected_dim > 0) {
                if (blob_sz == expected_dim * 4) {
                    dim = expected_dim;
                    blob_type = BlobType::F32;
                } else if (blob_sz == expected_dim * 2) {
                    dim = expected_dim;
                    blob_type = BlobType::F16;
                } else if (blob_sz == expected_dim * 8) {
                    dim = expected_dim;
                    blob_type = BlobType::F64;
                } else {
                    continue;
                }
            } else {
                if ((blob_sz % 4) != 0) continue;
                dim = blob_sz / 4;
                blob_type = BlobType::F32;
            }

            // Optional dtype preference (only affects ambiguous cases when expected_dim==0).
            if (expected_dim <= 0) {
                if (preferred_dtype_ == "f16" && (blob_sz % 2) == 0) {
                    blob_type = BlobType::F16;
                    dim = blob_sz / 2;
                } else if (preferred_dtype_ == "f64" && (blob_sz % 8) == 0) {
                    blob_type = BlobType::F64;
                    dim = blob_sz / 8;
                }
            }

            if (dim_ == 0) dim_ = dim;
            if (dim != dim_) continue;

            GalleryEntry e;
            e.person_id = person_id;
            e.name = name_u8 ? reinterpret_cast<const char*>(name_u8) : std::string{};
            e.emb.resize(static_cast<size_t>(dim_));
            if (blob_type == BlobType::F32) {
                memcpy(e.emb.data(), blob, static_cast<size_t>(dim_) * sizeof(float));
            } else if (blob_type == BlobType::F16) {
                const uint16_t* hp = reinterpret_cast<const uint16_t*>(blob);
                for (int i = 0; i < dim_; ++i) e.emb[static_cast<size_t>(i)] = HalfToFloat(hp[i]);
            } else {
                const double* dp = reinterpret_cast<const double*>(blob);
                for (int i = 0; i < dim_; ++i) e.emb[static_cast<size_t>(i)] = static_cast<float>(dp[i]);
            }
            L2Normalize(e.emb);

            // DEBUG: 打印数据库中 embedding 的前 8 个值
            {
                std::string dbg = "[FaceGallery] db emb[0:8] for " + e.name + ": ";
                for (int di = 0; di < 8 && di < static_cast<int>(e.emb.size()); ++di) {
                    dbg += std::to_string(e.emb[di]) + " ";
                }
                std::cerr << dbg << "\n";
            }

            entries_.push_back(std::move(e));
        }

        sqlite3_finalize(stmt);
        sqlite3_close(db);
        return true;
#else
        (void)db_path;
        err = "sqlite3 support not enabled at build time";
        return false;
#endif
    }

    bool LoadFileBackend(const std::string& base_path, std::string& err) {
        entries_.clear();
        dim_ = 0;

        const std::string json_path = base_path + ".json";
        const std::string bin_path = base_path + ".bin";

        std::string json_text;
        if (!ReadFileToString(json_path, json_text)) {
            err = "failed to read " + json_path;
            return false;
        }

        SimpleJson root;
        std::string jerr;
        if (!ParseSimpleJson(json_text, root, jerr) || !root.IsObject()) {
            err = "invalid json: " + jerr;
            return false;
        }

        dim_ = root.ValueOr<int>("dim", 0);
        if (dim_ <= 0) {
            err = "gallery dim missing";
            return false;
        }

        const SimpleJson* persons = root.Find("persons");
        if (!persons || !persons->IsArray()) {
            err = "gallery persons missing";
            return false;
        }

        const size_t n = persons->AsArray().size();
        if (n == 0) {
            // Empty gallery is valid.
            return true;
        }

        std::ifstream ifs(bin_path, std::ios::binary);
        if (!ifs) {
            err = "failed to open " + bin_path;
            return false;
        }

        const size_t total_floats = n * static_cast<size_t>(dim_);
        std::vector<float> buf(total_floats);
        ifs.read(reinterpret_cast<char*>(buf.data()), static_cast<std::streamsize>(total_floats * sizeof(float)));
        if (!ifs.good()) {
            err = "failed to read embeddings from " + bin_path;
            return false;
        }

        entries_.reserve(n);
        for (size_t i = 0; i < n; ++i) {
            const SimpleJson& p = persons->AsArray()[i];
            GalleryEntry e;
            e.person_id = p.ValueOr<int>("id", -1);
            e.name = p.ValueOr<std::string>("name", "");
            e.emb.resize(static_cast<size_t>(dim_));
            memcpy(e.emb.data(), buf.data() + i * static_cast<size_t>(dim_), static_cast<size_t>(dim_) * sizeof(float));
            L2Normalize(e.emb);
            entries_.push_back(std::move(e));
        }

        return true;
    }

    int Dim() const { return dim_; }
    size_t Size() const { return entries_.size(); }

    struct SearchResult {
        int best_person_id = -1;
        std::string best_name;
        float best_sim = 0.0f;
        float second_sim = 0.0f;
    };

    SearchResult SearchTop2(const std::vector<float>& emb_normed) const {
        SearchResult r;
        if (entries_.empty() || dim_ <= 0) return r;
        if (static_cast<int>(emb_normed.size()) != dim_) return r;

        float best = -std::numeric_limits<float>::infinity();
        float second = -std::numeric_limits<float>::infinity();
        int best_idx = -1;

        for (size_t i = 0; i < entries_.size(); ++i) {
            const float sim = Dot(emb_normed, entries_[i].emb);
            if (sim > best) {
                second = best;
                best = sim;
                best_idx = static_cast<int>(i);
            } else if (sim > second) {
                second = sim;
            }
        }

        if (best_idx >= 0) {
            r.best_person_id = entries_[static_cast<size_t>(best_idx)].person_id;
            r.best_name = entries_[static_cast<size_t>(best_idx)].name;
            r.best_sim = best;
            r.second_sim = std::isfinite(second) ? second : 0.0f;
        }
        return r;
    }

private:
    static float Dot(const std::vector<float>& a, const std::vector<float>& b) {
        float s = 0.0f;
        for (size_t i = 0; i < a.size(); ++i) s += a[i] * b[i];
        return s;
    }

    static void L2Normalize(std::vector<float>& v) {
        double ss = 0.0;
        for (float x : v) ss += static_cast<double>(x) * static_cast<double>(x);
        const double norm = std::sqrt(ss);
        if (norm <= 0.0) return;
        const float inv = static_cast<float>(1.0 / norm);
        for (float& x : v) x *= inv;
    }

    int dim_ = 0;
    int expected_dim_ = 512;
    std::string preferred_dtype_ = "auto";
    std::vector<GalleryEntry> entries_;
};

struct SimilarityTransform {
    // x' = a*x - b*y + c
    // y' = b*x + a*y + d
    float a = 1.0f;
    float b = 0.0f;
    float c = 0.0f;
    float d = 0.0f;
};

bool Solve4x4(float A[4][4], float b[4], float x[4]) {
    // Gaussian elimination with partial pivoting.
    for (int i = 0; i < 4; ++i) {
        int pivot = i;
        float best = std::fabs(A[i][i]);
        for (int r = i + 1; r < 4; ++r) {
            float v = std::fabs(A[r][i]);
            if (v > best) {
                best = v;
                pivot = r;
            }
        }
        if (best < 1e-8f) return false;
        if (pivot != i) {
            for (int c = i; c < 4; ++c) std::swap(A[i][c], A[pivot][c]);
            std::swap(b[i], b[pivot]);
        }

        const float diag = A[i][i];
        for (int c = i; c < 4; ++c) A[i][c] /= diag;
        b[i] /= diag;

        for (int r = 0; r < 4; ++r) {
            if (r == i) continue;
            const float f = A[r][i];
            if (std::fabs(f) < 1e-8f) continue;
            for (int c = i; c < 4; ++c) A[r][c] -= f * A[i][c];
            b[r] -= f * b[i];
        }
    }

    for (int i = 0; i < 4; ++i) x[i] = b[i];
    return true;
}

bool ComputeSimilarity(const std::array<Point2f, 5>& src,
                       const std::array<Point2f, 5>& dst,
                       SimilarityTransform& out) {
    // Least squares on similarity model.
    float ATA[4][4] = {};
    float ATb[4] = {};

    auto Acc = [&](const float row[4], float rhs) {
        for (int i = 0; i < 4; ++i) {
            ATb[i] += row[i] * rhs;
            for (int j = 0; j < 4; ++j) {
                ATA[i][j] += row[i] * row[j];
            }
        }
    };

    for (int i = 0; i < 5; ++i) {
        const float x = src[static_cast<size_t>(i)].x;
        const float y = src[static_cast<size_t>(i)].y;
        const float u = dst[static_cast<size_t>(i)].x;
        const float v = dst[static_cast<size_t>(i)].y;

        const float r1[4] = {x, -y, 1.0f, 0.0f};
        const float r2[4] = {y,  x, 0.0f, 1.0f};
        Acc(r1, u);
        Acc(r2, v);
    }

    float A[4][4];
    float b[4];
    for (int i = 0; i < 4; ++i) {
        b[i] = ATb[i];
        for (int j = 0; j < 4; ++j) A[i][j] = ATA[i][j];
    }
    float x[4];
    if (!Solve4x4(A, b, x)) return false;
    out.a = x[0];
    out.b = x[1];
    out.c = x[2];
    out.d = x[3];
    return true;
}

struct InvTransform {
    float m00 = 1.0f, m01 = 0.0f, m02 = 0.0f;
    float m10 = 0.0f, m11 = 1.0f, m12 = 0.0f;
};

bool InvertSimilarity(const SimilarityTransform& t, InvTransform& inv) {
    const float det = t.a * t.a + t.b * t.b;
    if (det < 1e-12f) return false;
    const float inv_det = 1.0f / det;
    inv.m00 =  t.a * inv_det;
    inv.m01 =  t.b * inv_det;
    inv.m10 = -t.b * inv_det;
    inv.m11 =  t.a * inv_det;
    inv.m02 = -(t.a * t.c + t.b * t.d) * inv_det;
    inv.m12 =  (t.b * t.c - t.a * t.d) * inv_det;
    return true;
}

inline uint8_t BilinearAt(const uint8_t* src, int w, int h, int stride, float x, float y, int c) {
    if (x < 0.0f || y < 0.0f || x > static_cast<float>(w - 1) || y > static_cast<float>(h - 1)) return 0;
    const int x0 = ClampInt(static_cast<int>(std::floor(x)), 0, w - 1);
    const int y0 = ClampInt(static_cast<int>(std::floor(y)), 0, h - 1);
    const int x1 = ClampInt(x0 + 1, 0, w - 1);
    const int y1 = ClampInt(y0 + 1, 0, h - 1);
    const float wx = x - static_cast<float>(x0);
    const float wy = y - static_cast<float>(y0);
    const float w00 = (1.0f - wx) * (1.0f - wy);
    const float w01 = wx * (1.0f - wy);
    const float w10 = (1.0f - wx) * wy;
    const float w11 = wx * wy;

    const uint8_t* p00 = src + y0 * stride + x0 * 3;
    const uint8_t* p01 = src + y0 * stride + x1 * 3;
    const uint8_t* p10 = src + y1 * stride + x0 * 3;
    const uint8_t* p11 = src + y1 * stride + x1 * 3;

    const float v =
        static_cast<float>(p00[c]) * w00 +
        static_cast<float>(p01[c]) * w01 +
        static_cast<float>(p10[c]) * w10 +
        static_cast<float>(p11[c]) * w11;
    return static_cast<uint8_t>(ClampInt(static_cast<int>(v + 0.5f), 0, 255));
}

void WarpFace(const uint8_t* src, int w, int h, int stride,
              const InvTransform& inv, uint8_t* dst, int dst_w, int dst_h, bool swap_rb) {
    for (int y = 0; y < dst_h; ++y) {
        uint8_t* row = dst + static_cast<size_t>(y) * static_cast<size_t>(dst_w) * 3;
        for (int x = 0; x < dst_w; ++x) {
            const float xs = inv.m00 * static_cast<float>(x) + inv.m01 * static_cast<float>(y) + inv.m02;
            const float ys = inv.m10 * static_cast<float>(x) + inv.m11 * static_cast<float>(y) + inv.m12;
            uint8_t r = BilinearAt(src, w, h, stride, xs, ys, 0);
            uint8_t g = BilinearAt(src, w, h, stride, xs, ys, 1);
            uint8_t b = BilinearAt(src, w, h, stride, xs, ys, 2);
            if (swap_rb) std::swap(r, b);
            row[0] = r;
            row[1] = g;
            row[2] = b;
            row += 3;
        }
    }
}

void CropResize(const uint8_t* src, int w, int h, int stride,
                const Rect& bbox, uint8_t* dst, int dst_w, int dst_h, bool swap_rb) {
    const float x0 = bbox.x;
    const float y0 = bbox.y;
    const float bw = std::max(1.0f, bbox.w);
    const float bh = std::max(1.0f, bbox.h);
    for (int y = 0; y < dst_h; ++y) {
        uint8_t* row = dst + static_cast<size_t>(y) * static_cast<size_t>(dst_w) * 3;
        const float sy = y0 + (static_cast<float>(y) + 0.5f) * (bh / static_cast<float>(dst_h)) - 0.5f;
        for (int x = 0; x < dst_w; ++x) {
            const float sx = x0 + (static_cast<float>(x) + 0.5f) * (bw / static_cast<float>(dst_w)) - 0.5f;
            uint8_t r = BilinearAt(src, w, h, stride, sx, sy, 0);
            uint8_t g = BilinearAt(src, w, h, stride, sx, sy, 1);
            uint8_t b = BilinearAt(src, w, h, stride, sx, sy, 2);
            if (swap_rb) std::swap(r, b);
            row[0] = r;
            row[1] = g;
            row[2] = b;
            row += 3;
        }
    }
}

#if defined(RK3588_ENABLE_RKNN)
template <typename T>
inline float Dequant(T q, int32_t zp, float scale) {
    return (static_cast<float>(q) - static_cast<float>(zp)) * scale;
}

bool DecodeEmbedding(const AiScheduler::BorrowedOutput& o, std::vector<float>& emb) {
    if (!o.data || o.size == 0) return false;

    size_t elem_size = 1;
    bool is_float = false;
    bool is_float16 = false;
    if (o.type == RKNN_TENSOR_FLOAT16) {
        elem_size = 2;
        is_float16 = true;
    }
    if (o.type == RKNN_TENSOR_FLOAT32) {
        elem_size = 4;
        is_float = true;
    }
    const size_t elem_cnt = elem_size > 0 ? (o.size / elem_size) : 0;
    if (elem_cnt == 0) return false;

    emb.resize(elem_cnt);
    if (is_float) {
        const float* fp = reinterpret_cast<const float*>(o.data);
        for (size_t i = 0; i < elem_cnt; ++i) emb[i] = fp[i];
        return true;
    }

    if (is_float16) {
        const uint16_t* hp = reinterpret_cast<const uint16_t*>(o.data);
        for (size_t i = 0; i < elem_cnt; ++i) emb[i] = HalfToFloat(hp[i]);
        return true;
    }

    if (o.type == RKNN_TENSOR_INT8) {
        const int8_t* p = reinterpret_cast<const int8_t*>(o.data);
        for (size_t i = 0; i < elem_cnt; ++i) emb[i] = Dequant(p[i], o.zp, o.scale);
        return true;
    }

    const uint8_t* p = reinterpret_cast<const uint8_t*>(o.data);
    for (size_t i = 0; i < elem_cnt; ++i) emb[i] = Dequant(p[i], o.zp, o.scale);
    return true;
}

void L2Normalize(std::vector<float>& v) {
    double ss = 0.0;
    for (float x : v) ss += static_cast<double>(x) * static_cast<double>(x);
    const double norm = std::sqrt(ss);
    if (norm <= 0.0) return;
    const float inv = static_cast<float>(1.0 / norm);
    for (float& x : v) x *= inv;
}
#else
bool DecodeEmbedding(const AiScheduler::BorrowedOutput& /*o*/, std::vector<float>& /*emb*/) {
    return false;
}

void L2Normalize(std::vector<float>& /*v*/) {}
#endif

struct FaceRecogConfigSnapshot {
    bool align = true;
    bool emit_embedding = false;
    int max_faces = 10;

    float thr_accept = 0.45f;
    float thr_margin = 0.05f;

    std::string model_input_format = "rgb";
    std::string input_dtype = "uint8";

    float norm_scale = 1.0f;
    float norm_bias = 0.0f;
    bool norm_use_mean_std = false;
    std::array<float, 3> norm_mean{{0.0f, 0.0f, 0.0f}};
    std::array<float, 3> norm_std{{1.0f, 1.0f, 1.0f}};

    std::string gallery_backend = "file";
    std::string gallery_path;
    bool gallery_load_on_start = true;
    int gallery_expected_dim = 512;
    std::string gallery_dtype = "auto";
};

static bool BuildFaceRecogConfigSnapshot(const SimpleJson& config,
                                        const std::shared_ptr<const FaceRecogConfigSnapshot>& base,
                                        std::shared_ptr<const FaceRecogConfigSnapshot>& out) {
    auto snap = std::make_shared<FaceRecogConfigSnapshot>();
    if (base) *snap = *base;

    snap->align = config.ValueOr<bool>("align", snap->align);
    snap->emit_embedding = config.ValueOr<bool>("emit_embedding", snap->emit_embedding);
    snap->max_faces = std::max(1, config.ValueOr<int>("max_faces", snap->max_faces));

    if (const SimpleJson* th = config.Find("threshold"); th && th->IsObject()) {
        snap->thr_accept = th->ValueOr<float>("accept", snap->thr_accept);
        snap->thr_margin = th->ValueOr<float>("margin", snap->thr_margin);
    }

    {
        std::string fmt = config.ValueOr<std::string>("input_format", snap->model_input_format);
        for (auto& c : fmt) c = static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
        snap->model_input_format = std::move(fmt);
    }
    {
        std::string dtype = config.ValueOr<std::string>("input_dtype", snap->input_dtype);
        for (auto& c : dtype) c = static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
        snap->input_dtype = std::move(dtype);
    }

    if (const SimpleJson* norm = config.Find("normalize"); norm && norm->IsObject()) {
        bool use_ms = false;
        if (const SimpleJson* mean = norm->Find("mean"); mean && mean->IsArray() && mean->AsArray().size() >= 3) {
            for (int i = 0; i < 3; ++i) {
                snap->norm_mean[static_cast<size_t>(i)] =
                    static_cast<float>(mean->AsArray()[static_cast<size_t>(i)].AsNumber(snap->norm_mean[static_cast<size_t>(i)]));
            }
            use_ms = true;
        }
        if (const SimpleJson* st = norm->Find("std"); st && st->IsArray() && st->AsArray().size() >= 3) {
            for (int i = 0; i < 3; ++i) {
                snap->norm_std[static_cast<size_t>(i)] =
                    static_cast<float>(st->AsArray()[static_cast<size_t>(i)].AsNumber(snap->norm_std[static_cast<size_t>(i)]));
            }
            use_ms = true;
        }
        snap->norm_use_mean_std = use_ms;
        snap->norm_scale = norm->ValueOr<float>("scale", snap->norm_scale);
        snap->norm_bias = norm->ValueOr<float>("bias", snap->norm_bias);
    }

    if (const SimpleJson* g = config.Find("gallery"); g && g->IsObject()) {
        snap->gallery_backend = g->ValueOr<std::string>("backend", snap->gallery_backend);
        snap->gallery_path = g->ValueOr<std::string>("path", snap->gallery_path);
        snap->gallery_load_on_start = g->ValueOr<bool>("load_on_start", snap->gallery_load_on_start);
        snap->gallery_expected_dim = std::max(0, g->ValueOr<int>("expected_dim", snap->gallery_expected_dim));
        snap->gallery_dtype = g->ValueOr<std::string>("dtype", snap->gallery_dtype);
    }
    for (auto& c : snap->gallery_backend) c = static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
    for (auto& c : snap->gallery_dtype) c = static_cast<char>(std::tolower(static_cast<unsigned char>(c)));

    out = std::move(snap);
    return true;
}

}  // namespace

class AiFaceRecogNode : public INode {
public:
    std::string Id() const override { return id_; }
    std::string Type() const override { return "ai_face_recog"; }

    bool Init(const SimpleJson& config, const NodeContext& ctx) override {
        id_ = config.ValueOr<std::string>("id", "face_recog");
        model_path_ = config.ValueOr<std::string>("model_path", "");
        std::shared_ptr<const FaceRecogConfigSnapshot> snap;
        BuildFaceRecogConfigSnapshot(config, nullptr, snap);
        {
            std::lock_guard<std::mutex> lock(mu_);
            cfg_ = std::move(snap);
            gallery_.reset();
        }

        input_queue_ = ctx.input_queue;
        output_queues_ = ctx.output_queues;
        if (!input_queue_) {
            std::cerr << "[ai_face_recog] no input queue for node " << id_ << "\n";
            return false;
        }
        if (output_queues_.empty()) {
            std::cerr << "[ai_face_recog] no output queue for node " << id_ << "\n";
            return false;
        }

#if defined(RK3588_ENABLE_RKNN)
        if (model_path_.empty()) {
            std::cerr << "[ai_face_recog] model_path is required\n";
            return false;
        }
        std::string err;
        model_handle_ = AiScheduler::Instance().LoadModel(model_path_, err);
        if (model_handle_ == kInvalidModelHandle) {
            std::cerr << "[ai_face_recog] failed to load model: " << err << "\n";
            return false;
        }
        ModelInfo info;
        if (AiScheduler::Instance().GetModelInfo(model_handle_, info)) {
            model_w_ = info.input_width;
            model_h_ = info.input_height;
        }
        LogInfo("[ai_face_recog] model loaded: " + model_path_ +
                " (" + std::to_string(model_w_) + "x" + std::to_string(model_h_) + ")");
#else
        LogWarn("[ai_face_recog] RKNN disabled, will passthrough frames");
#endif
        return true;
    }

    bool Start() override {
        std::shared_ptr<const FaceRecogConfigSnapshot> cfg;
        {
            std::lock_guard<std::mutex> lock(mu_);
            cfg = cfg_;
        }
        if (cfg && cfg->gallery_load_on_start) {
            ReloadGallery(*cfg);
        }
        const bool align = cfg ? cfg->align : false;
        const float thr_accept = cfg ? cfg->thr_accept : 0.0f;
        const float thr_margin = cfg ? cfg->thr_margin : 0.0f;
        LogInfo("[ai_face_recog] start id=" + id_ + " align=" + std::string(align ? "true" : "false") +
                " thr_accept=" + std::to_string(thr_accept) + " thr_margin=" + std::to_string(thr_margin));

        // ========== TEST: Load aligned image and run inference ==========
#if defined(RK3588_ENABLE_RKNN)
        {
            const char* test_img_path = "./003_aligned.png";
            int img_w = 0, img_h = 0, img_c = 0;
            unsigned char* img_data = stbi_load(test_img_path, &img_w, &img_h, &img_c, 3);
            if (img_data && img_w == 112 && img_h == 112) {
                std::cerr << "[TEST] Loaded " << test_img_path << " (" << img_w << "x" << img_h << "x" << img_c << ")\n";

                InferInput in;
                in.width = 112;
                in.height = 112;
                in.is_nhwc = true;
                in.data = img_data;
                in.size = 112 * 112 * 3;
                in.type = RKNN_TENSOR_UINT8;

                auto r = AiScheduler::Instance().InferBorrowed(model_handle_, in);
                if (r.success && !r.outputs.empty()) {
                    std::vector<float> emb;
                    if (DecodeEmbedding(r.outputs[0], emb)) {
                        L2Normalize(emb);
                        std::cerr << "[TEST] RKNN embedding[0:8]: ";
                        for (int i = 0; i < 8 && i < static_cast<int>(emb.size()); ++i) {
                            std::cerr << emb[i] << " ";
                        }
                        std::cerr << "\n";
                    } else {
                        std::cerr << "[TEST] DecodeEmbedding failed\n";
                    }
                } else {
                    std::cerr << "[TEST] Inference failed: " << r.error << "\n";
                }
                stbi_image_free(img_data);
            } else {
                if (img_data) stbi_image_free(img_data);
                std::cerr << "[TEST] Skip: " << test_img_path << " not found or wrong size\n";
            }
        }
#endif
        // ========== END TEST ==========

        return true;
    }

    bool UpdateConfig(const SimpleJson& new_config) override {
        const std::string new_id = new_config.ValueOr<std::string>("id", id_);
        if (!new_id.empty() && new_id != id_) return false;

        const std::string new_model = new_config.ValueOr<std::string>("model_path", model_path_);
        if (new_model != model_path_) {
            // Changing model requires graph rebuild.
            return false;
        }

        std::shared_ptr<const FaceRecogConfigSnapshot> base;
        {
            std::lock_guard<std::mutex> lock(mu_);
            base = cfg_;
        }

        std::shared_ptr<const FaceRecogConfigSnapshot> snap;
        BuildFaceRecogConfigSnapshot(new_config, base, snap);

        bool reload = false;
        if (base && snap) {
            reload = (snap->gallery_backend != base->gallery_backend ||
                      snap->gallery_path != base->gallery_path ||
                      snap->gallery_expected_dim != base->gallery_expected_dim ||
                      snap->gallery_dtype != base->gallery_dtype);
        }

        {
            std::lock_guard<std::mutex> lock(mu_);
            cfg_ = snap;
        }

        if (reload && snap) {
            ReloadGallery(*snap);
        }
        return true;
    }

    void Stop() override {
#if defined(RK3588_ENABLE_RKNN)
        if (model_handle_ != kInvalidModelHandle) {
            AiScheduler::Instance().UnloadModel(model_handle_);
            model_handle_ = kInvalidModelHandle;
        }
#endif
        LogInfo("[ai_face_recog] stop id=" + id_);
    }

    NodeStatus Process(FramePtr frame) override {
        if (!frame) return NodeStatus::DROP;

#if defined(RK3588_ENABLE_RKNN)
        Run(frame);
#endif
        Push(frame);
        return NodeStatus::OK;
    }

private:
    void Push(FramePtr frame) {
        for (auto& q : output_queues_) q->Push(frame);
    }

    void ReloadGallery(const FaceRecogConfigSnapshot& cfg) {
        if (cfg.gallery_path.empty()) return;

        std::string err;
        FaceGallery g;
        g.SetExpectedDim(cfg.gallery_expected_dim);
        g.SetPreferredDtype(cfg.gallery_dtype);
        bool ok = false;

        if (cfg.gallery_backend == "sqlite") {
            ok = g.LoadSqliteBackend(cfg.gallery_path, err);
        } else if (cfg.gallery_backend == "file") {
            ok = g.LoadFileBackend(cfg.gallery_path, err);
        } else {
            err = "unknown gallery backend: " + cfg.gallery_backend;
        }

        if (!ok) {
            if (!err.empty()) LogWarn("[ai_face_recog] gallery load failed: " + err);
            return;
        }

        auto sp = std::make_shared<FaceGallery>(std::move(g));
        {
            std::lock_guard<std::mutex> lock(mu_);
            gallery_ = sp;
        }
        LogInfo("[ai_face_recog] gallery loaded: n=" + std::to_string(sp->Size()) +
                " dim=" + std::to_string(sp->Dim()));
    }

#if defined(RK3588_ENABLE_RKNN)
    void Run(FramePtr frame) {
        if (!frame->face_det || frame->face_det->faces.empty()) return;
        if (!frame->data || frame->data_size == 0) return;
        if (frame->format != PixelFormat::RGB && frame->format != PixelFormat::BGR) {
            std::cerr << "[ai_face_recog] input must be RGB/BGR\n";
            return;
        }

        const uint8_t* src = frame->planes[0].data ? frame->planes[0].data : frame->data;
        const int w = frame->width;
        const int h = frame->height;
        const int stride = frame->planes[0].stride > 0 ? frame->planes[0].stride
                        : (frame->stride > 0 ? frame->stride : w * 3);
        if (!src || stride <= 0) return;

        std::shared_ptr<const FaceRecogConfigSnapshot> cfg;
        std::shared_ptr<const FaceGallery> gallery;
        {
            std::lock_guard<std::mutex> lock(mu_);
            cfg = cfg_;
            gallery = gallery_;
        }
        if (!cfg) return;

        const bool need_swap = (frame->format == PixelFormat::BGR && cfg->model_input_format == "rgb") ||
                               (frame->format == PixelFormat::RGB && cfg->model_input_format == "bgr");

        FaceRecogResult rr;
        rr.img_w = w;
        rr.img_h = h;
        rr.model_name = "arcface";

        const int limit = std::min<int>(cfg->max_faces, static_cast<int>(frame->face_det->faces.size()));
        rr.items.reserve(static_cast<size_t>(limit));

        for (int i = 0; i < limit; ++i) {
            const FaceDetItem& face = frame->face_det->faces[static_cast<size_t>(i)];

            face_buf_.resize(static_cast<size_t>(model_w_) * static_cast<size_t>(model_h_) * 3);
            if (cfg->align && face.has_landmarks && model_w_ == 112 && model_h_ == 112) {
                const std::array<Point2f, 5> dst = {
                    Point2f{38.2946f, 51.6963f},
                    Point2f{73.5318f, 51.5014f},
                    Point2f{56.0252f, 71.7366f},
                    Point2f{41.5493f, 92.3655f},
                    Point2f{70.7299f, 92.2041f},
                };
                SimilarityTransform t;
                InvTransform inv;
                if (ComputeSimilarity(face.landmarks, dst, t) && InvertSimilarity(t, inv)) {
                    WarpFace(src, w, h, stride, inv, face_buf_.data(), model_w_, model_h_, need_swap);
                } else {
                    CropResize(src, w, h, stride, face.bbox, face_buf_.data(), model_w_, model_h_, need_swap);
                }
            } else {
                CropResize(src, w, h, stride, face.bbox, face_buf_.data(), model_w_, model_h_, need_swap);
            }

            InferInput in;
            in.width = model_w_;
            in.height = model_h_;
            in.is_nhwc = true;

            if (cfg->input_dtype == "float" || cfg->input_dtype == "f32" || cfg->input_dtype == "float32") {
                float_input_buf_.resize(static_cast<size_t>(model_w_) * static_cast<size_t>(model_h_) * 3);
                const size_t pix = static_cast<size_t>(model_w_) * static_cast<size_t>(model_h_);
                const uint8_t* p = face_buf_.data();
                for (size_t ii = 0; ii < pix; ++ii) {
                    for (int c = 0; c < 3; ++c) {
                        float x = static_cast<float>(p[ii * 3 + static_cast<size_t>(c)]);
                        if (cfg->norm_use_mean_std) {
                            const float st = std::fabs(cfg->norm_std[static_cast<size_t>(c)]) < 1e-6f ? 1.0f
                                                                                                       : cfg->norm_std[static_cast<size_t>(c)];
                            x = (x - cfg->norm_mean[static_cast<size_t>(c)]) / st;
                        } else {
                            x = x * cfg->norm_scale + cfg->norm_bias;
                        }
                        float_input_buf_[ii * 3 + static_cast<size_t>(c)] = x;
                    }
                }

                in.data = float_input_buf_.data();
                in.size = float_input_buf_.size() * sizeof(float);
                in.type = RKNN_TENSOR_FLOAT32;
            } else {
                in.data = face_buf_.data();
                in.size = face_buf_.size();
                in.type = RKNN_TENSOR_UINT8;
            }

            auto r = AiScheduler::Instance().InferBorrowed(model_handle_, in);
            if (!r.success || r.outputs.empty()) {
                std::cerr << "[ai_face_recog] inference failed: " << (r.error.empty() ? "unknown" : r.error) << "\n";
                continue;
            }

            std::vector<float> emb;
            if (!DecodeEmbedding(r.outputs[0], emb)) {
                continue;
            }
            L2Normalize(emb);

            // DEBUG: 打印推理得到的 embedding 前 8 个值
            {
                std::string dbg = "[ai_face_recog] infer emb[0:8]: ";
                for (int di = 0; di < 8 && di < static_cast<int>(emb.size()); ++di) {
                    dbg += std::to_string(emb[di]) + " ";
                }
                LogInfo(dbg);
            }

            FaceGallery::SearchResult sr;
            if (gallery && gallery->Size() > 0) {
                sr = gallery->SearchTop2(emb);
            }

            const bool accept = (sr.best_person_id >= 0) &&
                                (sr.best_sim >= cfg->thr_accept) &&
                                ((cfg->thr_margin <= 0.0f) || ((sr.best_sim - sr.second_sim) >= cfg->thr_margin));

            FaceRecogItem item;
            item.bbox = face.bbox;
            item.has_landmarks = face.has_landmarks;
            item.landmarks = face.landmarks;

            item.best_person_id = accept ? sr.best_person_id : -1;
            item.best_name = accept ? sr.best_name : "unknown";
            item.best_sim = sr.best_sim;
            item.second_sim = sr.second_sim;
            item.unknown = !accept;

            if (cfg->emit_embedding) item.embedding = emb;
            rr.items.push_back(std::move(item));
        }

        frame->face_recog = std::make_shared<FaceRecogResult>(std::move(rr));
    }
#endif

    std::string id_;
    std::string model_path_;

    mutable std::mutex mu_;
    std::shared_ptr<const FaceRecogConfigSnapshot> cfg_;
    std::shared_ptr<const FaceGallery> gallery_;

    std::shared_ptr<SpscQueue<FramePtr>> input_queue_;
    std::vector<std::shared_ptr<SpscQueue<FramePtr>>> output_queues_;

    std::vector<uint8_t> face_buf_;
    std::vector<float> float_input_buf_;

    ModelHandle model_handle_ = kInvalidModelHandle;
    int model_w_ = 112;
    int model_h_ = 112;
};

REGISTER_NODE(AiFaceRecogNode, "ai_face_recog");

}  // namespace rk3588