OrangePi3588Media/include/ai_scheduler.h

#pragma once

#include <atomic>
#include <cstdint>
#include <functional>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>

#if defined(RK3588_ENABLE_RKNN)
#include "rknn_api.h"
#endif

namespace rk3588 {

using ModelHandle = uint64_t;
constexpr ModelHandle kInvalidModelHandle = 0;

struct ModelInfo {
    int input_width = 0;
    int input_height = 0;
    int input_channels = 0;
    uint32_t n_input = 0;
    uint32_t n_output = 0;
    std::string name;
};

struct InferInput {
    const void* data = nullptr;
    size_t size = 0;
    int width = 0;
    int height = 0;
    bool is_nhwc = true;  // true: NHWC, false: NCHW

    // Optional DMA-BUF input for RKNN zero-copy (best-effort).
    // When dma_fd >= 0, AiScheduler will try rknn_create_mem_from_fd + rknn_set_io_mem.
    int dma_fd = -1;
    int dma_offset = 0;

#if defined(RK3588_ENABLE_RKNN)
    // The actual data type of `data` passed to RKNN. Default preserves existing behavior.
    rknn_tensor_type type = RKNN_TENSOR_UINT8;
#endif
};

struct InferOutput {
    std::vector<uint8_t> data;
    size_t size = 0;
    int index = 0;
#if defined(RK3588_ENABLE_RKNN)
    rknn_tensor_type type = RKNN_TENSOR_UINT8;
    int32_t zp = 0;
    float scale = 1.0f;
    std::vector<uint32_t> dims;
#endif
};

struct InferResult {
    bool success = false;
    std::string error;
    std::vector<InferOutput> outputs;
};

// Callback for async inference (future use)
using InferCallback = std::function<void(const InferResult& result)>;

class AiScheduler {
public:
    static AiScheduler& Instance();

    // Prevent copy/move
    AiScheduler(const AiScheduler&) = delete;
    AiScheduler& operator=(const AiScheduler&) = delete;

    // Load a model from file, returns handle (0 = invalid)
    ModelHandle LoadModel(const std::string& model_path, std::string& err);

    // Unload a model by handle
    void UnloadModel(ModelHandle handle);

    // Get model information
    bool GetModelInfo(ModelHandle handle, ModelInfo& info) const;

    // Synchronous inference
    InferResult Infer(ModelHandle handle, const InferInput& input);

    struct BorrowedOutput {
        const uint8_t* data = nullptr;
        size_t size = 0;
        int index = 0;
#if defined(RK3588_ENABLE_RKNN)
        rknn_tensor_type type = RKNN_TENSOR_UINT8;
        int32_t zp = 0;
        float scale = 1.0f;
        std::vector<uint32_t> dims;
#endif
    };

    // Borrowed inference avoids per-call output allocations/copies by using per-context preallocated buffers.
    // The returned result holds the selected context's inference lock until it is destroyed.
    struct BorrowedInferResult {
        bool success = false;
        std::string error;
        std::vector<BorrowedOutput> outputs;

#if defined(RK3588_ENABLE_RKNN)
        std::shared_ptr<void> keepalive;               // keeps the selected ModelContext alive
        std::unique_lock<std::mutex> infer_lock;       // holds ModelContext::infer_mutex for that context
#endif

        BorrowedInferResult() = default;
        BorrowedInferResult(BorrowedInferResult&&) noexcept = default;
        BorrowedInferResult& operator=(BorrowedInferResult&&) noexcept = default;
        BorrowedInferResult(const BorrowedInferResult&) = delete;
        BorrowedInferResult& operator=(const BorrowedInferResult&) = delete;
    };

    BorrowedInferResult InferBorrowed(ModelHandle handle, const InferInput& input);

    // Async inference (submits to internal queue, calls callback when done)
    // For now, this is a simple wrapper around sync Infer
    void InferAsync(ModelHandle handle, const InferInput& input, InferCallback callback);

    // Get statistics
    uint64_t GetTotalInferences() const { return total_inferences_.load(); }
    uint64_t GetTotalErrors() const { return total_errors_.load(); }

    // Shutdown scheduler (unload all models)
    void Shutdown();

private:
    AiScheduler();
    ~AiScheduler();

#if defined(RK3588_ENABLE_RKNN)
    struct ModelContext {
        rknn_context ctx = 0;
        // Shared model blob kept alive for RKNN runtime. Multiple contexts can share the same data.
        std::shared_ptr<std::vector<uint8_t>> model_data;
        std::vector<rknn_tensor_attr> input_attrs;
        std::vector<rknn_tensor_attr> output_attrs;
        std::vector<std::vector<uint8_t>> output_buffers;  // preallocated output buffers
        uint32_t n_input = 0;
        uint32_t n_output = 0;
        int input_w = 0;
        int input_h = 0;
        int input_c = 0;
        std::string path;
        std::mutex infer_mutex;  // Per-context lock for inference

        ~ModelContext() {
            if (ctx) {
                rknn_destroy(ctx);
                ctx = 0;
            }
        }
    };

    struct ModelGroup {
        std::string path;
        std::vector<std::shared_ptr<ModelContext>> contexts;
        std::atomic<uint32_t> rr{0};  // round-robin context selection
    };

    std::unordered_map<ModelHandle, std::shared_ptr<ModelGroup>> models_by_handle_;
    std::unordered_map<std::string, std::weak_ptr<ModelGroup>> models_by_path_;
#endif

    mutable std::mutex models_mutex_;  // Protects models_ map
    std::atomic<ModelHandle> next_handle_{1};
    std::atomic<uint64_t> total_inferences_{0};
    std::atomic<uint64_t> total_errors_{0};
};

}  // namespace rk3588