rtsp_tensorrt/ref/tensorrt_engine.cpp

#include "inference/tensorrt_engine.hpp"
#include "common/logger.hpp"
#include <fstream>
#include <cuda_runtime.h>
#include <NvOnnxParser.h>
#include <dlfcn.h>  // 用于动态库加载检查
#include <filesystem>
#include "common/cuda_helper.hpp"

// 重命名为 TRTLogger 避免冲突
class TRTLogger : public nvinfer1::ILogger {
    void log(Severity severity, const char* msg) noexcept override {
        switch (severity) {
            case Severity::kINTERNAL_ERROR:
                Logger::error(std::string("TensorRT Internal Error: ") + msg);
                break;
            case Severity::kERROR:
                Logger::error(std::string("TensorRT Error: ") + msg);
                break;
            case Severity::kWARNING:
                Logger::info(std::string("TensorRT Warning: ") + msg);
                break;
            default:
                Logger::info(std::string("TensorRT Info: ") + msg);
                break;
        }
    }
};

static TRTLogger gLogger;  // 全局 logger 实例

class TensorRTEngine::Impl {
public:
    nvinfer1::IRuntime* runtime = nullptr;
    nvinfer1::ICudaEngine* engine = nullptr;
    nvinfer1::IExecutionContext* context = nullptr;
    cudaStream_t stream = nullptr;

    void* buffers[2];  // 输入和输出缓冲区
    int inputIndex;
    int outputIndex;

    int inputH = 640;
    int inputW = 640;
    int maxBatchSize = 1;

    float* hostInput = nullptr;
    float* hostOutput = nullptr;

    // 添加 GPU 缓冲区
    void* input_buffer = nullptr;   // GPU 输入缓冲区
    void* output_buffer = nullptr;  // GPU 输出缓冲区

    ~Impl() {
        if (runtime) delete runtime;
        if (engine) delete engine;
        if (context) delete context;
        if (stream) cudaStreamDestroy(stream);
        if (hostInput) delete[] hostInput;
        if (hostOutput) delete[] hostOutput;
        if (input_buffer) cudaFree(input_buffer);
        if (output_buffer) cudaFree(output_buffer);
    }
};

TensorRTEngine::TensorRTEngine(const std::string& model_path, int gpu_id) {
    try {
        Logger::info("TensorRTEngine constructor start");

        // 验证参数
        if (model_path.empty()) {
            throw std::runtime_error("Model path is empty");
        }

        if (gpu_id < 0) {
            throw std::runtime_error("Invalid GPU ID: " + std::to_string(gpu_id));
        }

        // 创建本地副本
        model_path_ = std::string(model_path);

        Logger::info("Parameters:");
        Logger::info("  Model path: " + model_path_);
        Logger::info("  GPU ID: " + std::to_string(gpu_id));

        // 创建实现
        pImpl = std::make_unique<Impl>();
        if (!pImpl) {
            throw std::runtime_error("Failed to create implementation");
        }

        // 检查模型文件
        if (!std::filesystem::exists(model_path_)) {
            throw std::runtime_error("Model file not found: " + model_path_);
        }

        auto file_size = std::filesystem::file_size(model_path_);
        Logger::info("Model file exists, size: " + std::to_string(file_size) + " bytes");

        // 初始化 CUDA
        cudaError_t error = cudaSetDevice(gpu_id);
        if (error != cudaSuccess) {
            throw std::runtime_error("Failed to set CUDA device: " +
                                   std::string(cudaGetErrorString(error)));
        }

        // 创建 CUDA 流
        error = cudaStreamCreate(&pImpl->stream);
        if (error != cudaSuccess) {
            throw std::runtime_error("Failed to create CUDA stream: " +
                                   std::string(cudaGetErrorString(error)));
        }

        // 加载模型
        if (!loadModel()) {
            throw std::runtime_error("Failed to load model");
        }

        Logger::info("TensorRTEngine constructor completed successfully");
    }
    catch (const std::exception& e) {
        Logger::error("Error in TensorRTEngine constructor: " + std::string(e.what()));
        throw;
    }
}

TensorRTEngine::~TensorRTEngine() = default;

bool TensorRTEngine::convertONNX2TRT(const std::string& onnx_file) {
    try {
        Logger::info("=== Starting ONNX to TensorRT Conversion ===");
        Logger::info("ONNX file: " + onnx_file);

        // 使用与模型文件相同目录作为基准目录
        std::filesystem::path model_dir = std::filesystem::path(onnx_file).parent_path();
        std::filesystem::path engine_path = model_dir / "model.engine";

        // 检查 engine 文件是否已存在
        if (std::filesystem::exists(engine_path)) {
            Logger::info("Found existing engine file: " + engine_path.string());
            Logger::info("Size: " + std::to_string(std::filesystem::file_size(engine_path)) + " bytes");
            return true;
        }

        Logger::info("Converting ONNX to TensorRT engine...");

        // 检查文件是否存在
        if (!std::filesystem::exists(onnx_file)) {
            Logger::error("ONNX file does not exist: " + onnx_file);
            return false;
        }

        // 获取文件大小
        std::filesystem::path p(onnx_file);
        auto file_size = std::filesystem::file_size(p);
        Logger::info("ONNX file size: " + std::to_string(file_size) + " bytes");

        // 创建 builder
        nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
        if (!builder) {
            throw std::runtime_error("Failed to create builder");
        }

        // 创建网络定义，启用显式批处理
        const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
        nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
        if (!network) {
            throw std::runtime_error("Failed to create network");
        }

        // 创建 ONNX 解析器
        auto parser = nvonnxparser::createParser(*network, gLogger);
        if (!parser) {
            throw std::runtime_error("Failed to create parser");
        }

        // 解析 ONNX 文件
        if (!parser->parseFromFile(onnx_file.c_str(),
            static_cast<int>(nvinfer1::ILogger::Severity::kWARNING))) {
            throw std::runtime_error("Failed to parse ONNX file");
        }

        // 获取网络输入
        if (network->getNbInputs() == 0) {
            throw std::runtime_error("Network has no inputs");
        }

        // 获取输入张量
        nvinfer1::ITensor* input = network->getInput(0);
        if (!input) {
            throw std::runtime_error("Failed to get input tensor");
        }

        // 获取输入名称和维度
        std::string inputName = input->getName();
        Logger::info("Input tensor name: " + inputName);

        // 获取输入维度
        nvinfer1::Dims inputDims = input->getDimensions();
        std::string dimStr = "Input dimensions: (";
        for (int i = 0; i < inputDims.nbDims; i++) {
            dimStr += std::to_string(inputDims.d[i]);
            if (i < inputDims.nbDims - 1) dimStr += ", ";
        }
        dimStr += ")";
        Logger::info(dimStr);

        // 打印网络信息
        Logger::info("Network layers:");
        for (int i = 0; i < network->getNbLayers(); i++) {
            auto layer = network->getLayer(i);
            Logger::info("Layer " + std::to_string(i) + ": " + layer->getName());

            // 打印每个层的输入维度
            for (int j = 0; j < layer->getNbInputs(); j++) {
                auto layerInput = layer->getInput(j);
                if (layerInput) {
                    auto dims = layerInput->getDimensions();
                    std::string layerDimStr = "  Input " + std::to_string(j) + " dims: (";
                    for (int k = 0; k < dims.nbDims; k++) {
                        layerDimStr += std::to_string(dims.d[k]);
                        if (k < dims.nbDims - 1) layerDimStr += ", ";
                    }
                    layerDimStr += ")";
                    Logger::info(layerDimStr);
                }
            }
        }

        // 创建构建配置
        nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
        if (!config) {
            throw std::runtime_error("Failed to create builder config");
        }

        // 设置 TensorRT 配置
        config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1 << 30); // 1GB
        config->setFlag(nvinfer1::BuilderFlag::kFP16);  // 启用 FP16 精度

        // 添加优化配置文件
        nvinfer1::IOptimizationProfile* profile = builder->createOptimizationProfile();
        if (!profile) {
            throw std::runtime_error("Failed to create optimization profile");
        }

        // 设置动态维度
        nvinfer1::Dims minDims = inputDims;
        nvinfer1::Dims optDims = inputDims;
        nvinfer1::Dims maxDims = inputDims;

        // 打印原始维度
        Logger::info("Original dimensions:");
        for (int i = 0; i < inputDims.nbDims; i++) {
            Logger::info("  dim[" + std::to_string(i) + "] = " + std::to_string(inputDims.d[i]));
        }

        // 确保所有维度都是正数
        for (int i = 0; i < inputDims.nbDims; i++) {
            // 如果维度是 -1（动态维度），设置为合适的值
            if (inputDims.d[i] == -1) {
                if (i == 0) {  // batch 维度
                    minDims.d[i] = 1;
                    optDims.d[i] = pImpl->maxBatchSize;
                    maxDims.d[i] = pImpl->maxBatchSize;
                } else if (i == 1) {  // channel 维度
                    minDims.d[i] = 3;  // RGB
                    optDims.d[i] = 3;
                    maxDims.d[i] = 3;
                } else if (i == 2) {  // height 维度
                    minDims.d[i] = pImpl->inputH;
                    optDims.d[i] = pImpl->inputH;
                    maxDims.d[i] = pImpl->inputH;
                } else if (i == 3) {  // width 维度
                    minDims.d[i] = pImpl->inputW;
                    optDims.d[i] = pImpl->inputW;
                    maxDims.d[i] = pImpl->inputW;
                } else {
                    minDims.d[i] = 1;
                    optDims.d[i] = 1;
                    maxDims.d[i] = 1;
                }
            } else {
                // 如果不是动态维度，保持原值
                minDims.d[i] = inputDims.d[i];
                optDims.d[i] = inputDims.d[i];
                maxDims.d[i] = inputDims.d[i];
            }
        }

        // 打印设置的维度
        Logger::info("Setting optimization profile dimensions:");
        Logger::info("Min dimensions:");
        for (int i = 0; i < minDims.nbDims; i++) {
            Logger::info("  dim[" + std::to_string(i) + "] = " + std::to_string(minDims.d[i]));
        }
        Logger::info("Opt dimensions:");
        for (int i = 0; i < optDims.nbDims; i++) {
            Logger::info("  dim[" + std::to_string(i) + "] = " + std::to_string(optDims.d[i]));
        }
        Logger::info("Max dimensions:");
        for (int i = 0; i < maxDims.nbDims; i++) {
            Logger::info("  dim[" + std::to_string(i) + "] = " + std::to_string(maxDims.d[i]));
        }

        // 设置优化配置
        if (!profile->setDimensions(inputName.c_str(), nvinfer1::OptProfileSelector::kMIN, minDims)) {
            throw std::runtime_error("Failed to set minimum dimensions");
        }
        if (!profile->setDimensions(inputName.c_str(), nvinfer1::OptProfileSelector::kOPT, optDims)) {
            throw std::runtime_error("Failed to set optimal dimensions");
        }
        if (!profile->setDimensions(inputName.c_str(), nvinfer1::OptProfileSelector::kMAX, maxDims)) {
            throw std::runtime_error("Failed to set maximum dimensions");
        }

        config->addOptimizationProfile(profile);

        // 构建引擎
        Logger::info("Building TensorRT engine...");
        nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
        if (!engine) {
            throw std::runtime_error("Failed to build TensorRT engine");
        }

        // 序列化引擎
        nvinfer1::IHostMemory* serializedEngine = engine->serialize();
        std::ofstream engine_file(engine_path, std::ios::binary);
        engine_file.write(static_cast<const char*>(serializedEngine->data()),
                         serializedEngine->size());

        // 清理资源
        delete serializedEngine;
        delete engine;
        delete config;
        delete network;
        delete parser;
        delete builder;

        Logger::info("Successfully converted ONNX to TensorRT engine");
        Logger::info("=== Engine Conversion Completed ===");
        Logger::info("Engine file saved to: " + engine_path.string());
        Logger::info("Engine file size: " + std::to_string(std::filesystem::file_size(engine_path)) + " bytes");
        return true;
    }
    catch (const std::exception& e) {
        Logger::error("Error in convertONNX2TRT: " + std::string(e.what()));
        return false;
    }
}

bool TensorRTEngine::loadModel() {
    try {
        if (!pImpl) {
            throw std::runtime_error("Implementation is null");
        }

        Logger::info("Loading model...");

        // 使用与模型文件相同目录作为基准目录
        std::filesystem::path model_dir = std::filesystem::path(model_path_).parent_path();
        std::filesystem::path engine_path = model_dir / "model.engine";

        // 检查引擎文件
        if (!std::filesystem::exists(engine_path)) {
            Logger::info("Engine file not found, converting from ONNX...");
            if (!convertONNX2TRT(model_path_)) {
                throw std::runtime_error("Failed to convert ONNX model");
            }
        }

        // 加载 engine 文件
        std::ifstream engine_file(engine_path, std::ios::binary);
        if (!engine_file) {
            throw std::runtime_error("Cannot open engine file: " + engine_path.string());
        }

        // 读取序列化的引擎文件
        std::ifstream file(engine_path, std::ios::binary);
        if (!file.good()) {
            Logger::error("Failed to open engine file");
            return false;
        }

        file.seekg(0, std::ios::end);
        size_t size = file.tellg();
        file.seekg(0, std::ios::beg);

        std::vector<char> engineData(size);
        file.read(engineData.data(), size);

        // 创建推理引擎
        Logger::info("Creating TensorRT runtime...");
        pImpl->runtime = nvinfer1::createInferRuntime(gLogger);
        if (!pImpl->runtime) {
            Logger::error("Failed to create TensorRT runtime");
            return false;
        }

        Logger::info("Deserializing CUDA engine...");
        pImpl->engine = pImpl->runtime->deserializeCudaEngine(engineData.data(), size);
        if (!pImpl->engine) {
            Logger::error("Failed to deserialize CUDA engine");
            return false;
        }

        Logger::info("Creating execution context...");
        pImpl->context = pImpl->engine->createExecutionContext();
        if (!pImpl->context) {
            Logger::error("Failed to create execution context");
            return false;
        }

        // 获取输入输出张量信息
        Logger::info("Getting tensor information...");

        // 获取网络输入输出数量
        int32_t nbIOTensors = pImpl->engine->getNbIOTensors();
        Logger::info("Number of I/O tensors: " + std::to_string(nbIOTensors));

        // 设置输入输出索引
        pImpl->inputIndex = 0;
        pImpl->outputIndex = 1;

        // 获取输入张量信息
        const char* inputName = pImpl->engine->getIOTensorName(pImpl->inputIndex);
        if (!inputName) {
            Logger::error("Failed to get input tensor name");
            return false;
        }
        Logger::info("Input tensor name: " + std::string(inputName));

        // 获取输入维度
        auto dims = pImpl->engine->getTensorShape(inputName);
        Logger::info("Input tensor dimensions: " + std::to_string(dims.nbDims) + " dimensions");

        // 验证维度
        if (dims.nbDims < 4) {
            Logger::error("Invalid input dimensions: expected at least 4, got " +
                         std::to_string(dims.nbDims));
            return false;
        }

        // 打印维度信息
        std::string dimStr = "Input dimensions: (";
        for (int i = 0; i < dims.nbDims; i++) {
            dimStr += std::to_string(dims.d[i]);
            if (i < dims.nbDims - 1) dimStr += ", ";
        }
        dimStr += ")";
        Logger::info(dimStr);

        // 设置输入尺寸
        pImpl->inputH = dims.d[2];
        pImpl->inputW = dims.d[3];
        Logger::info("Input HxW: " + std::to_string(pImpl->inputH) + "x" +
                    std::to_string(pImpl->inputW));

        // 获取输出张量信息
        const char* outputName = pImpl->engine->getIOTensorName(pImpl->outputIndex);
        if (!outputName) {
            Logger::error("Failed to get output tensor name");
            return false;
        }
        Logger::info("Output tensor name: " + std::string(outputName));

        // 获取输出维度
        auto outputDims = pImpl->engine->getTensorShape(outputName);
        Logger::info("Output tensor dimensions: " + std::to_string(outputDims.nbDims) +
                    " dimensions");

        std::string outDimStr = "Output dimensions: (";
        for (int i = 0; i < outputDims.nbDims; i++) {
            outDimStr += std::to_string(outputDims.d[i]);
            if (i < outputDims.nbDims - 1) outDimStr += ", ";
        }
        outDimStr += ")";
        Logger::info(outDimStr);

        // 计算缓冲区大小
        size_t inputSize = sizeof(float);
        for (int i = 0; i < dims.nbDims; i++) {
            inputSize *= (dims.d[i] > 0) ? static_cast<size_t>(dims.d[i]) : 1;
        }

        size_t outputSize = sizeof(float);
        for (int i = 0; i < outputDims.nbDims; i++) {
            outputSize *= (outputDims.d[i] > 0) ? static_cast<size_t>(outputDims.d[i]) : 1;
        }

        Logger::info("Allocating device memory...");
        Logger::info("Input buffer size: " + std::to_string(inputSize) + " bytes");
        Logger::info("Output buffer size: " + std::to_string(outputSize) + " bytes");

        // 分配设备内存
        cudaError_t error;
        error = cudaMalloc(&pImpl->buffers[pImpl->inputIndex], inputSize);
        if (error != cudaSuccess) {
            Logger::error("Failed to allocate input buffer: " +
                         std::string(cudaGetErrorString(error)));
            return false;
        }

        error = cudaMalloc(&pImpl->buffers[pImpl->outputIndex], outputSize);
        if (error != cudaSuccess) {
            Logger::error("Failed to allocate output buffer: " +
                         std::string(cudaGetErrorString(error)));
            return false;
        }

        // 获取输出维度
        nvinfer1::Dims output_dims = pImpl->engine->getTensorShape(pImpl->engine->getIOTensorName(1));
        size_t total_output_size = 1;
        for (int i = 0; i < output_dims.nbDims; i++) {
            total_output_size *= output_dims.d[i];
        }

        Logger::info("Output tensor dimensions:");
        Logger::info("  Number of dimensions: " + std::to_string(output_dims.nbDims));
        for (int i = 0; i < output_dims.nbDims; i++) {
            Logger::info("  Dimension " + std::to_string(i) + ": " + std::to_string(output_dims.d[i]));
        }
        Logger::info("Total output size: " + std::to_string(total_output_size));

        // 分配主机输出缓冲区
        pImpl->hostOutput = new float[total_output_size];

        // 分配 GPU 缓冲区
        size_t input_size = kMaxBatchSize * 3 * kInputH * kInputW * sizeof(float);
        size_t output_size = kMaxBatchSize * total_output_size * sizeof(float);

        // 使用 cudaMalloc 而不是 CUDA_CHECK 宏
        error = cudaMalloc(&pImpl->input_buffer, input_size);
        if (error != cudaSuccess) {
            Logger::error("Failed to allocate input buffer: " +
                         std::string(cudaGetErrorString(error)));
            return false;
        }

        error = cudaMalloc(&pImpl->output_buffer, output_size);
        if (error != cudaSuccess) {
            Logger::error("Failed to allocate output buffer: " +
                         std::string(cudaGetErrorString(error)));
            cudaFree(pImpl->input_buffer);
            pImpl->input_buffer = nullptr;
            return false;
        }

        // 设置缓冲区指针
        pImpl->buffers[pImpl->inputIndex] = pImpl->input_buffer;
        pImpl->buffers[pImpl->outputIndex] = pImpl->output_buffer;

        Logger::info("Model loaded successfully");
        return true;
    }
    catch (const std::exception& e) {
        Logger::error("Error in loadModel: " + std::string(e.what()));
        return false;
    }
}

void TensorRTEngine::preprocess(const cv::Mat& input_image, float* gpu_input) {
    try {
        Logger::info("Starting preprocessing...");

        // 检查输入
        if (input_image.empty()) {
            throw std::runtime_error("Input image is empty");
        }

        if (!gpu_input) {
            throw std::runtime_error("GPU input buffer is null");
        }

        // 调整图像大小
        cv::Mat resized;
        cv::resize(input_image, resized, cv::Size(pImpl->inputW, pImpl->inputH));

        // BGR to RGB
        cv::Mat rgb;
        cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB);

        // 转换为浮点型并归一化
        cv::Mat float_img;
        rgb.convertTo(float_img, CV_32F, 1.0/255.0);

        // 分离通道
        std::vector<cv::Mat> channels;
        cv::split(float_img, channels);

        // 检查通道数
        if (channels.size() != 3) {
            throw std::runtime_error("Expected 3 channels, got " +
                                   std::to_string(channels.size()));
        }

        // 计算每通道的大小
        size_t channel_size = pImpl->inputH * pImpl->inputW * sizeof(float);

        // 复制数据到 GPU
        for (int i = 0; i < 3; i++) {
            cudaError_t error = cudaMemcpyAsync(
                gpu_input + i * pImpl->inputH * pImpl->inputW,
                channels[i].data,
                channel_size,
                cudaMemcpyHostToDevice,
                pImpl->stream
            );

            if (error != cudaSuccess) {
                throw std::runtime_error("Failed to copy channel " + std::to_string(i) +
                                       " to GPU: " + cudaGetErrorString(error));
            }
        }

        // 同步确保数据复制完成
        cudaError_t error = cudaStreamSynchronize(pImpl->stream);
        if (error != cudaSuccess) {
            throw std::runtime_error("Failed to synchronize CUDA stream: " +
                                   std::string(cudaGetErrorString(error)));
        }

        Logger::info("Preprocessing completed successfully");
    }
    catch (const std::exception& e) {
        Logger::error("Error in preprocessing: " + std::string(e.what()));
        throw;
    }
}

bool TensorRTEngine::infer(const cv::Mat& input_image, std::vector<DetectionResult>& detections) {
    try {
        Logger::info("=== Starting Inference ===");

        if (!pImpl->context || !pImpl->engine) {
            Logger::error("TensorRT engine or context is null");
            return false;
        }

        if (input_image.empty()) {
            Logger::error("Input image is empty");
            return false;
        }

        // 打印输入图像信息
        Logger::info("Input image: " + std::to_string(input_image.cols) + "x" +
                    std::to_string(input_image.rows) + " channels: " +
                    std::to_string(input_image.channels()));

        // 预处理
        try {
            preprocess(input_image, (float*)pImpl->buffers[pImpl->inputIndex]);
        } catch (const std::exception& e) {
            Logger::error("Error in preprocessing: " + std::string(e.what()));
            return false;
        }

        // 执行推理
        bool status = false;
        try {
            cudaStreamSynchronize(pImpl->stream);  // 确保之前的操作完成
            status = pImpl->context->executeV2(pImpl->buffers);
            cudaStreamSynchronize(pImpl->stream);  // 等待推理完成
        } catch (const std::exception& e) {
            Logger::error("Error during inference execution: " + std::string(e.what()));
            return false;
        }

        if (!status) {
            Logger::error("Failed to execute inference");
            return false;
        }

        // 获取输出大小
        nvinfer1::Dims output_dims = pImpl->engine->getTensorShape(pImpl->engine->getIOTensorName(1));  // 1 是输出索引
        size_t output_size = 1;
        for (int i = 0; i < output_dims.nbDims; i++) {
            output_size *= output_dims.d[i];
        }

        // 复制结果回主机
        cudaError_t error = cudaMemcpyAsync(
            pImpl->hostOutput,
            pImpl->buffers[pImpl->outputIndex],
            output_size * sizeof(float),  // 使用计算出的大小
            cudaMemcpyDeviceToHost,
            pImpl->stream
        );

        if (error != cudaSuccess) {
            Logger::error("Failed to copy output data: " + std::string(cudaGetErrorString(error)) +
                         " (size: " + std::to_string(output_size) + ")");
            return false;
        }

        // 同步等待结果
        error = cudaStreamSynchronize(pImpl->stream);
        if (error != cudaSuccess) {
            Logger::error("Failed to synchronize CUDA stream: " +
                         std::string(cudaGetErrorString(error)));
            return false;
        }

        // 后处理
        try {
            detections = postprocess(pImpl->hostOutput, 1);
        } catch (const std::exception& e) {
            Logger::error("Error in postprocessing: " + std::string(e.what()));
            return false;
        }

        // 在检测结果后添加日志
        if (!detections.empty()) {
            Logger::info("=== Detection Results ===");
            Logger::info("Found " + std::to_string(detections.size()) + " objects");
            for (const auto& det : detections) {
                Logger::info("  Object: shoe");
                Logger::info("    Confidence: " + std::to_string(det.confidence));
                Logger::info("    Box: (" + std::to_string(det.x1) + ", " +
                           std::to_string(det.y1) + ", " +
                           std::to_string(det.x2) + ", " +
                           std::to_string(det.y2) + ")");
            }
        }

        // 保存检测结果
        if (!detections.empty()) {
            static int frame_count = 0;
            frame_count++;

            // 每100帧保存一次结果
            if (frame_count % 100 == 0) {
                cv::Mat output = input_image.clone();

                // 绘制检测框
                for (const auto& det : detections) {
                    cv::rectangle(output,
                                cv::Point(det.x1, det.y1),
                                cv::Point(det.x2, det.y2),
                                cv::Scalar(0, 255, 0), 2);

                    std::string label = "shoe: " + std::to_string(det.confidence);
                    cv::putText(output, label,
                              cv::Point(det.x1, det.y1 - 10),
                              cv::FONT_HERSHEY_SIMPLEX, 0.5,
                              cv::Scalar(0, 255, 0), 2);
                }

                // 保存图像
                std::string filename = "results/detection_" +
                                     std::to_string(frame_count) + ".jpg";
                cv::imwrite(filename, output);
                Logger::info("Saved detection result to: " + filename);
            }
        }

        Logger::info("=== Inference Completed ===");
        return true;
    }
    catch (const std::exception& e) {
        Logger::error("Error during inference: " + std::string(e.what()));
        return false;
    }
}

std::vector<DetectionResult> TensorRTEngine::postprocess(float* output, int batch_size) {
    std::vector<DetectionResult> results;

    // 设置阈值
    const float conf_threshold = 0.6f;  // 提高置信度阈值
    const float nms_threshold = 0.4f;   // 降低 NMS 阈值
    const float min_box_size = 20.0f;   // 最小框尺寸（像素）
    const float max_box_size = 416.0f;  // 最大框尺寸（像素）
    const int num_classes = 1;          // 只有一个类别 (shoe)
    const int num_boxes = 25200;        // YOLOv5 输出的框数量

    Logger::info("Post-processing parameters:");
    Logger::info("  Confidence threshold: " + std::to_string(conf_threshold));
    Logger::info("  NMS threshold: " + std::to_string(nms_threshold));
    Logger::info("  Min box size: " + std::to_string(min_box_size));
    Logger::info("  Max box size: " + std::to_string(max_box_size));

    // 存储所有检测结果
    std::vector<std::vector<DetectionResult>> class_detections(num_classes);
    int total_boxes = 0;
    int filtered_by_conf = 0;
    int filtered_by_size = 0;
    int filtered_by_nms = 0;

    // 遍历所有预测框
    for (int i = 0; i < num_boxes; i++) {
        float* box = output + i * (5 + num_classes);
        float confidence = box[4];

        // 检查置信度
        if (confidence < conf_threshold) {
            filtered_by_conf++;
            continue;
        }

        float class_score = box[5];
        float final_score = confidence * class_score;

        if (final_score > conf_threshold) {
            DetectionResult det;
            float cx = box[0] * pImpl->inputW;
            float cy = box[1] * pImpl->inputH;
            float w = box[2] * pImpl->inputW;
            float h = box[3] * pImpl->inputH;

            // 检查框的尺寸
            if (w < min_box_size || h < min_box_size ||
                w > max_box_size || h > max_box_size) {
                filtered_by_size++;
                continue;
            }

            det.x1 = std::max(0.0f, cx - w/2);
            det.y1 = std::max(0.0f, cy - h/2);
            det.x2 = std::min(float(pImpl->inputW), cx + w/2);
            det.y2 = std::min(float(pImpl->inputH), cy + h/2);
            det.confidence = final_score;
            det.class_id = 0;

            class_detections[0].push_back(det);
            total_boxes++;
        }
    }

    Logger::info("Detection statistics:");
    Logger::info("  Total boxes processed: " + std::to_string(num_boxes));
    Logger::info("  Filtered by confidence: " + std::to_string(filtered_by_conf));
    Logger::info("  Filtered by size: " + std::to_string(filtered_by_size));
    Logger::info("  Remaining after initial filtering: " + std::to_string(total_boxes));

    // NMS
    for (int c = 0; c < num_classes; c++) {
        auto& dets = class_detections[c];
        if (dets.empty()) continue;

        std::sort(dets.begin(), dets.end(),
                 [](const DetectionResult& a, const DetectionResult& b) {
                     return a.confidence > b.confidence;
                 });

        std::vector<bool> keep(dets.size(), true);
        for (size_t i = 0; i < dets.size(); i++) {
            if (!keep[i]) continue;

            for (size_t j = i + 1; j < dets.size(); j++) {
                if (!keep[j]) continue;

                float iou = calculateIoU(dets[i], dets[j]);
                if (iou > nms_threshold) {
                    keep[j] = false;
                    filtered_by_nms++;
                }
            }
        }

        for (size_t i = 0; i < dets.size(); i++) {
            if (keep[i]) {
                results.push_back(dets[i]);
            }
        }
    }

    Logger::info("  Filtered by NMS: " + std::to_string(filtered_by_nms));
    Logger::info("  Final detection count: " + std::to_string(results.size()));

    return results;
}

// 添加 IoU 计算函数
float TensorRTEngine::calculateIoU(const DetectionResult& a, const DetectionResult& b) {
    float x1 = std::max(a.x1, b.x1);
    float y1 = std::max(a.y1, b.y1);
    float x2 = std::min(a.x2, b.x2);
    float y2 = std::min(a.y2, b.y2);

    if (x2 < x1 || y2 < y1) return 0.0f;

    float intersection = (x2 - x1) * (y2 - y1);
    float area_a = (a.x2 - a.x1) * (a.y2 - a.y1);
    float area_b = (b.x2 - b.x1) * (b.y2 - b.y1);

    return intersection / (area_a + area_b - intersection);
}

bool TensorRTEngine::inferGPU(float* gpu_input, std::vector<DetectionResult>& detections) {
    try {
        // 直接使用 GPU 内存中的数据进行推理
        void* buffers[2] = {gpu_input, pImpl->output_buffer};

        // 执行推理
        bool status = false;
        try {
            cudaStreamSynchronize(pImpl->stream);  // 确保之前的操作完成
            status = pImpl->context->executeV2(buffers);
            cudaStreamSynchronize(pImpl->stream);  // 等待推理完成
        } catch (const std::exception& e) {
            Logger::error("Error during inference execution: " + std::string(e.what()));
            return false;
        }

        if (!status) {
            Logger::error("Failed to execute inference");
            return false;
        }

        // 获取输出大小
        nvinfer1::Dims output_dims = pImpl->engine->getTensorShape(pImpl->engine->getIOTensorName(1));
        size_t output_size = 1;
        for (int i = 0; i < output_dims.nbDims; i++) {
            output_size *= output_dims.d[i];
        }

        // 复制结果回主机
        cudaError_t error = cudaMemcpyAsync(
            pImpl->hostOutput,
            pImpl->output_buffer,
            output_size * sizeof(float),
            cudaMemcpyDeviceToHost,
            pImpl->stream
        );

        if (error != cudaSuccess) {
            Logger::error("Failed to copy output data: " + std::string(cudaGetErrorString(error)));
            return false;
        }

        // 同步等待结果
        error = cudaStreamSynchronize(pImpl->stream);
        if (error != cudaSuccess) {
            Logger::error("Failed to synchronize CUDA stream: " +
                         std::string(cudaGetErrorString(error)));
            return false;
        }

        // 后处理
        detections = postprocess(pImpl->hostOutput, 1);
        return true;
    }
    catch (const std::exception& e) {
        Logger::error("Error during GPU inference: " + std::string(e.what()));
        return false;
    }
}