#include "inference/tensorrt_engine.hpp" #include "common/logger.hpp" #include #include #include #include // 用于动态库加载检查 #include #include "common/cuda_helper.hpp" // 重命名为 TRTLogger 避免冲突 class TRTLogger : public nvinfer1::ILogger { void log(Severity severity, const char* msg) noexcept override { switch (severity) { case Severity::kINTERNAL_ERROR: Logger::error(std::string("TensorRT Internal Error: ") + msg); break; case Severity::kERROR: Logger::error(std::string("TensorRT Error: ") + msg); break; case Severity::kWARNING: Logger::info(std::string("TensorRT Warning: ") + msg); break; default: Logger::info(std::string("TensorRT Info: ") + msg); break; } } }; static TRTLogger gLogger; // 全局 logger 实例 class TensorRTEngine::Impl { public: nvinfer1::IRuntime* runtime = nullptr; nvinfer1::ICudaEngine* engine = nullptr; nvinfer1::IExecutionContext* context = nullptr; cudaStream_t stream = nullptr; void* buffers[2]; // 输入和输出缓冲区 int inputIndex; int outputIndex; int inputH = 640; int inputW = 640; int maxBatchSize = 1; float* hostInput = nullptr; float* hostOutput = nullptr; // 添加 GPU 缓冲区 void* input_buffer = nullptr; // GPU 输入缓冲区 void* output_buffer = nullptr; // GPU 输出缓冲区 ~Impl() { if (runtime) delete runtime; if (engine) delete engine; if (context) delete context; if (stream) cudaStreamDestroy(stream); if (hostInput) delete[] hostInput; if (hostOutput) delete[] hostOutput; if (input_buffer) cudaFree(input_buffer); if (output_buffer) cudaFree(output_buffer); } }; TensorRTEngine::TensorRTEngine(const std::string& model_path, int gpu_id) { try { Logger::info("TensorRTEngine constructor start"); // 验证参数 if (model_path.empty()) { throw std::runtime_error("Model path is empty"); } if (gpu_id < 0) { throw std::runtime_error("Invalid GPU ID: " + std::to_string(gpu_id)); } // 创建本地副本 model_path_ = std::string(model_path); Logger::info("Parameters:"); Logger::info(" Model path: " + model_path_); Logger::info(" GPU ID: " + std::to_string(gpu_id)); // 创建实现 pImpl = std::make_unique(); if (!pImpl) { throw std::runtime_error("Failed to create implementation"); } // 检查模型文件 if (!std::filesystem::exists(model_path_)) { throw std::runtime_error("Model file not found: " + model_path_); } auto file_size = std::filesystem::file_size(model_path_); Logger::info("Model file exists, size: " + std::to_string(file_size) + " bytes"); // 初始化 CUDA cudaError_t error = cudaSetDevice(gpu_id); if (error != cudaSuccess) { throw std::runtime_error("Failed to set CUDA device: " + std::string(cudaGetErrorString(error))); } // 创建 CUDA 流 error = cudaStreamCreate(&pImpl->stream); if (error != cudaSuccess) { throw std::runtime_error("Failed to create CUDA stream: " + std::string(cudaGetErrorString(error))); } // 加载模型 if (!loadModel()) { throw std::runtime_error("Failed to load model"); } Logger::info("TensorRTEngine constructor completed successfully"); } catch (const std::exception& e) { Logger::error("Error in TensorRTEngine constructor: " + std::string(e.what())); throw; } } TensorRTEngine::~TensorRTEngine() = default; bool TensorRTEngine::convertONNX2TRT(const std::string& onnx_file) { try { Logger::info("=== Starting ONNX to TensorRT Conversion ==="); Logger::info("ONNX file: " + onnx_file); // 使用与模型文件相同目录作为基准目录 std::filesystem::path model_dir = std::filesystem::path(onnx_file).parent_path(); std::filesystem::path engine_path = model_dir / "model.engine"; // 检查 engine 文件是否已存在 if (std::filesystem::exists(engine_path)) { Logger::info("Found existing engine file: " + engine_path.string()); Logger::info("Size: " + std::to_string(std::filesystem::file_size(engine_path)) + " bytes"); return true; } Logger::info("Converting ONNX to TensorRT engine..."); // 检查文件是否存在 if (!std::filesystem::exists(onnx_file)) { Logger::error("ONNX file does not exist: " + onnx_file); return false; } // 获取文件大小 std::filesystem::path p(onnx_file); auto file_size = std::filesystem::file_size(p); Logger::info("ONNX file size: " + std::to_string(file_size) + " bytes"); // 创建 builder nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger); if (!builder) { throw std::runtime_error("Failed to create builder"); } // 创建网络定义,启用显式批处理 const auto explicitBatch = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch); if (!network) { throw std::runtime_error("Failed to create network"); } // 创建 ONNX 解析器 auto parser = nvonnxparser::createParser(*network, gLogger); if (!parser) { throw std::runtime_error("Failed to create parser"); } // 解析 ONNX 文件 if (!parser->parseFromFile(onnx_file.c_str(), static_cast(nvinfer1::ILogger::Severity::kWARNING))) { throw std::runtime_error("Failed to parse ONNX file"); } // 获取网络输入 if (network->getNbInputs() == 0) { throw std::runtime_error("Network has no inputs"); } // 获取输入张量 nvinfer1::ITensor* input = network->getInput(0); if (!input) { throw std::runtime_error("Failed to get input tensor"); } // 获取输入名称和维度 std::string inputName = input->getName(); Logger::info("Input tensor name: " + inputName); // 获取输入维度 nvinfer1::Dims inputDims = input->getDimensions(); std::string dimStr = "Input dimensions: ("; for (int i = 0; i < inputDims.nbDims; i++) { dimStr += std::to_string(inputDims.d[i]); if (i < inputDims.nbDims - 1) dimStr += ", "; } dimStr += ")"; Logger::info(dimStr); // 打印网络信息 Logger::info("Network layers:"); for (int i = 0; i < network->getNbLayers(); i++) { auto layer = network->getLayer(i); Logger::info("Layer " + std::to_string(i) + ": " + layer->getName()); // 打印每个层的输入维度 for (int j = 0; j < layer->getNbInputs(); j++) { auto layerInput = layer->getInput(j); if (layerInput) { auto dims = layerInput->getDimensions(); std::string layerDimStr = " Input " + std::to_string(j) + " dims: ("; for (int k = 0; k < dims.nbDims; k++) { layerDimStr += std::to_string(dims.d[k]); if (k < dims.nbDims - 1) layerDimStr += ", "; } layerDimStr += ")"; Logger::info(layerDimStr); } } } // 创建构建配置 nvinfer1::IBuilderConfig* config = builder->createBuilderConfig(); if (!config) { throw std::runtime_error("Failed to create builder config"); } // 设置 TensorRT 配置 config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1 << 30); // 1GB config->setFlag(nvinfer1::BuilderFlag::kFP16); // 启用 FP16 精度 // 添加优化配置文件 nvinfer1::IOptimizationProfile* profile = builder->createOptimizationProfile(); if (!profile) { throw std::runtime_error("Failed to create optimization profile"); } // 设置动态维度 nvinfer1::Dims minDims = inputDims; nvinfer1::Dims optDims = inputDims; nvinfer1::Dims maxDims = inputDims; // 打印原始维度 Logger::info("Original dimensions:"); for (int i = 0; i < inputDims.nbDims; i++) { Logger::info(" dim[" + std::to_string(i) + "] = " + std::to_string(inputDims.d[i])); } // 确保所有维度都是正数 for (int i = 0; i < inputDims.nbDims; i++) { // 如果维度是 -1(动态维度),设置为合适的值 if (inputDims.d[i] == -1) { if (i == 0) { // batch 维度 minDims.d[i] = 1; optDims.d[i] = pImpl->maxBatchSize; maxDims.d[i] = pImpl->maxBatchSize; } else if (i == 1) { // channel 维度 minDims.d[i] = 3; // RGB optDims.d[i] = 3; maxDims.d[i] = 3; } else if (i == 2) { // height 维度 minDims.d[i] = pImpl->inputH; optDims.d[i] = pImpl->inputH; maxDims.d[i] = pImpl->inputH; } else if (i == 3) { // width 维度 minDims.d[i] = pImpl->inputW; optDims.d[i] = pImpl->inputW; maxDims.d[i] = pImpl->inputW; } else { minDims.d[i] = 1; optDims.d[i] = 1; maxDims.d[i] = 1; } } else { // 如果不是动态维度,保持原值 minDims.d[i] = inputDims.d[i]; optDims.d[i] = inputDims.d[i]; maxDims.d[i] = inputDims.d[i]; } } // 打印设置的维度 Logger::info("Setting optimization profile dimensions:"); Logger::info("Min dimensions:"); for (int i = 0; i < minDims.nbDims; i++) { Logger::info(" dim[" + std::to_string(i) + "] = " + std::to_string(minDims.d[i])); } Logger::info("Opt dimensions:"); for (int i = 0; i < optDims.nbDims; i++) { Logger::info(" dim[" + std::to_string(i) + "] = " + std::to_string(optDims.d[i])); } Logger::info("Max dimensions:"); for (int i = 0; i < maxDims.nbDims; i++) { Logger::info(" dim[" + std::to_string(i) + "] = " + std::to_string(maxDims.d[i])); } // 设置优化配置 if (!profile->setDimensions(inputName.c_str(), nvinfer1::OptProfileSelector::kMIN, minDims)) { throw std::runtime_error("Failed to set minimum dimensions"); } if (!profile->setDimensions(inputName.c_str(), nvinfer1::OptProfileSelector::kOPT, optDims)) { throw std::runtime_error("Failed to set optimal dimensions"); } if (!profile->setDimensions(inputName.c_str(), nvinfer1::OptProfileSelector::kMAX, maxDims)) { throw std::runtime_error("Failed to set maximum dimensions"); } config->addOptimizationProfile(profile); // 构建引擎 Logger::info("Building TensorRT engine..."); nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); if (!engine) { throw std::runtime_error("Failed to build TensorRT engine"); } // 序列化引擎 nvinfer1::IHostMemory* serializedEngine = engine->serialize(); std::ofstream engine_file(engine_path, std::ios::binary); engine_file.write(static_cast(serializedEngine->data()), serializedEngine->size()); // 清理资源 delete serializedEngine; delete engine; delete config; delete network; delete parser; delete builder; Logger::info("Successfully converted ONNX to TensorRT engine"); Logger::info("=== Engine Conversion Completed ==="); Logger::info("Engine file saved to: " + engine_path.string()); Logger::info("Engine file size: " + std::to_string(std::filesystem::file_size(engine_path)) + " bytes"); return true; } catch (const std::exception& e) { Logger::error("Error in convertONNX2TRT: " + std::string(e.what())); return false; } } bool TensorRTEngine::loadModel() { try { if (!pImpl) { throw std::runtime_error("Implementation is null"); } Logger::info("Loading model..."); // 使用与模型文件相同目录作为基准目录 std::filesystem::path model_dir = std::filesystem::path(model_path_).parent_path(); std::filesystem::path engine_path = model_dir / "model.engine"; // 检查引擎文件 if (!std::filesystem::exists(engine_path)) { Logger::info("Engine file not found, converting from ONNX..."); if (!convertONNX2TRT(model_path_)) { throw std::runtime_error("Failed to convert ONNX model"); } } // 加载 engine 文件 std::ifstream engine_file(engine_path, std::ios::binary); if (!engine_file) { throw std::runtime_error("Cannot open engine file: " + engine_path.string()); } // 读取序列化的引擎文件 std::ifstream file(engine_path, std::ios::binary); if (!file.good()) { Logger::error("Failed to open engine file"); return false; } file.seekg(0, std::ios::end); size_t size = file.tellg(); file.seekg(0, std::ios::beg); std::vector engineData(size); file.read(engineData.data(), size); // 创建推理引擎 Logger::info("Creating TensorRT runtime..."); pImpl->runtime = nvinfer1::createInferRuntime(gLogger); if (!pImpl->runtime) { Logger::error("Failed to create TensorRT runtime"); return false; } Logger::info("Deserializing CUDA engine..."); pImpl->engine = pImpl->runtime->deserializeCudaEngine(engineData.data(), size); if (!pImpl->engine) { Logger::error("Failed to deserialize CUDA engine"); return false; } Logger::info("Creating execution context..."); pImpl->context = pImpl->engine->createExecutionContext(); if (!pImpl->context) { Logger::error("Failed to create execution context"); return false; } // 获取输入输出张量信息 Logger::info("Getting tensor information..."); // 获取网络输入输出数量 int32_t nbIOTensors = pImpl->engine->getNbIOTensors(); Logger::info("Number of I/O tensors: " + std::to_string(nbIOTensors)); // 设置输入输出索引 pImpl->inputIndex = 0; pImpl->outputIndex = 1; // 获取输入张量信息 const char* inputName = pImpl->engine->getIOTensorName(pImpl->inputIndex); if (!inputName) { Logger::error("Failed to get input tensor name"); return false; } Logger::info("Input tensor name: " + std::string(inputName)); // 获取输入维度 auto dims = pImpl->engine->getTensorShape(inputName); Logger::info("Input tensor dimensions: " + std::to_string(dims.nbDims) + " dimensions"); // 验证维度 if (dims.nbDims < 4) { Logger::error("Invalid input dimensions: expected at least 4, got " + std::to_string(dims.nbDims)); return false; } // 打印维度信息 std::string dimStr = "Input dimensions: ("; for (int i = 0; i < dims.nbDims; i++) { dimStr += std::to_string(dims.d[i]); if (i < dims.nbDims - 1) dimStr += ", "; } dimStr += ")"; Logger::info(dimStr); // 设置输入尺寸 pImpl->inputH = dims.d[2]; pImpl->inputW = dims.d[3]; Logger::info("Input HxW: " + std::to_string(pImpl->inputH) + "x" + std::to_string(pImpl->inputW)); // 获取输出张量信息 const char* outputName = pImpl->engine->getIOTensorName(pImpl->outputIndex); if (!outputName) { Logger::error("Failed to get output tensor name"); return false; } Logger::info("Output tensor name: " + std::string(outputName)); // 获取输出维度 auto outputDims = pImpl->engine->getTensorShape(outputName); Logger::info("Output tensor dimensions: " + std::to_string(outputDims.nbDims) + " dimensions"); std::string outDimStr = "Output dimensions: ("; for (int i = 0; i < outputDims.nbDims; i++) { outDimStr += std::to_string(outputDims.d[i]); if (i < outputDims.nbDims - 1) outDimStr += ", "; } outDimStr += ")"; Logger::info(outDimStr); // 计算缓冲区大小 size_t inputSize = sizeof(float); for (int i = 0; i < dims.nbDims; i++) { inputSize *= (dims.d[i] > 0) ? static_cast(dims.d[i]) : 1; } size_t outputSize = sizeof(float); for (int i = 0; i < outputDims.nbDims; i++) { outputSize *= (outputDims.d[i] > 0) ? static_cast(outputDims.d[i]) : 1; } Logger::info("Allocating device memory..."); Logger::info("Input buffer size: " + std::to_string(inputSize) + " bytes"); Logger::info("Output buffer size: " + std::to_string(outputSize) + " bytes"); // 分配设备内存 cudaError_t error; error = cudaMalloc(&pImpl->buffers[pImpl->inputIndex], inputSize); if (error != cudaSuccess) { Logger::error("Failed to allocate input buffer: " + std::string(cudaGetErrorString(error))); return false; } error = cudaMalloc(&pImpl->buffers[pImpl->outputIndex], outputSize); if (error != cudaSuccess) { Logger::error("Failed to allocate output buffer: " + std::string(cudaGetErrorString(error))); return false; } // 获取输出维度 nvinfer1::Dims output_dims = pImpl->engine->getTensorShape(pImpl->engine->getIOTensorName(1)); size_t total_output_size = 1; for (int i = 0; i < output_dims.nbDims; i++) { total_output_size *= output_dims.d[i]; } Logger::info("Output tensor dimensions:"); Logger::info(" Number of dimensions: " + std::to_string(output_dims.nbDims)); for (int i = 0; i < output_dims.nbDims; i++) { Logger::info(" Dimension " + std::to_string(i) + ": " + std::to_string(output_dims.d[i])); } Logger::info("Total output size: " + std::to_string(total_output_size)); // 分配主机输出缓冲区 pImpl->hostOutput = new float[total_output_size]; // 分配 GPU 缓冲区 size_t input_size = kMaxBatchSize * 3 * kInputH * kInputW * sizeof(float); size_t output_size = kMaxBatchSize * total_output_size * sizeof(float); // 使用 cudaMalloc 而不是 CUDA_CHECK 宏 error = cudaMalloc(&pImpl->input_buffer, input_size); if (error != cudaSuccess) { Logger::error("Failed to allocate input buffer: " + std::string(cudaGetErrorString(error))); return false; } error = cudaMalloc(&pImpl->output_buffer, output_size); if (error != cudaSuccess) { Logger::error("Failed to allocate output buffer: " + std::string(cudaGetErrorString(error))); cudaFree(pImpl->input_buffer); pImpl->input_buffer = nullptr; return false; } // 设置缓冲区指针 pImpl->buffers[pImpl->inputIndex] = pImpl->input_buffer; pImpl->buffers[pImpl->outputIndex] = pImpl->output_buffer; Logger::info("Model loaded successfully"); return true; } catch (const std::exception& e) { Logger::error("Error in loadModel: " + std::string(e.what())); return false; } } void TensorRTEngine::preprocess(const cv::Mat& input_image, float* gpu_input) { try { Logger::info("Starting preprocessing..."); // 检查输入 if (input_image.empty()) { throw std::runtime_error("Input image is empty"); } if (!gpu_input) { throw std::runtime_error("GPU input buffer is null"); } // 调整图像大小 cv::Mat resized; cv::resize(input_image, resized, cv::Size(pImpl->inputW, pImpl->inputH)); // BGR to RGB cv::Mat rgb; cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB); // 转换为浮点型并归一化 cv::Mat float_img; rgb.convertTo(float_img, CV_32F, 1.0/255.0); // 分离通道 std::vector channels; cv::split(float_img, channels); // 检查通道数 if (channels.size() != 3) { throw std::runtime_error("Expected 3 channels, got " + std::to_string(channels.size())); } // 计算每通道的大小 size_t channel_size = pImpl->inputH * pImpl->inputW * sizeof(float); // 复制数据到 GPU for (int i = 0; i < 3; i++) { cudaError_t error = cudaMemcpyAsync( gpu_input + i * pImpl->inputH * pImpl->inputW, channels[i].data, channel_size, cudaMemcpyHostToDevice, pImpl->stream ); if (error != cudaSuccess) { throw std::runtime_error("Failed to copy channel " + std::to_string(i) + " to GPU: " + cudaGetErrorString(error)); } } // 同步确保数据复制完成 cudaError_t error = cudaStreamSynchronize(pImpl->stream); if (error != cudaSuccess) { throw std::runtime_error("Failed to synchronize CUDA stream: " + std::string(cudaGetErrorString(error))); } Logger::info("Preprocessing completed successfully"); } catch (const std::exception& e) { Logger::error("Error in preprocessing: " + std::string(e.what())); throw; } } bool TensorRTEngine::infer(const cv::Mat& input_image, std::vector& detections) { try { Logger::info("=== Starting Inference ==="); if (!pImpl->context || !pImpl->engine) { Logger::error("TensorRT engine or context is null"); return false; } if (input_image.empty()) { Logger::error("Input image is empty"); return false; } // 打印输入图像信息 Logger::info("Input image: " + std::to_string(input_image.cols) + "x" + std::to_string(input_image.rows) + " channels: " + std::to_string(input_image.channels())); // 预处理 try { preprocess(input_image, (float*)pImpl->buffers[pImpl->inputIndex]); } catch (const std::exception& e) { Logger::error("Error in preprocessing: " + std::string(e.what())); return false; } // 执行推理 bool status = false; try { cudaStreamSynchronize(pImpl->stream); // 确保之前的操作完成 status = pImpl->context->executeV2(pImpl->buffers); cudaStreamSynchronize(pImpl->stream); // 等待推理完成 } catch (const std::exception& e) { Logger::error("Error during inference execution: " + std::string(e.what())); return false; } if (!status) { Logger::error("Failed to execute inference"); return false; } // 获取输出大小 nvinfer1::Dims output_dims = pImpl->engine->getTensorShape(pImpl->engine->getIOTensorName(1)); // 1 是输出索引 size_t output_size = 1; for (int i = 0; i < output_dims.nbDims; i++) { output_size *= output_dims.d[i]; } // 复制结果回主机 cudaError_t error = cudaMemcpyAsync( pImpl->hostOutput, pImpl->buffers[pImpl->outputIndex], output_size * sizeof(float), // 使用计算出的大小 cudaMemcpyDeviceToHost, pImpl->stream ); if (error != cudaSuccess) { Logger::error("Failed to copy output data: " + std::string(cudaGetErrorString(error)) + " (size: " + std::to_string(output_size) + ")"); return false; } // 同步等待结果 error = cudaStreamSynchronize(pImpl->stream); if (error != cudaSuccess) { Logger::error("Failed to synchronize CUDA stream: " + std::string(cudaGetErrorString(error))); return false; } // 后处理 try { detections = postprocess(pImpl->hostOutput, 1); } catch (const std::exception& e) { Logger::error("Error in postprocessing: " + std::string(e.what())); return false; } // 在检测结果后添加日志 if (!detections.empty()) { Logger::info("=== Detection Results ==="); Logger::info("Found " + std::to_string(detections.size()) + " objects"); for (const auto& det : detections) { Logger::info(" Object: shoe"); Logger::info(" Confidence: " + std::to_string(det.confidence)); Logger::info(" Box: (" + std::to_string(det.x1) + ", " + std::to_string(det.y1) + ", " + std::to_string(det.x2) + ", " + std::to_string(det.y2) + ")"); } } // 保存检测结果 if (!detections.empty()) { static int frame_count = 0; frame_count++; // 每100帧保存一次结果 if (frame_count % 100 == 0) { cv::Mat output = input_image.clone(); // 绘制检测框 for (const auto& det : detections) { cv::rectangle(output, cv::Point(det.x1, det.y1), cv::Point(det.x2, det.y2), cv::Scalar(0, 255, 0), 2); std::string label = "shoe: " + std::to_string(det.confidence); cv::putText(output, label, cv::Point(det.x1, det.y1 - 10), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 2); } // 保存图像 std::string filename = "results/detection_" + std::to_string(frame_count) + ".jpg"; cv::imwrite(filename, output); Logger::info("Saved detection result to: " + filename); } } Logger::info("=== Inference Completed ==="); return true; } catch (const std::exception& e) { Logger::error("Error during inference: " + std::string(e.what())); return false; } } std::vector TensorRTEngine::postprocess(float* output, int batch_size) { std::vector results; // 设置阈值 const float conf_threshold = 0.6f; // 提高置信度阈值 const float nms_threshold = 0.4f; // 降低 NMS 阈值 const float min_box_size = 20.0f; // 最小框尺寸(像素) const float max_box_size = 416.0f; // 最大框尺寸(像素) const int num_classes = 1; // 只有一个类别 (shoe) const int num_boxes = 25200; // YOLOv5 输出的框数量 Logger::info("Post-processing parameters:"); Logger::info(" Confidence threshold: " + std::to_string(conf_threshold)); Logger::info(" NMS threshold: " + std::to_string(nms_threshold)); Logger::info(" Min box size: " + std::to_string(min_box_size)); Logger::info(" Max box size: " + std::to_string(max_box_size)); // 存储所有检测结果 std::vector> class_detections(num_classes); int total_boxes = 0; int filtered_by_conf = 0; int filtered_by_size = 0; int filtered_by_nms = 0; // 遍历所有预测框 for (int i = 0; i < num_boxes; i++) { float* box = output + i * (5 + num_classes); float confidence = box[4]; // 检查置信度 if (confidence < conf_threshold) { filtered_by_conf++; continue; } float class_score = box[5]; float final_score = confidence * class_score; if (final_score > conf_threshold) { DetectionResult det; float cx = box[0] * pImpl->inputW; float cy = box[1] * pImpl->inputH; float w = box[2] * pImpl->inputW; float h = box[3] * pImpl->inputH; // 检查框的尺寸 if (w < min_box_size || h < min_box_size || w > max_box_size || h > max_box_size) { filtered_by_size++; continue; } det.x1 = std::max(0.0f, cx - w/2); det.y1 = std::max(0.0f, cy - h/2); det.x2 = std::min(float(pImpl->inputW), cx + w/2); det.y2 = std::min(float(pImpl->inputH), cy + h/2); det.confidence = final_score; det.class_id = 0; class_detections[0].push_back(det); total_boxes++; } } Logger::info("Detection statistics:"); Logger::info(" Total boxes processed: " + std::to_string(num_boxes)); Logger::info(" Filtered by confidence: " + std::to_string(filtered_by_conf)); Logger::info(" Filtered by size: " + std::to_string(filtered_by_size)); Logger::info(" Remaining after initial filtering: " + std::to_string(total_boxes)); // NMS for (int c = 0; c < num_classes; c++) { auto& dets = class_detections[c]; if (dets.empty()) continue; std::sort(dets.begin(), dets.end(), [](const DetectionResult& a, const DetectionResult& b) { return a.confidence > b.confidence; }); std::vector keep(dets.size(), true); for (size_t i = 0; i < dets.size(); i++) { if (!keep[i]) continue; for (size_t j = i + 1; j < dets.size(); j++) { if (!keep[j]) continue; float iou = calculateIoU(dets[i], dets[j]); if (iou > nms_threshold) { keep[j] = false; filtered_by_nms++; } } } for (size_t i = 0; i < dets.size(); i++) { if (keep[i]) { results.push_back(dets[i]); } } } Logger::info(" Filtered by NMS: " + std::to_string(filtered_by_nms)); Logger::info(" Final detection count: " + std::to_string(results.size())); return results; } // 添加 IoU 计算函数 float TensorRTEngine::calculateIoU(const DetectionResult& a, const DetectionResult& b) { float x1 = std::max(a.x1, b.x1); float y1 = std::max(a.y1, b.y1); float x2 = std::min(a.x2, b.x2); float y2 = std::min(a.y2, b.y2); if (x2 < x1 || y2 < y1) return 0.0f; float intersection = (x2 - x1) * (y2 - y1); float area_a = (a.x2 - a.x1) * (a.y2 - a.y1); float area_b = (b.x2 - b.x1) * (b.y2 - b.y1); return intersection / (area_a + area_b - intersection); } bool TensorRTEngine::inferGPU(float* gpu_input, std::vector& detections) { try { // 直接使用 GPU 内存中的数据进行推理 void* buffers[2] = {gpu_input, pImpl->output_buffer}; // 执行推理 bool status = false; try { cudaStreamSynchronize(pImpl->stream); // 确保之前的操作完成 status = pImpl->context->executeV2(buffers); cudaStreamSynchronize(pImpl->stream); // 等待推理完成 } catch (const std::exception& e) { Logger::error("Error during inference execution: " + std::string(e.what())); return false; } if (!status) { Logger::error("Failed to execute inference"); return false; } // 获取输出大小 nvinfer1::Dims output_dims = pImpl->engine->getTensorShape(pImpl->engine->getIOTensorName(1)); size_t output_size = 1; for (int i = 0; i < output_dims.nbDims; i++) { output_size *= output_dims.d[i]; } // 复制结果回主机 cudaError_t error = cudaMemcpyAsync( pImpl->hostOutput, pImpl->output_buffer, output_size * sizeof(float), cudaMemcpyDeviceToHost, pImpl->stream ); if (error != cudaSuccess) { Logger::error("Failed to copy output data: " + std::string(cudaGetErrorString(error))); return false; } // 同步等待结果 error = cudaStreamSynchronize(pImpl->stream); if (error != cudaSuccess) { Logger::error("Failed to synchronize CUDA stream: " + std::string(cudaGetErrorString(error))); return false; } // 后处理 detections = postprocess(pImpl->hostOutput, 1); return true; } catch (const std::exception& e) { Logger::error("Error during GPU inference: " + std::string(e.what())); return false; } }