- 创建基本项目结构和目录 - 添加CMake构建系统 - 实现基础的配置解析功能 - 添加YOLO推理框架支持 - 集成RTSP和视频流处理功能 - 添加性能监控和日志系统
947 lines
35 KiB
C++
947 lines
35 KiB
C++
#include "inference/tensorrt_engine.hpp"
|
||
#include "common/logger.hpp"
|
||
#include <fstream>
|
||
#include <cuda_runtime.h>
|
||
#include <NvOnnxParser.h>
|
||
#include <dlfcn.h> // 用于动态库加载检查
|
||
#include <filesystem>
|
||
#include "common/cuda_helper.hpp"
|
||
|
||
// 重命名为 TRTLogger 避免冲突
|
||
class TRTLogger : public nvinfer1::ILogger {
|
||
void log(Severity severity, const char* msg) noexcept override {
|
||
switch (severity) {
|
||
case Severity::kINTERNAL_ERROR:
|
||
Logger::error(std::string("TensorRT Internal Error: ") + msg);
|
||
break;
|
||
case Severity::kERROR:
|
||
Logger::error(std::string("TensorRT Error: ") + msg);
|
||
break;
|
||
case Severity::kWARNING:
|
||
Logger::info(std::string("TensorRT Warning: ") + msg);
|
||
break;
|
||
default:
|
||
Logger::info(std::string("TensorRT Info: ") + msg);
|
||
break;
|
||
}
|
||
}
|
||
};
|
||
|
||
static TRTLogger gLogger; // 全局 logger 实例
|
||
|
||
class TensorRTEngine::Impl {
|
||
public:
|
||
nvinfer1::IRuntime* runtime = nullptr;
|
||
nvinfer1::ICudaEngine* engine = nullptr;
|
||
nvinfer1::IExecutionContext* context = nullptr;
|
||
cudaStream_t stream = nullptr;
|
||
|
||
void* buffers[2]; // 输入和输出缓冲区
|
||
int inputIndex;
|
||
int outputIndex;
|
||
|
||
int inputH = 640;
|
||
int inputW = 640;
|
||
int maxBatchSize = 1;
|
||
|
||
float* hostInput = nullptr;
|
||
float* hostOutput = nullptr;
|
||
|
||
// 添加 GPU 缓冲区
|
||
void* input_buffer = nullptr; // GPU 输入缓冲区
|
||
void* output_buffer = nullptr; // GPU 输出缓冲区
|
||
|
||
~Impl() {
|
||
if (runtime) delete runtime;
|
||
if (engine) delete engine;
|
||
if (context) delete context;
|
||
if (stream) cudaStreamDestroy(stream);
|
||
if (hostInput) delete[] hostInput;
|
||
if (hostOutput) delete[] hostOutput;
|
||
if (input_buffer) cudaFree(input_buffer);
|
||
if (output_buffer) cudaFree(output_buffer);
|
||
}
|
||
};
|
||
|
||
TensorRTEngine::TensorRTEngine(const std::string& model_path, int gpu_id) {
|
||
try {
|
||
Logger::info("TensorRTEngine constructor start");
|
||
|
||
// 验证参数
|
||
if (model_path.empty()) {
|
||
throw std::runtime_error("Model path is empty");
|
||
}
|
||
|
||
if (gpu_id < 0) {
|
||
throw std::runtime_error("Invalid GPU ID: " + std::to_string(gpu_id));
|
||
}
|
||
|
||
// 创建本地副本
|
||
model_path_ = std::string(model_path);
|
||
|
||
Logger::info("Parameters:");
|
||
Logger::info(" Model path: " + model_path_);
|
||
Logger::info(" GPU ID: " + std::to_string(gpu_id));
|
||
|
||
// 创建实现
|
||
pImpl = std::make_unique<Impl>();
|
||
if (!pImpl) {
|
||
throw std::runtime_error("Failed to create implementation");
|
||
}
|
||
|
||
// 检查模型文件
|
||
if (!std::filesystem::exists(model_path_)) {
|
||
throw std::runtime_error("Model file not found: " + model_path_);
|
||
}
|
||
|
||
auto file_size = std::filesystem::file_size(model_path_);
|
||
Logger::info("Model file exists, size: " + std::to_string(file_size) + " bytes");
|
||
|
||
// 初始化 CUDA
|
||
cudaError_t error = cudaSetDevice(gpu_id);
|
||
if (error != cudaSuccess) {
|
||
throw std::runtime_error("Failed to set CUDA device: " +
|
||
std::string(cudaGetErrorString(error)));
|
||
}
|
||
|
||
// 创建 CUDA 流
|
||
error = cudaStreamCreate(&pImpl->stream);
|
||
if (error != cudaSuccess) {
|
||
throw std::runtime_error("Failed to create CUDA stream: " +
|
||
std::string(cudaGetErrorString(error)));
|
||
}
|
||
|
||
// 加载模型
|
||
if (!loadModel()) {
|
||
throw std::runtime_error("Failed to load model");
|
||
}
|
||
|
||
Logger::info("TensorRTEngine constructor completed successfully");
|
||
}
|
||
catch (const std::exception& e) {
|
||
Logger::error("Error in TensorRTEngine constructor: " + std::string(e.what()));
|
||
throw;
|
||
}
|
||
}
|
||
|
||
TensorRTEngine::~TensorRTEngine() = default;
|
||
|
||
bool TensorRTEngine::convertONNX2TRT(const std::string& onnx_file) {
|
||
try {
|
||
Logger::info("=== Starting ONNX to TensorRT Conversion ===");
|
||
Logger::info("ONNX file: " + onnx_file);
|
||
|
||
// 使用与模型文件相同目录作为基准目录
|
||
std::filesystem::path model_dir = std::filesystem::path(onnx_file).parent_path();
|
||
std::filesystem::path engine_path = model_dir / "model.engine";
|
||
|
||
// 检查 engine 文件是否已存在
|
||
if (std::filesystem::exists(engine_path)) {
|
||
Logger::info("Found existing engine file: " + engine_path.string());
|
||
Logger::info("Size: " + std::to_string(std::filesystem::file_size(engine_path)) + " bytes");
|
||
return true;
|
||
}
|
||
|
||
Logger::info("Converting ONNX to TensorRT engine...");
|
||
|
||
// 检查文件是否存在
|
||
if (!std::filesystem::exists(onnx_file)) {
|
||
Logger::error("ONNX file does not exist: " + onnx_file);
|
||
return false;
|
||
}
|
||
|
||
// 获取文件大小
|
||
std::filesystem::path p(onnx_file);
|
||
auto file_size = std::filesystem::file_size(p);
|
||
Logger::info("ONNX file size: " + std::to_string(file_size) + " bytes");
|
||
|
||
// 创建 builder
|
||
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
|
||
if (!builder) {
|
||
throw std::runtime_error("Failed to create builder");
|
||
}
|
||
|
||
// 创建网络定义,启用显式批处理
|
||
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
|
||
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
|
||
if (!network) {
|
||
throw std::runtime_error("Failed to create network");
|
||
}
|
||
|
||
// 创建 ONNX 解析器
|
||
auto parser = nvonnxparser::createParser(*network, gLogger);
|
||
if (!parser) {
|
||
throw std::runtime_error("Failed to create parser");
|
||
}
|
||
|
||
// 解析 ONNX 文件
|
||
if (!parser->parseFromFile(onnx_file.c_str(),
|
||
static_cast<int>(nvinfer1::ILogger::Severity::kWARNING))) {
|
||
throw std::runtime_error("Failed to parse ONNX file");
|
||
}
|
||
|
||
// 获取网络输入
|
||
if (network->getNbInputs() == 0) {
|
||
throw std::runtime_error("Network has no inputs");
|
||
}
|
||
|
||
// 获取输入张量
|
||
nvinfer1::ITensor* input = network->getInput(0);
|
||
if (!input) {
|
||
throw std::runtime_error("Failed to get input tensor");
|
||
}
|
||
|
||
// 获取输入名称和维度
|
||
std::string inputName = input->getName();
|
||
Logger::info("Input tensor name: " + inputName);
|
||
|
||
// 获取输入维度
|
||
nvinfer1::Dims inputDims = input->getDimensions();
|
||
std::string dimStr = "Input dimensions: (";
|
||
for (int i = 0; i < inputDims.nbDims; i++) {
|
||
dimStr += std::to_string(inputDims.d[i]);
|
||
if (i < inputDims.nbDims - 1) dimStr += ", ";
|
||
}
|
||
dimStr += ")";
|
||
Logger::info(dimStr);
|
||
|
||
// 打印网络信息
|
||
Logger::info("Network layers:");
|
||
for (int i = 0; i < network->getNbLayers(); i++) {
|
||
auto layer = network->getLayer(i);
|
||
Logger::info("Layer " + std::to_string(i) + ": " + layer->getName());
|
||
|
||
// 打印每个层的输入维度
|
||
for (int j = 0; j < layer->getNbInputs(); j++) {
|
||
auto layerInput = layer->getInput(j);
|
||
if (layerInput) {
|
||
auto dims = layerInput->getDimensions();
|
||
std::string layerDimStr = " Input " + std::to_string(j) + " dims: (";
|
||
for (int k = 0; k < dims.nbDims; k++) {
|
||
layerDimStr += std::to_string(dims.d[k]);
|
||
if (k < dims.nbDims - 1) layerDimStr += ", ";
|
||
}
|
||
layerDimStr += ")";
|
||
Logger::info(layerDimStr);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 创建构建配置
|
||
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
|
||
if (!config) {
|
||
throw std::runtime_error("Failed to create builder config");
|
||
}
|
||
|
||
// 设置 TensorRT 配置
|
||
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1 << 30); // 1GB
|
||
config->setFlag(nvinfer1::BuilderFlag::kFP16); // 启用 FP16 精度
|
||
|
||
// 添加优化配置文件
|
||
nvinfer1::IOptimizationProfile* profile = builder->createOptimizationProfile();
|
||
if (!profile) {
|
||
throw std::runtime_error("Failed to create optimization profile");
|
||
}
|
||
|
||
// 设置动态维度
|
||
nvinfer1::Dims minDims = inputDims;
|
||
nvinfer1::Dims optDims = inputDims;
|
||
nvinfer1::Dims maxDims = inputDims;
|
||
|
||
// 打印原始维度
|
||
Logger::info("Original dimensions:");
|
||
for (int i = 0; i < inputDims.nbDims; i++) {
|
||
Logger::info(" dim[" + std::to_string(i) + "] = " + std::to_string(inputDims.d[i]));
|
||
}
|
||
|
||
// 确保所有维度都是正数
|
||
for (int i = 0; i < inputDims.nbDims; i++) {
|
||
// 如果维度是 -1(动态维度),设置为合适的值
|
||
if (inputDims.d[i] == -1) {
|
||
if (i == 0) { // batch 维度
|
||
minDims.d[i] = 1;
|
||
optDims.d[i] = pImpl->maxBatchSize;
|
||
maxDims.d[i] = pImpl->maxBatchSize;
|
||
} else if (i == 1) { // channel 维度
|
||
minDims.d[i] = 3; // RGB
|
||
optDims.d[i] = 3;
|
||
maxDims.d[i] = 3;
|
||
} else if (i == 2) { // height 维度
|
||
minDims.d[i] = pImpl->inputH;
|
||
optDims.d[i] = pImpl->inputH;
|
||
maxDims.d[i] = pImpl->inputH;
|
||
} else if (i == 3) { // width 维度
|
||
minDims.d[i] = pImpl->inputW;
|
||
optDims.d[i] = pImpl->inputW;
|
||
maxDims.d[i] = pImpl->inputW;
|
||
} else {
|
||
minDims.d[i] = 1;
|
||
optDims.d[i] = 1;
|
||
maxDims.d[i] = 1;
|
||
}
|
||
} else {
|
||
// 如果不是动态维度,保持原值
|
||
minDims.d[i] = inputDims.d[i];
|
||
optDims.d[i] = inputDims.d[i];
|
||
maxDims.d[i] = inputDims.d[i];
|
||
}
|
||
}
|
||
|
||
// 打印设置的维度
|
||
Logger::info("Setting optimization profile dimensions:");
|
||
Logger::info("Min dimensions:");
|
||
for (int i = 0; i < minDims.nbDims; i++) {
|
||
Logger::info(" dim[" + std::to_string(i) + "] = " + std::to_string(minDims.d[i]));
|
||
}
|
||
Logger::info("Opt dimensions:");
|
||
for (int i = 0; i < optDims.nbDims; i++) {
|
||
Logger::info(" dim[" + std::to_string(i) + "] = " + std::to_string(optDims.d[i]));
|
||
}
|
||
Logger::info("Max dimensions:");
|
||
for (int i = 0; i < maxDims.nbDims; i++) {
|
||
Logger::info(" dim[" + std::to_string(i) + "] = " + std::to_string(maxDims.d[i]));
|
||
}
|
||
|
||
// 设置优化配置
|
||
if (!profile->setDimensions(inputName.c_str(), nvinfer1::OptProfileSelector::kMIN, minDims)) {
|
||
throw std::runtime_error("Failed to set minimum dimensions");
|
||
}
|
||
if (!profile->setDimensions(inputName.c_str(), nvinfer1::OptProfileSelector::kOPT, optDims)) {
|
||
throw std::runtime_error("Failed to set optimal dimensions");
|
||
}
|
||
if (!profile->setDimensions(inputName.c_str(), nvinfer1::OptProfileSelector::kMAX, maxDims)) {
|
||
throw std::runtime_error("Failed to set maximum dimensions");
|
||
}
|
||
|
||
config->addOptimizationProfile(profile);
|
||
|
||
// 构建引擎
|
||
Logger::info("Building TensorRT engine...");
|
||
nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
|
||
if (!engine) {
|
||
throw std::runtime_error("Failed to build TensorRT engine");
|
||
}
|
||
|
||
// 序列化引擎
|
||
nvinfer1::IHostMemory* serializedEngine = engine->serialize();
|
||
std::ofstream engine_file(engine_path, std::ios::binary);
|
||
engine_file.write(static_cast<const char*>(serializedEngine->data()),
|
||
serializedEngine->size());
|
||
|
||
// 清理资源
|
||
delete serializedEngine;
|
||
delete engine;
|
||
delete config;
|
||
delete network;
|
||
delete parser;
|
||
delete builder;
|
||
|
||
Logger::info("Successfully converted ONNX to TensorRT engine");
|
||
Logger::info("=== Engine Conversion Completed ===");
|
||
Logger::info("Engine file saved to: " + engine_path.string());
|
||
Logger::info("Engine file size: " + std::to_string(std::filesystem::file_size(engine_path)) + " bytes");
|
||
return true;
|
||
}
|
||
catch (const std::exception& e) {
|
||
Logger::error("Error in convertONNX2TRT: " + std::string(e.what()));
|
||
return false;
|
||
}
|
||
}
|
||
|
||
bool TensorRTEngine::loadModel() {
|
||
try {
|
||
if (!pImpl) {
|
||
throw std::runtime_error("Implementation is null");
|
||
}
|
||
|
||
Logger::info("Loading model...");
|
||
|
||
// 使用与模型文件相同目录作为基准目录
|
||
std::filesystem::path model_dir = std::filesystem::path(model_path_).parent_path();
|
||
std::filesystem::path engine_path = model_dir / "model.engine";
|
||
|
||
// 检查引擎文件
|
||
if (!std::filesystem::exists(engine_path)) {
|
||
Logger::info("Engine file not found, converting from ONNX...");
|
||
if (!convertONNX2TRT(model_path_)) {
|
||
throw std::runtime_error("Failed to convert ONNX model");
|
||
}
|
||
}
|
||
|
||
// 加载 engine 文件
|
||
std::ifstream engine_file(engine_path, std::ios::binary);
|
||
if (!engine_file) {
|
||
throw std::runtime_error("Cannot open engine file: " + engine_path.string());
|
||
}
|
||
|
||
// 读取序列化的引擎文件
|
||
std::ifstream file(engine_path, std::ios::binary);
|
||
if (!file.good()) {
|
||
Logger::error("Failed to open engine file");
|
||
return false;
|
||
}
|
||
|
||
file.seekg(0, std::ios::end);
|
||
size_t size = file.tellg();
|
||
file.seekg(0, std::ios::beg);
|
||
|
||
std::vector<char> engineData(size);
|
||
file.read(engineData.data(), size);
|
||
|
||
// 创建推理引擎
|
||
Logger::info("Creating TensorRT runtime...");
|
||
pImpl->runtime = nvinfer1::createInferRuntime(gLogger);
|
||
if (!pImpl->runtime) {
|
||
Logger::error("Failed to create TensorRT runtime");
|
||
return false;
|
||
}
|
||
|
||
Logger::info("Deserializing CUDA engine...");
|
||
pImpl->engine = pImpl->runtime->deserializeCudaEngine(engineData.data(), size);
|
||
if (!pImpl->engine) {
|
||
Logger::error("Failed to deserialize CUDA engine");
|
||
return false;
|
||
}
|
||
|
||
Logger::info("Creating execution context...");
|
||
pImpl->context = pImpl->engine->createExecutionContext();
|
||
if (!pImpl->context) {
|
||
Logger::error("Failed to create execution context");
|
||
return false;
|
||
}
|
||
|
||
// 获取输入输出张量信息
|
||
Logger::info("Getting tensor information...");
|
||
|
||
// 获取网络输入输出数量
|
||
int32_t nbIOTensors = pImpl->engine->getNbIOTensors();
|
||
Logger::info("Number of I/O tensors: " + std::to_string(nbIOTensors));
|
||
|
||
// 设置输入输出索引
|
||
pImpl->inputIndex = 0;
|
||
pImpl->outputIndex = 1;
|
||
|
||
// 获取输入张量信息
|
||
const char* inputName = pImpl->engine->getIOTensorName(pImpl->inputIndex);
|
||
if (!inputName) {
|
||
Logger::error("Failed to get input tensor name");
|
||
return false;
|
||
}
|
||
Logger::info("Input tensor name: " + std::string(inputName));
|
||
|
||
// 获取输入维度
|
||
auto dims = pImpl->engine->getTensorShape(inputName);
|
||
Logger::info("Input tensor dimensions: " + std::to_string(dims.nbDims) + " dimensions");
|
||
|
||
// 验证维度
|
||
if (dims.nbDims < 4) {
|
||
Logger::error("Invalid input dimensions: expected at least 4, got " +
|
||
std::to_string(dims.nbDims));
|
||
return false;
|
||
}
|
||
|
||
// 打印维度信息
|
||
std::string dimStr = "Input dimensions: (";
|
||
for (int i = 0; i < dims.nbDims; i++) {
|
||
dimStr += std::to_string(dims.d[i]);
|
||
if (i < dims.nbDims - 1) dimStr += ", ";
|
||
}
|
||
dimStr += ")";
|
||
Logger::info(dimStr);
|
||
|
||
// 设置输入尺寸
|
||
pImpl->inputH = dims.d[2];
|
||
pImpl->inputW = dims.d[3];
|
||
Logger::info("Input HxW: " + std::to_string(pImpl->inputH) + "x" +
|
||
std::to_string(pImpl->inputW));
|
||
|
||
// 获取输出张量信息
|
||
const char* outputName = pImpl->engine->getIOTensorName(pImpl->outputIndex);
|
||
if (!outputName) {
|
||
Logger::error("Failed to get output tensor name");
|
||
return false;
|
||
}
|
||
Logger::info("Output tensor name: " + std::string(outputName));
|
||
|
||
// 获取输出维度
|
||
auto outputDims = pImpl->engine->getTensorShape(outputName);
|
||
Logger::info("Output tensor dimensions: " + std::to_string(outputDims.nbDims) +
|
||
" dimensions");
|
||
|
||
std::string outDimStr = "Output dimensions: (";
|
||
for (int i = 0; i < outputDims.nbDims; i++) {
|
||
outDimStr += std::to_string(outputDims.d[i]);
|
||
if (i < outputDims.nbDims - 1) outDimStr += ", ";
|
||
}
|
||
outDimStr += ")";
|
||
Logger::info(outDimStr);
|
||
|
||
// 计算缓冲区大小
|
||
size_t inputSize = sizeof(float);
|
||
for (int i = 0; i < dims.nbDims; i++) {
|
||
inputSize *= (dims.d[i] > 0) ? static_cast<size_t>(dims.d[i]) : 1;
|
||
}
|
||
|
||
size_t outputSize = sizeof(float);
|
||
for (int i = 0; i < outputDims.nbDims; i++) {
|
||
outputSize *= (outputDims.d[i] > 0) ? static_cast<size_t>(outputDims.d[i]) : 1;
|
||
}
|
||
|
||
Logger::info("Allocating device memory...");
|
||
Logger::info("Input buffer size: " + std::to_string(inputSize) + " bytes");
|
||
Logger::info("Output buffer size: " + std::to_string(outputSize) + " bytes");
|
||
|
||
// 分配设备内存
|
||
cudaError_t error;
|
||
error = cudaMalloc(&pImpl->buffers[pImpl->inputIndex], inputSize);
|
||
if (error != cudaSuccess) {
|
||
Logger::error("Failed to allocate input buffer: " +
|
||
std::string(cudaGetErrorString(error)));
|
||
return false;
|
||
}
|
||
|
||
error = cudaMalloc(&pImpl->buffers[pImpl->outputIndex], outputSize);
|
||
if (error != cudaSuccess) {
|
||
Logger::error("Failed to allocate output buffer: " +
|
||
std::string(cudaGetErrorString(error)));
|
||
return false;
|
||
}
|
||
|
||
// 获取输出维度
|
||
nvinfer1::Dims output_dims = pImpl->engine->getTensorShape(pImpl->engine->getIOTensorName(1));
|
||
size_t total_output_size = 1;
|
||
for (int i = 0; i < output_dims.nbDims; i++) {
|
||
total_output_size *= output_dims.d[i];
|
||
}
|
||
|
||
Logger::info("Output tensor dimensions:");
|
||
Logger::info(" Number of dimensions: " + std::to_string(output_dims.nbDims));
|
||
for (int i = 0; i < output_dims.nbDims; i++) {
|
||
Logger::info(" Dimension " + std::to_string(i) + ": " + std::to_string(output_dims.d[i]));
|
||
}
|
||
Logger::info("Total output size: " + std::to_string(total_output_size));
|
||
|
||
// 分配主机输出缓冲区
|
||
pImpl->hostOutput = new float[total_output_size];
|
||
|
||
// 分配 GPU 缓冲区
|
||
size_t input_size = kMaxBatchSize * 3 * kInputH * kInputW * sizeof(float);
|
||
size_t output_size = kMaxBatchSize * total_output_size * sizeof(float);
|
||
|
||
// 使用 cudaMalloc 而不是 CUDA_CHECK 宏
|
||
error = cudaMalloc(&pImpl->input_buffer, input_size);
|
||
if (error != cudaSuccess) {
|
||
Logger::error("Failed to allocate input buffer: " +
|
||
std::string(cudaGetErrorString(error)));
|
||
return false;
|
||
}
|
||
|
||
error = cudaMalloc(&pImpl->output_buffer, output_size);
|
||
if (error != cudaSuccess) {
|
||
Logger::error("Failed to allocate output buffer: " +
|
||
std::string(cudaGetErrorString(error)));
|
||
cudaFree(pImpl->input_buffer);
|
||
pImpl->input_buffer = nullptr;
|
||
return false;
|
||
}
|
||
|
||
// 设置缓冲区指针
|
||
pImpl->buffers[pImpl->inputIndex] = pImpl->input_buffer;
|
||
pImpl->buffers[pImpl->outputIndex] = pImpl->output_buffer;
|
||
|
||
Logger::info("Model loaded successfully");
|
||
return true;
|
||
}
|
||
catch (const std::exception& e) {
|
||
Logger::error("Error in loadModel: " + std::string(e.what()));
|
||
return false;
|
||
}
|
||
}
|
||
|
||
void TensorRTEngine::preprocess(const cv::Mat& input_image, float* gpu_input) {
|
||
try {
|
||
Logger::info("Starting preprocessing...");
|
||
|
||
// 检查输入
|
||
if (input_image.empty()) {
|
||
throw std::runtime_error("Input image is empty");
|
||
}
|
||
|
||
if (!gpu_input) {
|
||
throw std::runtime_error("GPU input buffer is null");
|
||
}
|
||
|
||
// 调整图像大小
|
||
cv::Mat resized;
|
||
cv::resize(input_image, resized, cv::Size(pImpl->inputW, pImpl->inputH));
|
||
|
||
// BGR to RGB
|
||
cv::Mat rgb;
|
||
cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB);
|
||
|
||
// 转换为浮点型并归一化
|
||
cv::Mat float_img;
|
||
rgb.convertTo(float_img, CV_32F, 1.0/255.0);
|
||
|
||
// 分离通道
|
||
std::vector<cv::Mat> channels;
|
||
cv::split(float_img, channels);
|
||
|
||
// 检查通道数
|
||
if (channels.size() != 3) {
|
||
throw std::runtime_error("Expected 3 channels, got " +
|
||
std::to_string(channels.size()));
|
||
}
|
||
|
||
// 计算每通道的大小
|
||
size_t channel_size = pImpl->inputH * pImpl->inputW * sizeof(float);
|
||
|
||
// 复制数据到 GPU
|
||
for (int i = 0; i < 3; i++) {
|
||
cudaError_t error = cudaMemcpyAsync(
|
||
gpu_input + i * pImpl->inputH * pImpl->inputW,
|
||
channels[i].data,
|
||
channel_size,
|
||
cudaMemcpyHostToDevice,
|
||
pImpl->stream
|
||
);
|
||
|
||
if (error != cudaSuccess) {
|
||
throw std::runtime_error("Failed to copy channel " + std::to_string(i) +
|
||
" to GPU: " + cudaGetErrorString(error));
|
||
}
|
||
}
|
||
|
||
// 同步确保数据复制完成
|
||
cudaError_t error = cudaStreamSynchronize(pImpl->stream);
|
||
if (error != cudaSuccess) {
|
||
throw std::runtime_error("Failed to synchronize CUDA stream: " +
|
||
std::string(cudaGetErrorString(error)));
|
||
}
|
||
|
||
Logger::info("Preprocessing completed successfully");
|
||
}
|
||
catch (const std::exception& e) {
|
||
Logger::error("Error in preprocessing: " + std::string(e.what()));
|
||
throw;
|
||
}
|
||
}
|
||
|
||
bool TensorRTEngine::infer(const cv::Mat& input_image, std::vector<DetectionResult>& detections) {
|
||
try {
|
||
Logger::info("=== Starting Inference ===");
|
||
|
||
if (!pImpl->context || !pImpl->engine) {
|
||
Logger::error("TensorRT engine or context is null");
|
||
return false;
|
||
}
|
||
|
||
if (input_image.empty()) {
|
||
Logger::error("Input image is empty");
|
||
return false;
|
||
}
|
||
|
||
// 打印输入图像信息
|
||
Logger::info("Input image: " + std::to_string(input_image.cols) + "x" +
|
||
std::to_string(input_image.rows) + " channels: " +
|
||
std::to_string(input_image.channels()));
|
||
|
||
// 预处理
|
||
try {
|
||
preprocess(input_image, (float*)pImpl->buffers[pImpl->inputIndex]);
|
||
} catch (const std::exception& e) {
|
||
Logger::error("Error in preprocessing: " + std::string(e.what()));
|
||
return false;
|
||
}
|
||
|
||
// 执行推理
|
||
bool status = false;
|
||
try {
|
||
cudaStreamSynchronize(pImpl->stream); // 确保之前的操作完成
|
||
status = pImpl->context->executeV2(pImpl->buffers);
|
||
cudaStreamSynchronize(pImpl->stream); // 等待推理完成
|
||
} catch (const std::exception& e) {
|
||
Logger::error("Error during inference execution: " + std::string(e.what()));
|
||
return false;
|
||
}
|
||
|
||
if (!status) {
|
||
Logger::error("Failed to execute inference");
|
||
return false;
|
||
}
|
||
|
||
// 获取输出大小
|
||
nvinfer1::Dims output_dims = pImpl->engine->getTensorShape(pImpl->engine->getIOTensorName(1)); // 1 是输出索引
|
||
size_t output_size = 1;
|
||
for (int i = 0; i < output_dims.nbDims; i++) {
|
||
output_size *= output_dims.d[i];
|
||
}
|
||
|
||
// 复制结果回主机
|
||
cudaError_t error = cudaMemcpyAsync(
|
||
pImpl->hostOutput,
|
||
pImpl->buffers[pImpl->outputIndex],
|
||
output_size * sizeof(float), // 使用计算出的大小
|
||
cudaMemcpyDeviceToHost,
|
||
pImpl->stream
|
||
);
|
||
|
||
if (error != cudaSuccess) {
|
||
Logger::error("Failed to copy output data: " + std::string(cudaGetErrorString(error)) +
|
||
" (size: " + std::to_string(output_size) + ")");
|
||
return false;
|
||
}
|
||
|
||
// 同步等待结果
|
||
error = cudaStreamSynchronize(pImpl->stream);
|
||
if (error != cudaSuccess) {
|
||
Logger::error("Failed to synchronize CUDA stream: " +
|
||
std::string(cudaGetErrorString(error)));
|
||
return false;
|
||
}
|
||
|
||
// 后处理
|
||
try {
|
||
detections = postprocess(pImpl->hostOutput, 1);
|
||
} catch (const std::exception& e) {
|
||
Logger::error("Error in postprocessing: " + std::string(e.what()));
|
||
return false;
|
||
}
|
||
|
||
// 在检测结果后添加日志
|
||
if (!detections.empty()) {
|
||
Logger::info("=== Detection Results ===");
|
||
Logger::info("Found " + std::to_string(detections.size()) + " objects");
|
||
for (const auto& det : detections) {
|
||
Logger::info(" Object: shoe");
|
||
Logger::info(" Confidence: " + std::to_string(det.confidence));
|
||
Logger::info(" Box: (" + std::to_string(det.x1) + ", " +
|
||
std::to_string(det.y1) + ", " +
|
||
std::to_string(det.x2) + ", " +
|
||
std::to_string(det.y2) + ")");
|
||
}
|
||
}
|
||
|
||
// 保存检测结果
|
||
if (!detections.empty()) {
|
||
static int frame_count = 0;
|
||
frame_count++;
|
||
|
||
// 每100帧保存一次结果
|
||
if (frame_count % 100 == 0) {
|
||
cv::Mat output = input_image.clone();
|
||
|
||
// 绘制检测框
|
||
for (const auto& det : detections) {
|
||
cv::rectangle(output,
|
||
cv::Point(det.x1, det.y1),
|
||
cv::Point(det.x2, det.y2),
|
||
cv::Scalar(0, 255, 0), 2);
|
||
|
||
std::string label = "shoe: " + std::to_string(det.confidence);
|
||
cv::putText(output, label,
|
||
cv::Point(det.x1, det.y1 - 10),
|
||
cv::FONT_HERSHEY_SIMPLEX, 0.5,
|
||
cv::Scalar(0, 255, 0), 2);
|
||
}
|
||
|
||
// 保存图像
|
||
std::string filename = "results/detection_" +
|
||
std::to_string(frame_count) + ".jpg";
|
||
cv::imwrite(filename, output);
|
||
Logger::info("Saved detection result to: " + filename);
|
||
}
|
||
}
|
||
|
||
Logger::info("=== Inference Completed ===");
|
||
return true;
|
||
}
|
||
catch (const std::exception& e) {
|
||
Logger::error("Error during inference: " + std::string(e.what()));
|
||
return false;
|
||
}
|
||
}
|
||
|
||
std::vector<DetectionResult> TensorRTEngine::postprocess(float* output, int batch_size) {
|
||
std::vector<DetectionResult> results;
|
||
|
||
// 设置阈值
|
||
const float conf_threshold = 0.6f; // 提高置信度阈值
|
||
const float nms_threshold = 0.4f; // 降低 NMS 阈值
|
||
const float min_box_size = 20.0f; // 最小框尺寸(像素)
|
||
const float max_box_size = 416.0f; // 最大框尺寸(像素)
|
||
const int num_classes = 1; // 只有一个类别 (shoe)
|
||
const int num_boxes = 25200; // YOLOv5 输出的框数量
|
||
|
||
Logger::info("Post-processing parameters:");
|
||
Logger::info(" Confidence threshold: " + std::to_string(conf_threshold));
|
||
Logger::info(" NMS threshold: " + std::to_string(nms_threshold));
|
||
Logger::info(" Min box size: " + std::to_string(min_box_size));
|
||
Logger::info(" Max box size: " + std::to_string(max_box_size));
|
||
|
||
// 存储所有检测结果
|
||
std::vector<std::vector<DetectionResult>> class_detections(num_classes);
|
||
int total_boxes = 0;
|
||
int filtered_by_conf = 0;
|
||
int filtered_by_size = 0;
|
||
int filtered_by_nms = 0;
|
||
|
||
// 遍历所有预测框
|
||
for (int i = 0; i < num_boxes; i++) {
|
||
float* box = output + i * (5 + num_classes);
|
||
float confidence = box[4];
|
||
|
||
// 检查置信度
|
||
if (confidence < conf_threshold) {
|
||
filtered_by_conf++;
|
||
continue;
|
||
}
|
||
|
||
float class_score = box[5];
|
||
float final_score = confidence * class_score;
|
||
|
||
if (final_score > conf_threshold) {
|
||
DetectionResult det;
|
||
float cx = box[0] * pImpl->inputW;
|
||
float cy = box[1] * pImpl->inputH;
|
||
float w = box[2] * pImpl->inputW;
|
||
float h = box[3] * pImpl->inputH;
|
||
|
||
// 检查框的尺寸
|
||
if (w < min_box_size || h < min_box_size ||
|
||
w > max_box_size || h > max_box_size) {
|
||
filtered_by_size++;
|
||
continue;
|
||
}
|
||
|
||
det.x1 = std::max(0.0f, cx - w/2);
|
||
det.y1 = std::max(0.0f, cy - h/2);
|
||
det.x2 = std::min(float(pImpl->inputW), cx + w/2);
|
||
det.y2 = std::min(float(pImpl->inputH), cy + h/2);
|
||
det.confidence = final_score;
|
||
det.class_id = 0;
|
||
|
||
class_detections[0].push_back(det);
|
||
total_boxes++;
|
||
}
|
||
}
|
||
|
||
Logger::info("Detection statistics:");
|
||
Logger::info(" Total boxes processed: " + std::to_string(num_boxes));
|
||
Logger::info(" Filtered by confidence: " + std::to_string(filtered_by_conf));
|
||
Logger::info(" Filtered by size: " + std::to_string(filtered_by_size));
|
||
Logger::info(" Remaining after initial filtering: " + std::to_string(total_boxes));
|
||
|
||
// NMS
|
||
for (int c = 0; c < num_classes; c++) {
|
||
auto& dets = class_detections[c];
|
||
if (dets.empty()) continue;
|
||
|
||
std::sort(dets.begin(), dets.end(),
|
||
[](const DetectionResult& a, const DetectionResult& b) {
|
||
return a.confidence > b.confidence;
|
||
});
|
||
|
||
std::vector<bool> keep(dets.size(), true);
|
||
for (size_t i = 0; i < dets.size(); i++) {
|
||
if (!keep[i]) continue;
|
||
|
||
for (size_t j = i + 1; j < dets.size(); j++) {
|
||
if (!keep[j]) continue;
|
||
|
||
float iou = calculateIoU(dets[i], dets[j]);
|
||
if (iou > nms_threshold) {
|
||
keep[j] = false;
|
||
filtered_by_nms++;
|
||
}
|
||
}
|
||
}
|
||
|
||
for (size_t i = 0; i < dets.size(); i++) {
|
||
if (keep[i]) {
|
||
results.push_back(dets[i]);
|
||
}
|
||
}
|
||
}
|
||
|
||
Logger::info(" Filtered by NMS: " + std::to_string(filtered_by_nms));
|
||
Logger::info(" Final detection count: " + std::to_string(results.size()));
|
||
|
||
return results;
|
||
}
|
||
|
||
// 添加 IoU 计算函数
|
||
float TensorRTEngine::calculateIoU(const DetectionResult& a, const DetectionResult& b) {
|
||
float x1 = std::max(a.x1, b.x1);
|
||
float y1 = std::max(a.y1, b.y1);
|
||
float x2 = std::min(a.x2, b.x2);
|
||
float y2 = std::min(a.y2, b.y2);
|
||
|
||
if (x2 < x1 || y2 < y1) return 0.0f;
|
||
|
||
float intersection = (x2 - x1) * (y2 - y1);
|
||
float area_a = (a.x2 - a.x1) * (a.y2 - a.y1);
|
||
float area_b = (b.x2 - b.x1) * (b.y2 - b.y1);
|
||
|
||
return intersection / (area_a + area_b - intersection);
|
||
}
|
||
|
||
bool TensorRTEngine::inferGPU(float* gpu_input, std::vector<DetectionResult>& detections) {
|
||
try {
|
||
// 直接使用 GPU 内存中的数据进行推理
|
||
void* buffers[2] = {gpu_input, pImpl->output_buffer};
|
||
|
||
// 执行推理
|
||
bool status = false;
|
||
try {
|
||
cudaStreamSynchronize(pImpl->stream); // 确保之前的操作完成
|
||
status = pImpl->context->executeV2(buffers);
|
||
cudaStreamSynchronize(pImpl->stream); // 等待推理完成
|
||
} catch (const std::exception& e) {
|
||
Logger::error("Error during inference execution: " + std::string(e.what()));
|
||
return false;
|
||
}
|
||
|
||
if (!status) {
|
||
Logger::error("Failed to execute inference");
|
||
return false;
|
||
}
|
||
|
||
// 获取输出大小
|
||
nvinfer1::Dims output_dims = pImpl->engine->getTensorShape(pImpl->engine->getIOTensorName(1));
|
||
size_t output_size = 1;
|
||
for (int i = 0; i < output_dims.nbDims; i++) {
|
||
output_size *= output_dims.d[i];
|
||
}
|
||
|
||
// 复制结果回主机
|
||
cudaError_t error = cudaMemcpyAsync(
|
||
pImpl->hostOutput,
|
||
pImpl->output_buffer,
|
||
output_size * sizeof(float),
|
||
cudaMemcpyDeviceToHost,
|
||
pImpl->stream
|
||
);
|
||
|
||
if (error != cudaSuccess) {
|
||
Logger::error("Failed to copy output data: " + std::string(cudaGetErrorString(error)));
|
||
return false;
|
||
}
|
||
|
||
// 同步等待结果
|
||
error = cudaStreamSynchronize(pImpl->stream);
|
||
if (error != cudaSuccess) {
|
||
Logger::error("Failed to synchronize CUDA stream: " +
|
||
std::string(cudaGetErrorString(error)));
|
||
return false;
|
||
}
|
||
|
||
// 后处理
|
||
detections = postprocess(pImpl->hostOutput, 1);
|
||
return true;
|
||
}
|
||
catch (const std::exception& e) {
|
||
Logger::error("Error during GPU inference: " + std::string(e.what()));
|
||
return false;
|
||
}
|
||
} |