yolo_standard_libray/tensorrtx-master/efficientnet/efficientnet.cpp
2025-03-07 11:35:40 +08:00

281 lines
9.8 KiB
C++

#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "utils.hpp"
#define USE_FP32 //USE_FP16
#define INPUT_NAME "data"
#define OUTPUT_NAME "prob"
#define MAX_BATCH_SIZE 8
using namespace nvinfer1;
static Logger gLogger;
static std::vector<BlockArgs>
block_args_list = {
BlockArgs{1, 3, 1, 1, 32, 16, 0.25, true},
BlockArgs{2, 3, 2, 6, 16, 24, 0.25, true},
BlockArgs{2, 5, 2, 6, 24, 40, 0.25, true},
BlockArgs{3, 3, 2, 6, 40, 80, 0.25, true},
BlockArgs{3, 5, 1, 6, 80, 112, 0.25, true},
BlockArgs{4, 5, 2, 6, 112, 192, 0.25, true},
BlockArgs{1, 3, 1, 6, 192, 320, 0.25, true}};
static std::map<std::string, GlobalParams>
global_params_map = {
// input_h,input_w,num_classes,batch_norm_epsilon,
// width_coefficient,depth_coefficient,depth_divisor, min_depth
{"b0", GlobalParams{224, 224, 1000, 0.001, 1.0, 1.0, 8, -1}},
{"b1", GlobalParams{240, 240, 1000, 0.001, 1.0, 1.1, 8, -1}},
{"b2", GlobalParams{260, 260, 1000, 0.001, 1.1, 1.2, 8, -1}},
{"b3", GlobalParams{300, 300, 1000, 0.001, 1.2, 1.4, 8, -1}},
{"b4", GlobalParams{380, 380, 1000, 0.001, 1.4, 1.8, 8, -1}},
{"b5", GlobalParams{456, 456, 1000, 0.001, 1.6, 2.2, 8, -1}},
{"b6", GlobalParams{528, 528, 1000, 0.001, 1.8, 2.6, 8, -1}},
{"b7", GlobalParams{600, 600, 1000, 0.001, 2.0, 3.1, 8, -1}},
{"b8", GlobalParams{672, 672, 1000, 0.001, 2.2, 3.6, 8, -1}},
{"l2", GlobalParams{800, 800, 1000, 0.001, 4.3, 5.3, 8, -1}},
};
ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt, std::string path_wts, std::vector<BlockArgs> block_args_list, GlobalParams global_params)
{
float bn_eps = global_params.batch_norm_epsilon;
DimsHW image_size = DimsHW{global_params.input_h, global_params.input_w};
std::map<std::string, Weights> weightMap = loadWeights(path_wts);
Weights emptywts{DataType::kFLOAT, nullptr, 0};
INetworkDefinition *network = builder->createNetworkV2(0U);
ITensor *data = network->addInput(INPUT_NAME, dt, Dims3{3, global_params.input_h, global_params.input_w});
assert(data);
int out_channels = roundFilters(32, global_params);
auto conv_stem = addSamePaddingConv2d(network, weightMap, *data, out_channels, 3, 2, 1, 1, image_size, "_conv_stem");
auto bn0 = addBatchNorm2d(network, weightMap, *conv_stem->getOutput(0), "_bn0", bn_eps);
auto swish0 = addSwish(network, *bn0->getOutput(0));
ITensor *x = swish0->getOutput(0);
image_size = calculateOutputImageSize(image_size, 2);
int block_id = 0;
for (int i = 0; i < block_args_list.size(); i++)
{
BlockArgs block_args = block_args_list[i];
block_args.input_filters = roundFilters(block_args.input_filters, global_params);
block_args.output_filters = roundFilters(block_args.output_filters, global_params);
block_args.num_repeat = roundRepeats(block_args.num_repeat, global_params);
x = MBConvBlock(network, weightMap, *x, "_blocks." + std::to_string(block_id), block_args, global_params, image_size);
assert(x);
block_id++;
image_size = calculateOutputImageSize(image_size, block_args.stride);
if (block_args.num_repeat > 1)
{
block_args.input_filters = block_args.output_filters;
block_args.stride = 1;
}
for (int r = 0; r < block_args.num_repeat - 1; r++)
{
x = MBConvBlock(network, weightMap, *x, "_blocks." + std::to_string(block_id), block_args, global_params, image_size);
block_id++;
}
}
out_channels = roundFilters(1280, global_params);
auto conv_head = addSamePaddingConv2d(network, weightMap, *x, out_channels, 1, 1, 1, 1, image_size, "_conv_head", false);
auto bn1 = addBatchNorm2d(network, weightMap, *conv_head->getOutput(0), "_bn1", bn_eps);
auto swish1 = addSwish(network, *bn1->getOutput(0));
auto avg_pool = network->addPoolingNd(*swish1->getOutput(0), PoolingType::kAVERAGE, image_size);
IFullyConnectedLayer *final = network->addFullyConnected(*avg_pool->getOutput(0), global_params.num_classes, weightMap["_fc.weight"], weightMap["_fc.bias"]);
assert(final);
final->getOutput(0)->setName(OUTPUT_NAME);
network->markOutput(*final->getOutput(0));
// Build engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(1 << 20);
#ifdef USE_FP16
config->setFlag(BuilderFlag::kFP16);
#endif
std::cout << "build engine ..." << std::endl;
ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
assert(engine != nullptr);
std::cout << "build finished" << std::endl;
// Don't need the network any more
network->destroy();
// Release host memory
for (auto &mem : weightMap)
{
free((void *)(mem.second.values));
}
return engine;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream, std::string wtsPath, std::vector<BlockArgs> block_args_list, GlobalParams global_params)
{
// Create builder
IBuilder *builder = createInferBuilder(gLogger);
IBuilderConfig *config = builder->createBuilderConfig();
// Create model to populate the network, then set the outputs and create an engine
ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wtsPath, block_args_list, global_params);
assert(engine != nullptr);
// Serialize the engine
(*modelStream) = engine->serialize();
// Close everything down
engine->destroy();
builder->destroy();
config->destroy();
}
void doInference(IExecutionContext &context, float *input, float *output, int batchSize, GlobalParams global_params)
{
const ICudaEngine &engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);
void *buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_NAME);
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * global_params.input_h * global_params.input_w * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * global_params.num_classes * sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * global_params.input_h * global_params.input_w * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * global_params.num_classes * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, std::string &backbone)
{
if (std::string(argv[1]) == "-s" && argc == 5)
{
wts = std::string(argv[2]);
engine = std::string(argv[3]);
backbone = std::string(argv[4]);
}
else if (std::string(argv[1]) == "-d" && argc == 4)
{
engine = std::string(argv[2]);
backbone = std::string(argv[3]);
}
else
{
return false;
}
return true;
}
int main(int argc, char **argv)
{
std::string wtsPath = "";
std::string engine_name = "";
std::string backbone = "";
if (!parse_args(argc, argv, wtsPath, engine_name, backbone))
{
std::cerr << "arguments not right!" << std::endl;
std::cerr << "./efficientnet -s [.wts] [.engine] [b0 b1 b2 b3 ... b7] // serialize model to engine file" << std::endl;
std::cerr << "./efficientnet -d [.engine] [b0 b1 b2 b3 ... b7] // deserialize engine file and run inference" << std::endl;
return -1;
}
GlobalParams global_params = global_params_map[backbone];
// create a model using the API directly and serialize it to a stream
if (!wtsPath.empty())
{
IHostMemory *modelStream{nullptr};
APIToModel(MAX_BATCH_SIZE, &modelStream, wtsPath, block_args_list, global_params);
assert(modelStream != nullptr);
std::ofstream p(engine_name, std::ios::binary);
if (!p)
{
std::cerr << "could not open plan output file" << std::endl;
return -1;
}
p.write(reinterpret_cast<const char *>(modelStream->data()), modelStream->size());
modelStream->destroy();
return 1;
}
char *trtModelStream{nullptr};
size_t size{0};
std::ifstream file(engine_name, std::ios::binary);
if (file.good())
{
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}
else
{
std::cerr << "could not open plan file" << std::endl;
return -1;
}
// dummy input
float *data = new float[3 * global_params.input_h * global_params.input_w];
for (int i = 0; i < 3 * global_params.input_h * global_params.input_w; i++)
data[i] = 0.1;
IRuntime *runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
assert(engine != nullptr);
IExecutionContext *context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
// Run inference
float *prob = new float[global_params.num_classes];
for (int i = 0; i < 100; i++)
{
auto start = std::chrono::system_clock::now();
doInference(*context, data, prob, 1, global_params);
auto end = std::chrono::system_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
}
for (unsigned int i = 0; i < 20; i++)
{
std::cout << prob[i] << ", ";
}
std::cout << std::endl;
// Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
delete data;
delete prob;
return 0;
}