281 lines
9.8 KiB
C++
281 lines
9.8 KiB
C++
#include "NvInfer.h"
|
|
#include "cuda_runtime_api.h"
|
|
#include "logging.h"
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <map>
|
|
#include <sstream>
|
|
#include <vector>
|
|
#include <chrono>
|
|
#include "utils.hpp"
|
|
|
|
#define USE_FP32 //USE_FP16
|
|
#define INPUT_NAME "data"
|
|
#define OUTPUT_NAME "prob"
|
|
#define MAX_BATCH_SIZE 8
|
|
|
|
using namespace nvinfer1;
|
|
static Logger gLogger;
|
|
|
|
static std::vector<BlockArgs>
|
|
block_args_list = {
|
|
BlockArgs{1, 3, 1, 1, 32, 16, 0.25, true},
|
|
BlockArgs{2, 3, 2, 6, 16, 24, 0.25, true},
|
|
BlockArgs{2, 5, 2, 6, 24, 40, 0.25, true},
|
|
BlockArgs{3, 3, 2, 6, 40, 80, 0.25, true},
|
|
BlockArgs{3, 5, 1, 6, 80, 112, 0.25, true},
|
|
BlockArgs{4, 5, 2, 6, 112, 192, 0.25, true},
|
|
BlockArgs{1, 3, 1, 6, 192, 320, 0.25, true}};
|
|
|
|
static std::map<std::string, GlobalParams>
|
|
global_params_map = {
|
|
// input_h,input_w,num_classes,batch_norm_epsilon,
|
|
// width_coefficient,depth_coefficient,depth_divisor, min_depth
|
|
{"b0", GlobalParams{224, 224, 1000, 0.001, 1.0, 1.0, 8, -1}},
|
|
{"b1", GlobalParams{240, 240, 1000, 0.001, 1.0, 1.1, 8, -1}},
|
|
{"b2", GlobalParams{260, 260, 1000, 0.001, 1.1, 1.2, 8, -1}},
|
|
{"b3", GlobalParams{300, 300, 1000, 0.001, 1.2, 1.4, 8, -1}},
|
|
{"b4", GlobalParams{380, 380, 1000, 0.001, 1.4, 1.8, 8, -1}},
|
|
{"b5", GlobalParams{456, 456, 1000, 0.001, 1.6, 2.2, 8, -1}},
|
|
{"b6", GlobalParams{528, 528, 1000, 0.001, 1.8, 2.6, 8, -1}},
|
|
{"b7", GlobalParams{600, 600, 1000, 0.001, 2.0, 3.1, 8, -1}},
|
|
{"b8", GlobalParams{672, 672, 1000, 0.001, 2.2, 3.6, 8, -1}},
|
|
{"l2", GlobalParams{800, 800, 1000, 0.001, 4.3, 5.3, 8, -1}},
|
|
};
|
|
|
|
ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt, std::string path_wts, std::vector<BlockArgs> block_args_list, GlobalParams global_params)
|
|
{
|
|
float bn_eps = global_params.batch_norm_epsilon;
|
|
DimsHW image_size = DimsHW{global_params.input_h, global_params.input_w};
|
|
|
|
std::map<std::string, Weights> weightMap = loadWeights(path_wts);
|
|
Weights emptywts{DataType::kFLOAT, nullptr, 0};
|
|
INetworkDefinition *network = builder->createNetworkV2(0U);
|
|
ITensor *data = network->addInput(INPUT_NAME, dt, Dims3{3, global_params.input_h, global_params.input_w});
|
|
assert(data);
|
|
|
|
int out_channels = roundFilters(32, global_params);
|
|
auto conv_stem = addSamePaddingConv2d(network, weightMap, *data, out_channels, 3, 2, 1, 1, image_size, "_conv_stem");
|
|
auto bn0 = addBatchNorm2d(network, weightMap, *conv_stem->getOutput(0), "_bn0", bn_eps);
|
|
auto swish0 = addSwish(network, *bn0->getOutput(0));
|
|
ITensor *x = swish0->getOutput(0);
|
|
image_size = calculateOutputImageSize(image_size, 2);
|
|
int block_id = 0;
|
|
for (int i = 0; i < block_args_list.size(); i++)
|
|
{
|
|
BlockArgs block_args = block_args_list[i];
|
|
|
|
block_args.input_filters = roundFilters(block_args.input_filters, global_params);
|
|
block_args.output_filters = roundFilters(block_args.output_filters, global_params);
|
|
block_args.num_repeat = roundRepeats(block_args.num_repeat, global_params);
|
|
x = MBConvBlock(network, weightMap, *x, "_blocks." + std::to_string(block_id), block_args, global_params, image_size);
|
|
|
|
assert(x);
|
|
block_id++;
|
|
image_size = calculateOutputImageSize(image_size, block_args.stride);
|
|
if (block_args.num_repeat > 1)
|
|
{
|
|
block_args.input_filters = block_args.output_filters;
|
|
block_args.stride = 1;
|
|
}
|
|
for (int r = 0; r < block_args.num_repeat - 1; r++)
|
|
{
|
|
x = MBConvBlock(network, weightMap, *x, "_blocks." + std::to_string(block_id), block_args, global_params, image_size);
|
|
block_id++;
|
|
}
|
|
}
|
|
out_channels = roundFilters(1280, global_params);
|
|
auto conv_head = addSamePaddingConv2d(network, weightMap, *x, out_channels, 1, 1, 1, 1, image_size, "_conv_head", false);
|
|
auto bn1 = addBatchNorm2d(network, weightMap, *conv_head->getOutput(0), "_bn1", bn_eps);
|
|
auto swish1 = addSwish(network, *bn1->getOutput(0));
|
|
auto avg_pool = network->addPoolingNd(*swish1->getOutput(0), PoolingType::kAVERAGE, image_size);
|
|
|
|
IFullyConnectedLayer *final = network->addFullyConnected(*avg_pool->getOutput(0), global_params.num_classes, weightMap["_fc.weight"], weightMap["_fc.bias"]);
|
|
assert(final);
|
|
|
|
final->getOutput(0)->setName(OUTPUT_NAME);
|
|
network->markOutput(*final->getOutput(0));
|
|
|
|
// Build engine
|
|
builder->setMaxBatchSize(maxBatchSize);
|
|
config->setMaxWorkspaceSize(1 << 20);
|
|
#ifdef USE_FP16
|
|
config->setFlag(BuilderFlag::kFP16);
|
|
#endif
|
|
std::cout << "build engine ..." << std::endl;
|
|
|
|
ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
|
|
assert(engine != nullptr);
|
|
|
|
std::cout << "build finished" << std::endl;
|
|
// Don't need the network any more
|
|
network->destroy();
|
|
// Release host memory
|
|
for (auto &mem : weightMap)
|
|
{
|
|
free((void *)(mem.second.values));
|
|
}
|
|
|
|
return engine;
|
|
}
|
|
|
|
void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream, std::string wtsPath, std::vector<BlockArgs> block_args_list, GlobalParams global_params)
|
|
{
|
|
// Create builder
|
|
IBuilder *builder = createInferBuilder(gLogger);
|
|
IBuilderConfig *config = builder->createBuilderConfig();
|
|
|
|
// Create model to populate the network, then set the outputs and create an engine
|
|
ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wtsPath, block_args_list, global_params);
|
|
assert(engine != nullptr);
|
|
|
|
// Serialize the engine
|
|
(*modelStream) = engine->serialize();
|
|
|
|
// Close everything down
|
|
engine->destroy();
|
|
builder->destroy();
|
|
config->destroy();
|
|
}
|
|
void doInference(IExecutionContext &context, float *input, float *output, int batchSize, GlobalParams global_params)
|
|
{
|
|
const ICudaEngine &engine = context.getEngine();
|
|
|
|
// Pointers to input and output device buffers to pass to engine.
|
|
// Engine requires exactly IEngine::getNbBindings() number of buffers.
|
|
assert(engine.getNbBindings() == 2);
|
|
void *buffers[2];
|
|
|
|
// In order to bind the buffers, we need to know the names of the input and output tensors.
|
|
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
|
|
const int inputIndex = engine.getBindingIndex(INPUT_NAME);
|
|
const int outputIndex = engine.getBindingIndex(OUTPUT_NAME);
|
|
|
|
// Create GPU buffers on device
|
|
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * global_params.input_h * global_params.input_w * sizeof(float)));
|
|
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * global_params.num_classes * sizeof(float)));
|
|
|
|
// Create stream
|
|
cudaStream_t stream;
|
|
CHECK(cudaStreamCreate(&stream));
|
|
|
|
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
|
|
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * global_params.input_h * global_params.input_w * sizeof(float), cudaMemcpyHostToDevice, stream));
|
|
context.enqueue(batchSize, buffers, stream, nullptr);
|
|
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * global_params.num_classes * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
|
cudaStreamSynchronize(stream);
|
|
|
|
// Release stream and buffers
|
|
cudaStreamDestroy(stream);
|
|
CHECK(cudaFree(buffers[inputIndex]));
|
|
CHECK(cudaFree(buffers[outputIndex]));
|
|
}
|
|
|
|
bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, std::string &backbone)
|
|
{
|
|
if (std::string(argv[1]) == "-s" && argc == 5)
|
|
{
|
|
wts = std::string(argv[2]);
|
|
engine = std::string(argv[3]);
|
|
backbone = std::string(argv[4]);
|
|
}
|
|
else if (std::string(argv[1]) == "-d" && argc == 4)
|
|
{
|
|
engine = std::string(argv[2]);
|
|
backbone = std::string(argv[3]);
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
std::string wtsPath = "";
|
|
std::string engine_name = "";
|
|
std::string backbone = "";
|
|
if (!parse_args(argc, argv, wtsPath, engine_name, backbone))
|
|
{
|
|
std::cerr << "arguments not right!" << std::endl;
|
|
std::cerr << "./efficientnet -s [.wts] [.engine] [b0 b1 b2 b3 ... b7] // serialize model to engine file" << std::endl;
|
|
std::cerr << "./efficientnet -d [.engine] [b0 b1 b2 b3 ... b7] // deserialize engine file and run inference" << std::endl;
|
|
return -1;
|
|
}
|
|
GlobalParams global_params = global_params_map[backbone];
|
|
// create a model using the API directly and serialize it to a stream
|
|
if (!wtsPath.empty())
|
|
{
|
|
IHostMemory *modelStream{nullptr};
|
|
APIToModel(MAX_BATCH_SIZE, &modelStream, wtsPath, block_args_list, global_params);
|
|
assert(modelStream != nullptr);
|
|
|
|
std::ofstream p(engine_name, std::ios::binary);
|
|
if (!p)
|
|
{
|
|
std::cerr << "could not open plan output file" << std::endl;
|
|
return -1;
|
|
}
|
|
p.write(reinterpret_cast<const char *>(modelStream->data()), modelStream->size());
|
|
modelStream->destroy();
|
|
return 1;
|
|
}
|
|
|
|
char *trtModelStream{nullptr};
|
|
size_t size{0};
|
|
|
|
std::ifstream file(engine_name, std::ios::binary);
|
|
if (file.good())
|
|
{
|
|
file.seekg(0, file.end);
|
|
size = file.tellg();
|
|
file.seekg(0, file.beg);
|
|
trtModelStream = new char[size];
|
|
assert(trtModelStream);
|
|
file.read(trtModelStream, size);
|
|
file.close();
|
|
}
|
|
else
|
|
{
|
|
std::cerr << "could not open plan file" << std::endl;
|
|
return -1;
|
|
}
|
|
|
|
// dummy input
|
|
float *data = new float[3 * global_params.input_h * global_params.input_w];
|
|
for (int i = 0; i < 3 * global_params.input_h * global_params.input_w; i++)
|
|
data[i] = 0.1;
|
|
|
|
IRuntime *runtime = createInferRuntime(gLogger);
|
|
assert(runtime != nullptr);
|
|
ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
|
|
assert(engine != nullptr);
|
|
IExecutionContext *context = engine->createExecutionContext();
|
|
assert(context != nullptr);
|
|
delete[] trtModelStream;
|
|
|
|
// Run inference
|
|
float *prob = new float[global_params.num_classes];
|
|
for (int i = 0; i < 100; i++)
|
|
{
|
|
auto start = std::chrono::system_clock::now();
|
|
doInference(*context, data, prob, 1, global_params);
|
|
auto end = std::chrono::system_clock::now();
|
|
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
|
|
}
|
|
for (unsigned int i = 0; i < 20; i++)
|
|
{
|
|
std::cout << prob[i] << ", ";
|
|
}
|
|
std::cout << std::endl;
|
|
// Destroy the engine
|
|
context->destroy();
|
|
engine->destroy();
|
|
runtime->destroy();
|
|
delete data;
|
|
delete prob;
|
|
|
|
return 0;
|
|
}
|