#pragma once #include #include #include #include "common.hpp" /* when stride>1, whether to put stride in the first 1x1 convolution or the bottleneck 3x3 convolution. set false when use backbone from torchvision*/ #define STRIDE_IN_1X1 true enum RESNETTYPE { R18 = 0, R34, R50, R101, R152 }; const std::map> num_blocks_per_stage = { {R18, {2, 2, 2, 2}}, {R34, {3, 4, 6, 3}}, {R50, {3, 4, 6, 3}}, {R101, {3, 4, 23, 3}}, {R152, {3, 8, 36, 3}} }; ILayer* BasicStem(INetworkDefinition *network, std::map& weightMap, const std::string& lname, ITensor& input, int out_channels, int group_num = 1) { // conv1 IConvolutionLayer* conv1 = network->addConvolutionNd(input, out_channels, DimsHW{ 7, 7 }, weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{ 2, 2 }); conv1->setPaddingNd(DimsHW{ 3, 3 }); conv1->setNbGroups(group_num); auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(r1); auto max_pool2d = network->addPoolingNd(*r1->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 }); max_pool2d->setStrideNd(DimsHW{ 2, 2 }); max_pool2d->setPaddingNd(DimsHW{ 1, 1 }); // auto mp_dim = max_pool2d->getOutput(0)->getDimensions(); return max_pool2d; } ITensor* BasicBlock(INetworkDefinition *network, std::map& weightMap, const std::string& lname, ITensor& input, int in_channels, int out_channels, int stride = 1) { // conv1 IConvolutionLayer* conv1 = network->addConvolutionNd(input, out_channels, DimsHW{ 3, 3 }, weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{ stride, stride }); conv1->setPaddingNd(DimsHW{ 1, 1 }); auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(r1); // conv2 IConvolutionLayer* conv2 = network->addConvolutionNd(*r1->getOutput(0), out_channels, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], weightMap[lname + ".conv2.bias"]); assert(conv2); conv2->setStrideNd(DimsHW{ 1, 1 }); conv2->setPaddingNd(DimsHW{ 1, 1 }); // shortcut ITensor* shortcut_value = nullptr; if (in_channels != out_channels) { auto shortcut = network->addConvolutionNd(input, out_channels, DimsHW{ 1, 1 }, weightMap[lname + ".shortcut.weight"], weightMap[lname + ".shortcut.bias"]); assert(shortcut); shortcut->setStrideNd(DimsHW{ stride, stride }); shortcut_value = shortcut->getOutput(0); } else { shortcut_value = &input; } // add auto ew = network->addElementWise(*conv2->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM); assert(ew); auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU); assert(r3); return r3->getOutput(0); } ITensor* BottleneckBlock(INetworkDefinition *network, std::map& weightMap, const std::string& lname, ITensor& input, int in_channels, int bottleneck_channels, int out_channels, int stride = 1, int dilation = 1, int group_num = 1) { int stride_1x1 = STRIDE_IN_1X1 ? stride : 1; int stride_3x3 = STRIDE_IN_1X1 ? 1 : stride; // conv1 IConvolutionLayer* conv1 = network->addConvolutionNd(input, bottleneck_channels, DimsHW{ 1, 1 }, weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{ stride_1x1, stride_1x1 }); conv1->setNbGroups(group_num); auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(r1); // conv2 IConvolutionLayer* conv2 = network->addConvolutionNd(*r1->getOutput(0), bottleneck_channels, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], weightMap[lname + ".conv2.bias"]); assert(conv2); conv2->setStrideNd(DimsHW{ stride_3x3, stride_3x3 }); conv2->setPaddingNd(DimsHW{ 1 * dilation, 1 * dilation }); conv2->setDilationNd(DimsHW{ dilation, dilation }); conv2->setNbGroups(group_num); auto r2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU); assert(r2); // conv3 IConvolutionLayer* conv3 = network->addConvolutionNd(*r2->getOutput(0), out_channels, DimsHW{ 1, 1 }, weightMap[lname + ".conv3.weight"], weightMap[lname + ".conv3.bias"]); assert(conv3); conv3->setStrideNd(DimsHW{ 1, 1 }); conv3->setNbGroups(group_num); // shortcut ITensor* shortcut_value = nullptr; if (in_channels != out_channels) { auto shortcut = network->addConvolutionNd(input, out_channels, DimsHW{ 1, 1 }, weightMap[lname + ".shortcut.weight"], weightMap[lname + ".shortcut.bias"]); assert(shortcut); shortcut->setStrideNd(DimsHW{stride, stride}); shortcut->setNbGroups(group_num); shortcut_value = shortcut->getOutput(0); } else { shortcut_value = &input; } // add auto ew = network->addElementWise(*conv3->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM); assert(ew); auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU); assert(r3); return r3->getOutput(0); } ITensor* MakeStage(INetworkDefinition *network, std::map& weightMap, const std::string& lname, ITensor& input, int stage, RESNETTYPE resnet_type, int in_channels, int bottleneck_channels, int out_channels, int first_stride = 1, int dilation = 1) { ITensor* out = &input; for (int i = 0; i < stage; i++) { std::string layerName = lname + "." + std::to_string(i); int stride = i == 0 ? first_stride : 1; if (resnet_type == R18 || resnet_type == R34) out = BasicBlock(network, weightMap, layerName, *out, in_channels, out_channels, stride); else out = BottleneckBlock(network, weightMap, layerName, *out, in_channels, bottleneck_channels, out_channels, stride, dilation); in_channels = out_channels; } return out; } ITensor* BuildResNet(INetworkDefinition *network, std::map& weightMap, ITensor& input, RESNETTYPE resnet_type, int stem_out_channels, int bottleneck_channels, int res2_out_channels, int res5_dilation = 1) { assert(res5_dilation == 1 || res5_dilation == 2); // "res5_dilation must be 1 or 2" if (resnet_type == R18 || resnet_type == R34) { assert(res2_out_channels == 64); // "res2_out_channels must be 64 for R18/R34" assert(res5_dilation == 1); // "res5_dilation must be 1 for R18/R34" } int out_channels = res2_out_channels; ITensor* out = nullptr; // stem auto stem = BasicStem(network, weightMap, "backbone.stem", input, stem_out_channels); out = stem->getOutput(0); // res for (int i = 0; i < 3; i++) { int dilation = (i == 3) ? res5_dilation : 1; int first_stride = (i == 0 || (i == 3 && dilation == 2)) ? 1 : 2; out = MakeStage(network, weightMap, "backbone.res" + std::to_string(i + 2), *out, num_blocks_per_stage.at(resnet_type)[i], resnet_type, stem_out_channels, bottleneck_channels, out_channels, first_stride, dilation); stem_out_channels = out_channels; bottleneck_channels *= 2; out_channels *= 2; } return out; }