第一次提交
This commit is contained in:
commit
7f5d8a10a3
81
.gitignore
vendored
Normal file
81
.gitignore
vendored
Normal file
@ -0,0 +1,81 @@
|
||||
############################
|
||||
# 视频文件(全部忽略)
|
||||
############################
|
||||
*.mp4
|
||||
*.avi
|
||||
*.mov
|
||||
*.mkv
|
||||
*.flv
|
||||
*.wmv
|
||||
*.webm
|
||||
*.rm
|
||||
*.rmvb
|
||||
*.ts
|
||||
*.m3u8
|
||||
|
||||
############################
|
||||
# 图片文件(全部忽略)
|
||||
############################
|
||||
*.jpg
|
||||
*.jpeg
|
||||
*.png
|
||||
*.bmp
|
||||
*.tiff
|
||||
*.tif
|
||||
*.gif
|
||||
*.webp
|
||||
|
||||
############################
|
||||
# YOLO / TensorRT 模型文件
|
||||
############################
|
||||
*.pt
|
||||
*.onnx
|
||||
*.wts
|
||||
*.engine
|
||||
|
||||
# 只保留 best.engine
|
||||
!best.engine
|
||||
|
||||
############################
|
||||
# CMake / 编译生成文件(忽略)
|
||||
############################
|
||||
CMakeFiles/
|
||||
CMakeCache.txt
|
||||
cmake_install.cmake
|
||||
Makefile
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
*.dll
|
||||
*.exe
|
||||
|
||||
############################
|
||||
# build 目录规则
|
||||
############################
|
||||
|
||||
# 忽略默认 build 目录
|
||||
build/
|
||||
|
||||
# ❗保留你指定的 build_XXXXXX 目录
|
||||
!build_*/
|
||||
|
||||
############################
|
||||
# Python 缓存(忽略)
|
||||
############################
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
|
||||
############################
|
||||
# 日志 / 临时文件
|
||||
############################
|
||||
*.log
|
||||
*.tmp
|
||||
*.swp
|
||||
*.bak
|
||||
|
||||
############################
|
||||
# 系统文件
|
||||
############################
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
5
001测试拼接列表.py
Normal file
5
001测试拼接列表.py
Normal file
@ -0,0 +1,5 @@
|
||||
|
||||
|
||||
l = ["a", "b", "c", "d"]
|
||||
|
||||
print('_'.join(l))
|
||||
65
CMakeLists.txt
Normal file
65
CMakeLists.txt
Normal file
@ -0,0 +1,65 @@
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
project(yolov8)
|
||||
|
||||
add_definitions(-std=c++11)
|
||||
add_definitions(-DAPI_EXPORTS)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
|
||||
set(CMAKE_CUDA_COMPILER /usr/bin/nvcc)
|
||||
enable_language(CUDA)
|
||||
|
||||
include_directories(${PROJECT_SOURCE_DIR}/include)
|
||||
include_directories(${PROJECT_SOURCE_DIR}/plugin)
|
||||
|
||||
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||
message("embed_platform on")
|
||||
include_directories(/usr/local/cuda/targets/aarch64-linux/include)
|
||||
link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
|
||||
else()
|
||||
message("embed_platform off")
|
||||
# cuda
|
||||
include_directories(/usr/local/cuda-12.4/include)
|
||||
link_directories(/usr/local/cuda-12.4/lib64)
|
||||
|
||||
# tensorrt
|
||||
include_directories(/home/admin-root/software/TensorRT-8.6.1.6/include)
|
||||
link_directories(/home/admin-root/software/TensorRT-8.6.1.6/lib)
|
||||
# include_directories(/home/lindsay/TensorRT-7.2.3.4/include)
|
||||
# link_directories(/home/lindsay/TensorRT-7.2.3.4/lib)
|
||||
|
||||
|
||||
endif()
|
||||
|
||||
add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
|
||||
target_link_libraries(myplugins nvinfer cudart)
|
||||
|
||||
find_package(OpenCV)
|
||||
include_directories(${OpenCV_INCLUDE_DIRS})
|
||||
|
||||
|
||||
file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
|
||||
add_executable(yolov8_det ${PROJECT_SOURCE_DIR}/yolov8_det.cpp ${SRCS})
|
||||
|
||||
target_link_libraries(yolov8_det nvinfer)
|
||||
target_link_libraries(yolov8_det cudart)
|
||||
target_link_libraries(yolov8_det myplugins)
|
||||
target_link_libraries(yolov8_det ${OpenCV_LIBS})
|
||||
|
||||
add_executable(yolov8_seg ${PROJECT_SOURCE_DIR}/yolov8_seg.cpp ${SRCS})
|
||||
target_link_libraries(yolov8_seg nvinfer cudart myplugins ${OpenCV_LIBS})
|
||||
|
||||
|
||||
add_executable(yolov8_pose ${PROJECT_SOURCE_DIR}/yolov8_pose.cpp ${SRCS})
|
||||
target_link_libraries(yolov8_pose nvinfer cudart myplugins ${OpenCV_LIBS})
|
||||
|
||||
add_executable(yolov8_cls ${PROJECT_SOURCE_DIR}/yolov8_cls.cpp ${SRCS})
|
||||
target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS})
|
||||
|
||||
add_executable(yolov8_5u_det ${PROJECT_SOURCE_DIR}/yolov8_5u_det.cpp ${SRCS})
|
||||
target_link_libraries(yolov8_5u_det nvinfer cudart myplugins ${OpenCV_LIBS})
|
||||
|
||||
add_executable(yolov8_obb ${PROJECT_SOURCE_DIR}/yolov8_obb.cpp ${SRCS})
|
||||
target_link_libraries(yolov8_obb nvinfer cudart myplugins ${OpenCV_LIBS})
|
||||
200
README.md
Normal file
200
README.md
Normal file
@ -0,0 +1,200 @@
|
||||
# YOLOv8
|
||||
|
||||
The Pytorch implementation is [ultralytics/yolov8](https://github.com/ultralytics/ultralytics/tree/main/ultralytics).
|
||||
|
||||
The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.com/xiaocao-tian/yolov8_tensorrt)
|
||||
|
||||
## Contributors
|
||||
|
||||
<a href="https://github.com/xiaocao-tian"><img src="https://avatars.githubusercontent.com/u/65889782?v=4?s=48" width="40px;" alt=""/></a>
|
||||
<a href="https://github.com/lindsayshuo"><img src="https://avatars.githubusercontent.com/u/45239466?v=4?s=48" width="40px;" alt=""/></a>
|
||||
<a href="https://github.com/xinsuinizhuan"><img src="https://avatars.githubusercontent.com/u/40679769?v=4?s=48" width="40px;" alt=""/></a>
|
||||
<a href="https://github.com/Rex-LK"><img src="https://avatars.githubusercontent.com/u/74702576?s=48&v=4" width="40px;" alt=""/></a>
|
||||
<a href="https://github.com/emptysoal"><img src="https://avatars.githubusercontent.com/u/57931586?s=48&v=4" width="40px;" alt=""/></a>
|
||||
<a href="https://github.com/ChangjunDAI"><img src="https://avatars.githubusercontent.com/u/65420228?s=48&v=4" width="40px;" alt=""/></a>
|
||||
|
||||
## Requirements
|
||||
|
||||
- TensorRT 8.0+
|
||||
- OpenCV 3.4.0+
|
||||
- ultralytics<=8.2.103
|
||||
|
||||
## Different versions of yolov8
|
||||
|
||||
Currently, we support yolov8
|
||||
|
||||
- For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page.
|
||||
|
||||
## Config
|
||||
|
||||
- Choose the model n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 from command line arguments.
|
||||
- Check more configs in [include/config.h](./include/config.h)
|
||||
|
||||
## How to Run, yolov8n as example
|
||||
|
||||
1. generate .wts from pytorch with .pt, or download .wts from model zoo
|
||||
|
||||
```
|
||||
// download https://github.com/ultralytics/assets/releases/yolov8n.pt
|
||||
// download https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for 10 cls p2 model)
|
||||
cp {tensorrtx}/yolov8/gen_wts.py {ultralytics}/ultralytics
|
||||
cd {ultralytics}/ultralytics
|
||||
python gen_wts.py -w yolov8n.pt -o yolov8n.wts -t detect
|
||||
// a file 'yolov8n.wts' will be generated.
|
||||
|
||||
|
||||
// For p2 model
|
||||
// download https://github.com/lindsayshuo/yolov8_p2_tensorrtx/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for 10 cls p2 model)
|
||||
cd {ultralytics}/ultralytics
|
||||
python gen_wts.py -w VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt -o VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts -t detect (only for 10 cls p2 model)
|
||||
// a file 'VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts' will be generated.
|
||||
|
||||
// For yolov8_5u_det model
|
||||
// download https://github.com/ultralytics/assets/releases/yolov5nu.pt
|
||||
cd {ultralytics}/ultralytics
|
||||
python gen_wts.py -w yolov5nu.pt -o yolov5nu.wts -t detect
|
||||
// a file 'yolov5nu.wts' will be generated.
|
||||
|
||||
```
|
||||
|
||||
2. build tensorrtx/yolov8 and run
|
||||
|
||||
### Detection
|
||||
```
|
||||
cd {tensorrtx}/yolov8/
|
||||
mkdir build
|
||||
cd build
|
||||
cp {ultralytics}/ultralytics/yolov8.wts {tensorrtx}/yolov8/build
|
||||
cmake ..
|
||||
make
|
||||
sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file
|
||||
sudo ./yolov8_det -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed.
|
||||
|
||||
// For example yolov8n
|
||||
sudo ./yolov8_det -s yolov8n.wts yolov8.engine n
|
||||
sudo ./yolov8_det -d yolov8n.engine ../images c //cpu postprocess
|
||||
sudo ./yolov8_det -d yolov8n.engine ../images g //gpu postprocess
|
||||
|
||||
|
||||
// For p2 model:
|
||||
// change the "const static int kNumClass" in config.h to 10;
|
||||
sudo ./yolov8_det -s VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine x2
|
||||
wget https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/0000008_01999_d_0000040.jpg
|
||||
cp -r 0000008_01999_d_0000040.jpg ../images
|
||||
sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images c //cpu postprocess
|
||||
sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images g //gpu postprocess
|
||||
|
||||
// For yolov8_5u_det(YOLOv5u with the anchor-free, objectness-free split head structure based on YOLOv8 features) model:
|
||||
sudo ./yolov8_5u_det -s [.wts] [.engine] [n/s/m/l/x//n6/s6/m6/l6/x6]
|
||||
sudo ./yolov8_5u_det -d yolov5xu.engine ../images c //cpu postprocess
|
||||
sudo ./yolov8_5u_det -d yolov5xu.engine ../images g //gpu postprocess
|
||||
```
|
||||
|
||||
### Instance Segmentation
|
||||
```
|
||||
# Build and serialize TensorRT engine
|
||||
./yolov8_seg -s yolov8s-seg.wts yolov8s-seg.engine s
|
||||
|
||||
# Download the labels file
|
||||
wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt
|
||||
|
||||
# Run inference with labels file
|
||||
./yolov8_seg -d yolov8s-seg.engine ../images c coco.txt
|
||||
```
|
||||
|
||||
### Classification
|
||||
```
|
||||
cd {tensorrtx}/yolov8/
|
||||
// Download inference images
|
||||
wget https://github.com/lindsayshuo/infer_pic/releases/download/pics/1709970363.6990473rescls.jpg
|
||||
mkdir samples
|
||||
cp -r 1709970363.6990473rescls.jpg samples
|
||||
// Download ImageNet labels
|
||||
wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt
|
||||
|
||||
// update kClsNumClass in config.h if your model is trained on custom dataset
|
||||
mkdir build
|
||||
cd build
|
||||
cp {ultralytics}/ultralytics/yolov8n-cls.wts {tensorrtx}/yolov8/build
|
||||
cmake ..
|
||||
make
|
||||
sudo ./yolov8_cls -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file
|
||||
sudo ./yolov8_cls -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed.
|
||||
|
||||
// For example yolov8n
|
||||
sudo ./yolov8_cls -s yolov8n-cls.wts yolov8-cls.engine n
|
||||
sudo ./yolov8_cls -d yolov8n-cls.engine ../samples
|
||||
```
|
||||
|
||||
|
||||
### Pose Estimation
|
||||
```
|
||||
cd {tensorrtx}/yolov8/
|
||||
// update "kPoseNumClass = 1" in config.h
|
||||
mkdir build
|
||||
cd build
|
||||
cp {ultralytics}/ultralytics/yolov8-pose.wts {tensorrtx}/yolov8/build
|
||||
cmake ..
|
||||
make
|
||||
sudo ./yolov8_pose -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file
|
||||
sudo ./yolov8_pose -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed.
|
||||
|
||||
// For example yolov8-pose
|
||||
sudo ./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n
|
||||
sudo ./yolov8_pose -d yolov8n-pose.engine ../images c //cpu postprocess
|
||||
sudo ./yolov8_pose -d yolov8n-pose.engine ../images g //gpu postprocess
|
||||
```
|
||||
|
||||
|
||||
### Oriented Bounding Boxes (OBB) Estimation
|
||||
```
|
||||
cd {tensorrtx}/yolov8/
|
||||
// update "kObbNumClass = 15" "kInputH = 1024" "kInputW = 1024" in config.h
|
||||
wget https://github.com/lindsayshuo/infer_pic/releases/download/pics/obb.png
|
||||
mkdir images
|
||||
mv obb.png ./images
|
||||
mkdir build
|
||||
cd build
|
||||
cp {ultralytics}/ultralytics/yolov8-obb.wts {tensorrtx}/yolov8/build
|
||||
cmake ..
|
||||
make
|
||||
sudo ./yolov8_obb -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file
|
||||
sudo ./yolov8_obb -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed.
|
||||
|
||||
// For example yolov8-obb
|
||||
sudo ./yolov8_obb -s yolov8n-obb.wts yolov8n-obb.engine n
|
||||
sudo ./yolov8_obb -d yolov8n-obb.engine ../images c //cpu postprocess
|
||||
sudo ./yolov8_obb -d yolov8n-obb.engine ../images g //gpu postprocess
|
||||
```
|
||||
|
||||
|
||||
4. optional, load and run the tensorrt model in python
|
||||
|
||||
```
|
||||
// install python-tensorrt, pycuda, etc.
|
||||
// ensure the yolov8n.engine and libmyplugins.so have been built
|
||||
python yolov8_det_trt.py # Detection
|
||||
python yolov8_seg_trt.py # Segmentation
|
||||
python yolov8_cls_trt.py # Classification
|
||||
python yolov8_pose_trt.py # Pose Estimation
|
||||
python yolov8_5u_det_trt.py # yolov8_5u_det(YOLOv5u with the anchor-free, objectness-free split head structure based on YOLOv8 features) model
|
||||
python yolov8_obb_trt.py # Oriented Bounding Boxes (OBB) Estimation
|
||||
```
|
||||
|
||||
# INT8 Quantization
|
||||
|
||||
1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh
|
||||
|
||||
2. unzip it in yolov8/build
|
||||
|
||||
3. set the macro `USE_INT8` in config.h, change `kInputQuantizationFolder` into your image folder path and make
|
||||
|
||||
4. serialize the model and test
|
||||
|
||||
<p align="center">
|
||||
<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg" height="360px;">
|
||||
</p>
|
||||
|
||||
## More Information
|
||||
|
||||
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
|
||||
12
attendance/2025-01-08.txt
Normal file
12
attendance/2025-01-08.txt
Normal file
@ -0,0 +1,12 @@
|
||||
新人打卡, 员工名:haotian ,相似度:0.99852,打卡时间2025-01-08 14:38:01.425799
|
||||
新人打卡, 员工名:胡同同 ,相似度:0.96767,打卡时间2025-01-08 14:41:52.114084
|
||||
新人打卡, 员工名:杨威 ,相似度:0.98566,打卡时间2025-01-08 14:57:25.434180
|
||||
新人打卡, 员工名:张建峰 ,相似度:0.9608,打卡时间2025-01-08 14:57:42.841012
|
||||
新人打卡, 员工名:郑俊 ,相似度:0.8994,打卡时间2025-01-08 15:17:13.444042
|
||||
新人打卡, 员工名:马可义 ,相似度:0.90012,打卡时间2025-01-08 15:17:15.850005
|
||||
新人打卡, 员工名:李同同 ,相似度:0.9586,打卡时间2025-01-08 15:18:17.529510
|
||||
新人打卡, 员工名:白景辰(1) ,相似度:0.99092,打卡时间2025-01-08 15:18:34.346216
|
||||
新人打卡, 员工名:焦军红(1) ,相似度:0.97479,打卡时间2025-01-08 15:18:39.877829
|
||||
新人打卡, 员工名:林时波 ,相似度:0.98172,打卡时间2025-01-08 15:19:11.575465
|
||||
新人打卡, 员工名:林凯 ,相似度:0.9983,打卡时间2025-01-08 15:27:33.302375
|
||||
新人打卡, 员工名:于波 ,相似度:0.92919,打卡时间2025-01-08 15:27:34.106271
|
||||
BIN
build_20250603/best.engine
Normal file
BIN
build_20250603/best.engine
Normal file
Binary file not shown.
5
build_20250603/build.sh
Normal file
5
build_20250603/build.sh
Normal file
@ -0,0 +1,5 @@
|
||||
# conda activate trt
|
||||
python gen_wts.py -w best.pt -o best.wts -t detect
|
||||
cmake ..
|
||||
make
|
||||
./yolov8_det -s best.wts best.engine n
|
||||
57
build_20250603/gen_wts.py
Normal file
57
build_20250603/gen_wts.py
Normal file
@ -0,0 +1,57 @@
|
||||
import sys # noqa: F401
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import torch
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
|
||||
parser.add_argument('-w', '--weights', required=True,
|
||||
help='Input weights (.pt) file path (required)')
|
||||
parser.add_argument(
|
||||
'-o', '--output', help='Output (.wts) file path (optional)')
|
||||
parser.add_argument(
|
||||
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
|
||||
help='determines the model is detection/classification')
|
||||
args = parser.parse_args()
|
||||
if not os.path.isfile(args.weights):
|
||||
raise SystemExit('Invalid input file')
|
||||
if not args.output:
|
||||
args.output = os.path.splitext(args.weights)[0] + '.wts'
|
||||
elif os.path.isdir(args.output):
|
||||
args.output = os.path.join(
|
||||
args.output,
|
||||
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
|
||||
return args.weights, args.output, args.type
|
||||
|
||||
|
||||
pt_file, wts_file, m_type = parse_args()
|
||||
|
||||
print(f'Generating .wts for {m_type} model')
|
||||
|
||||
# Load model
|
||||
print(f'Loading {pt_file}')
|
||||
|
||||
# Initialize
|
||||
device = 'cpu'
|
||||
|
||||
# Load model
|
||||
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32
|
||||
|
||||
if m_type in ['detect', 'seg', 'pose', 'obb']:
|
||||
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
|
||||
|
||||
delattr(model.model[-1], 'anchors')
|
||||
|
||||
model.to(device).eval()
|
||||
|
||||
with open(wts_file, 'w') as f:
|
||||
f.write('{}\n'.format(len(model.state_dict().keys())))
|
||||
for k, v in model.state_dict().items():
|
||||
vr = v.reshape(-1).cpu().numpy()
|
||||
f.write('{} {} '.format(k, len(vr)))
|
||||
for vv in vr:
|
||||
f.write(' ')
|
||||
f.write(struct.pack('>f', float(vv)).hex())
|
||||
f.write('\n')
|
||||
BIN
build_20250603/yolov8_5u_det
Normal file
BIN
build_20250603/yolov8_5u_det
Normal file
Binary file not shown.
BIN
build_20250603/yolov8_cls
Normal file
BIN
build_20250603/yolov8_cls
Normal file
Binary file not shown.
BIN
build_20250603/yolov8_det
Normal file
BIN
build_20250603/yolov8_det
Normal file
Binary file not shown.
BIN
build_20250603/yolov8_obb
Normal file
BIN
build_20250603/yolov8_obb
Normal file
Binary file not shown.
BIN
build_20250603/yolov8_pose
Normal file
BIN
build_20250603/yolov8_pose
Normal file
Binary file not shown.
BIN
build_20250603/yolov8_seg
Normal file
BIN
build_20250603/yolov8_seg
Normal file
Binary file not shown.
BIN
build_20250630/best.engine
Normal file
BIN
build_20250630/best.engine
Normal file
Binary file not shown.
5
build_20250630/build.sh
Normal file
5
build_20250630/build.sh
Normal file
@ -0,0 +1,5 @@
|
||||
# conda activate trt
|
||||
python gen_wts.py -w best.pt -o best.wts -t detect
|
||||
cmake ..
|
||||
make
|
||||
./yolov8_det -s best.wts best.engine n
|
||||
57
build_20250630/gen_wts.py
Normal file
57
build_20250630/gen_wts.py
Normal file
@ -0,0 +1,57 @@
|
||||
import sys # noqa: F401
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import torch
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
|
||||
parser.add_argument('-w', '--weights', required=True,
|
||||
help='Input weights (.pt) file path (required)')
|
||||
parser.add_argument(
|
||||
'-o', '--output', help='Output (.wts) file path (optional)')
|
||||
parser.add_argument(
|
||||
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
|
||||
help='determines the model is detection/classification')
|
||||
args = parser.parse_args()
|
||||
if not os.path.isfile(args.weights):
|
||||
raise SystemExit('Invalid input file')
|
||||
if not args.output:
|
||||
args.output = os.path.splitext(args.weights)[0] + '.wts'
|
||||
elif os.path.isdir(args.output):
|
||||
args.output = os.path.join(
|
||||
args.output,
|
||||
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
|
||||
return args.weights, args.output, args.type
|
||||
|
||||
|
||||
pt_file, wts_file, m_type = parse_args()
|
||||
|
||||
print(f'Generating .wts for {m_type} model')
|
||||
|
||||
# Load model
|
||||
print(f'Loading {pt_file}')
|
||||
|
||||
# Initialize
|
||||
device = 'cpu'
|
||||
|
||||
# Load model
|
||||
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32
|
||||
|
||||
if m_type in ['detect', 'seg', 'pose', 'obb']:
|
||||
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
|
||||
|
||||
delattr(model.model[-1], 'anchors')
|
||||
|
||||
model.to(device).eval()
|
||||
|
||||
with open(wts_file, 'w') as f:
|
||||
f.write('{}\n'.format(len(model.state_dict().keys())))
|
||||
for k, v in model.state_dict().items():
|
||||
vr = v.reshape(-1).cpu().numpy()
|
||||
f.write('{} {} '.format(k, len(vr)))
|
||||
for vv in vr:
|
||||
f.write(' ')
|
||||
f.write(struct.pack('>f', float(vv)).hex())
|
||||
f.write('\n')
|
||||
BIN
build_20250630/yolov8_5u_det
Normal file
BIN
build_20250630/yolov8_5u_det
Normal file
Binary file not shown.
BIN
build_20250630/yolov8_cls
Normal file
BIN
build_20250630/yolov8_cls
Normal file
Binary file not shown.
BIN
build_20250630/yolov8_det
Normal file
BIN
build_20250630/yolov8_det
Normal file
Binary file not shown.
BIN
build_20250630/yolov8_obb
Normal file
BIN
build_20250630/yolov8_obb
Normal file
Binary file not shown.
BIN
build_20250630/yolov8_pose
Normal file
BIN
build_20250630/yolov8_pose
Normal file
Binary file not shown.
BIN
build_20250630/yolov8_seg
Normal file
BIN
build_20250630/yolov8_seg
Normal file
Binary file not shown.
BIN
build_20251226/best.engine
Normal file
BIN
build_20251226/best.engine
Normal file
Binary file not shown.
BIN
build_20251226/yolov8_5u_det
Normal file
BIN
build_20251226/yolov8_5u_det
Normal file
Binary file not shown.
BIN
build_20251226/yolov8_cls
Normal file
BIN
build_20251226/yolov8_cls
Normal file
Binary file not shown.
BIN
build_20251226/yolov8_det
Normal file
BIN
build_20251226/yolov8_det
Normal file
Binary file not shown.
BIN
build_20251226/yolov8_obb
Normal file
BIN
build_20251226/yolov8_obb
Normal file
Binary file not shown.
BIN
build_20251226/yolov8_pose
Normal file
BIN
build_20251226/yolov8_pose
Normal file
Binary file not shown.
BIN
build_20251226/yolov8_seg
Normal file
BIN
build_20251226/yolov8_seg
Normal file
Binary file not shown.
BIN
build_250306/best.engine
Normal file
BIN
build_250306/best.engine
Normal file
Binary file not shown.
5
build_250306/build.sh
Normal file
5
build_250306/build.sh
Normal file
@ -0,0 +1,5 @@
|
||||
# conda activate trt
|
||||
python gen_wts.py -w best.pt -o best.wts -t detect
|
||||
cmake ..
|
||||
make
|
||||
./yolov8_det -s best.wts best.engine n
|
||||
BIN
build_250306/cuda-keyring_1.1-1_all.deb
Normal file
BIN
build_250306/cuda-keyring_1.1-1_all.deb
Normal file
Binary file not shown.
57
build_250306/gen_wts.py
Normal file
57
build_250306/gen_wts.py
Normal file
@ -0,0 +1,57 @@
|
||||
import sys # noqa: F401
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import torch
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
|
||||
parser.add_argument('-w', '--weights', required=True,
|
||||
help='Input weights (.pt) file path (required)')
|
||||
parser.add_argument(
|
||||
'-o', '--output', help='Output (.wts) file path (optional)')
|
||||
parser.add_argument(
|
||||
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
|
||||
help='determines the model is detection/classification')
|
||||
args = parser.parse_args()
|
||||
if not os.path.isfile(args.weights):
|
||||
raise SystemExit('Invalid input file')
|
||||
if not args.output:
|
||||
args.output = os.path.splitext(args.weights)[0] + '.wts'
|
||||
elif os.path.isdir(args.output):
|
||||
args.output = os.path.join(
|
||||
args.output,
|
||||
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
|
||||
return args.weights, args.output, args.type
|
||||
|
||||
|
||||
pt_file, wts_file, m_type = parse_args()
|
||||
|
||||
print(f'Generating .wts for {m_type} model')
|
||||
|
||||
# Load model
|
||||
print(f'Loading {pt_file}')
|
||||
|
||||
# Initialize
|
||||
device = 'cpu'
|
||||
|
||||
# Load model
|
||||
model = torch.load(pt_file, map_location=device)['model'].float() # load to FP32
|
||||
|
||||
if m_type in ['detect', 'seg', 'pose', 'obb']:
|
||||
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
|
||||
|
||||
delattr(model.model[-1], 'anchors')
|
||||
|
||||
model.to(device).eval()
|
||||
|
||||
with open(wts_file, 'w') as f:
|
||||
f.write('{}\n'.format(len(model.state_dict().keys())))
|
||||
for k, v in model.state_dict().items():
|
||||
vr = v.reshape(-1).cpu().numpy()
|
||||
f.write('{} {} '.format(k, len(vr)))
|
||||
for vv in vr:
|
||||
f.write(' ')
|
||||
f.write(struct.pack('>f', float(vv)).hex())
|
||||
f.write('\n')
|
||||
BIN
build_250306/yolov8_5u_det
Normal file
BIN
build_250306/yolov8_5u_det
Normal file
Binary file not shown.
BIN
build_250306/yolov8_cls
Normal file
BIN
build_250306/yolov8_cls
Normal file
Binary file not shown.
BIN
build_250306/yolov8_det
Normal file
BIN
build_250306/yolov8_det
Normal file
Binary file not shown.
BIN
build_250306/yolov8_obb
Normal file
BIN
build_250306/yolov8_obb
Normal file
Binary file not shown.
BIN
build_250306/yolov8_pose
Normal file
BIN
build_250306/yolov8_pose
Normal file
Binary file not shown.
BIN
build_250306/yolov8_seg
Normal file
BIN
build_250306/yolov8_seg
Normal file
Binary file not shown.
57
build_250423_yolov11/gen_wts.py
Normal file
57
build_250423_yolov11/gen_wts.py
Normal file
@ -0,0 +1,57 @@
|
||||
import sys # noqa: F401
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import torch
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
|
||||
parser.add_argument('-w', '--weights', required=True,
|
||||
help='Input weights (.pt) file path (required)')
|
||||
parser.add_argument(
|
||||
'-o', '--output', help='Output (.wts) file path (optional)')
|
||||
parser.add_argument(
|
||||
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
|
||||
help='determines the model is detection/classification')
|
||||
args = parser.parse_args()
|
||||
if not os.path.isfile(args.weights):
|
||||
raise SystemExit('Invalid input file')
|
||||
if not args.output:
|
||||
args.output = os.path.splitext(args.weights)[0] + '.wts'
|
||||
elif os.path.isdir(args.output):
|
||||
args.output = os.path.join(
|
||||
args.output,
|
||||
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
|
||||
return args.weights, args.output, args.type
|
||||
|
||||
|
||||
pt_file, wts_file, m_type = parse_args()
|
||||
|
||||
print(f'Generating .wts for {m_type} model')
|
||||
|
||||
# Load model
|
||||
print(f'Loading {pt_file}')
|
||||
|
||||
# Initialize
|
||||
device = 'cpu'
|
||||
|
||||
# Load model
|
||||
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32
|
||||
|
||||
if m_type in ['detect', 'seg', 'pose', 'obb']:
|
||||
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
|
||||
|
||||
delattr(model.model[-1], 'anchors')
|
||||
|
||||
model.to(device).eval()
|
||||
|
||||
with open(wts_file, 'w') as f:
|
||||
f.write('{}\n'.format(len(model.state_dict().keys())))
|
||||
for k, v in model.state_dict().items():
|
||||
vr = v.reshape(-1).cpu().numpy()
|
||||
f.write('{} {} '.format(k, len(vr)))
|
||||
for vv in vr:
|
||||
f.write(' ')
|
||||
f.write(struct.pack('>f', float(vv)).hex())
|
||||
f.write('\n')
|
||||
BIN
build_250423_yolov11/yolov8_5u_det
Normal file
BIN
build_250423_yolov11/yolov8_5u_det
Normal file
Binary file not shown.
BIN
build_250423_yolov11/yolov8_cls
Normal file
BIN
build_250423_yolov11/yolov8_cls
Normal file
Binary file not shown.
BIN
build_250423_yolov11/yolov8_det
Normal file
BIN
build_250423_yolov11/yolov8_det
Normal file
Binary file not shown.
BIN
build_250423_yolov11/yolov8_obb
Normal file
BIN
build_250423_yolov11/yolov8_obb
Normal file
Binary file not shown.
BIN
build_250423_yolov11/yolov8_pose
Normal file
BIN
build_250423_yolov11/yolov8_pose
Normal file
Binary file not shown.
BIN
build_250423_yolov11/yolov8_seg
Normal file
BIN
build_250423_yolov11/yolov8_seg
Normal file
Binary file not shown.
BIN
build_250506_helmet_head/best.engine
Normal file
BIN
build_250506_helmet_head/best.engine
Normal file
Binary file not shown.
5
build_250506_helmet_head/build.sh
Normal file
5
build_250506_helmet_head/build.sh
Normal file
@ -0,0 +1,5 @@
|
||||
# conda activate trt
|
||||
python gen_wts.py -w best.pt -o best.wts -t detect
|
||||
cmake ..
|
||||
make
|
||||
./yolov8_det -s best.wts best.engine n
|
||||
57
build_250506_helmet_head/gen_wts.py
Normal file
57
build_250506_helmet_head/gen_wts.py
Normal file
@ -0,0 +1,57 @@
|
||||
import sys # noqa: F401
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import torch
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
|
||||
parser.add_argument('-w', '--weights', required=True,
|
||||
help='Input weights (.pt) file path (required)')
|
||||
parser.add_argument(
|
||||
'-o', '--output', help='Output (.wts) file path (optional)')
|
||||
parser.add_argument(
|
||||
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
|
||||
help='determines the model is detection/classification')
|
||||
args = parser.parse_args()
|
||||
if not os.path.isfile(args.weights):
|
||||
raise SystemExit('Invalid input file')
|
||||
if not args.output:
|
||||
args.output = os.path.splitext(args.weights)[0] + '.wts'
|
||||
elif os.path.isdir(args.output):
|
||||
args.output = os.path.join(
|
||||
args.output,
|
||||
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
|
||||
return args.weights, args.output, args.type
|
||||
|
||||
|
||||
pt_file, wts_file, m_type = parse_args()
|
||||
|
||||
print(f'Generating .wts for {m_type} model')
|
||||
|
||||
# Load model
|
||||
print(f'Loading {pt_file}')
|
||||
|
||||
# Initialize
|
||||
device = 'cpu'
|
||||
|
||||
# Load model
|
||||
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32
|
||||
|
||||
if m_type in ['detect', 'seg', 'pose', 'obb']:
|
||||
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
|
||||
|
||||
delattr(model.model[-1], 'anchors')
|
||||
|
||||
model.to(device).eval()
|
||||
|
||||
with open(wts_file, 'w') as f:
|
||||
f.write('{}\n'.format(len(model.state_dict().keys())))
|
||||
for k, v in model.state_dict().items():
|
||||
vr = v.reshape(-1).cpu().numpy()
|
||||
f.write('{} {} '.format(k, len(vr)))
|
||||
for vv in vr:
|
||||
f.write(' ')
|
||||
f.write(struct.pack('>f', float(vv)).hex())
|
||||
f.write('\n')
|
||||
BIN
build_250506_helmet_head/yolov8_5u_det
Normal file
BIN
build_250506_helmet_head/yolov8_5u_det
Normal file
Binary file not shown.
BIN
build_250506_helmet_head/yolov8_cls
Normal file
BIN
build_250506_helmet_head/yolov8_cls
Normal file
Binary file not shown.
BIN
build_250506_helmet_head/yolov8_det
Normal file
BIN
build_250506_helmet_head/yolov8_det
Normal file
Binary file not shown.
BIN
build_250506_helmet_head/yolov8_obb
Normal file
BIN
build_250506_helmet_head/yolov8_obb
Normal file
Binary file not shown.
BIN
build_250506_helmet_head/yolov8_pose
Normal file
BIN
build_250506_helmet_head/yolov8_pose
Normal file
Binary file not shown.
BIN
build_250506_helmet_head/yolov8_seg
Normal file
BIN
build_250506_helmet_head/yolov8_seg
Normal file
Binary file not shown.
92
config.yaml
Normal file
92
config.yaml
Normal file
@ -0,0 +1,92 @@
|
||||
# ===============================
|
||||
# TensorRT Engine 配置
|
||||
# ===============================
|
||||
engine_path: 'build_20251226/'
|
||||
|
||||
|
||||
# ===============================
|
||||
# 视频配置(只启用 v0 本地视频)
|
||||
# ===============================
|
||||
video_config:
|
||||
|
||||
# 输出路径
|
||||
m3u8_path: 'output/'
|
||||
save_path: 'mp4/'
|
||||
people_save_path: 'attendance/'
|
||||
|
||||
# 检测类别(和 engine 对应)
|
||||
categories: ["helmet", "non-Helmet", "shoe"]
|
||||
|
||||
# 兼容旧代码字段
|
||||
m3u8_path_0: 'output/'
|
||||
|
||||
# ===== v0:本地视频(唯一启用)=====
|
||||
v0_ip: 'local_video'
|
||||
v0_channelNo: '0#'
|
||||
v0_testclasses: [0, 1, 2]
|
||||
v0_path: '/home/admin-root/haotian/锻8/tensorrtx/yolov8/video/场景1.mp4'
|
||||
|
||||
# ===== 其余通道全部禁用(别删,防止 KeyError)=====
|
||||
v1_ip: ''
|
||||
v1_path: ''
|
||||
|
||||
v2_ip: ''
|
||||
v2_path: ''
|
||||
|
||||
v3_ip: ''
|
||||
v3_path: ''
|
||||
|
||||
v4_ip: ''
|
||||
v4_path: ''
|
||||
|
||||
v5_ip: ''
|
||||
v5_path: ''
|
||||
|
||||
v6_ip: ''
|
||||
v6_path: ''
|
||||
|
||||
v7_ip: ''
|
||||
v7_path: ''
|
||||
|
||||
v8_ip: ''
|
||||
v8_path: ''
|
||||
|
||||
v9_ip: ''
|
||||
v9_path: ''
|
||||
|
||||
v10_ip: ''
|
||||
v10_path: ''
|
||||
|
||||
v11_ip: ''
|
||||
v11_path: ''
|
||||
|
||||
|
||||
# ===============================
|
||||
# MinIO(先留着,不影响跑视频)
|
||||
# ===============================
|
||||
minioConfig:
|
||||
endpoint: '180.50.12.100:9000/'
|
||||
access_key: 'admin'
|
||||
secret_key: '12345678aA'
|
||||
secure: false
|
||||
bucket_name: 'vi-attachment'
|
||||
|
||||
|
||||
# ===============================
|
||||
# 数据上报(不影响本地跑)
|
||||
# ===============================
|
||||
dataConfig:
|
||||
getTokenUrl: 'http://180.50.12.100/api/appsys/sso/httpheader/login/v1?username_=szls'
|
||||
putMessageUrl: 'http://180.50.12.100/api/edge/edgecallmanages/vi-alarm/v1'
|
||||
timeInterval: 600
|
||||
|
||||
|
||||
# ===============================
|
||||
# 人脸服务(即使不用也不能缺)
|
||||
# ===============================
|
||||
compreface_service:
|
||||
domain: 'http://180.50.12.104'
|
||||
port: '8000'
|
||||
api_key: '88f43f2f-1483-4ad0-ae6c-8f1a800c3acd'
|
||||
det_prob_threshold: 0.99
|
||||
limit: 0
|
||||
135
config.yaml.back_d8_1_mp4
Normal file
135
config.yaml.back_d8_1_mp4
Normal file
@ -0,0 +1,135 @@
|
||||
# engine_path: 'build_250306/'
|
||||
engine_path: 'build_20251226/'
|
||||
|
||||
video_config:
|
||||
|
||||
# 保存m3u8文件路径
|
||||
# m3u8_path: '/home/pro/hls/mid/'
|
||||
# m3u8_path: '/workspace/hls_data/mid/'
|
||||
m3u8_path: 'output/'
|
||||
# 保存mp4文件路径
|
||||
# save_path: '/home/pro/tensorrtx-master/yolov8/mp4/'
|
||||
save_path: 'mp4/'
|
||||
|
||||
people_save_path: 'attendance/'
|
||||
|
||||
# categories : ["face", "shoe", "phone", "e-bike"]
|
||||
categories : ["helmet", "non-Helmet", "shoe"]
|
||||
|
||||
# m3u8_path_0: '/workspace/hls_data/mid/'
|
||||
m3u8_path_0: 'output/'
|
||||
|
||||
v0_ip: 'test243'
|
||||
v0_channelNo: '0#'
|
||||
v0_testclasses : [0, 1, 2]
|
||||
# v0_path: 'rtsp://180.50.12.106:8554/camera_test/2'
|
||||
v0_path: '/home/admin-root/haotian/锻8/tensorrtx/yolov8/video/场景1.mp4'
|
||||
|
||||
v1_ip: '180.50.13.20'
|
||||
v1_channelNo: 'D19'
|
||||
v1_testclasses : [1, 2]
|
||||
|
||||
v1_path: 'rtsp://admin:sy12345678@180.50.13.20:554/Streaming/Channels/102'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
v2_ip: '180.50.13.21'
|
||||
# v2_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v2_channelNo: 'D20'
|
||||
v2_testclasses : [1, 2]
|
||||
v2_path: 'rtsp://admin:sy12345678@180.50.13.21:554/Streaming/Channels/102 '
|
||||
|
||||
v3_ip: '180.50.13.22'
|
||||
# v3_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v3_channelNo: 'D21'
|
||||
v3_testclasses : [1, 2]
|
||||
v3_path: 'rtsp://admin:sy12345678@180.50.13.22:554/Streaming/Channels/102'
|
||||
|
||||
v4_ip: '180.50.13.23'
|
||||
# v4_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v4_channelNo: 'D22'
|
||||
v4_testclasses : [1, 2]
|
||||
v4_path: 'rtsp://admin:sy12345678@180.50.13.23:554/Streaming/Channels/102'
|
||||
|
||||
v5_ip: '180.50.13.24'
|
||||
# v5_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v5_channelNo: 'D23'
|
||||
v5_testclasses : [1, 2]
|
||||
v5_path: 'rtsp://admin:sy12345678@180.50.13.24:554/Streaming/Channels/102'
|
||||
|
||||
v6_ip: '192.168.21.30'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v6_channelNo: '6#'
|
||||
v6_testclasses : [0]
|
||||
v6_path: 'rtsp://admin:12345678a@192.168.21.30:554/Streaming/Channels/101'
|
||||
|
||||
v7_ip: '192.168.21.37'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v7_channelNo: '7#'
|
||||
v7_testclasses : [0]
|
||||
v7_path: 'rtsp://admin:12345678a@192.168.21.37:554/Streaming/Channels/101'
|
||||
|
||||
v8_ip: '192.168.21.50'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v8_channelNo: '8#'
|
||||
v8_testclasses: [0]
|
||||
v8_path: 'rtsp://admin:12345678a@192.168.21.50:554/Streaming/Channels/101'
|
||||
|
||||
v9_ip: '192.168.21.51'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v9_channelNo: '9#'
|
||||
v9_testclasses: [0]
|
||||
v9_path: 'rtsp://admin:12345678a@192.168.21.51:554/Streaming/Channels/101'
|
||||
|
||||
v10_ip: '192.168.21.18'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v10_channelNo: '10#'
|
||||
v10_testclasses: [0]
|
||||
v10_path: 'rtsp://admin:12345678a@192.168.21.18:554/Streaming/Channels/101'
|
||||
|
||||
v11_ip: '192.168.21.55'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v11_channelNo: '11#'
|
||||
v11_testclasses: [0]
|
||||
v11_path: 'rtsp://admin:12345678a@192.168.21.55:554/Streaming/Channels/101'
|
||||
|
||||
|
||||
|
||||
minioConfig:
|
||||
# endpoint: '10.0.0.58:9000/'
|
||||
# access_key: 'root'
|
||||
# secret_key: '@root123456'
|
||||
# secure: False
|
||||
# bucket_name: 'miniotest'
|
||||
|
||||
#bucketName: vi
|
||||
endpoint: '180.50.12.100:9000/'
|
||||
access_key: 'admin'
|
||||
secret_key: '12345678aA'
|
||||
secure: False
|
||||
bucket_name: 'vi-attachment'
|
||||
|
||||
|
||||
dataConfig:
|
||||
# getTokenUrl: 'http://192.168.220.202/api/appsys/sso/httpheader/login/v1?username_=digital'
|
||||
getTokenUrl: 'http://180.50.12.100/api/appsys/sso/httpheader/login/v1?username_=szls'
|
||||
# putMessageUrl: 'http://192.168.220.200/api/edge/edgecallmanages/vi-alarm/v1'
|
||||
putMessageUrl: 'http://180.50.12.100/api/edge/edgecallmanages/vi-alarm/v1'
|
||||
timeInterval: 600
|
||||
# getTokenUrl: 'http://192.168.220.202/api/appsys/sso/httpheader/login/v1/username_=digital'
|
||||
# putMessageUrl: 'http://192.168.220.202/api/edge/edgecallmanages/vi-alarm/v1'
|
||||
|
||||
compreface_service:
|
||||
domain: 'http://180.50.12.104'
|
||||
port: '8000'
|
||||
api_key: '88f43f2f-1483-4ad0-ae6c-8f1a800c3acd'
|
||||
# api_key: 'ab77978a-cc2b-4fa0-8959-6294e856721a'
|
||||
# api_key: '6d89a2ce-b71a-4894-96bb-03c6712e86d0'
|
||||
# 人脸置信度,>0.9j就判断为人脸
|
||||
det_prob_threshold: 0.99
|
||||
# 识别图像中人脸的个数,0代表没有限制。
|
||||
limit: 0
|
||||
|
||||
|
||||
131
config.yaml.back_first
Normal file
131
config.yaml.back_first
Normal file
@ -0,0 +1,131 @@
|
||||
engine_path: 'build250306/'
|
||||
|
||||
video_config:
|
||||
|
||||
# 保存m3u8文件路径
|
||||
m3u8_path: '/home/pro/hls/mid/'
|
||||
# m3u8_path: '/workspace/hls_data/mid/'
|
||||
# 保存mp4文件路径
|
||||
# save_path: '/home/pro/tensorrtx-master/yolov8/mp4/'
|
||||
save_path: 'mp4/'
|
||||
|
||||
people_save_path: 'attendance/'
|
||||
|
||||
# categories : ["face", "shoe", "phone", "e-bike"]
|
||||
categories : ["helmet", "non-Helmet", "shoe"]
|
||||
|
||||
m3u8_path_0: '/workspace/hls_data/mid/'
|
||||
|
||||
v0_ip: 'test243'
|
||||
v0_channelNo: '0#'
|
||||
v0_testclasses : [0, 1, 2]
|
||||
v0_path: 'rtsp://180.50.12.106:8554/camera_test/2'
|
||||
|
||||
v1_ip: '180.50.13.20'
|
||||
v1_channelNo: 'D19'
|
||||
v1_testclasses : [1, 2]
|
||||
|
||||
v1_path: 'rtsp://admin:sy12345678@180.50.13.20:554/Streaming/Channels/102'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
v2_ip: '180.50.13.21'
|
||||
# v2_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v2_channelNo: 'D20'
|
||||
v2_testclasses : [1, 2]
|
||||
v2_path: 'rtsp://admin:sy12345678@180.50.13.21:554/Streaming/Channels/102 '
|
||||
|
||||
v3_ip: '180.50.13.22'
|
||||
# v3_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v3_channelNo: 'D21'
|
||||
v3_testclasses : [1, 2]
|
||||
v3_path: 'rtsp://admin:sy12345678@180.50.13.22:554/Streaming/Channels/102'
|
||||
|
||||
v4_ip: '180.50.13.23'
|
||||
# v4_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v4_channelNo: 'D22'
|
||||
v4_testclasses : [1, 2]
|
||||
v4_path: 'rtsp://admin:sy12345678@180.50.13.23:554/Streaming/Channels/102'
|
||||
|
||||
v5_ip: '180.50.13.24'
|
||||
# v5_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v5_channelNo: 'D23'
|
||||
v5_testclasses : [1, 2]
|
||||
v5_path: 'rtsp://admin:sy12345678@180.50.13.24:554/Streaming/Channels/102'
|
||||
|
||||
v6_ip: '192.168.21.30'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v6_channelNo: '6#'
|
||||
v6_testclasses : [0]
|
||||
v6_path: 'rtsp://admin:12345678a@192.168.21.30:554/Streaming/Channels/101'
|
||||
|
||||
v7_ip: '192.168.21.37'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v7_channelNo: '7#'
|
||||
v7_testclasses : [0]
|
||||
v7_path: 'rtsp://admin:12345678a@192.168.21.37:554/Streaming/Channels/101'
|
||||
|
||||
v8_ip: '192.168.21.50'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v8_channelNo: '8#'
|
||||
v8_testclasses: [0]
|
||||
v8_path: 'rtsp://admin:12345678a@192.168.21.50:554/Streaming/Channels/101'
|
||||
|
||||
v9_ip: '192.168.21.51'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v9_channelNo: '9#'
|
||||
v9_testclasses: [0]
|
||||
v9_path: 'rtsp://admin:12345678a@192.168.21.51:554/Streaming/Channels/101'
|
||||
|
||||
v10_ip: '192.168.21.18'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v10_channelNo: '10#'
|
||||
v10_testclasses: [0]
|
||||
v10_path: 'rtsp://admin:12345678a@192.168.21.18:554/Streaming/Channels/101'
|
||||
|
||||
v11_ip: '192.168.21.55'
|
||||
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
|
||||
v11_channelNo: '11#'
|
||||
v11_testclasses: [0]
|
||||
v11_path: 'rtsp://admin:12345678a@192.168.21.55:554/Streaming/Channels/101'
|
||||
|
||||
|
||||
|
||||
minioConfig:
|
||||
# endpoint: '10.0.0.58:9000/'
|
||||
# access_key: 'root'
|
||||
# secret_key: '@root123456'
|
||||
# secure: False
|
||||
# bucket_name: 'miniotest'
|
||||
|
||||
#bucketName: vi
|
||||
endpoint: '180.50.12.100:9000/'
|
||||
access_key: 'admin'
|
||||
secret_key: '12345678aA'
|
||||
secure: False
|
||||
bucket_name: 'vi-attachment'
|
||||
|
||||
|
||||
dataConfig:
|
||||
# getTokenUrl: 'http://192.168.220.202/api/appsys/sso/httpheader/login/v1?username_=digital'
|
||||
getTokenUrl: 'http://180.50.12.100/api/appsys/sso/httpheader/login/v1?username_=szls'
|
||||
# putMessageUrl: 'http://192.168.220.200/api/edge/edgecallmanages/vi-alarm/v1'
|
||||
putMessageUrl: 'http://180.50.12.100/api/edge/edgecallmanages/vi-alarm/v1'
|
||||
timeInterval: 600
|
||||
# getTokenUrl: 'http://192.168.220.202/api/appsys/sso/httpheader/login/v1/username_=digital'
|
||||
# putMessageUrl: 'http://192.168.220.202/api/edge/edgecallmanages/vi-alarm/v1'
|
||||
|
||||
compreface_service:
|
||||
domain: 'http://180.50.12.104'
|
||||
port: '8000'
|
||||
api_key: '88f43f2f-1483-4ad0-ae6c-8f1a800c3acd'
|
||||
# api_key: 'ab77978a-cc2b-4fa0-8959-6294e856721a'
|
||||
# api_key: '6d89a2ce-b71a-4894-96bb-03c6712e86d0'
|
||||
# 人脸置信度,>0.9j就判断为人脸
|
||||
det_prob_threshold: 0.99
|
||||
# 识别图像中人脸的个数,0代表没有限制。
|
||||
limit: 0
|
||||
|
||||
|
||||
1205
d8_1_mp4 copy.py
Normal file
1205
d8_1_mp4 copy.py
Normal file
File diff suppressed because it is too large
Load Diff
1205
d8_1_mp4.py
Normal file
1205
d8_1_mp4.py
Normal file
File diff suppressed because it is too large
Load Diff
1203
d8_2_new copy 2.py
Normal file
1203
d8_2_new copy 2.py
Normal file
File diff suppressed because it is too large
Load Diff
1203
d8_2_new copy.py
Normal file
1203
d8_2_new copy.py
Normal file
File diff suppressed because it is too large
Load Diff
1256
d8_2_new.py
Normal file
1256
d8_2_new.py
Normal file
File diff suppressed because it is too large
Load Diff
57
gen_wts.py
Normal file
57
gen_wts.py
Normal file
@ -0,0 +1,57 @@
|
||||
import sys # noqa: F401
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import torch
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
|
||||
parser.add_argument('-w', '--weights', required=True,
|
||||
help='Input weights (.pt) file path (required)')
|
||||
parser.add_argument(
|
||||
'-o', '--output', help='Output (.wts) file path (optional)')
|
||||
parser.add_argument(
|
||||
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
|
||||
help='determines the model is detection/classification')
|
||||
args = parser.parse_args()
|
||||
if not os.path.isfile(args.weights):
|
||||
raise SystemExit('Invalid input file')
|
||||
if not args.output:
|
||||
args.output = os.path.splitext(args.weights)[0] + '.wts'
|
||||
elif os.path.isdir(args.output):
|
||||
args.output = os.path.join(
|
||||
args.output,
|
||||
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
|
||||
return args.weights, args.output, args.type
|
||||
|
||||
|
||||
pt_file, wts_file, m_type = parse_args()
|
||||
|
||||
print(f'Generating .wts for {m_type} model')
|
||||
|
||||
# Load model
|
||||
print(f'Loading {pt_file}')
|
||||
|
||||
# Initialize
|
||||
device = 'cpu'
|
||||
|
||||
# Load model
|
||||
model = torch.load(pt_file, map_location=device)['model'].float() # load to FP32
|
||||
|
||||
if m_type in ['detect', 'seg', 'pose', 'obb']:
|
||||
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
|
||||
|
||||
delattr(model.model[-1], 'anchors')
|
||||
|
||||
model.to(device).eval()
|
||||
|
||||
with open(wts_file, 'w') as f:
|
||||
f.write('{}\n'.format(len(model.state_dict().keys())))
|
||||
for k, v in model.state_dict().items():
|
||||
vr = v.reshape(-1).cpu().numpy()
|
||||
f.write('{} {} '.format(k, len(vr)))
|
||||
for vv in vr:
|
||||
f.write(' ')
|
||||
f.write(struct.pack('>f', float(vv)).hex())
|
||||
f.write('\n')
|
||||
BIN
images.zip
Normal file
BIN
images.zip
Normal file
Binary file not shown.
36
include/block.h
Normal file
36
include/block.h
Normal file
@ -0,0 +1,36 @@
|
||||
#pragma once
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "NvInfer.h"
|
||||
|
||||
int calculateP(int ksize);
|
||||
|
||||
std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);
|
||||
|
||||
nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
|
||||
int ch, int k, int s, int p, std::string lname);
|
||||
|
||||
nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
|
||||
int c2, int n, bool shortcut, float e, std::string lname);
|
||||
|
||||
nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
|
||||
int c2, int n, bool shortcut, float e, std::string lname);
|
||||
|
||||
nvinfer1::IElementWiseLayer* C3(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
|
||||
int c2, int n, bool shortcut, float e, std::string lname);
|
||||
|
||||
nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
|
||||
int c2, int k, std::string lname);
|
||||
|
||||
nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
|
||||
nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);
|
||||
|
||||
nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
|
||||
std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
|
||||
int px_arry_num, int num_class, bool is_segmentation, bool is_pose, bool is_obb);
|
||||
39
include/calibrator.h
Normal file
39
include/calibrator.h
Normal file
@ -0,0 +1,39 @@
|
||||
#ifndef ENTROPY_CALIBRATOR_H
|
||||
#define ENTROPY_CALIBRATOR_H
|
||||
|
||||
#include <NvInfer.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "macros.h"
|
||||
|
||||
//! \class Int8EntropyCalibrator2
|
||||
//!
|
||||
//! \brief Implements Entropy calibrator 2.
|
||||
//! CalibrationAlgoType is kENTROPY_CALIBRATION_2.
|
||||
//!
|
||||
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
|
||||
{
|
||||
public:
|
||||
Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
|
||||
virtual ~Int8EntropyCalibrator2();
|
||||
int getBatchSize() const TRT_NOEXCEPT override;
|
||||
bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
|
||||
const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
|
||||
void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
|
||||
|
||||
private:
|
||||
int batchsize_;
|
||||
int input_w_;
|
||||
int input_h_;
|
||||
int img_idx_;
|
||||
std::string img_dir_;
|
||||
std::vector<std::string> img_files_;
|
||||
size_t input_count_;
|
||||
std::string calib_table_name_;
|
||||
const char* input_blob_name_;
|
||||
bool read_cache_;
|
||||
void* device_input_;
|
||||
std::vector<char> calib_cache_;
|
||||
};
|
||||
|
||||
#endif // ENTROPY_CALIBRATOR_H
|
||||
31
include/config.h
Normal file
31
include/config.h
Normal file
@ -0,0 +1,31 @@
|
||||
#define USE_FP16
|
||||
//#define USE_FP32
|
||||
//#define USE_INT8
|
||||
|
||||
const static char* kInputTensorName = "images";
|
||||
const static char* kOutputTensorName = "output";
|
||||
const static int kNumClass = 3;
|
||||
const static int kBatchSize = 1;
|
||||
const static int kGpuId = 0;
|
||||
const static int kInputH = 640;
|
||||
const static int kInputW = 640;
|
||||
const static float kNmsThresh = 0.45f;
|
||||
const static float kConfThresh = 0.5f;
|
||||
const static float kConfThreshKeypoints = 0.5f; // keypoints confidence
|
||||
const static int kMaxInputImageSize = 3000 * 3000;
|
||||
const static int kMaxNumOutputBbox = 1000;
|
||||
//Quantization input image folder path
|
||||
const static char* kInputQuantizationFolder = "./coco_calib";
|
||||
|
||||
// Classfication model's number of classes
|
||||
constexpr static int kClsNumClass = 1000;
|
||||
// Classfication model's input shape
|
||||
constexpr static int kClsInputH = 224;
|
||||
constexpr static int kClsInputW = 224;
|
||||
|
||||
// pose model's number of classes
|
||||
constexpr static int kPoseNumClass = 1;
|
||||
const static int kNumberOfPoints = 17; // number of keypoints total
|
||||
|
||||
// obb model's number of classes
|
||||
constexpr static int kObbNumClass = 15;
|
||||
18
include/cuda_utils.h
Normal file
18
include/cuda_utils.h
Normal file
@ -0,0 +1,18 @@
|
||||
#ifndef TRTX_CUDA_UTILS_H_
|
||||
#define TRTX_CUDA_UTILS_H_
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#ifndef CUDA_CHECK
|
||||
#define CUDA_CHECK(callstr)\
|
||||
{\
|
||||
cudaError_t error_code = callstr;\
|
||||
if (error_code != cudaSuccess) {\
|
||||
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
|
||||
assert(0);\
|
||||
}\
|
||||
}
|
||||
#endif // CUDA_CHECK
|
||||
|
||||
#endif // TRTX_CUDA_UTILS_H_
|
||||
|
||||
504
include/logging.h
Normal file
504
include/logging.h
Normal file
@ -0,0 +1,504 @@
|
||||
/*
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef TENSORRT_LOGGING_H
|
||||
#define TENSORRT_LOGGING_H
|
||||
|
||||
#include "NvInferRuntimeCommon.h"
|
||||
#include <cassert>
|
||||
#include <ctime>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include "macros.h"
|
||||
|
||||
using Severity = nvinfer1::ILogger::Severity;
|
||||
|
||||
class LogStreamConsumerBuffer : public std::stringbuf
|
||||
{
|
||||
public:
|
||||
LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
|
||||
: mOutput(stream)
|
||||
, mPrefix(prefix)
|
||||
, mShouldLog(shouldLog)
|
||||
{
|
||||
}
|
||||
|
||||
LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
|
||||
: mOutput(other.mOutput)
|
||||
{
|
||||
}
|
||||
|
||||
~LogStreamConsumerBuffer()
|
||||
{
|
||||
// std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
|
||||
// std::streambuf::pptr() gives a pointer to the current position of the output sequence
|
||||
// if the pointer to the beginning is not equal to the pointer to the current position,
|
||||
// call putOutput() to log the output to the stream
|
||||
if (pbase() != pptr())
|
||||
{
|
||||
putOutput();
|
||||
}
|
||||
}
|
||||
|
||||
// synchronizes the stream buffer and returns 0 on success
|
||||
// synchronizing the stream buffer consists of inserting the buffer contents into the stream,
|
||||
// resetting the buffer and flushing the stream
|
||||
virtual int sync()
|
||||
{
|
||||
putOutput();
|
||||
return 0;
|
||||
}
|
||||
|
||||
void putOutput()
|
||||
{
|
||||
if (mShouldLog)
|
||||
{
|
||||
// prepend timestamp
|
||||
std::time_t timestamp = std::time(nullptr);
|
||||
tm* tm_local = std::localtime(×tamp);
|
||||
std::cout << "[";
|
||||
std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
|
||||
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
|
||||
std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
|
||||
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
|
||||
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
|
||||
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
|
||||
// std::stringbuf::str() gets the string contents of the buffer
|
||||
// insert the buffer contents pre-appended by the appropriate prefix into the stream
|
||||
mOutput << mPrefix << str();
|
||||
// set the buffer to empty
|
||||
str("");
|
||||
// flush the stream
|
||||
mOutput.flush();
|
||||
}
|
||||
}
|
||||
|
||||
void setShouldLog(bool shouldLog)
|
||||
{
|
||||
mShouldLog = shouldLog;
|
||||
}
|
||||
|
||||
private:
|
||||
std::ostream& mOutput;
|
||||
std::string mPrefix;
|
||||
bool mShouldLog;
|
||||
};
|
||||
|
||||
//!
|
||||
//! \class LogStreamConsumerBase
|
||||
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
|
||||
//!
|
||||
class LogStreamConsumerBase
|
||||
{
|
||||
public:
|
||||
LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
|
||||
: mBuffer(stream, prefix, shouldLog)
|
||||
{
|
||||
}
|
||||
|
||||
protected:
|
||||
LogStreamConsumerBuffer mBuffer;
|
||||
};
|
||||
|
||||
//!
|
||||
//! \class LogStreamConsumer
|
||||
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
|
||||
//! Order of base classes is LogStreamConsumerBase and then std::ostream.
|
||||
//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
|
||||
//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
|
||||
//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
|
||||
//! Please do not change the order of the parent classes.
|
||||
//!
|
||||
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
|
||||
{
|
||||
public:
|
||||
//! \brief Creates a LogStreamConsumer which logs messages with level severity.
|
||||
//! Reportable severity determines if the messages are severe enough to be logged.
|
||||
LogStreamConsumer(Severity reportableSeverity, Severity severity)
|
||||
: LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
|
||||
, std::ostream(&mBuffer) // links the stream buffer with the stream
|
||||
, mShouldLog(severity <= reportableSeverity)
|
||||
, mSeverity(severity)
|
||||
{
|
||||
}
|
||||
|
||||
LogStreamConsumer(LogStreamConsumer&& other)
|
||||
: LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
|
||||
, std::ostream(&mBuffer) // links the stream buffer with the stream
|
||||
, mShouldLog(other.mShouldLog)
|
||||
, mSeverity(other.mSeverity)
|
||||
{
|
||||
}
|
||||
|
||||
void setReportableSeverity(Severity reportableSeverity)
|
||||
{
|
||||
mShouldLog = mSeverity <= reportableSeverity;
|
||||
mBuffer.setShouldLog(mShouldLog);
|
||||
}
|
||||
|
||||
private:
|
||||
static std::ostream& severityOstream(Severity severity)
|
||||
{
|
||||
return severity >= Severity::kINFO ? std::cout : std::cerr;
|
||||
}
|
||||
|
||||
static std::string severityPrefix(Severity severity)
|
||||
{
|
||||
switch (severity)
|
||||
{
|
||||
case Severity::kINTERNAL_ERROR: return "[F] ";
|
||||
case Severity::kERROR: return "[E] ";
|
||||
case Severity::kWARNING: return "[W] ";
|
||||
case Severity::kINFO: return "[I] ";
|
||||
case Severity::kVERBOSE: return "[V] ";
|
||||
default: assert(0); return "";
|
||||
}
|
||||
}
|
||||
|
||||
bool mShouldLog;
|
||||
Severity mSeverity;
|
||||
};
|
||||
|
||||
//! \class Logger
|
||||
//!
|
||||
//! \brief Class which manages logging of TensorRT tools and samples
|
||||
//!
|
||||
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
|
||||
//! and supports logging two types of messages:
|
||||
//!
|
||||
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
|
||||
//! - Test pass/fail messages
|
||||
//!
|
||||
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
|
||||
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
|
||||
//!
|
||||
//! In the future, this class could be extended to support dumping test results to a file in some standard format
|
||||
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
|
||||
//!
|
||||
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
|
||||
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
|
||||
//! library and messages coming from the sample.
|
||||
//!
|
||||
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
|
||||
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
|
||||
//! object.
|
||||
|
||||
class Logger : public nvinfer1::ILogger
|
||||
{
|
||||
public:
|
||||
Logger(Severity severity = Severity::kWARNING)
|
||||
: mReportableSeverity(severity)
|
||||
{
|
||||
}
|
||||
|
||||
//!
|
||||
//! \enum TestResult
|
||||
//! \brief Represents the state of a given test
|
||||
//!
|
||||
enum class TestResult
|
||||
{
|
||||
kRUNNING, //!< The test is running
|
||||
kPASSED, //!< The test passed
|
||||
kFAILED, //!< The test failed
|
||||
kWAIVED //!< The test was waived
|
||||
};
|
||||
|
||||
//!
|
||||
//! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
|
||||
//! \return The nvinfer1::ILogger associated with this Logger
|
||||
//!
|
||||
//! TODO Once all samples are updated to use this method to register the logger with TensorRT,
|
||||
//! we can eliminate the inheritance of Logger from ILogger
|
||||
//!
|
||||
nvinfer1::ILogger& getTRTLogger()
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief Implementation of the nvinfer1::ILogger::log() virtual method
|
||||
//!
|
||||
//! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
|
||||
//! inheritance from nvinfer1::ILogger
|
||||
//!
|
||||
void log(Severity severity, const char* msg) TRT_NOEXCEPT override
|
||||
{
|
||||
LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief Method for controlling the verbosity of logging output
|
||||
//!
|
||||
//! \param severity The logger will only emit messages that have severity of this level or higher.
|
||||
//!
|
||||
void setReportableSeverity(Severity severity)
|
||||
{
|
||||
mReportableSeverity = severity;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief Opaque handle that holds logging information for a particular test
|
||||
//!
|
||||
//! This object is an opaque handle to information used by the Logger to print test results.
|
||||
//! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
|
||||
//! with Logger::reportTest{Start,End}().
|
||||
//!
|
||||
class TestAtom
|
||||
{
|
||||
public:
|
||||
TestAtom(TestAtom&&) = default;
|
||||
|
||||
private:
|
||||
friend class Logger;
|
||||
|
||||
TestAtom(bool started, const std::string& name, const std::string& cmdline)
|
||||
: mStarted(started)
|
||||
, mName(name)
|
||||
, mCmdline(cmdline)
|
||||
{
|
||||
}
|
||||
|
||||
bool mStarted;
|
||||
std::string mName;
|
||||
std::string mCmdline;
|
||||
};
|
||||
|
||||
//!
|
||||
//! \brief Define a test for logging
|
||||
//!
|
||||
//! \param[in] name The name of the test. This should be a string starting with
|
||||
//! "TensorRT" and containing dot-separated strings containing
|
||||
//! the characters [A-Za-z0-9_].
|
||||
//! For example, "TensorRT.sample_googlenet"
|
||||
//! \param[in] cmdline The command line used to reproduce the test
|
||||
//
|
||||
//! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
|
||||
//!
|
||||
static TestAtom defineTest(const std::string& name, const std::string& cmdline)
|
||||
{
|
||||
return TestAtom(false, name, cmdline);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
|
||||
//! as input
|
||||
//!
|
||||
//! \param[in] name The name of the test
|
||||
//! \param[in] argc The number of command-line arguments
|
||||
//! \param[in] argv The array of command-line arguments (given as C strings)
|
||||
//!
|
||||
//! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
|
||||
static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
|
||||
{
|
||||
auto cmdline = genCmdlineString(argc, argv);
|
||||
return defineTest(name, cmdline);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief Report that a test has started.
|
||||
//!
|
||||
//! \pre reportTestStart() has not been called yet for the given testAtom
|
||||
//!
|
||||
//! \param[in] testAtom The handle to the test that has started
|
||||
//!
|
||||
static void reportTestStart(TestAtom& testAtom)
|
||||
{
|
||||
reportTestResult(testAtom, TestResult::kRUNNING);
|
||||
assert(!testAtom.mStarted);
|
||||
testAtom.mStarted = true;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief Report that a test has ended.
|
||||
//!
|
||||
//! \pre reportTestStart() has been called for the given testAtom
|
||||
//!
|
||||
//! \param[in] testAtom The handle to the test that has ended
|
||||
//! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
|
||||
//! TestResult::kFAILED, TestResult::kWAIVED
|
||||
//!
|
||||
static void reportTestEnd(const TestAtom& testAtom, TestResult result)
|
||||
{
|
||||
assert(result != TestResult::kRUNNING);
|
||||
assert(testAtom.mStarted);
|
||||
reportTestResult(testAtom, result);
|
||||
}
|
||||
|
||||
static int reportPass(const TestAtom& testAtom)
|
||||
{
|
||||
reportTestEnd(testAtom, TestResult::kPASSED);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
static int reportFail(const TestAtom& testAtom)
|
||||
{
|
||||
reportTestEnd(testAtom, TestResult::kFAILED);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
static int reportWaive(const TestAtom& testAtom)
|
||||
{
|
||||
reportTestEnd(testAtom, TestResult::kWAIVED);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
static int reportTest(const TestAtom& testAtom, bool pass)
|
||||
{
|
||||
return pass ? reportPass(testAtom) : reportFail(testAtom);
|
||||
}
|
||||
|
||||
Severity getReportableSeverity() const
|
||||
{
|
||||
return mReportableSeverity;
|
||||
}
|
||||
|
||||
private:
|
||||
//!
|
||||
//! \brief returns an appropriate string for prefixing a log message with the given severity
|
||||
//!
|
||||
static const char* severityPrefix(Severity severity)
|
||||
{
|
||||
switch (severity)
|
||||
{
|
||||
case Severity::kINTERNAL_ERROR: return "[F] ";
|
||||
case Severity::kERROR: return "[E] ";
|
||||
case Severity::kWARNING: return "[W] ";
|
||||
case Severity::kINFO: return "[I] ";
|
||||
case Severity::kVERBOSE: return "[V] ";
|
||||
default: assert(0); return "";
|
||||
}
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief returns an appropriate string for prefixing a test result message with the given result
|
||||
//!
|
||||
static const char* testResultString(TestResult result)
|
||||
{
|
||||
switch (result)
|
||||
{
|
||||
case TestResult::kRUNNING: return "RUNNING";
|
||||
case TestResult::kPASSED: return "PASSED";
|
||||
case TestResult::kFAILED: return "FAILED";
|
||||
case TestResult::kWAIVED: return "WAIVED";
|
||||
default: assert(0); return "";
|
||||
}
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
|
||||
//!
|
||||
static std::ostream& severityOstream(Severity severity)
|
||||
{
|
||||
return severity >= Severity::kINFO ? std::cout : std::cerr;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief method that implements logging test results
|
||||
//!
|
||||
static void reportTestResult(const TestAtom& testAtom, TestResult result)
|
||||
{
|
||||
severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
|
||||
<< testAtom.mCmdline << std::endl;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief generate a command line string from the given (argc, argv) values
|
||||
//!
|
||||
static std::string genCmdlineString(int argc, char const* const* argv)
|
||||
{
|
||||
std::stringstream ss;
|
||||
for (int i = 0; i < argc; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
ss << " ";
|
||||
ss << argv[i];
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
Severity mReportableSeverity;
|
||||
};
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
//!
|
||||
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
|
||||
//!
|
||||
//! Example usage:
|
||||
//!
|
||||
//! LOG_VERBOSE(logger) << "hello world" << std::endl;
|
||||
//!
|
||||
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
|
||||
{
|
||||
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
|
||||
//!
|
||||
//! Example usage:
|
||||
//!
|
||||
//! LOG_INFO(logger) << "hello world" << std::endl;
|
||||
//!
|
||||
inline LogStreamConsumer LOG_INFO(const Logger& logger)
|
||||
{
|
||||
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
|
||||
//!
|
||||
//! Example usage:
|
||||
//!
|
||||
//! LOG_WARN(logger) << "hello world" << std::endl;
|
||||
//!
|
||||
inline LogStreamConsumer LOG_WARN(const Logger& logger)
|
||||
{
|
||||
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
|
||||
//!
|
||||
//! Example usage:
|
||||
//!
|
||||
//! LOG_ERROR(logger) << "hello world" << std::endl;
|
||||
//!
|
||||
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
|
||||
{
|
||||
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
|
||||
// ("fatal" severity)
|
||||
//!
|
||||
//! Example usage:
|
||||
//!
|
||||
//! LOG_FATAL(logger) << "hello world" << std::endl;
|
||||
//!
|
||||
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
|
||||
{
|
||||
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
#endif // TENSORRT_LOGGING_H
|
||||
29
include/macros.h
Normal file
29
include/macros.h
Normal file
@ -0,0 +1,29 @@
|
||||
#ifndef __MACROS_H
|
||||
#define __MACROS_H
|
||||
|
||||
#include "NvInfer.h"
|
||||
|
||||
#ifdef API_EXPORTS
|
||||
#if defined(_MSC_VER)
|
||||
#define API __declspec(dllexport)
|
||||
#else
|
||||
#define API __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define API __declspec(dllimport)
|
||||
#else
|
||||
#define API
|
||||
#endif
|
||||
#endif // API_EXPORTS
|
||||
|
||||
#if NV_TENSORRT_MAJOR >= 8
|
||||
#define TRT_NOEXCEPT noexcept
|
||||
#define TRT_CONST_ENQUEUE const
|
||||
#else
|
||||
#define TRT_NOEXCEPT
|
||||
#define TRT_CONST_ENQUEUE
|
||||
#endif
|
||||
|
||||
#endif // __MACROS_H
|
||||
43
include/model.h
Normal file
43
include/model.h
Normal file
@ -0,0 +1,43 @@
|
||||
#pragma once
|
||||
#include <assert.h>
|
||||
#include <string>
|
||||
#include "NvInfer.h"
|
||||
|
||||
nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
|
||||
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
|
||||
int& max_channels);
|
||||
|
||||
nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
|
||||
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
|
||||
int& max_channels);
|
||||
|
||||
nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
|
||||
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
|
||||
int& max_channels);
|
||||
|
||||
nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
|
||||
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw);
|
||||
|
||||
nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
|
||||
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
|
||||
int& max_channels);
|
||||
|
||||
nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
|
||||
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
|
||||
int& max_channels);
|
||||
|
||||
nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
|
||||
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
|
||||
int& max_channels);
|
||||
|
||||
nvinfer1::IHostMemory* buildEngineYolov8_5uDet(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
|
||||
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
|
||||
int& max_channels);
|
||||
|
||||
nvinfer1::IHostMemory* buildEngineYolov8_5uDetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
|
||||
nvinfer1::DataType dt, const std::string& wts_path, float& gd,
|
||||
float& gw, int& max_channels);
|
||||
|
||||
nvinfer1::IHostMemory* buildEngineYolov8Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
|
||||
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
|
||||
int& max_channels);
|
||||
41
include/postprocess.h
Normal file
41
include/postprocess.h
Normal file
@ -0,0 +1,41 @@
|
||||
#pragma once
|
||||
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include "NvInfer.h"
|
||||
#include "types.h"
|
||||
|
||||
// Preprocessing functions
|
||||
cv::Rect get_rect(cv::Mat& img, float bbox[4]);
|
||||
|
||||
// Processing functions
|
||||
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
|
||||
int bbox_element, const std::vector<cv::Mat>& img_batch);
|
||||
void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
|
||||
int bbox_element, const std::vector<cv::Mat>& img_batch);
|
||||
void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
|
||||
int count);
|
||||
void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
|
||||
cv::Mat& img, int count);
|
||||
|
||||
// NMS functions
|
||||
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);
|
||||
void batch_nms(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
|
||||
float conf_thresh, float nms_thresh = 0.5);
|
||||
void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);
|
||||
void batch_nms_obb(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
|
||||
float conf_thresh, float nms_thresh = 0.5);
|
||||
|
||||
// CUDA-related functions
|
||||
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
|
||||
cudaStream_t stream);
|
||||
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
|
||||
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
|
||||
cudaStream_t stream);
|
||||
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
|
||||
|
||||
// Drawing functions
|
||||
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
|
||||
void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
|
||||
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
|
||||
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
|
||||
std::unordered_map<int, std::string>& labels_map);
|
||||
16
include/preprocess.h
Normal file
16
include/preprocess.h
Normal file
@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include "NvInfer.h"
|
||||
#include "types.h"
|
||||
#include <map>
|
||||
|
||||
|
||||
void cuda_preprocess_init(int max_image_size);
|
||||
|
||||
void cuda_preprocess_destroy();
|
||||
|
||||
void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height, cudaStream_t stream);
|
||||
|
||||
void cuda_batch_preprocess(std::vector<cv::Mat> &img_batch, float *dst, int dst_width, int dst_height, cudaStream_t stream);
|
||||
|
||||
19
include/types.h
Normal file
19
include/types.h
Normal file
@ -0,0 +1,19 @@
|
||||
#pragma once
|
||||
#include "config.h"
|
||||
|
||||
struct alignas(float) Detection {
|
||||
//center_x center_y w h
|
||||
float bbox[4];
|
||||
float conf; // bbox_conf * cls_conf
|
||||
float class_id;
|
||||
float mask[32];
|
||||
float keypoints[kNumberOfPoints * 3]; // keypoints array with dynamic size based on kNumberOfPoints
|
||||
float angle; // obb angle
|
||||
};
|
||||
|
||||
struct AffineMatrix {
|
||||
float value[6];
|
||||
};
|
||||
|
||||
const int bbox_element =
|
||||
sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag
|
||||
86
include/utils.h
Normal file
86
include/utils.h
Normal file
@ -0,0 +1,86 @@
|
||||
#pragma once
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include <dirent.h>
|
||||
#include <fstream>
|
||||
|
||||
static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
|
||||
int w, h, x, y;
|
||||
float r_w = input_w / (img.cols*1.0);
|
||||
float r_h = input_h / (img.rows*1.0);
|
||||
if (r_h > r_w) {
|
||||
w = input_w;
|
||||
h = r_w * img.rows;
|
||||
x = 0;
|
||||
y = (input_h - h) / 2;
|
||||
} else {
|
||||
w = r_h * img.cols;
|
||||
h = input_h;
|
||||
x = (input_w - w) / 2;
|
||||
y = 0;
|
||||
}
|
||||
cv::Mat re(h, w, CV_8UC3);
|
||||
cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
|
||||
cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
|
||||
re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
|
||||
return out;
|
||||
}
|
||||
|
||||
static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
|
||||
DIR *p_dir = opendir(p_dir_name);
|
||||
if (p_dir == nullptr) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct dirent* p_file = nullptr;
|
||||
while ((p_file = readdir(p_dir)) != nullptr) {
|
||||
if (strcmp(p_file->d_name, ".") != 0 &&
|
||||
strcmp(p_file->d_name, "..") != 0) {
|
||||
//std::string cur_file_name(p_dir_name);
|
||||
//cur_file_name += "/";
|
||||
//cur_file_name += p_file->d_name;
|
||||
std::string cur_file_name(p_file->d_name);
|
||||
file_names.push_back(cur_file_name);
|
||||
}
|
||||
}
|
||||
|
||||
closedir(p_dir);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Function to trim leading and trailing whitespace from a string
|
||||
static inline std::string trim_leading_whitespace(const std::string& str) {
|
||||
size_t first = str.find_first_not_of(' ');
|
||||
if (std::string::npos == first) {
|
||||
return str;
|
||||
}
|
||||
size_t last = str.find_last_not_of(' ');
|
||||
return str.substr(first, (last - first + 1));
|
||||
}
|
||||
|
||||
// Src: https://stackoverflow.com/questions/16605967
|
||||
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
|
||||
std::ostringstream out;
|
||||
out.precision(n);
|
||||
out << std::fixed << a_value;
|
||||
return out.str();
|
||||
}
|
||||
|
||||
static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
|
||||
std::ifstream file(labels_filename);
|
||||
// Read each line of the file
|
||||
std::string line;
|
||||
int index = 0;
|
||||
while (std::getline(file, line)) {
|
||||
// Strip the line of any leading or trailing whitespace
|
||||
line = trim_leading_whitespace(line);
|
||||
|
||||
// Add the stripped line to the labels_map, using the loop index as the key
|
||||
labels_map[index] = line;
|
||||
index++;
|
||||
}
|
||||
// Close the file
|
||||
file.close();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
371
plugin/yololayer.cu
Normal file
371
plugin/yololayer.cu
Normal file
@ -0,0 +1,371 @@
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "cuda_utils.h"
|
||||
#include "types.h"
|
||||
#include "yololayer.h"
|
||||
|
||||
namespace Tn {
|
||||
template <typename T>
|
||||
void write(char*& buffer, const T& val) {
|
||||
*reinterpret_cast<T*>(buffer) = val;
|
||||
buffer += sizeof(T);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void read(const char*& buffer, T& val) {
|
||||
val = *reinterpret_cast<const T*>(buffer);
|
||||
buffer += sizeof(T);
|
||||
}
|
||||
} // namespace Tn
|
||||
|
||||
__device__ float sigmoid(float x) {
|
||||
return 1.0f / (1.0f + exp(-x));
|
||||
}
|
||||
|
||||
namespace nvinfer1 {
|
||||
YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth,
|
||||
int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb,
|
||||
const int* strides, int stridesLength) {
|
||||
|
||||
mClassCount = classCount;
|
||||
mNumberofpoints = numberofpoints;
|
||||
mConfthreshkeypoints = confthreshkeypoints;
|
||||
mYoloV8NetWidth = netWidth;
|
||||
mYoloV8netHeight = netHeight;
|
||||
mMaxOutObject = maxOut;
|
||||
mStridesLength = stridesLength;
|
||||
mStrides = new int[stridesLength];
|
||||
memcpy(mStrides, strides, stridesLength * sizeof(int));
|
||||
is_segmentation_ = is_segmentation;
|
||||
is_pose_ = is_pose;
|
||||
is_obb_ = is_obb;
|
||||
}
|
||||
|
||||
YoloLayerPlugin::~YoloLayerPlugin() {
|
||||
if (mStrides != nullptr) {
|
||||
delete[] mStrides;
|
||||
mStrides = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
|
||||
using namespace Tn;
|
||||
const char *d = reinterpret_cast<const char*>(data), *a = d;
|
||||
read(d, mClassCount);
|
||||
read(d, mNumberofpoints);
|
||||
read(d, mConfthreshkeypoints);
|
||||
read(d, mThreadCount);
|
||||
read(d, mYoloV8NetWidth);
|
||||
read(d, mYoloV8netHeight);
|
||||
read(d, mMaxOutObject);
|
||||
read(d, mStridesLength);
|
||||
mStrides = new int[mStridesLength];
|
||||
for (int i = 0; i < mStridesLength; ++i) {
|
||||
read(d, mStrides[i]);
|
||||
}
|
||||
read(d, is_segmentation_);
|
||||
read(d, is_pose_);
|
||||
read(d, is_obb_);
|
||||
|
||||
assert(d == a + length);
|
||||
}
|
||||
|
||||
void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
|
||||
|
||||
using namespace Tn;
|
||||
char *d = static_cast<char*>(buffer), *a = d;
|
||||
write(d, mClassCount);
|
||||
write(d, mNumberofpoints);
|
||||
write(d, mConfthreshkeypoints);
|
||||
write(d, mThreadCount);
|
||||
write(d, mYoloV8NetWidth);
|
||||
write(d, mYoloV8netHeight);
|
||||
write(d, mMaxOutObject);
|
||||
write(d, mStridesLength);
|
||||
for (int i = 0; i < mStridesLength; ++i) {
|
||||
write(d, mStrides[i]);
|
||||
}
|
||||
write(d, is_segmentation_);
|
||||
write(d, is_pose_);
|
||||
write(d, is_obb_);
|
||||
|
||||
assert(d == a + getSerializationSize());
|
||||
}
|
||||
|
||||
size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
|
||||
return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) +
|
||||
sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) +
|
||||
sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_) + sizeof(is_obb_);
|
||||
}
|
||||
|
||||
int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
|
||||
return 0;
|
||||
}
|
||||
|
||||
nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
|
||||
int nbInputDims) TRT_NOEXCEPT {
|
||||
int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
|
||||
return nvinfer1::Dims3(total_size + 1, 1, 1);
|
||||
}
|
||||
|
||||
void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
|
||||
mPluginNamespace = pluginNamespace;
|
||||
}
|
||||
|
||||
const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
|
||||
return mPluginNamespace;
|
||||
}
|
||||
|
||||
nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
|
||||
int nbInputs) const TRT_NOEXCEPT {
|
||||
return nvinfer1::DataType::kFLOAT;
|
||||
}
|
||||
|
||||
bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
|
||||
int nbInputs) const TRT_NOEXCEPT {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
|
||||
nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};
|
||||
|
||||
void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
|
||||
IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};
|
||||
|
||||
void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
|
||||
|
||||
const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
|
||||
|
||||
return "YoloLayer_TRT";
|
||||
}
|
||||
|
||||
const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
|
||||
return "1";
|
||||
}
|
||||
|
||||
void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
|
||||
delete this;
|
||||
}
|
||||
|
||||
nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
|
||||
|
||||
YoloLayerPlugin* p =
|
||||
new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight,
|
||||
mMaxOutObject, is_segmentation_, is_pose_, is_obb_, mStrides, mStridesLength);
|
||||
p->setPluginNamespace(mPluginNamespace);
|
||||
return p;
|
||||
}
|
||||
|
||||
int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
|
||||
void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
|
||||
|
||||
forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
|
||||
return 0;
|
||||
}
|
||||
|
||||
__device__ float Logist(float data) {
|
||||
return 1.0f / (1.0f + expf(-data));
|
||||
};
|
||||
|
||||
__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
|
||||
int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem,
|
||||
bool is_segmentation, bool is_pose, bool is_obb) {
|
||||
int idx = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if (idx >= numElements)
|
||||
return;
|
||||
|
||||
const int N_kpts = nk;
|
||||
int total_grid = grid_h * grid_w;
|
||||
int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0);
|
||||
int batchIdx = idx / total_grid;
|
||||
int elemIdx = idx % total_grid;
|
||||
const float* curInput = input + batchIdx * total_grid * info_len;
|
||||
int outputIdx = batchIdx * outputElem;
|
||||
|
||||
int class_id = 0;
|
||||
float max_cls_prob = 0.0;
|
||||
for (int i = 4; i < 4 + classes; i++) {
|
||||
float p = Logist(curInput[elemIdx + i * total_grid]);
|
||||
if (p > max_cls_prob) {
|
||||
max_cls_prob = p;
|
||||
class_id = i - 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (max_cls_prob < 0.1)
|
||||
return;
|
||||
|
||||
int count = (int)atomicAdd(output + outputIdx, 1);
|
||||
if (count >= maxoutobject)
|
||||
return;
|
||||
char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
|
||||
Detection* det = (Detection*)(data);
|
||||
|
||||
int row = elemIdx / grid_w;
|
||||
int col = elemIdx % grid_w;
|
||||
|
||||
det->conf = max_cls_prob;
|
||||
det->class_id = class_id;
|
||||
det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
|
||||
det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
|
||||
det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
|
||||
det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;
|
||||
|
||||
if (is_segmentation) {
|
||||
for (int k = 0; k < 32; ++k) {
|
||||
det->mask[k] =
|
||||
curInput[elemIdx + (4 + classes + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0) + k) * total_grid];
|
||||
}
|
||||
}
|
||||
|
||||
if (is_pose) {
|
||||
for (int kpt = 0; kpt < N_kpts; kpt++) {
|
||||
int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3) * total_grid;
|
||||
int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 1) * total_grid;
|
||||
int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 2) * total_grid;
|
||||
|
||||
float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]);
|
||||
|
||||
float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride;
|
||||
float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride;
|
||||
|
||||
bool is_within_bbox =
|
||||
kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3];
|
||||
|
||||
if (kpt_confidence < confkeypoints || !is_within_bbox) {
|
||||
det->keypoints[kpt * 3] = -1;
|
||||
det->keypoints[kpt * 3 + 1] = -1;
|
||||
det->keypoints[kpt * 3 + 2] = -1;
|
||||
} else {
|
||||
det->keypoints[kpt * 3] = kpt_x;
|
||||
det->keypoints[kpt * 3 + 1] = kpt_y;
|
||||
det->keypoints[kpt * 3 + 2] = kpt_confidence;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_obb) {
|
||||
double pi = M_PI;
|
||||
auto angle_inx = curInput[elemIdx + (4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) +
|
||||
0) * total_grid];
|
||||
auto angle = (sigmoid(angle_inx) - 0.25f) * pi;
|
||||
|
||||
auto cos1 = cos(angle);
|
||||
auto sin1 = sin(angle);
|
||||
auto xf = (curInput[elemIdx + 2 * total_grid] - curInput[elemIdx + 0 * total_grid]) / 2;
|
||||
auto yf = (curInput[elemIdx + 3 * total_grid] - curInput[elemIdx + 1 * total_grid]) / 2;
|
||||
|
||||
auto x = xf * cos1 - yf * sin1;
|
||||
auto y = xf * sin1 + yf * cos1;
|
||||
|
||||
float cx = (col + 0.5f + x) * stride;
|
||||
float cy = (row + 0.5f + y) * stride;
|
||||
|
||||
float w1 = (curInput[elemIdx + 0 * total_grid] + curInput[elemIdx + 2 * total_grid]) * stride;
|
||||
float h1 = (curInput[elemIdx + 1 * total_grid] + curInput[elemIdx + 3 * total_grid]) * stride;
|
||||
det->bbox[0] = cx;
|
||||
det->bbox[1] = cy;
|
||||
det->bbox[2] = w1;
|
||||
det->bbox[3] = h1;
|
||||
det->angle = angle;
|
||||
}
|
||||
}
|
||||
|
||||
void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
|
||||
int mYoloV8NetWidth, int batchSize) {
|
||||
|
||||
int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
|
||||
cudaMemsetAsync(output, 0, sizeof(float), stream);
|
||||
for (int idx = 0; idx < batchSize; ++idx) {
|
||||
CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
|
||||
}
|
||||
int numElem = 0;
|
||||
int maxGrids = mStridesLength;
|
||||
int flatGridsLen = 2 * maxGrids;
|
||||
int* flatGrids = new int[flatGridsLen];
|
||||
|
||||
for (int i = 0; i < maxGrids; ++i) {
|
||||
flatGrids[2 * i] = mYoloV8netHeight / mStrides[i];
|
||||
flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i];
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < maxGrids; i++) {
|
||||
// Access the elements of the original 2D array from the flattened 1D array
|
||||
int grid_h = flatGrids[2 * i]; // Corresponds to the access of grids[i][0]
|
||||
int grid_w = flatGrids[2 * i + 1]; // Corresponds to the access of grids[i][1]
|
||||
int stride = mStrides[i];
|
||||
numElem = grid_h * grid_w * batchSize; // Calculate the total number of elements
|
||||
if (numElem < mThreadCount) // Adjust the thread count if needed
|
||||
mThreadCount = numElem;
|
||||
|
||||
// The CUDA kernel call remains unchanged
|
||||
CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
|
||||
inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints,
|
||||
mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_, is_obb_);
|
||||
}
|
||||
|
||||
delete[] flatGrids;
|
||||
}
|
||||
|
||||
PluginFieldCollection YoloPluginCreator::mFC{};
|
||||
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
|
||||
|
||||
YoloPluginCreator::YoloPluginCreator() {
|
||||
mPluginAttributes.clear();
|
||||
mFC.nbFields = mPluginAttributes.size();
|
||||
mFC.fields = mPluginAttributes.data();
|
||||
}
|
||||
|
||||
const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
|
||||
return "YoloLayer_TRT";
|
||||
}
|
||||
|
||||
const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
|
||||
return "1";
|
||||
}
|
||||
|
||||
const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
|
||||
return &mFC;
|
||||
}
|
||||
|
||||
IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
|
||||
assert(fc->nbFields == 1);
|
||||
assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
|
||||
const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
|
||||
int netinfo_count = 9;
|
||||
int class_count = combinedInfo[0];
|
||||
int numberofpoints = combinedInfo[1];
|
||||
float confthreshkeypoints = combinedInfo[2];
|
||||
int input_w = combinedInfo[3];
|
||||
int input_h = combinedInfo[4];
|
||||
int max_output_object_count = combinedInfo[5];
|
||||
bool is_segmentation = combinedInfo[6];
|
||||
bool is_pose = combinedInfo[7];
|
||||
bool is_obb = combinedInfo[8];
|
||||
const int* px_arry = combinedInfo + netinfo_count;
|
||||
int px_arry_length = fc->fields[0].length - netinfo_count;
|
||||
YoloLayerPlugin* obj =
|
||||
new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h,
|
||||
max_output_object_count, is_segmentation, is_pose, is_obb, px_arry, px_arry_length);
|
||||
obj->setPluginNamespace(mNamespace.c_str());
|
||||
return obj;
|
||||
}
|
||||
|
||||
IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
|
||||
size_t serialLength) TRT_NOEXCEPT {
|
||||
// This object will be deleted when the network is destroyed, which will
|
||||
// call YoloLayerPlugin::destroy()
|
||||
YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
|
||||
obj->setPluginNamespace(mNamespace.c_str());
|
||||
return obj;
|
||||
}
|
||||
|
||||
} // namespace nvinfer1
|
||||
110
plugin/yololayer.h
Normal file
110
plugin/yololayer.h
Normal file
@ -0,0 +1,110 @@
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "NvInfer.h"
|
||||
#include "macros.h"
|
||||
namespace nvinfer1 {
|
||||
class API YoloLayerPlugin : public IPluginV2IOExt {
|
||||
public:
|
||||
YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight,
|
||||
int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength);
|
||||
|
||||
YoloLayerPlugin(const void* data, size_t length);
|
||||
~YoloLayerPlugin();
|
||||
|
||||
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
|
||||
|
||||
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
|
||||
|
||||
int initialize() TRT_NOEXCEPT override;
|
||||
|
||||
virtual void terminate() TRT_NOEXCEPT override {}
|
||||
|
||||
virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
|
||||
|
||||
virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
|
||||
cudaStream_t stream) TRT_NOEXCEPT override;
|
||||
|
||||
virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
|
||||
|
||||
virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
|
||||
|
||||
bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
|
||||
int nbOutputs) const TRT_NOEXCEPT override {
|
||||
return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
|
||||
}
|
||||
|
||||
const char* getPluginType() const TRT_NOEXCEPT override;
|
||||
|
||||
const char* getPluginVersion() const TRT_NOEXCEPT override;
|
||||
|
||||
void destroy() TRT_NOEXCEPT override;
|
||||
|
||||
IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
|
||||
|
||||
void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
|
||||
|
||||
const char* getPluginNamespace() const TRT_NOEXCEPT override;
|
||||
|
||||
nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
|
||||
int32_t nbInputs) const TRT_NOEXCEPT;
|
||||
|
||||
bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
|
||||
int nbInputs) const TRT_NOEXCEPT override;
|
||||
|
||||
bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
|
||||
|
||||
void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
|
||||
IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
|
||||
|
||||
void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
|
||||
int32_t nbOutput) TRT_NOEXCEPT override;
|
||||
|
||||
void detachFromContext() TRT_NOEXCEPT override;
|
||||
|
||||
private:
|
||||
void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
|
||||
int mYoloV8NetWidth, int batchSize);
|
||||
int mThreadCount = 256;
|
||||
const char* mPluginNamespace;
|
||||
int mClassCount;
|
||||
int mNumberofpoints;
|
||||
float mConfthreshkeypoints;
|
||||
int mYoloV8NetWidth;
|
||||
int mYoloV8netHeight;
|
||||
int mMaxOutObject;
|
||||
bool is_segmentation_;
|
||||
bool is_pose_;
|
||||
bool is_obb_;
|
||||
int* mStrides;
|
||||
int mStridesLength;
|
||||
};
|
||||
|
||||
class API YoloPluginCreator : public IPluginCreator {
|
||||
public:
|
||||
YoloPluginCreator();
|
||||
~YoloPluginCreator() override = default;
|
||||
|
||||
const char* getPluginName() const TRT_NOEXCEPT override;
|
||||
|
||||
const char* getPluginVersion() const TRT_NOEXCEPT override;
|
||||
|
||||
const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
|
||||
|
||||
nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
|
||||
const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
|
||||
|
||||
nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
|
||||
size_t serialLength) TRT_NOEXCEPT override;
|
||||
|
||||
void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }
|
||||
|
||||
const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
|
||||
|
||||
private:
|
||||
std::string mNamespace;
|
||||
static PluginFieldCollection mFC;
|
||||
static std::vector<PluginField> mPluginAttributes;
|
||||
};
|
||||
REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
|
||||
} // namespace nvinfer1
|
||||
309
src/block.cpp
Normal file
309
src/block.cpp
Normal file
@ -0,0 +1,309 @@
|
||||
#include "block.h"
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include "config.h"
|
||||
#include "yololayer.h"
|
||||
|
||||
int calculateP(int ksize) {
|
||||
return ksize / 3;
|
||||
}
|
||||
|
||||
std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
|
||||
std::cout << "Loading weights: " << file << std::endl;
|
||||
std::map<std::string, nvinfer1::Weights> WeightMap;
|
||||
|
||||
std::ifstream input(file);
|
||||
assert(input.is_open() &&
|
||||
"Unable to load weight file. please check if the "
|
||||
".wts file path is right!!!!!!");
|
||||
|
||||
int32_t count;
|
||||
input >> count;
|
||||
assert(count > 0 && "Invalid weight map file.");
|
||||
|
||||
while (count--) {
|
||||
nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
|
||||
uint32_t size;
|
||||
|
||||
std::string name;
|
||||
input >> name >> std::dec >> size;
|
||||
wt.type = nvinfer1::DataType::kFLOAT;
|
||||
|
||||
uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
|
||||
for (uint32_t x = 0, y = size; x < y; x++) {
|
||||
input >> std::hex >> val[x];
|
||||
}
|
||||
wt.values = val;
|
||||
wt.count = size;
|
||||
WeightMap[name] = wt;
|
||||
}
|
||||
return WeightMap;
|
||||
}
|
||||
|
||||
static nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights> weightMap,
|
||||
nvinfer1::ITensor& input, std::string lname, float eps) {
|
||||
float* gamma = (float*)weightMap[lname + ".weight"].values;
|
||||
float* beta = (float*)weightMap[lname + ".bias"].values;
|
||||
float* mean = (float*)weightMap[lname + ".running_mean"].values;
|
||||
float* var = (float*)weightMap[lname + ".running_var"].values;
|
||||
int len = weightMap[lname + ".running_var"].count;
|
||||
|
||||
float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
|
||||
for (int i = 0; i < len; i++) {
|
||||
scval[i] = gamma[i] / sqrt(var[i] + eps);
|
||||
}
|
||||
nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};
|
||||
|
||||
float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
|
||||
for (int i = 0; i < len; i++) {
|
||||
shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
|
||||
}
|
||||
nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};
|
||||
|
||||
float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
|
||||
for (int i = 0; i < len; i++) {
|
||||
pval[i] = 1.0;
|
||||
}
|
||||
nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
|
||||
weightMap[lname + ".scale"] = scale;
|
||||
weightMap[lname + ".shift"] = shift;
|
||||
weightMap[lname + ".power"] = power;
|
||||
nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
|
||||
assert(output);
|
||||
return output;
|
||||
}
|
||||
|
||||
nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
|
||||
int ch, int k, int s, int p, std::string lname) {
|
||||
nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
|
||||
nvinfer1::IConvolutionLayer* conv =
|
||||
network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
|
||||
assert(conv);
|
||||
conv->setStrideNd(nvinfer1::DimsHW{s, s});
|
||||
conv->setPaddingNd(nvinfer1::DimsHW{p, p});
|
||||
|
||||
nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
|
||||
|
||||
nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
|
||||
nvinfer1::IElementWiseLayer* ew =
|
||||
network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
|
||||
assert(ew);
|
||||
return ew;
|
||||
}
|
||||
|
||||
nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
|
||||
nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, std::string lname) {
|
||||
nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".cv1");
|
||||
nvinfer1::IElementWiseLayer* conv2 =
|
||||
convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, 3, 1, 1, lname + ".cv2");
|
||||
|
||||
if (shortcut && c1 == c2) {
|
||||
nvinfer1::IElementWiseLayer* ew =
|
||||
network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
|
||||
return ew;
|
||||
}
|
||||
return conv2;
|
||||
}
|
||||
|
||||
static nvinfer1::ILayer* bottleneck_c3(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
|
||||
int c1, int c2, bool shortcut, float e, std::string lname) {
|
||||
nvinfer1::IElementWiseLayer* cv1 =
|
||||
convBnSiLU(network, weightMap, input, (int)((float)c2 * e), 1, 1, calculateP(1), lname + ".cv1");
|
||||
nvinfer1::IElementWiseLayer* cv2 =
|
||||
convBnSiLU(network, weightMap, *cv1->getOutput(0), c2, 3, 1, calculateP(3), lname + ".cv2");
|
||||
if (shortcut && c1 == c2) {
|
||||
auto ew = network->addElementWise(input, *cv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
|
||||
return ew;
|
||||
}
|
||||
return cv2;
|
||||
}
|
||||
|
||||
nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
|
||||
int c2, int n, bool shortcut, float e, std::string lname) {
|
||||
int c_ = (float)c2 * e;
|
||||
|
||||
nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, 0, lname + ".cv1");
|
||||
nvinfer1::Dims d = conv1->getOutput(0)->getDimensions();
|
||||
|
||||
nvinfer1::ISliceLayer* split1 =
|
||||
network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
|
||||
nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});
|
||||
nvinfer1::ISliceLayer* split2 =
|
||||
network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0},
|
||||
nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});
|
||||
nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)};
|
||||
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
|
||||
nvinfer1::ITensor* y1 = split2->getOutput(0);
|
||||
for (int i = 0; i < n; i++) {
|
||||
auto* b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i));
|
||||
y1 = b->getOutput(0);
|
||||
|
||||
nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
|
||||
cat = network->addConcatenation(inputTensors, 2);
|
||||
}
|
||||
|
||||
nvinfer1::IElementWiseLayer* conv2 =
|
||||
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
|
||||
|
||||
return conv2;
|
||||
}
|
||||
|
||||
nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
|
||||
int c2, int n, bool shortcut, float e, std::string lname) {
|
||||
assert(network != nullptr);
|
||||
int hidden_channels = static_cast<int>(c2 * e);
|
||||
|
||||
// cv1 branch
|
||||
nvinfer1::IElementWiseLayer* conv1 =
|
||||
convBnSiLU(network, weightMap, input, 2 * hidden_channels, 1, 1, 0, lname + ".cv1");
|
||||
nvinfer1::ITensor* cv1_out = conv1->getOutput(0);
|
||||
|
||||
// Split the output of cv1 into two tensors
|
||||
nvinfer1::Dims dims = cv1_out->getDimensions();
|
||||
nvinfer1::ISliceLayer* split1 =
|
||||
network->addSlice(*cv1_out, nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{dims.d[0] / 2, dims.d[1], dims.d[2]},
|
||||
nvinfer1::Dims3{1, 1, 1});
|
||||
nvinfer1::ISliceLayer* split2 =
|
||||
network->addSlice(*cv1_out, nvinfer1::Dims3{dims.d[0] / 2, 0, 0},
|
||||
nvinfer1::Dims3{dims.d[0] / 2, dims.d[1], dims.d[2]}, nvinfer1::Dims3{1, 1, 1});
|
||||
|
||||
// Create y1 bottleneck sequence
|
||||
nvinfer1::ITensor* y1 = split1->getOutput(0);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto* bottleneck_layer = bottleneck(network, weightMap, *y1, hidden_channels, hidden_channels, shortcut, 1.0,
|
||||
lname + ".m." + std::to_string(i));
|
||||
y1 = bottleneck_layer->getOutput(0); // update 'y1' to be the output of the current bottleneck
|
||||
}
|
||||
|
||||
// Concatenate y1 with the second split of cv1
|
||||
nvinfer1::ITensor* concatInputs[2] = {y1, split2->getOutput(0)};
|
||||
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2);
|
||||
|
||||
// cv2 to produce the final output
|
||||
nvinfer1::IElementWiseLayer* conv2 =
|
||||
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
|
||||
|
||||
return conv2;
|
||||
}
|
||||
|
||||
nvinfer1::IElementWiseLayer* C3(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
|
||||
int c2, int n, bool shortcut, float e, std::string lname) {
|
||||
int c_ = (float)c2 * e;
|
||||
nvinfer1::IElementWiseLayer* cv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, calculateP(1), lname + ".cv1");
|
||||
nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, input, c_, 1, 1, calculateP(1), lname + ".cv2");
|
||||
nvinfer1::ITensor* y1 = cv1->getOutput(0);
|
||||
for (int i = 0; i < n; i++) {
|
||||
auto b = bottleneck_c3(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i));
|
||||
y1 = b->getOutput(0);
|
||||
}
|
||||
nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)};
|
||||
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2);
|
||||
nvinfer1::IElementWiseLayer* conv3 =
|
||||
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, calculateP(1), lname + ".cv3");
|
||||
return conv3;
|
||||
}
|
||||
|
||||
nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
|
||||
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
|
||||
int c2, int k, std::string lname) {
|
||||
int c_ = c1 / 2;
|
||||
nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1");
|
||||
nvinfer1::IPoolingLayer* pool1 =
|
||||
network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
|
||||
pool1->setStrideNd(nvinfer1::DimsHW{1, 1});
|
||||
pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
|
||||
nvinfer1::IPoolingLayer* pool2 =
|
||||
network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
|
||||
pool2->setStrideNd(nvinfer1::DimsHW{1, 1});
|
||||
pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
|
||||
nvinfer1::IPoolingLayer* pool3 =
|
||||
network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
|
||||
pool3->setStrideNd(nvinfer1::DimsHW{1, 1});
|
||||
pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
|
||||
nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0),
|
||||
pool3->getOutput(0)};
|
||||
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4);
|
||||
nvinfer1::IElementWiseLayer* conv2 =
|
||||
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
|
||||
return conv2;
|
||||
}
|
||||
|
||||
nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
|
||||
nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) {
|
||||
|
||||
nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input);
|
||||
shuffle1->setReshapeDimensions(nvinfer1::Dims3{4, 16, grid});
|
||||
shuffle1->setSecondTranspose(nvinfer1::Permutation{1, 0, 2});
|
||||
nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0));
|
||||
|
||||
nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
|
||||
nvinfer1::IConvolutionLayer* conv =
|
||||
network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty);
|
||||
conv->setStrideNd(nvinfer1::DimsHW{s, s});
|
||||
conv->setPaddingNd(nvinfer1::DimsHW{p, p});
|
||||
|
||||
nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0));
|
||||
shuffle2->setReshapeDimensions(nvinfer1::Dims2{4, grid});
|
||||
|
||||
return shuffle2;
|
||||
}
|
||||
|
||||
nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
|
||||
std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
|
||||
int px_arry_num, int num_class, bool is_segmentation, bool is_pose,
|
||||
bool is_obb) {
|
||||
auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
|
||||
const int netinfo_count = 9; // Assuming the first 5 elements are for netinfo as per existing code.
|
||||
const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined.
|
||||
|
||||
std::vector<int> combinedInfo(total_count);
|
||||
// Fill in the first 5 elements as per existing netinfo.
|
||||
combinedInfo[0] = num_class;
|
||||
combinedInfo[1] = kNumberOfPoints;
|
||||
combinedInfo[2] = kConfThreshKeypoints;
|
||||
combinedInfo[3] = kInputW;
|
||||
combinedInfo[4] = kInputH;
|
||||
combinedInfo[5] = kMaxNumOutputBbox;
|
||||
combinedInfo[6] = is_segmentation;
|
||||
combinedInfo[7] = is_pose;
|
||||
combinedInfo[8] = is_obb;
|
||||
|
||||
// Copy the contents of px_arry into the combinedInfo vector after the initial
|
||||
// 5 elements.
|
||||
std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);
|
||||
|
||||
// Now let's create the PluginField object to hold this combined information.
|
||||
nvinfer1::PluginField pluginField;
|
||||
pluginField.name = "combinedInfo"; // This can be any name that the plugin will recognize
|
||||
pluginField.data = combinedInfo.data();
|
||||
pluginField.type = nvinfer1::PluginFieldType::kINT32;
|
||||
pluginField.length = combinedInfo.size();
|
||||
|
||||
// Create the PluginFieldCollection to hold the PluginField object.
|
||||
nvinfer1::PluginFieldCollection pluginFieldCollection;
|
||||
pluginFieldCollection.nbFields = 1; // We have just one field, but it's a combined array
|
||||
pluginFieldCollection.fields = &pluginField;
|
||||
|
||||
// Create the plugin object using the PluginFieldCollection.
|
||||
nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);
|
||||
|
||||
// We assume that the plugin is to be added onto the network.
|
||||
// Prepare input tensors for the YOLO Layer.
|
||||
std::vector<nvinfer1::ITensor*> inputTensors;
|
||||
for (auto det : dets) {
|
||||
inputTensors.push_back(det->getOutput(0)); // Assuming each IConcatenationLayer has one output tensor.
|
||||
}
|
||||
|
||||
// Add the plugin to the network using the prepared input tensors.
|
||||
nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);
|
||||
|
||||
return yoloLayer; // Return the added YOLO layer.
|
||||
}
|
||||
80
src/calibrator.cpp
Normal file
80
src/calibrator.cpp
Normal file
@ -0,0 +1,80 @@
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <fstream>
|
||||
#include <opencv2/dnn/dnn.hpp>
|
||||
#include "calibrator.h"
|
||||
#include "cuda_utils.h"
|
||||
#include "utils.h"
|
||||
|
||||
Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
|
||||
const char* input_blob_name, bool read_cache)
|
||||
: batchsize_(batchsize)
|
||||
, input_w_(input_w)
|
||||
, input_h_(input_h)
|
||||
, img_idx_(0)
|
||||
, img_dir_(img_dir)
|
||||
, calib_table_name_(calib_table_name)
|
||||
, input_blob_name_(input_blob_name)
|
||||
, read_cache_(read_cache)
|
||||
{
|
||||
input_count_ = 3 * input_w * input_h * batchsize;
|
||||
CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
|
||||
read_files_in_dir(img_dir, img_files_);
|
||||
}
|
||||
|
||||
Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
|
||||
{
|
||||
CUDA_CHECK(cudaFree(device_input_));
|
||||
}
|
||||
|
||||
int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT
|
||||
{
|
||||
return batchsize_;
|
||||
}
|
||||
|
||||
bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT
|
||||
{
|
||||
if (img_idx_ + batchsize_ > (int)img_files_.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<cv::Mat> input_imgs_;
|
||||
for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
|
||||
std::cout << img_files_[i] << " " << i << std::endl;
|
||||
cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
|
||||
if (temp.empty()){
|
||||
std::cerr << "Fatal error: image cannot open!" << std::endl;
|
||||
return false;
|
||||
}
|
||||
cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
|
||||
input_imgs_.push_back(pr_img);
|
||||
}
|
||||
img_idx_ += batchsize_;
|
||||
cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);
|
||||
CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
|
||||
assert(!strcmp(names[0], input_blob_name_));
|
||||
bindings[0] = device_input_;
|
||||
return true;
|
||||
}
|
||||
|
||||
const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT
|
||||
{
|
||||
std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
|
||||
calib_cache_.clear();
|
||||
std::ifstream input(calib_table_name_, std::ios::binary);
|
||||
input >> std::noskipws;
|
||||
if (read_cache_ && input.good())
|
||||
{
|
||||
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
|
||||
}
|
||||
length = calib_cache_.size();
|
||||
return length ? calib_cache_.data() : nullptr;
|
||||
}
|
||||
|
||||
void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT
|
||||
{
|
||||
std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
|
||||
std::ofstream output(calib_table_name_, std::ios::binary);
|
||||
output.write(reinterpret_cast<const char*>(cache), length);
|
||||
}
|
||||
|
||||
2750
src/model.cpp
Normal file
2750
src/model.cpp
Normal file
File diff suppressed because it is too large
Load Diff
507
src/postprocess.cpp
Normal file
507
src/postprocess.cpp
Normal file
@ -0,0 +1,507 @@
|
||||
#include "postprocess.h"
|
||||
#include <algorithm>
|
||||
#include <iostream> // Include this header for printing
|
||||
#include "utils.h"
|
||||
|
||||
cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
|
||||
float l, r, t, b;
|
||||
float r_w = kInputW / (img.cols * 1.0);
|
||||
float r_h = kInputH / (img.rows * 1.0);
|
||||
|
||||
if (r_h > r_w) {
|
||||
l = bbox[0];
|
||||
r = bbox[2];
|
||||
t = bbox[1] - (kInputH - r_w * img.rows) / 2;
|
||||
b = bbox[3] - (kInputH - r_w * img.rows) / 2;
|
||||
l = l / r_w;
|
||||
r = r / r_w;
|
||||
t = t / r_w;
|
||||
b = b / r_w;
|
||||
} else {
|
||||
l = bbox[0] - (kInputW - r_h * img.cols) / 2;
|
||||
r = bbox[2] - (kInputW - r_h * img.cols) / 2;
|
||||
t = bbox[1];
|
||||
b = bbox[3];
|
||||
l = l / r_h;
|
||||
r = r / r_h;
|
||||
t = t / r_h;
|
||||
b = b / r_h;
|
||||
}
|
||||
l = std::max(0.0f, l);
|
||||
t = std::max(0.0f, t);
|
||||
int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
|
||||
int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));
|
||||
|
||||
return cv::Rect(int(round(l)), int(round(t)), width, height);
|
||||
}
|
||||
|
||||
cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) {
|
||||
float l, r, t, b;
|
||||
float r_w = kInputW / (img.cols * 1.0);
|
||||
float r_h = kInputH / (img.rows * 1.0);
|
||||
if (r_h > r_w) {
|
||||
l = bbox[0] / r_w;
|
||||
r = bbox[2] / r_w;
|
||||
t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
|
||||
b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
|
||||
for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
|
||||
lmk[i] /= r_w;
|
||||
lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
|
||||
// lmk[i + 2]
|
||||
}
|
||||
} else {
|
||||
l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
|
||||
r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
|
||||
t = bbox[1] / r_h;
|
||||
b = bbox[3] / r_h;
|
||||
for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
|
||||
lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
|
||||
lmk[i + 1] /= r_h;
|
||||
// lmk[i + 2]
|
||||
}
|
||||
}
|
||||
l = std::max(0.0f, l);
|
||||
t = std::max(0.0f, t);
|
||||
int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
|
||||
int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));
|
||||
|
||||
return cv::Rect(int(round(l)), int(round(t)), width, height);
|
||||
}
|
||||
|
||||
static float iou(float lbox[4], float rbox[4]) {
|
||||
float interBox[] = {
|
||||
(std::max)(lbox[0], rbox[0]),
|
||||
(std::min)(lbox[2], rbox[2]),
|
||||
(std::max)(lbox[1], rbox[1]),
|
||||
(std::min)(lbox[3], rbox[3]),
|
||||
};
|
||||
|
||||
if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
|
||||
return 0.0f;
|
||||
|
||||
float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
|
||||
float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS;
|
||||
return interBoxS / unionBoxS;
|
||||
}
|
||||
|
||||
static bool cmp(const Detection& a, const Detection& b) {
|
||||
if (a.conf == b.conf) {
|
||||
return a.bbox[0] < b.bbox[0];
|
||||
}
|
||||
return a.conf > b.conf;
|
||||
}
|
||||
|
||||
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
|
||||
int det_size = sizeof(Detection) / sizeof(float);
|
||||
std::map<float, std::vector<Detection>> m;
|
||||
|
||||
for (int i = 0; i < output[0]; i++) {
|
||||
if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4]))
|
||||
continue;
|
||||
Detection det;
|
||||
memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
|
||||
if (m.count(det.class_id) == 0)
|
||||
m.emplace(det.class_id, std::vector<Detection>());
|
||||
m[det.class_id].push_back(det);
|
||||
}
|
||||
for (auto it = m.begin(); it != m.end(); it++) {
|
||||
auto& dets = it->second;
|
||||
std::sort(dets.begin(), dets.end(), cmp);
|
||||
for (size_t m = 0; m < dets.size(); ++m) {
|
||||
auto& item = dets[m];
|
||||
res.push_back(item);
|
||||
for (size_t n = m + 1; n < dets.size(); ++n) {
|
||||
if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
|
||||
dets.erase(dets.begin() + n);
|
||||
--n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void batch_nms(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
|
||||
float conf_thresh, float nms_thresh) {
|
||||
res_batch.resize(batch_size);
|
||||
for (int i = 0; i < batch_size; i++) {
|
||||
nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
|
||||
}
|
||||
}
|
||||
|
||||
void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
|
||||
int count) {
|
||||
Detection det;
|
||||
for (int i = 0; i < count; i++) {
|
||||
int basic_pos = 1 + i * bbox_element;
|
||||
int keep_flag = decode_ptr_host[basic_pos + 6];
|
||||
if (keep_flag == 1) {
|
||||
det.bbox[0] = decode_ptr_host[basic_pos + 0];
|
||||
det.bbox[1] = decode_ptr_host[basic_pos + 1];
|
||||
det.bbox[2] = decode_ptr_host[basic_pos + 2];
|
||||
det.bbox[3] = decode_ptr_host[basic_pos + 3];
|
||||
det.conf = decode_ptr_host[basic_pos + 4];
|
||||
det.class_id = decode_ptr_host[basic_pos + 5];
|
||||
res.push_back(det);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
|
||||
int bbox_element, const std::vector<cv::Mat>& img_batch) {
|
||||
res_batch.resize(batch_size);
|
||||
int count = static_cast<int>(*decode_ptr_host);
|
||||
count = std::min(count, kMaxNumOutputBbox);
|
||||
for (int i = 0; i < batch_size; i++) {
|
||||
auto& img = const_cast<cv::Mat&>(img_batch[i]);
|
||||
process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
|
||||
}
|
||||
}
|
||||
|
||||
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
|
||||
for (size_t i = 0; i < img_batch.size(); i++) {
|
||||
auto& res = res_batch[i];
|
||||
cv::Mat img = img_batch[i];
|
||||
for (size_t j = 0; j < res.size(); j++) {
|
||||
cv::Rect r = get_rect(img, res[j].bbox);
|
||||
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
|
||||
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
|
||||
cv::Scalar(0xFF, 0xFF, 0xFF), 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
|
||||
const std::vector<std::pair<int, int>> skeleton_pairs = {
|
||||
{0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11},
|
||||
{6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};
|
||||
|
||||
for (size_t i = 0; i < img_batch.size(); i++) {
|
||||
auto& res = res_batch[i];
|
||||
cv::Mat img = img_batch[i];
|
||||
for (size_t j = 0; j < res.size(); j++) {
|
||||
cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
|
||||
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
|
||||
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
|
||||
cv::Scalar(0xFF, 0xFF, 0xFF), 2);
|
||||
|
||||
for (int k = 0; k < kNumberOfPoints * 3; k += 3) {
|
||||
if (res[j].keypoints[k + 2] > 0.5) {
|
||||
cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
|
||||
cv::Scalar(0, 0x27, 0xC1), -1);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& bone : skeleton_pairs) {
|
||||
int kp1_idx = bone.first * 3;
|
||||
int kp2_idx = bone.second * 3;
|
||||
if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
|
||||
cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
|
||||
cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
|
||||
cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
|
||||
int x, y, w, h;
|
||||
float r_w = kInputW / (img.cols * 1.0);
|
||||
float r_h = kInputH / (img.rows * 1.0);
|
||||
if (r_h > r_w) {
|
||||
w = kInputW;
|
||||
h = r_w * img.rows;
|
||||
x = 0;
|
||||
y = (kInputH - h) / 2;
|
||||
} else {
|
||||
w = r_h * img.cols;
|
||||
h = kInputH;
|
||||
x = (kInputW - w) / 2;
|
||||
y = 0;
|
||||
}
|
||||
cv::Rect r(x, y, w, h);
|
||||
cv::Mat res;
|
||||
cv::resize(mask(r), res, img.size());
|
||||
return res;
|
||||
}
|
||||
|
||||
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
|
||||
std::unordered_map<int, std::string>& labels_map) {
|
||||
static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
|
||||
0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
|
||||
0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
|
||||
for (size_t i = 0; i < dets.size(); i++) {
|
||||
cv::Mat img_mask = scale_mask(masks[i], img);
|
||||
auto color = colors[(int)dets[i].class_id % colors.size()];
|
||||
auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
|
||||
|
||||
cv::Rect r = get_rect(img, dets[i].bbox);
|
||||
for (int x = r.x; x < r.x + r.width; x++) {
|
||||
for (int y = r.y; y < r.y + r.height; y++) {
|
||||
float val = img_mask.at<float>(y, x);
|
||||
if (val <= 0.5)
|
||||
continue;
|
||||
img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
|
||||
img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
|
||||
img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
|
||||
}
|
||||
}
|
||||
|
||||
cv::rectangle(img, r, bgr, 2);
|
||||
|
||||
// Get the size of the text
|
||||
cv::Size textSize =
|
||||
cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
|
||||
cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
|
||||
// Set the top left corner of the rectangle
|
||||
cv::Point topLeft(r.x, r.y - textSize.height);
|
||||
|
||||
// Set the bottom right corner of the rectangle
|
||||
cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);
|
||||
|
||||
// Set the thickness of the rectangle lines
|
||||
int lineThickness = 2;
|
||||
|
||||
// Draw the rectangle on the image
|
||||
cv::rectangle(img, topLeft, bottomRight, bgr, -1);
|
||||
|
||||
cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
|
||||
cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
|
||||
}
|
||||
}
|
||||
|
||||
void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
|
||||
cv::Mat& img, int count) {
|
||||
Detection det;
|
||||
for (int i = 0; i < count; i++) {
|
||||
int basic_pos = 1 + i * bbox_element;
|
||||
int keep_flag = decode_ptr_host[basic_pos + 6];
|
||||
if (keep_flag == 1) {
|
||||
det.bbox[0] = decode_ptr_host[basic_pos + 0];
|
||||
det.bbox[1] = decode_ptr_host[basic_pos + 1];
|
||||
det.bbox[2] = decode_ptr_host[basic_pos + 2];
|
||||
det.bbox[3] = decode_ptr_host[basic_pos + 3];
|
||||
det.conf = decode_ptr_host[basic_pos + 4];
|
||||
det.class_id = decode_ptr_host[basic_pos + 5];
|
||||
det.angle = decode_ptr_host[basic_pos + 7];
|
||||
res.push_back(det);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
|
||||
int bbox_element, const std::vector<cv::Mat>& img_batch) {
|
||||
res_batch.resize(batch_size);
|
||||
int count = static_cast<int>(*decode_ptr_host);
|
||||
count = std::min(count, kMaxNumOutputBbox);
|
||||
for (int i = 0; i < batch_size; i++) {
|
||||
auto& img = const_cast<cv::Mat&>(img_batch[i]);
|
||||
process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
|
||||
}
|
||||
}
|
||||
|
||||
std::tuple<float, float, float> convariance_matrix(Detection res) {
|
||||
float w = res.bbox[2];
|
||||
float h = res.bbox[3];
|
||||
|
||||
float a = w * w / 12.0;
|
||||
float b = h * h / 12.0;
|
||||
float c = res.angle;
|
||||
|
||||
float cos_r = std::cos(c);
|
||||
float sin_r = std::sin(c);
|
||||
|
||||
float cos_r2 = cos_r * cos_r;
|
||||
float sin_r2 = sin_r * sin_r;
|
||||
|
||||
float a_val = a * cos_r2 + b * sin_r2;
|
||||
float b_val = a * sin_r2 + b * cos_r2;
|
||||
float c_val = (a - b) * cos_r * sin_r;
|
||||
|
||||
return std::make_tuple(a_val, b_val, c_val);
|
||||
}
|
||||
|
||||
static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) {
|
||||
// Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
|
||||
float a1, b1, c1, a2, b2, c2;
|
||||
std::tuple<float, float, float> matrix1 = {a1, b1, c1};
|
||||
std::tuple<float, float, float> matrix2 = {a2, b2, c2};
|
||||
matrix1 = convariance_matrix(res1);
|
||||
matrix2 = convariance_matrix(res2);
|
||||
a1 = std::get<0>(matrix1);
|
||||
b1 = std::get<1>(matrix1);
|
||||
c1 = std::get<2>(matrix1);
|
||||
a2 = std::get<0>(matrix2);
|
||||
b2 = std::get<1>(matrix2);
|
||||
c2 = std::get<2>(matrix2);
|
||||
|
||||
float x1 = res1.bbox[0], y1 = res1.bbox[1];
|
||||
float x2 = res2.bbox[0], y2 = res2.bbox[1];
|
||||
|
||||
float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) /
|
||||
((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
|
||||
float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
|
||||
float t3 = std::log(
|
||||
((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) /
|
||||
(4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) +
|
||||
eps) +
|
||||
eps);
|
||||
|
||||
float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
|
||||
bd = std::max(std::min(bd, 100.0f), eps);
|
||||
float hd = std::sqrt(1.0 - std::exp(-bd) + eps);
|
||||
|
||||
return 1 - hd;
|
||||
}
|
||||
|
||||
void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
|
||||
int det_size = sizeof(Detection) / sizeof(float);
|
||||
std::map<float, std::vector<Detection>> m;
|
||||
|
||||
for (int i = 0; i < output[0]; i++) {
|
||||
|
||||
if (output[1 + det_size * i + 4] <= conf_thresh)
|
||||
continue;
|
||||
Detection det;
|
||||
memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
|
||||
if (m.count(det.class_id) == 0)
|
||||
m.emplace(det.class_id, std::vector<Detection>());
|
||||
m[det.class_id].push_back(det);
|
||||
}
|
||||
for (auto it = m.begin(); it != m.end(); it++) {
|
||||
auto& dets = it->second;
|
||||
std::sort(dets.begin(), dets.end(), cmp);
|
||||
for (size_t m = 0; m < dets.size(); ++m) {
|
||||
auto& item = dets[m];
|
||||
res.push_back(item);
|
||||
for (size_t n = m + 1; n < dets.size(); ++n) {
|
||||
if (probiou(item, dets[n]) >= nms_thresh) {
|
||||
dets.erase(dets.begin() + n);
|
||||
--n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void batch_nms_obb(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
|
||||
float conf_thresh, float nms_thresh) {
|
||||
res_batch.resize(batch_size);
|
||||
for (int i = 0; i < batch_size; i++) {
|
||||
nms_obb(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<cv::Point> get_corner(cv::Mat& img, const Detection& box) {
|
||||
float cos_value, sin_value;
|
||||
|
||||
// Calculate center point and width/height
|
||||
float x1 = box.bbox[0];
|
||||
float y1 = box.bbox[1];
|
||||
float w = box.bbox[2];
|
||||
float h = box.bbox[3];
|
||||
float angle = box.angle * 180.0f / CV_PI; // Convert radians to degrees
|
||||
|
||||
// Print original angle
|
||||
std::cout << "Original angle: " << angle << std::endl;
|
||||
|
||||
// Swap width and height if height is greater than or equal to width
|
||||
if (h >= w) {
|
||||
std::swap(w, h);
|
||||
angle = fmod(angle + 90.0f, 180.0f); // Adjust angle to be within [0, 180)
|
||||
}
|
||||
|
||||
// Ensure the angle is between 0 and 180 degrees
|
||||
if (angle < 0) {
|
||||
angle += 360.0f; // Convert to positive value
|
||||
}
|
||||
if (angle > 180.0f) {
|
||||
angle -= 180.0f; // Subtract 180 from angles greater than 180
|
||||
}
|
||||
|
||||
// Print adjusted angle
|
||||
std::cout << "Adjusted angle: " << angle << std::endl;
|
||||
|
||||
// Convert to normal angle value
|
||||
float normal_angle = fmod(angle, 180.0f);
|
||||
if (normal_angle < 0) {
|
||||
normal_angle += 180.0f; // Ensure it's a positive value
|
||||
}
|
||||
|
||||
// Print normal angle value
|
||||
std::cout << "Normal angle: " << normal_angle << std::endl;
|
||||
|
||||
cos_value = std::cos(angle * CV_PI / 180.0f); // Convert to radians
|
||||
sin_value = std::sin(angle * CV_PI / 180.0f);
|
||||
|
||||
// Calculate each corner point
|
||||
float l = x1 - w / 2; // Left boundary
|
||||
float r = x1 + w / 2; // Right boundary
|
||||
float t = y1 - h / 2; // Top boundary
|
||||
float b = y1 + h / 2; // Bottom boundary
|
||||
|
||||
// Use get_rect function to scale the coordinates
|
||||
float bbox[4] = {l, t, r, b};
|
||||
cv::Rect rect = get_rect(img, bbox);
|
||||
|
||||
float x_ = (rect.x + rect.x + rect.width) / 2; // Center x
|
||||
float y_ = (rect.y + rect.y + rect.height) / 2; // Center y
|
||||
float width = rect.width; // Width
|
||||
float height = rect.height; // Height
|
||||
|
||||
// Calculate each corner point
|
||||
std::vector<cv::Point> corner_points(4);
|
||||
float vec1x = width / 2 * cos_value;
|
||||
float vec1y = width / 2 * sin_value;
|
||||
float vec2x = -height / 2 * sin_value;
|
||||
float vec2y = height / 2 * cos_value;
|
||||
|
||||
corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))); // Top-left corner
|
||||
corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))); // Top-right corner
|
||||
corner_points[2] =
|
||||
cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))); // Bottom-right corner
|
||||
corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))); // Bottom-left corner
|
||||
|
||||
// Check and adjust corner points to ensure the rectangle is parallel to image boundaries
|
||||
for (auto& point : corner_points) {
|
||||
point.x = std::max(0, std::min(point.x, img.cols - 1));
|
||||
point.y = std::max(0, std::min(point.y, img.rows - 1));
|
||||
}
|
||||
|
||||
return corner_points;
|
||||
}
|
||||
|
||||
void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
|
||||
static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
|
||||
0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
|
||||
0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
|
||||
for (size_t i = 0; i < img_batch.size(); i++) {
|
||||
auto& res = res_batch[i];
|
||||
auto& img = img_batch[i];
|
||||
for (auto& obj : res) {
|
||||
auto color = colors[(int)obj.class_id % colors.size()];
|
||||
auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
|
||||
auto corner_points = get_corner(img, obj);
|
||||
cv::polylines(img, std::vector<std::vector<cv::Point>>{corner_points}, true, bgr, 1);
|
||||
|
||||
auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf));
|
||||
cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr);
|
||||
|
||||
int width = textsize.width;
|
||||
int height = textsize.height;
|
||||
bool outside = (corner_points[0].y - height >= 3) ? true : false;
|
||||
cv::Point p1(corner_points[0].x, corner_points[0].y), p2;
|
||||
p2.x = corner_points[0].x + width;
|
||||
if (outside) {
|
||||
p2.y = corner_points[0].y - height - 3;
|
||||
} else {
|
||||
p2.y = corner_points[0].y + height + 3;
|
||||
}
|
||||
cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA);
|
||||
cv::putText(
|
||||
img, text,
|
||||
cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)),
|
||||
0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA);
|
||||
}
|
||||
}
|
||||
}
|
||||
193
src/postprocess.cu
Normal file
193
src/postprocess.cu
Normal file
@ -0,0 +1,193 @@
|
||||
//
|
||||
// Created by lindsay on 23-7-17.
|
||||
//
|
||||
#include "postprocess.h"
|
||||
#include "types.h"
|
||||
|
||||
static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray,
|
||||
int max_objects) {
|
||||
float count = predict[0];
|
||||
int position = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (position >= count)
|
||||
return;
|
||||
|
||||
float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
|
||||
int index = atomicAdd(parray, 1);
|
||||
if (index >= max_objects)
|
||||
return;
|
||||
|
||||
float confidence = pitem[4];
|
||||
|
||||
if (confidence < confidence_threshold)
|
||||
return;
|
||||
//[center_x center_y w h conf class_id mask[32] keypoints[51] angle]
|
||||
float cx = pitem[0];
|
||||
float cy = pitem[1];
|
||||
float width = pitem[2];
|
||||
float height = pitem[3];
|
||||
float label = pitem[5];
|
||||
float angle = pitem[89];
|
||||
|
||||
float* pout_item = parray + 1 + index * bbox_element;
|
||||
*pout_item++ = cx;
|
||||
*pout_item++ = cy;
|
||||
*pout_item++ = width;
|
||||
*pout_item++ = height;
|
||||
*pout_item++ = confidence;
|
||||
*pout_item++ = label;
|
||||
*pout_item++ = 1; // 1 = keep, 0 = ignore
|
||||
*pout_item++ = angle;
|
||||
}
|
||||
|
||||
static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray,
|
||||
int max_objects) {
|
||||
float count = predict[0];
|
||||
int position = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (position >= count)
|
||||
return;
|
||||
|
||||
float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
|
||||
int index = atomicAdd(parray, 1);
|
||||
if (index >= max_objects)
|
||||
return;
|
||||
|
||||
float confidence = pitem[4];
|
||||
if (confidence < confidence_threshold)
|
||||
return;
|
||||
|
||||
float left = pitem[0];
|
||||
float top = pitem[1];
|
||||
float right = pitem[2];
|
||||
float bottom = pitem[3];
|
||||
float label = pitem[5];
|
||||
|
||||
float* pout_item = parray + 1 + index * bbox_element;
|
||||
*pout_item++ = left;
|
||||
*pout_item++ = top;
|
||||
*pout_item++ = right;
|
||||
*pout_item++ = bottom;
|
||||
*pout_item++ = confidence;
|
||||
*pout_item++ = label;
|
||||
*pout_item++ = 1; // 1 = keep, 0 = ignore
|
||||
}
|
||||
|
||||
static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop,
|
||||
float bright, float bbottom) {
|
||||
float cleft = max(aleft, bleft);
|
||||
float ctop = max(atop, btop);
|
||||
float cright = min(aright, bright);
|
||||
float cbottom = min(abottom, bbottom);
|
||||
float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
|
||||
if (c_area == 0.0f)
|
||||
return 0.0f;
|
||||
|
||||
float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
|
||||
float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
|
||||
return c_area / (a_area + b_area - c_area);
|
||||
}
|
||||
|
||||
static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) {
|
||||
int position = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
int count = bboxes[0];
|
||||
if (position >= count)
|
||||
return;
|
||||
|
||||
float* pcurrent = bboxes + 1 + position * bbox_element;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
float* pitem = bboxes + 1 + i * bbox_element;
|
||||
if (i == position || pcurrent[5] != pitem[5])
|
||||
continue;
|
||||
if (pitem[4] >= pcurrent[4]) {
|
||||
if (pitem[4] == pcurrent[4] && i < position)
|
||||
continue;
|
||||
float iou =
|
||||
box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]);
|
||||
if (iou > threshold) {
|
||||
pcurrent[6] = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) {
|
||||
float a_val = w * w / 12.0f;
|
||||
float b_val = h * h / 12.0f;
|
||||
float cos_r = cosf(r);
|
||||
float sin_r = sinf(r);
|
||||
|
||||
a = a_val * cos_r * cos_r + b_val * sin_r * sin_r;
|
||||
b = a_val * sin_r * sin_r + b_val * cos_r * cos_r;
|
||||
c = (a_val - b_val) * sin_r * cos_r;
|
||||
}
|
||||
|
||||
static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2,
|
||||
float h2, float r2, float eps = 1e-7) {
|
||||
|
||||
// Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
|
||||
float a1, b1, c1, a2, b2, c2;
|
||||
convariance_matrix(w1, h1, r1, a1, b1, c1);
|
||||
convariance_matrix(w2, h2, r2, a2, b2, c2);
|
||||
|
||||
float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) /
|
||||
((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
|
||||
float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
|
||||
float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) /
|
||||
(4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) +
|
||||
eps);
|
||||
float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
|
||||
bd = fmaxf(fminf(bd, 100.0f), eps);
|
||||
float hd = sqrtf(1.0f - expf(-bd) + eps);
|
||||
return 1 - hd;
|
||||
}
|
||||
|
||||
static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) {
|
||||
int position = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
int count = bboxes[0];
|
||||
if (position >= count)
|
||||
return;
|
||||
|
||||
float* pcurrent = bboxes + 1 + position * bbox_element;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
float* pitem = bboxes + 1 + i * bbox_element;
|
||||
if (i == position || pcurrent[5] != pitem[5])
|
||||
continue;
|
||||
if (pitem[4] >= pcurrent[4]) {
|
||||
if (pitem[4] == pcurrent[4] && i < position)
|
||||
continue;
|
||||
float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1],
|
||||
pitem[2], pitem[3], pitem[7]);
|
||||
if (iou > threshold) {
|
||||
pcurrent[6] = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
|
||||
cudaStream_t stream) {
|
||||
int block = 256;
|
||||
int grid = ceil(num_bboxes / (float)block);
|
||||
decode_kernel<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
|
||||
}
|
||||
|
||||
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
|
||||
int block = max_objects < 256 ? max_objects : 256;
|
||||
int grid = ceil(max_objects / (float)block);
|
||||
nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
|
||||
}
|
||||
|
||||
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
|
||||
cudaStream_t stream) {
|
||||
int block = 256;
|
||||
int grid = ceil(num_bboxes / (float)block);
|
||||
decode_kernel_obb<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray,
|
||||
max_objects);
|
||||
}
|
||||
|
||||
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
|
||||
int block = max_objects < 256 ? max_objects : 256;
|
||||
int grid = ceil(max_objects / (float)block);
|
||||
nms_kernel_obb<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
|
||||
}
|
||||
139
src/preprocess.cu
Normal file
139
src/preprocess.cu
Normal file
@ -0,0 +1,139 @@
|
||||
#include "cuda_utils.h"
|
||||
#include "preprocess.h"
|
||||
|
||||
static uint8_t* img_buffer_host = nullptr;
|
||||
static uint8_t* img_buffer_device = nullptr;
|
||||
|
||||
__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
|
||||
int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
|
||||
int position = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (position >= edge)
|
||||
return;
|
||||
|
||||
float m_x1 = d2s.value[0];
|
||||
float m_y1 = d2s.value[1];
|
||||
float m_z1 = d2s.value[2];
|
||||
float m_x2 = d2s.value[3];
|
||||
float m_y2 = d2s.value[4];
|
||||
float m_z2 = d2s.value[5];
|
||||
|
||||
int dx = position % dst_width;
|
||||
int dy = position / dst_width;
|
||||
float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
|
||||
float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
|
||||
float c0, c1, c2;
|
||||
|
||||
if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
|
||||
// out of range
|
||||
c0 = const_value_st;
|
||||
c1 = const_value_st;
|
||||
c2 = const_value_st;
|
||||
} else {
|
||||
int y_low = floorf(src_y);
|
||||
int x_low = floorf(src_x);
|
||||
int y_high = y_low + 1;
|
||||
int x_high = x_low + 1;
|
||||
|
||||
uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
|
||||
float ly = src_y - y_low;
|
||||
float lx = src_x - x_low;
|
||||
float hy = 1 - ly;
|
||||
float hx = 1 - lx;
|
||||
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
|
||||
uint8_t* v1 = const_value;
|
||||
uint8_t* v2 = const_value;
|
||||
uint8_t* v3 = const_value;
|
||||
uint8_t* v4 = const_value;
|
||||
|
||||
if (y_low >= 0) {
|
||||
if (x_low >= 0)
|
||||
v1 = src + y_low * src_line_size + x_low * 3;
|
||||
|
||||
if (x_high < src_width)
|
||||
v2 = src + y_low * src_line_size + x_high * 3;
|
||||
}
|
||||
|
||||
if (y_high < src_height) {
|
||||
if (x_low >= 0)
|
||||
v3 = src + y_high * src_line_size + x_low * 3;
|
||||
|
||||
if (x_high < src_width)
|
||||
v4 = src + y_high * src_line_size + x_high * 3;
|
||||
}
|
||||
|
||||
c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
|
||||
c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
|
||||
c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
|
||||
}
|
||||
|
||||
// bgr to rgb
|
||||
float t = c2;
|
||||
c2 = c0;
|
||||
c0 = t;
|
||||
|
||||
// normalization
|
||||
c0 = c0 / 255.0f;
|
||||
c1 = c1 / 255.0f;
|
||||
c2 = c2 / 255.0f;
|
||||
|
||||
// rgbrgbrgb to rrrgggbbb
|
||||
int area = dst_width * dst_height;
|
||||
float* pdst_c0 = dst + dy * dst_width + dx;
|
||||
float* pdst_c1 = pdst_c0 + area;
|
||||
float* pdst_c2 = pdst_c1 + area;
|
||||
*pdst_c0 = c0;
|
||||
*pdst_c1 = c1;
|
||||
*pdst_c2 = c2;
|
||||
}
|
||||
|
||||
void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
|
||||
cudaStream_t stream) {
|
||||
int img_size = src_width * src_height * 3;
|
||||
// copy data to pinned memory
|
||||
memcpy(img_buffer_host, src, img_size);
|
||||
// copy data to device memory
|
||||
CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));
|
||||
|
||||
AffineMatrix s2d, d2s;
|
||||
float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);
|
||||
|
||||
s2d.value[0] = scale;
|
||||
s2d.value[1] = 0;
|
||||
s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
|
||||
s2d.value[3] = 0;
|
||||
s2d.value[4] = scale;
|
||||
s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
|
||||
cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
|
||||
cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
|
||||
cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
|
||||
|
||||
memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
|
||||
|
||||
int jobs = dst_height * dst_width;
|
||||
int threads = 256;
|
||||
int blocks = ceil(jobs / (float)threads);
|
||||
warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
|
||||
dst_width, dst_height, 128, d2s, jobs);
|
||||
}
|
||||
|
||||
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
|
||||
cudaStream_t stream) {
|
||||
int dst_size = dst_width * dst_height * 3;
|
||||
for (size_t i = 0; i < img_batch.size(); i++) {
|
||||
cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
|
||||
dst_height, stream);
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_preprocess_init(int max_image_size) {
|
||||
// prepare input data in pinned memory
|
||||
CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
|
||||
// prepare input data in device memory
|
||||
CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
|
||||
}
|
||||
|
||||
void cuda_preprocess_destroy() {
|
||||
CUDA_CHECK(cudaFree(img_buffer_device));
|
||||
CUDA_CHECK(cudaFreeHost(img_buffer_host));
|
||||
}
|
||||
278
yolov8_5u_det.cpp
Normal file
278
yolov8_5u_det.cpp
Normal file
@ -0,0 +1,278 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include "cuda_utils.h"
|
||||
#include "logging.h"
|
||||
#include "model.h"
|
||||
#include "postprocess.h"
|
||||
#include "preprocess.h"
|
||||
#include "utils.h"
|
||||
|
||||
Logger gLogger;
|
||||
using namespace nvinfer1;
|
||||
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
|
||||
|
||||
void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
|
||||
float& gw, int& max_channels) {
|
||||
IBuilder* builder = createInferBuilder(gLogger);
|
||||
IBuilderConfig* config = builder->createBuilderConfig();
|
||||
IHostMemory* serialized_engine = nullptr;
|
||||
|
||||
if (is_p == 6) {
|
||||
serialized_engine =
|
||||
buildEngineYolov8_5uDetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
|
||||
} else {
|
||||
serialized_engine = buildEngineYolov8_5uDet(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
|
||||
}
|
||||
|
||||
assert(serialized_engine);
|
||||
std::ofstream p(engine_name, std::ios::binary);
|
||||
if (!p) {
|
||||
std::cout << "could not open plan output file" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
|
||||
|
||||
delete serialized_engine;
|
||||
delete config;
|
||||
delete builder;
|
||||
}
|
||||
|
||||
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
|
||||
IExecutionContext** context) {
|
||||
std::ifstream file(engine_name, std::ios::binary);
|
||||
if (!file.good()) {
|
||||
std::cerr << "read " << engine_name << " error!" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
size_t size = 0;
|
||||
file.seekg(0, file.end);
|
||||
size = file.tellg();
|
||||
file.seekg(0, file.beg);
|
||||
char* serialized_engine = new char[size];
|
||||
assert(serialized_engine);
|
||||
file.read(serialized_engine, size);
|
||||
file.close();
|
||||
|
||||
*runtime = createInferRuntime(gLogger);
|
||||
assert(*runtime);
|
||||
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
|
||||
assert(*engine);
|
||||
*context = (*engine)->createExecutionContext();
|
||||
assert(*context);
|
||||
delete[] serialized_engine;
|
||||
}
|
||||
|
||||
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
|
||||
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
|
||||
std::string cuda_post_process) {
|
||||
assert(engine->getNbBindings() == 2);
|
||||
// In order to bind the buffers, we need to know the names of the input and
|
||||
// output tensors. Note that indices are guaranteed to be less than
|
||||
// IEngine::getNbBindings()
|
||||
const int inputIndex = engine->getBindingIndex(kInputTensorName);
|
||||
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
|
||||
assert(inputIndex == 0);
|
||||
assert(outputIndex == 1);
|
||||
// Create GPU buffers on device
|
||||
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
|
||||
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
|
||||
if (cuda_post_process == "c") {
|
||||
*output_buffer_host = new float[kBatchSize * kOutputSize];
|
||||
} else if (cuda_post_process == "g") {
|
||||
if (kBatchSize > 1) {
|
||||
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
// Allocate memory for decode_ptr_host and copy to device
|
||||
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
|
||||
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
|
||||
}
|
||||
}
|
||||
|
||||
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
|
||||
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
|
||||
// infer on the batch asynchronously, and DMA output back to host
|
||||
auto start = std::chrono::system_clock::now();
|
||||
context.enqueue(batchsize, buffers, stream, nullptr);
|
||||
if (cuda_post_process == "c") {
|
||||
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
|
||||
<< "ms" << std::endl;
|
||||
} else if (cuda_post_process == "g") {
|
||||
CUDA_CHECK(
|
||||
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
|
||||
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
|
||||
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox,
|
||||
stream); // cuda nms
|
||||
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
|
||||
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference and gpu postprocess time: "
|
||||
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
|
||||
std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
|
||||
if (argc < 4)
|
||||
return false;
|
||||
if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
|
||||
wts = std::string(argv[2]);
|
||||
engine = std::string(argv[3]);
|
||||
auto sub_type = std::string(argv[4]);
|
||||
|
||||
if (sub_type[0] == 'n') {
|
||||
gd = 0.33;
|
||||
gw = 0.25;
|
||||
max_channels = 1024;
|
||||
} else if (sub_type[0] == 's') {
|
||||
gd = 0.33;
|
||||
gw = 0.50;
|
||||
max_channels = 1024;
|
||||
} else if (sub_type[0] == 'm') {
|
||||
gd = 0.67;
|
||||
gw = 0.75;
|
||||
max_channels = 576;
|
||||
} else if (sub_type[0] == 'l') {
|
||||
gd = 1.0;
|
||||
gw = 1.0;
|
||||
max_channels = 512;
|
||||
} else if (sub_type[0] == 'x') {
|
||||
gd = 1.33;
|
||||
gw = 1.25;
|
||||
max_channels = 640;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
if (sub_type.size() == 2 && sub_type[1] == '6') {
|
||||
is_p = 6;
|
||||
}
|
||||
} else if (std::string(argv[1]) == "-d" && argc == 5) {
|
||||
engine = std::string(argv[2]);
|
||||
img_dir = std::string(argv[3]);
|
||||
cuda_post_process = std::string(argv[4]);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
cudaSetDevice(kGpuId);
|
||||
std::string wts_name = "";
|
||||
std::string engine_name = "";
|
||||
std::string img_dir;
|
||||
std::string sub_type = "";
|
||||
std::string cuda_post_process = "";
|
||||
int model_bboxes;
|
||||
int is_p = 0;
|
||||
float gd = 0.0f, gw = 0.0f;
|
||||
int max_channels = 0;
|
||||
|
||||
if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
|
||||
max_channels)) {
|
||||
std::cerr << "Arguments not right!" << std::endl;
|
||||
std::cerr << "./yolov8_5u_det -s [.wts] [.engine] "
|
||||
"[n/s/m/l/x//n6/s6/m6/l6/x6] // serialize model to "
|
||||
"plan file"
|
||||
<< std::endl;
|
||||
std::cerr << "./yolov8_5u_det -d [.engine] ../samples [c/g]// deserialize "
|
||||
"plan file and run inference"
|
||||
<< std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create a model using the API directly and serialize it to a file
|
||||
if (!wts_name.empty()) {
|
||||
serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Deserialize the engine from file
|
||||
IRuntime* runtime = nullptr;
|
||||
ICudaEngine* engine = nullptr;
|
||||
IExecutionContext* context = nullptr;
|
||||
deserialize_engine(engine_name, &runtime, &engine, &context);
|
||||
cudaStream_t stream;
|
||||
CUDA_CHECK(cudaStreamCreate(&stream));
|
||||
cuda_preprocess_init(kMaxInputImageSize);
|
||||
auto out_dims = engine->getBindingDimensions(1);
|
||||
model_bboxes = out_dims.d[0];
|
||||
// Prepare cpu and gpu buffers
|
||||
float* device_buffers[2];
|
||||
float* output_buffer_host = nullptr;
|
||||
float* decode_ptr_host = nullptr;
|
||||
float* decode_ptr_device = nullptr;
|
||||
|
||||
// Read images from directory
|
||||
std::vector<std::string> file_names;
|
||||
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
|
||||
std::cerr << "read_files_in_dir failed." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
|
||||
&decode_ptr_device, cuda_post_process);
|
||||
|
||||
// batch predict
|
||||
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
|
||||
// Get a batch of images
|
||||
std::vector<cv::Mat> img_batch;
|
||||
std::vector<std::string> img_name_batch;
|
||||
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
|
||||
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
|
||||
img_batch.push_back(img);
|
||||
img_name_batch.push_back(file_names[j]);
|
||||
}
|
||||
// Preprocess
|
||||
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
|
||||
// Run inference
|
||||
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
|
||||
decode_ptr_device, model_bboxes, cuda_post_process);
|
||||
std::vector<std::vector<Detection>> res_batch;
|
||||
if (cuda_post_process == "c") {
|
||||
// NMS
|
||||
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
|
||||
} else if (cuda_post_process == "g") {
|
||||
// Process gpu decode and nms results
|
||||
batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
|
||||
}
|
||||
// Draw bounding boxes
|
||||
draw_bbox(img_batch, res_batch);
|
||||
// Save images
|
||||
for (size_t j = 0; j < img_batch.size(); j++) {
|
||||
cv::imwrite("_" + img_name_batch[j], img_batch[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Release stream and buffers
|
||||
cudaStreamDestroy(stream);
|
||||
CUDA_CHECK(cudaFree(device_buffers[0]));
|
||||
CUDA_CHECK(cudaFree(device_buffers[1]));
|
||||
CUDA_CHECK(cudaFree(decode_ptr_device));
|
||||
delete[] decode_ptr_host;
|
||||
delete[] output_buffer_host;
|
||||
cuda_preprocess_destroy();
|
||||
// Destroy the engine
|
||||
delete context;
|
||||
delete engine;
|
||||
delete runtime;
|
||||
|
||||
// Print histogram of the output distribution
|
||||
// std::cout << "\nOutput:\n\n";
|
||||
// for (unsigned int i = 0; i < kOutputSize; i++)
|
||||
//{
|
||||
// std::cout << prob[i] << ", ";
|
||||
// if (i % 10 == 0) std::cout << std::endl;
|
||||
//}
|
||||
// std::cout << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
462
yolov8_5u_det_trt.py
Normal file
462
yolov8_5u_det_trt.py
Normal file
@ -0,0 +1,462 @@
|
||||
"""
|
||||
An example that uses TensorRT's Python api to make inferences.
|
||||
"""
|
||||
import ctypes
|
||||
import os
|
||||
import shutil
|
||||
import random
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pycuda.autoinit # noqa: F401
|
||||
import pycuda.driver as cuda
|
||||
import tensorrt as trt
|
||||
|
||||
CONF_THRESH = 0.5
|
||||
IOU_THRESHOLD = 0.4
|
||||
POSE_NUM = 17 * 3
|
||||
DET_NUM = 6
|
||||
SEG_NUM = 32
|
||||
OBB_NUM = 1
|
||||
|
||||
|
||||
def get_img_path_batches(batch_size, img_dir):
|
||||
ret = []
|
||||
batch = []
|
||||
for root, dirs, files in os.walk(img_dir):
|
||||
for name in files:
|
||||
if len(batch) == batch_size:
|
||||
ret.append(batch)
|
||||
batch = []
|
||||
batch.append(os.path.join(root, name))
|
||||
if len(batch) > 0:
|
||||
ret.append(batch)
|
||||
return ret
|
||||
|
||||
|
||||
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
|
||||
"""
|
||||
description: Plots one bounding box on image img,
|
||||
this function comes from YoLov8 project.
|
||||
param:
|
||||
x: a box likes [x1,y1,x2,y2]
|
||||
img: a opencv image object
|
||||
color: color to draw rectangle, such as (0,255,0)
|
||||
label: str
|
||||
line_thickness: int
|
||||
return:
|
||||
no return
|
||||
|
||||
"""
|
||||
tl = (
|
||||
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
|
||||
) # line/font thickness
|
||||
color = color or [random.randint(0, 255) for _ in range(3)]
|
||||
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
|
||||
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
|
||||
if label:
|
||||
tf = max(tl - 1, 1) # font thickness
|
||||
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
|
||||
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
|
||||
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
|
||||
cv2.putText(
|
||||
img,
|
||||
label,
|
||||
(c1[0], c1[1] - 2),
|
||||
0,
|
||||
tl / 3,
|
||||
[225, 255, 255],
|
||||
thickness=tf,
|
||||
lineType=cv2.LINE_AA,
|
||||
)
|
||||
|
||||
|
||||
class YoLov8TRT(object):
|
||||
"""
|
||||
description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
|
||||
"""
|
||||
|
||||
def __init__(self, engine_file_path):
|
||||
# Create a Context on this device,
|
||||
self.ctx = cuda.Device(0).make_context()
|
||||
stream = cuda.Stream()
|
||||
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
|
||||
runtime = trt.Runtime(TRT_LOGGER)
|
||||
|
||||
# Deserialize the engine from file
|
||||
with open(engine_file_path, "rb") as f:
|
||||
engine = runtime.deserialize_cuda_engine(f.read())
|
||||
context = engine.create_execution_context()
|
||||
|
||||
host_inputs = []
|
||||
cuda_inputs = []
|
||||
host_outputs = []
|
||||
cuda_outputs = []
|
||||
bindings = []
|
||||
|
||||
for binding in engine:
|
||||
print('bingding:', binding, engine.get_binding_shape(binding))
|
||||
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
|
||||
dtype = trt.nptype(engine.get_binding_dtype(binding))
|
||||
# Allocate host and device buffers
|
||||
host_mem = cuda.pagelocked_empty(size, dtype)
|
||||
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||
# Append the device buffer to device bindings.
|
||||
bindings.append(int(cuda_mem))
|
||||
# Append to the appropriate list.
|
||||
if engine.binding_is_input(binding):
|
||||
self.input_w = engine.get_binding_shape(binding)[-1]
|
||||
self.input_h = engine.get_binding_shape(binding)[-2]
|
||||
host_inputs.append(host_mem)
|
||||
cuda_inputs.append(cuda_mem)
|
||||
else:
|
||||
host_outputs.append(host_mem)
|
||||
cuda_outputs.append(cuda_mem)
|
||||
|
||||
# Store
|
||||
self.stream = stream
|
||||
self.context = context
|
||||
self.engine = engine
|
||||
self.host_inputs = host_inputs
|
||||
self.cuda_inputs = cuda_inputs
|
||||
self.host_outputs = host_outputs
|
||||
self.cuda_outputs = cuda_outputs
|
||||
self.bindings = bindings
|
||||
self.batch_size = engine.max_batch_size
|
||||
self.det_output_length = host_outputs[0].shape[0]
|
||||
|
||||
def infer(self, raw_image_generator):
|
||||
threading.Thread.__init__(self)
|
||||
# Make self the active context, pushing it on top of the context stack.
|
||||
self.ctx.push()
|
||||
# Restore
|
||||
stream = self.stream
|
||||
context = self.context
|
||||
host_inputs = self.host_inputs
|
||||
cuda_inputs = self.cuda_inputs
|
||||
host_outputs = self.host_outputs
|
||||
cuda_outputs = self.cuda_outputs
|
||||
bindings = self.bindings
|
||||
# Do image preprocess
|
||||
batch_image_raw = []
|
||||
batch_origin_h = []
|
||||
batch_origin_w = []
|
||||
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
|
||||
for i, image_raw in enumerate(raw_image_generator):
|
||||
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
|
||||
batch_image_raw.append(image_raw)
|
||||
batch_origin_h.append(origin_h)
|
||||
batch_origin_w.append(origin_w)
|
||||
np.copyto(batch_input_image[i], input_image)
|
||||
batch_input_image = np.ascontiguousarray(batch_input_image)
|
||||
|
||||
# Copy input image to host buffer
|
||||
np.copyto(host_inputs[0], batch_input_image.ravel())
|
||||
start = time.time()
|
||||
# Transfer input data to the GPU.
|
||||
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
|
||||
# Run inference.
|
||||
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
|
||||
# Transfer predictions back from the GPU.
|
||||
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
|
||||
# Synchronize the stream
|
||||
stream.synchronize()
|
||||
end = time.time()
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
# Here we use the first row of output in that batch_size = 1
|
||||
output = host_outputs[0]
|
||||
# Do postprocess
|
||||
for i in range(self.batch_size):
|
||||
result_boxes, result_scores, result_classid = self.post_process(
|
||||
output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
|
||||
batch_origin_w[i]
|
||||
)
|
||||
# Draw rectangles and labels on the original image
|
||||
for j in range(len(result_boxes)):
|
||||
box = result_boxes[j]
|
||||
plot_one_box(
|
||||
box,
|
||||
batch_image_raw[i],
|
||||
label="{}:{:.2f}".format(
|
||||
categories[int(result_classid[j])], result_scores[j]
|
||||
),
|
||||
)
|
||||
return batch_image_raw, end - start
|
||||
|
||||
def destroy(self):
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
|
||||
def get_raw_image(self, image_path_batch):
|
||||
"""
|
||||
description: Read an image from image path
|
||||
"""
|
||||
for img_path in image_path_batch:
|
||||
yield cv2.imread(img_path)
|
||||
|
||||
def get_raw_image_zeros(self, image_path_batch=None):
|
||||
"""
|
||||
description: Ready data for warmup
|
||||
"""
|
||||
for _ in range(self.batch_size):
|
||||
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
|
||||
|
||||
def preprocess_image(self, raw_bgr_image):
|
||||
"""
|
||||
description: Convert BGR image to RGB,
|
||||
resize and pad it to target size, normalize to [0,1],
|
||||
transform to NCHW format.
|
||||
param:
|
||||
input_image_path: str, image path
|
||||
return:
|
||||
image: the processed image
|
||||
image_raw: the original image
|
||||
h: original height
|
||||
w: original width
|
||||
"""
|
||||
image_raw = raw_bgr_image
|
||||
h, w, c = image_raw.shape
|
||||
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
|
||||
# Calculate widht and height and paddings
|
||||
r_w = self.input_w / w
|
||||
r_h = self.input_h / h
|
||||
if r_h > r_w:
|
||||
tw = self.input_w
|
||||
th = int(r_w * h)
|
||||
tx1 = tx2 = 0
|
||||
ty1 = int((self.input_h - th) / 2)
|
||||
ty2 = self.input_h - th - ty1
|
||||
else:
|
||||
tw = int(r_h * w)
|
||||
th = self.input_h
|
||||
tx1 = int((self.input_w - tw) / 2)
|
||||
tx2 = self.input_w - tw - tx1
|
||||
ty1 = ty2 = 0
|
||||
# Resize the image with long side while maintaining ratio
|
||||
image = cv2.resize(image, (tw, th))
|
||||
# Pad the short side with (128,128,128)
|
||||
image = cv2.copyMakeBorder(
|
||||
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
|
||||
)
|
||||
image = image.astype(np.float32)
|
||||
# Normalize to [0,1]
|
||||
image /= 255.0
|
||||
# HWC to CHW format:
|
||||
image = np.transpose(image, [2, 0, 1])
|
||||
# CHW to NCHW format
|
||||
image = np.expand_dims(image, axis=0)
|
||||
# Convert the image to row-major order, also known as "C order":
|
||||
image = np.ascontiguousarray(image)
|
||||
return image, image_raw, h, w
|
||||
|
||||
def xywh2xyxy(self, origin_h, origin_w, x):
|
||||
"""
|
||||
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
|
||||
param:
|
||||
origin_h: height of original image
|
||||
origin_w: width of original image
|
||||
x: A boxes numpy, each row is a box [center_x, center_y, w, h]
|
||||
return:
|
||||
y: A boxes numpy, each row is a box [x1, y1, x2, y2]
|
||||
"""
|
||||
y = np.zeros_like(x)
|
||||
r_w = self.input_w / origin_w
|
||||
r_h = self.input_h / origin_h
|
||||
if r_h > r_w:
|
||||
y[:, 0] = x[:, 0]
|
||||
y[:, 2] = x[:, 2]
|
||||
y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
|
||||
y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
|
||||
y /= r_w
|
||||
else:
|
||||
y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
|
||||
y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
|
||||
y[:, 1] = x[:, 1]
|
||||
y[:, 3] = x[:, 3]
|
||||
y /= r_h
|
||||
|
||||
return y
|
||||
|
||||
def post_process(self, output, origin_h, origin_w):
|
||||
"""
|
||||
description: postprocess the prediction
|
||||
param:
|
||||
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
|
||||
origin_h: height of original image
|
||||
origin_w: width of original image
|
||||
return:
|
||||
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
|
||||
result_scores: finally scores, a numpy, each element is the score correspoing to box
|
||||
result_classid: finally classid, a numpy, each element is the classid correspoing to box
|
||||
"""
|
||||
num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
|
||||
# Get the num of boxes detected
|
||||
num = int(output[0])
|
||||
# Reshape to a two dimentional ndarray
|
||||
# pred = np.reshape(output[1:], (-1, 38))[:num, :]
|
||||
pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
|
||||
# Do nms
|
||||
boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
|
||||
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
|
||||
result_scores = boxes[:, 4] if len(boxes) else np.array([])
|
||||
result_classid = boxes[:, 5] if len(boxes) else np.array([])
|
||||
return result_boxes, result_scores, result_classid
|
||||
|
||||
def bbox_iou(self, box1, box2, x1y1x2y2=True):
|
||||
"""
|
||||
description: compute the IoU of two bounding boxes
|
||||
param:
|
||||
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
|
||||
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
|
||||
x1y1x2y2: select the coordinate format
|
||||
return:
|
||||
iou: computed iou
|
||||
"""
|
||||
if not x1y1x2y2:
|
||||
# Transform from center and width to exact coordinates
|
||||
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
|
||||
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
|
||||
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
|
||||
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
|
||||
else:
|
||||
# Get the coordinates of bounding boxes
|
||||
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
|
||||
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
|
||||
|
||||
# Get the coordinates of the intersection rectangle
|
||||
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
|
||||
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
|
||||
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
|
||||
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
|
||||
# Intersection area
|
||||
inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
|
||||
* np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
|
||||
# Union Area
|
||||
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
|
||||
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
|
||||
|
||||
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
|
||||
|
||||
return iou
|
||||
|
||||
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
|
||||
"""
|
||||
description: Removes detections with lower object confidence score than 'conf_thres' and performs
|
||||
Non-Maximum Suppression to further filter detections.
|
||||
param:
|
||||
prediction: detections, (x1, y1, x2, y2, conf, cls_id)
|
||||
origin_h: original image height
|
||||
origin_w: original image width
|
||||
conf_thres: a confidence threshold to filter detections
|
||||
nms_thres: a iou threshold to filter detections
|
||||
return:
|
||||
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
|
||||
"""
|
||||
# Get the boxes that score > CONF_THRESH
|
||||
boxes = prediction[prediction[:, 4] >= conf_thres]
|
||||
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
|
||||
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
|
||||
# clip the coordinates
|
||||
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
|
||||
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
|
||||
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
|
||||
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
|
||||
# Object confidence
|
||||
confs = boxes[:, 4]
|
||||
# Sort by the confs
|
||||
boxes = boxes[np.argsort(-confs)]
|
||||
# Perform non-maximum suppression
|
||||
keep_boxes = []
|
||||
while boxes.shape[0]:
|
||||
large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
|
||||
label_match = boxes[0, -1] == boxes[:, -1]
|
||||
# Indices of boxes with lower confidence scores, large IOUs and matching labels
|
||||
invalid = large_overlap & label_match
|
||||
keep_boxes += [boxes[0]]
|
||||
boxes = boxes[~invalid]
|
||||
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
|
||||
return boxes
|
||||
|
||||
|
||||
class inferThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper, image_path_batch):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
self.image_path_batch = image_path_batch
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
|
||||
for i, img_path in enumerate(self.image_path_batch):
|
||||
parent, filename = os.path.split(img_path)
|
||||
save_name = os.path.join('output', filename)
|
||||
# Save image
|
||||
cv2.imwrite(save_name, batch_image_raw[i])
|
||||
print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
|
||||
|
||||
|
||||
class warmUpThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
|
||||
print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# load custom plugin and engine
|
||||
PLUGIN_LIBRARY = "./build/libmyplugins.so"
|
||||
engine_file_path = "yolov5xu.engine"
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
engine_file_path = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
PLUGIN_LIBRARY = sys.argv[2]
|
||||
|
||||
ctypes.CDLL(PLUGIN_LIBRARY)
|
||||
|
||||
# load coco labels
|
||||
|
||||
categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
|
||||
"traffic light",
|
||||
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
|
||||
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
|
||||
"frisbee",
|
||||
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
|
||||
"surfboard",
|
||||
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
|
||||
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
|
||||
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
|
||||
"cell phone",
|
||||
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
|
||||
"teddy bear",
|
||||
"hair drier", "toothbrush"]
|
||||
|
||||
if os.path.exists('output/'):
|
||||
shutil.rmtree('output/')
|
||||
os.makedirs('output/')
|
||||
# a YoLov8TRT instance
|
||||
yolov8_wrapper = YoLov8TRT(engine_file_path)
|
||||
try:
|
||||
print('batch size is', yolov8_wrapper.batch_size)
|
||||
|
||||
image_dir = "images/"
|
||||
image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
|
||||
|
||||
for i in range(10):
|
||||
# create a new thread to do warm_up
|
||||
thread1 = warmUpThread(yolov8_wrapper)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
for batch in image_path_batches:
|
||||
# create a new thread to do inference
|
||||
thread1 = inferThread(yolov8_wrapper, batch)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
finally:
|
||||
# destroy the instance
|
||||
yolov8_wrapper.destroy()
|
||||
285
yolov8_cls.cpp
Normal file
285
yolov8_cls.cpp
Normal file
@ -0,0 +1,285 @@
|
||||
#include "cuda_utils.h"
|
||||
#include "logging.h"
|
||||
#include "utils.h"
|
||||
#include "model.h"
|
||||
#include "config.h"
|
||||
#include "calibrator.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <opencv2/opencv.hpp>
|
||||
|
||||
using namespace nvinfer1;
|
||||
|
||||
static Logger gLogger;
|
||||
const static int kOutputSize = kClsNumClass;
|
||||
|
||||
void batch_preprocess(std::vector<cv::Mat>& imgs, float* output, int dst_width=224, int dst_height=224) {
|
||||
for (size_t b = 0; b < imgs.size(); b++) {
|
||||
int h = imgs[b].rows;
|
||||
int w = imgs[b].cols;
|
||||
int m = std::min(h, w);
|
||||
int top = (h - m) / 2;
|
||||
int left = (w - m) / 2;
|
||||
cv::Mat img = imgs[b](cv::Rect(left, top, m, m));
|
||||
cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR);
|
||||
cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
|
||||
img.convertTo(img, CV_32F, 1/255.0);
|
||||
|
||||
std::vector<cv::Mat> channels(3);
|
||||
cv::split(img, channels);
|
||||
|
||||
// CHW format
|
||||
for (int c = 0; c < 3; ++c) {
|
||||
int i = 0;
|
||||
for (int row = 0; row < dst_height; ++row) {
|
||||
for (int col = 0; col < dst_width; ++col) {
|
||||
output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] =
|
||||
channels[c].at<float>(row, col);
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> softmax(float *prob, int n) {
|
||||
std::vector<float> res;
|
||||
float sum = 0.0f;
|
||||
float t;
|
||||
for (int i = 0; i < n; i++) {
|
||||
t = expf(prob[i]);
|
||||
res.push_back(t);
|
||||
sum += t;
|
||||
}
|
||||
for (int i = 0; i < n; i++) {
|
||||
res[i] /= sum;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<int> topk(const std::vector<float>& vec, int k) {
|
||||
std::vector<int> topk_index;
|
||||
std::vector<size_t> vec_index(vec.size());
|
||||
std::iota(vec_index.begin(), vec_index.end(), 0);
|
||||
|
||||
std::sort(vec_index.begin(), vec_index.end(), [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; });
|
||||
|
||||
int k_num = std::min<int>(vec.size(), k);
|
||||
|
||||
for (int i = 0; i < k_num; ++i) {
|
||||
topk_index.push_back(vec_index[i]);
|
||||
}
|
||||
|
||||
return topk_index;
|
||||
}
|
||||
|
||||
std::vector<std::string> read_classes(std::string file_name) {
|
||||
std::vector<std::string> classes;
|
||||
std::ifstream ifs(file_name, std::ios::in);
|
||||
if (!ifs.is_open()) {
|
||||
std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl;
|
||||
assert(0);
|
||||
}
|
||||
std::string s;
|
||||
while (std::getline(ifs, s)) {
|
||||
classes.push_back(s);
|
||||
}
|
||||
ifs.close();
|
||||
return classes;
|
||||
}
|
||||
|
||||
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir) {
|
||||
if (argc < 4) return false;
|
||||
if (std::string(argv[1]) == "-s" && (argc == 5)) {
|
||||
wts = std::string(argv[2]);
|
||||
engine = std::string(argv[3]);
|
||||
auto net = std::string(argv[4]);
|
||||
if (net[0] == 'n') {
|
||||
gd = 0.33;
|
||||
gw = 0.25;
|
||||
} else if (net[0] == 's') {
|
||||
gd = 0.33;
|
||||
gw = 0.50;
|
||||
} else if (net[0] == 'm') {
|
||||
gd = 0.67;
|
||||
gw = 0.75;
|
||||
} else if (net[0] == 'l') {
|
||||
gd = 1.0;
|
||||
gw = 1.0;
|
||||
} else if (net[0] == 'x') {
|
||||
gd = 1.0;
|
||||
gw = 1.25;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (std::string(argv[1]) == "-d" && argc == 4) {
|
||||
engine = std::string(argv[2]);
|
||||
img_dir = std::string(argv[3]);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer, float** output_buffer_host) {
|
||||
assert(engine->getNbBindings() == 2);
|
||||
// In order to bind the buffers, we need to know the names of the input and output tensors.
|
||||
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
|
||||
const int inputIndex = engine->getBindingIndex(kInputTensorName);
|
||||
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
|
||||
assert(inputIndex == 0);
|
||||
assert(outputIndex == 1);
|
||||
// Create GPU buffers on device
|
||||
CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float)));
|
||||
CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float)));
|
||||
|
||||
*cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW];
|
||||
*output_buffer_host = new float[kBatchSize * kOutputSize];
|
||||
}
|
||||
|
||||
void infer(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float), cudaMemcpyHostToDevice, stream));
|
||||
context.enqueue(batchSize, buffers, stream, nullptr);
|
||||
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
|
||||
void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name, std::string& engine_name) {
|
||||
// Create builder
|
||||
IBuilder* builder = createInferBuilder(gLogger);
|
||||
IBuilderConfig* config = builder->createBuilderConfig();
|
||||
// Create model to populate the network, then set the outputs and create an engine
|
||||
IHostMemory *serialized_engine = nullptr;
|
||||
//engine = buildEngineYolov8Cls(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
|
||||
serialized_engine = buildEngineYolov8Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw);
|
||||
assert(serialized_engine);
|
||||
// Save engine to file
|
||||
std::ofstream p(engine_name, std::ios::binary);
|
||||
if (!p) {
|
||||
std::cerr << "Could not open plan output file" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
|
||||
|
||||
// Close everything down
|
||||
delete serialized_engine;
|
||||
delete config;
|
||||
delete builder;
|
||||
}
|
||||
|
||||
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) {
|
||||
std::ifstream file(engine_name, std::ios::binary);
|
||||
if (!file.good()) {
|
||||
std::cerr << "read " << engine_name << " error!" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
size_t size = 0;
|
||||
file.seekg(0, file.end);
|
||||
size = file.tellg();
|
||||
file.seekg(0, file.beg);
|
||||
char* serialized_engine = new char[size];
|
||||
assert(serialized_engine);
|
||||
file.read(serialized_engine, size);
|
||||
file.close();
|
||||
|
||||
*runtime = createInferRuntime(gLogger);
|
||||
assert(*runtime);
|
||||
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
|
||||
assert(*engine);
|
||||
*context = (*engine)->createExecutionContext();
|
||||
assert(*context);
|
||||
delete[] serialized_engine;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
cudaSetDevice(kGpuId);
|
||||
|
||||
std::string wts_name = "";
|
||||
std::string engine_name = "";
|
||||
float gd = 0.0f, gw = 0.0f;
|
||||
std::string img_dir;
|
||||
|
||||
if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) {
|
||||
std::cerr << "arguments not right!" << std::endl;
|
||||
std::cerr << "./yolov8_cls -s [.wts] [.engine] [n/s/m/l/x or c gd gw] // serialize model to plan file" << std::endl;
|
||||
std::cerr << "./yolov8_cls -d [.engine] ../samples // deserialize plan file and run inference" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create a model using the API directly and serialize it to a file
|
||||
if (!wts_name.empty()) {
|
||||
serialize_engine(kBatchSize, gd, gw, wts_name, engine_name);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Deserialize the engine from file
|
||||
IRuntime* runtime = nullptr;
|
||||
ICudaEngine* engine = nullptr;
|
||||
IExecutionContext* context = nullptr;
|
||||
deserialize_engine(engine_name, &runtime, &engine, &context);
|
||||
cudaStream_t stream;
|
||||
CUDA_CHECK(cudaStreamCreate(&stream));
|
||||
|
||||
// Prepare cpu and gpu buffers
|
||||
float* device_buffers[2];
|
||||
float* cpu_input_buffer = nullptr;
|
||||
float* output_buffer_host = nullptr;
|
||||
prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host);
|
||||
|
||||
// Read images from directory
|
||||
std::vector<std::string> file_names;
|
||||
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
|
||||
std::cerr << "read_files_in_dir failed." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Read imagenet labels
|
||||
auto classes = read_classes("imagenet_classes.txt");
|
||||
|
||||
// batch predict
|
||||
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
|
||||
// Get a batch of images
|
||||
std::vector<cv::Mat> img_batch;
|
||||
std::vector<std::string> img_name_batch;
|
||||
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
|
||||
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
|
||||
img_batch.push_back(img);
|
||||
img_name_batch.push_back(file_names[j]);
|
||||
}
|
||||
|
||||
// Preprocess
|
||||
batch_preprocess(img_batch, cpu_input_buffer);
|
||||
|
||||
// Run inference
|
||||
auto start = std::chrono::system_clock::now();
|
||||
infer(*context, stream, (void**)device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize);
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
|
||||
|
||||
// Postprocess and get top-k result
|
||||
for (size_t b = 0; b < img_name_batch.size(); b++) {
|
||||
float* p = &output_buffer_host[b * kOutputSize];
|
||||
auto res = softmax(p, kOutputSize);
|
||||
auto topk_idx = topk(res, 3);
|
||||
std::cout << img_name_batch[b] << std::endl;
|
||||
for (auto idx: topk_idx) {
|
||||
std::cout << " " << classes[idx] << " " << res[idx] << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Release stream and buffers
|
||||
cudaStreamDestroy(stream);
|
||||
CUDA_CHECK(cudaFree(device_buffers[0]));
|
||||
CUDA_CHECK(cudaFree(device_buffers[1]));
|
||||
delete[] cpu_input_buffer;
|
||||
delete[] output_buffer_host;
|
||||
// Destroy the engine
|
||||
delete context;
|
||||
delete engine;
|
||||
delete runtime;
|
||||
return 0;
|
||||
}
|
||||
283
yolov8_cls_trt.py
Normal file
283
yolov8_cls_trt.py
Normal file
@ -0,0 +1,283 @@
|
||||
"""
|
||||
An example that uses TensorRT's Python api to make inferences.
|
||||
"""
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import pycuda.autoinit
|
||||
import pycuda.driver as cuda
|
||||
import tensorrt as trt
|
||||
|
||||
|
||||
def get_img_path_batches(batch_size, img_dir):
|
||||
ret = []
|
||||
batch = []
|
||||
for root, dirs, files in os.walk(img_dir):
|
||||
for name in files:
|
||||
if len(batch) == batch_size:
|
||||
ret.append(batch)
|
||||
batch = []
|
||||
batch.append(os.path.join(root, name))
|
||||
if len(batch) > 0:
|
||||
ret.append(batch)
|
||||
return ret
|
||||
|
||||
|
||||
with open("imagenet_classes.txt") as f:
|
||||
classes = [line.strip() for line in f.readlines()]
|
||||
|
||||
|
||||
class YoLov8TRT(object):
|
||||
"""
|
||||
description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
|
||||
"""
|
||||
|
||||
def __init__(self, engine_file_path):
|
||||
# Create a Context on this device,
|
||||
self.ctx = cuda.Device(0).make_context()
|
||||
stream = cuda.Stream()
|
||||
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
|
||||
runtime = trt.Runtime(TRT_LOGGER)
|
||||
|
||||
# Deserialize the engine from file
|
||||
with open(engine_file_path, "rb") as f:
|
||||
engine = runtime.deserialize_cuda_engine(f.read())
|
||||
context = engine.create_execution_context()
|
||||
|
||||
host_inputs = []
|
||||
cuda_inputs = []
|
||||
host_outputs = []
|
||||
cuda_outputs = []
|
||||
bindings = []
|
||||
self.mean = (0.485, 0.456, 0.406)
|
||||
self.std = (0.229, 0.224, 0.225)
|
||||
|
||||
for binding in engine:
|
||||
print('binding:', binding, engine.get_binding_shape(binding))
|
||||
size = trt.volume(engine.get_binding_shape(
|
||||
binding)) * engine.max_batch_size
|
||||
dtype = trt.nptype(engine.get_binding_dtype(binding))
|
||||
# Allocate host and device buffers
|
||||
host_mem = cuda.pagelocked_empty(size, dtype)
|
||||
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||
# Append the device buffer to device bindings.
|
||||
bindings.append(int(cuda_mem))
|
||||
# Append to the appropriate list.
|
||||
if engine.binding_is_input(binding):
|
||||
self.input_w = engine.get_binding_shape(binding)[-1]
|
||||
self.input_h = engine.get_binding_shape(binding)[-2]
|
||||
host_inputs.append(host_mem)
|
||||
cuda_inputs.append(cuda_mem)
|
||||
else:
|
||||
host_outputs.append(host_mem)
|
||||
cuda_outputs.append(cuda_mem)
|
||||
|
||||
# Store
|
||||
self.stream = stream
|
||||
self.context = context
|
||||
self.engine = engine
|
||||
self.host_inputs = host_inputs
|
||||
self.cuda_inputs = cuda_inputs
|
||||
self.host_outputs = host_outputs
|
||||
self.cuda_outputs = cuda_outputs
|
||||
self.bindings = bindings
|
||||
self.batch_size = engine.max_batch_size
|
||||
|
||||
def infer(self, raw_image_generator):
|
||||
threading.Thread.__init__(self)
|
||||
# Make self the active context, pushing it on top of the context stack.
|
||||
self.ctx.push()
|
||||
# Restore
|
||||
stream = self.stream
|
||||
context = self.context
|
||||
engine = self.engine
|
||||
host_inputs = self.host_inputs
|
||||
cuda_inputs = self.cuda_inputs
|
||||
host_outputs = self.host_outputs
|
||||
cuda_outputs = self.cuda_outputs
|
||||
bindings = self.bindings
|
||||
# Do image preprocess
|
||||
batch_image_raw = []
|
||||
batch_input_image = np.empty(
|
||||
shape=[self.batch_size, 3, self.input_h, self.input_w])
|
||||
for i, image_raw in enumerate(raw_image_generator):
|
||||
batch_image_raw.append(image_raw)
|
||||
input_image = self.preprocess_cls_image(image_raw)
|
||||
np.copyto(batch_input_image[i], input_image)
|
||||
batch_input_image = np.ascontiguousarray(batch_input_image)
|
||||
|
||||
# Copy input image to host buffer
|
||||
np.copyto(host_inputs[0], batch_input_image.ravel())
|
||||
start = time.time()
|
||||
# Transfer input data to the GPU.
|
||||
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
|
||||
# Run inference.
|
||||
context.execute_async(batch_size=self.batch_size,
|
||||
bindings=bindings, stream_handle=stream.handle)
|
||||
# Transfer predictions back from the GPU.
|
||||
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
|
||||
# Synchronize the stream
|
||||
stream.synchronize()
|
||||
end = time.time()
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
# Here we use the first row of output in that batch_size = 1
|
||||
output = host_outputs[0]
|
||||
# Do postprocess
|
||||
for i in range(self.batch_size):
|
||||
classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls(
|
||||
output)
|
||||
cv2.putText(batch_image_raw[i], str(
|
||||
classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA)
|
||||
print(classes_ls, predicted_conf_ls)
|
||||
return batch_image_raw, end - start
|
||||
|
||||
def destroy(self):
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
|
||||
def get_raw_image(self, image_path_batch):
|
||||
"""
|
||||
description: Read an image from image path
|
||||
"""
|
||||
for img_path in image_path_batch:
|
||||
yield cv2.imread(img_path)
|
||||
|
||||
def get_raw_image_zeros(self, image_path_batch=None):
|
||||
"""
|
||||
description: Ready data for warmup
|
||||
"""
|
||||
for _ in range(self.batch_size):
|
||||
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
|
||||
|
||||
def preprocess_cls_image(self, raw_bgr_image, dst_width=224, dst_height=224):
|
||||
|
||||
"""
|
||||
description: Convert BGR image to RGB,
|
||||
crop the center square frame,
|
||||
resize it to target size, normalize to [0,1],
|
||||
transform to NCHW format.
|
||||
param:
|
||||
raw_bgr_image: numpy array, raw BGR image
|
||||
dst_width: int, target image width
|
||||
dst_height: int, target image height
|
||||
return:
|
||||
image: the processed image
|
||||
image_raw: the original image
|
||||
h: original height
|
||||
w: original width
|
||||
"""
|
||||
image_raw = raw_bgr_image
|
||||
h, w, c = image_raw.shape
|
||||
# Crop the center square frame
|
||||
m = min(h, w)
|
||||
top = (h - m) // 2
|
||||
left = (w - m) // 2
|
||||
image = raw_bgr_image[top:top + m, left:left + m]
|
||||
|
||||
# Resize the image with target size while maintaining ratio
|
||||
image = cv2.resize(image, (dst_width, dst_height), interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
# Convert BGR to RGB
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Normalize to [0,1]
|
||||
image = image.astype(np.float32) / 255.0
|
||||
|
||||
# HWC to CHW format
|
||||
image = image.transpose(2, 0, 1)
|
||||
|
||||
# CHW to NCHW format (add batch dimension)
|
||||
image = np.expand_dims(image, axis=0)
|
||||
|
||||
# Convert the image to row-major order, also known as "C order"
|
||||
image = np.ascontiguousarray(image)
|
||||
|
||||
batch_data = np.expand_dims(image, axis=0)
|
||||
|
||||
return batch_data
|
||||
|
||||
def postprocess_cls(self, output_data):
|
||||
classes_ls = []
|
||||
predicted_conf_ls = []
|
||||
category_id_ls = []
|
||||
output_data = output_data.reshape(self.batch_size, -1)
|
||||
output_data = torch.Tensor(output_data)
|
||||
p = torch.nn.functional.softmax(output_data, dim=1)
|
||||
score, index = torch.topk(p, 3)
|
||||
for ind in range(index.shape[0]):
|
||||
input_category_id = index[ind][0].item() # 716
|
||||
category_id_ls.append(input_category_id)
|
||||
predicted_confidence = score[ind][0].item()
|
||||
predicted_conf_ls.append(predicted_confidence)
|
||||
classes_ls.append(classes[input_category_id])
|
||||
return classes_ls, predicted_conf_ls, category_id_ls
|
||||
|
||||
|
||||
class inferThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper, image_path_batch):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
self.image_path_batch = image_path_batch
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(
|
||||
self.yolov8_wrapper.get_raw_image(self.image_path_batch))
|
||||
for i, img_path in enumerate(self.image_path_batch):
|
||||
parent, filename = os.path.split(img_path)
|
||||
save_name = os.path.join('output', filename)
|
||||
# Save image
|
||||
cv2.imwrite(save_name, batch_image_raw[i])
|
||||
print('input->{}, time->{:.2f}ms, saving into output/'.format(
|
||||
self.image_path_batch, use_time * 1000))
|
||||
|
||||
|
||||
class warmUpThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(
|
||||
self.yolov8_wrapper.get_raw_image_zeros())
|
||||
print(
|
||||
'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# load custom plugin and engine
|
||||
engine_file_path = "./yolov8x-cls-fp32.engine"
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
engine_file_path = sys.argv[1]
|
||||
|
||||
if os.path.exists('output/'):
|
||||
shutil.rmtree('output/')
|
||||
os.makedirs('output/')
|
||||
# a YoLov8TRT instance
|
||||
yolov8_wrapper = YoLov8TRT(engine_file_path)
|
||||
try:
|
||||
print('batch size is', yolov8_wrapper.batch_size)
|
||||
|
||||
image_dir = "samples/"
|
||||
image_path_batches = get_img_path_batches(
|
||||
yolov8_wrapper.batch_size, image_dir)
|
||||
|
||||
for i in range(10):
|
||||
# create a new thread to do warm_up
|
||||
thread1 = warmUpThread(yolov8_wrapper)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
for batch in image_path_batches:
|
||||
# create a new thread to do inference
|
||||
thread1 = inferThread(yolov8_wrapper, batch)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
finally:
|
||||
# destroy the instance
|
||||
yolov8_wrapper.destroy()
|
||||
276
yolov8_det.cpp
Normal file
276
yolov8_det.cpp
Normal file
@ -0,0 +1,276 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include "cuda_utils.h"
|
||||
#include "logging.h"
|
||||
#include "model.h"
|
||||
#include "postprocess.h"
|
||||
#include "preprocess.h"
|
||||
#include "utils.h"
|
||||
|
||||
Logger gLogger;
|
||||
using namespace nvinfer1;
|
||||
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
|
||||
|
||||
void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
|
||||
float& gw, int& max_channels) {
|
||||
IBuilder* builder = createInferBuilder(gLogger);
|
||||
IBuilderConfig* config = builder->createBuilderConfig();
|
||||
IHostMemory* serialized_engine = nullptr;
|
||||
|
||||
if (is_p == 6) {
|
||||
serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
|
||||
} else if (is_p == 2) {
|
||||
serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
|
||||
} else {
|
||||
serialized_engine = buildEngineYolov8Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
|
||||
}
|
||||
|
||||
assert(serialized_engine);
|
||||
std::ofstream p(engine_name, std::ios::binary);
|
||||
if (!p) {
|
||||
std::cout << "could not open plan output file" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
|
||||
|
||||
delete serialized_engine;
|
||||
delete config;
|
||||
delete builder;
|
||||
}
|
||||
|
||||
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
|
||||
IExecutionContext** context) {
|
||||
std::ifstream file(engine_name, std::ios::binary);
|
||||
if (!file.good()) {
|
||||
std::cerr << "read " << engine_name << " error!" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
size_t size = 0;
|
||||
file.seekg(0, file.end);
|
||||
size = file.tellg();
|
||||
file.seekg(0, file.beg);
|
||||
char* serialized_engine = new char[size];
|
||||
assert(serialized_engine);
|
||||
file.read(serialized_engine, size);
|
||||
file.close();
|
||||
|
||||
*runtime = createInferRuntime(gLogger);
|
||||
assert(*runtime);
|
||||
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
|
||||
assert(*engine);
|
||||
*context = (*engine)->createExecutionContext();
|
||||
assert(*context);
|
||||
delete[] serialized_engine;
|
||||
}
|
||||
|
||||
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
|
||||
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
|
||||
std::string cuda_post_process) {
|
||||
assert(engine->getNbBindings() == 2);
|
||||
// In order to bind the buffers, we need to know the names of the input and output tensors.
|
||||
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
|
||||
const int inputIndex = engine->getBindingIndex(kInputTensorName);
|
||||
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
|
||||
assert(inputIndex == 0);
|
||||
assert(outputIndex == 1);
|
||||
// Create GPU buffers on device
|
||||
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
|
||||
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
|
||||
if (cuda_post_process == "c") {
|
||||
*output_buffer_host = new float[kBatchSize * kOutputSize];
|
||||
} else if (cuda_post_process == "g") {
|
||||
if (kBatchSize > 1) {
|
||||
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
// Allocate memory for decode_ptr_host and copy to device
|
||||
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
|
||||
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
|
||||
}
|
||||
}
|
||||
|
||||
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
|
||||
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
|
||||
// infer on the batch asynchronously, and DMA output back to host
|
||||
auto start = std::chrono::system_clock::now();
|
||||
context.enqueue(batchsize, buffers, stream, nullptr);
|
||||
if (cuda_post_process == "c") {
|
||||
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
|
||||
<< "ms" << std::endl;
|
||||
} else if (cuda_post_process == "g") {
|
||||
CUDA_CHECK(
|
||||
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
|
||||
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
|
||||
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
|
||||
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
|
||||
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference and gpu postprocess time: "
|
||||
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
|
||||
std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
|
||||
if (argc < 4)
|
||||
return false;
|
||||
if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
|
||||
wts = std::string(argv[2]);
|
||||
engine = std::string(argv[3]);
|
||||
auto sub_type = std::string(argv[4]);
|
||||
|
||||
if (sub_type[0] == 'n') {
|
||||
gd = 0.33;
|
||||
gw = 0.25;
|
||||
max_channels = 1024;
|
||||
} else if (sub_type[0] == 's') {
|
||||
gd = 0.33;
|
||||
gw = 0.50;
|
||||
max_channels = 1024;
|
||||
} else if (sub_type[0] == 'm') {
|
||||
gd = 0.67;
|
||||
gw = 0.75;
|
||||
max_channels = 576;
|
||||
} else if (sub_type[0] == 'l') {
|
||||
gd = 1.0;
|
||||
gw = 1.0;
|
||||
max_channels = 512;
|
||||
} else if (sub_type[0] == 'x') {
|
||||
gd = 1.0;
|
||||
gw = 1.25;
|
||||
max_channels = 640;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
if (sub_type.size() == 2 && sub_type[1] == '6') {
|
||||
is_p = 6;
|
||||
} else if (sub_type.size() == 2 && sub_type[1] == '2') {
|
||||
is_p = 2;
|
||||
}
|
||||
} else if (std::string(argv[1]) == "-d" && argc == 5) {
|
||||
engine = std::string(argv[2]);
|
||||
img_dir = std::string(argv[3]);
|
||||
cuda_post_process = std::string(argv[4]);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
cudaSetDevice(kGpuId);
|
||||
std::string wts_name = "";
|
||||
std::string engine_name = "";
|
||||
std::string img_dir;
|
||||
std::string sub_type = "";
|
||||
std::string cuda_post_process = "";
|
||||
int model_bboxes;
|
||||
int is_p = 0;
|
||||
float gd = 0.0f, gw = 0.0f;
|
||||
int max_channels = 0;
|
||||
|
||||
if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
|
||||
max_channels)) {
|
||||
std::cerr << "Arguments not right!" << std::endl;
|
||||
std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to "
|
||||
"plan file"
|
||||
<< std::endl;
|
||||
std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create a model using the API directly and serialize it to a file
|
||||
if (!wts_name.empty()) {
|
||||
serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Deserialize the engine from file
|
||||
IRuntime* runtime = nullptr;
|
||||
ICudaEngine* engine = nullptr;
|
||||
IExecutionContext* context = nullptr;
|
||||
deserialize_engine(engine_name, &runtime, &engine, &context);
|
||||
cudaStream_t stream;
|
||||
CUDA_CHECK(cudaStreamCreate(&stream));
|
||||
cuda_preprocess_init(kMaxInputImageSize);
|
||||
auto out_dims = engine->getBindingDimensions(1);
|
||||
model_bboxes = out_dims.d[0];
|
||||
// Prepare cpu and gpu buffers
|
||||
float* device_buffers[2];
|
||||
float* output_buffer_host = nullptr;
|
||||
float* decode_ptr_host = nullptr;
|
||||
float* decode_ptr_device = nullptr;
|
||||
|
||||
// Read images from directory
|
||||
std::vector<std::string> file_names;
|
||||
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
|
||||
std::cerr << "read_files_in_dir failed." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
|
||||
&decode_ptr_device, cuda_post_process);
|
||||
|
||||
// batch predict
|
||||
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
|
||||
// Get a batch of images
|
||||
std::vector<cv::Mat> img_batch;
|
||||
std::vector<std::string> img_name_batch;
|
||||
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
|
||||
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
|
||||
img_batch.push_back(img);
|
||||
img_name_batch.push_back(file_names[j]);
|
||||
}
|
||||
// Preprocess
|
||||
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
|
||||
// Run inference
|
||||
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
|
||||
decode_ptr_device, model_bboxes, cuda_post_process);
|
||||
std::vector<std::vector<Detection>> res_batch;
|
||||
if (cuda_post_process == "c") {
|
||||
// NMS
|
||||
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
|
||||
} else if (cuda_post_process == "g") {
|
||||
//Process gpu decode and nms results
|
||||
batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
|
||||
}
|
||||
// Draw bounding boxes
|
||||
draw_bbox(img_batch, res_batch);
|
||||
// Save images
|
||||
for (size_t j = 0; j < img_batch.size(); j++) {
|
||||
cv::imwrite("_" + img_name_batch[j], img_batch[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Release stream and buffers
|
||||
cudaStreamDestroy(stream);
|
||||
CUDA_CHECK(cudaFree(device_buffers[0]));
|
||||
CUDA_CHECK(cudaFree(device_buffers[1]));
|
||||
CUDA_CHECK(cudaFree(decode_ptr_device));
|
||||
delete[] decode_ptr_host;
|
||||
delete[] output_buffer_host;
|
||||
cuda_preprocess_destroy();
|
||||
// Destroy the engine
|
||||
delete context;
|
||||
delete engine;
|
||||
delete runtime;
|
||||
|
||||
// Print histogram of the output distribution
|
||||
//std::cout << "\nOutput:\n\n";
|
||||
//for (unsigned int i = 0; i < kOutputSize; i++)
|
||||
//{
|
||||
// std::cout << prob[i] << ", ";
|
||||
// if (i % 10 == 0) std::cout << std::endl;
|
||||
//}
|
||||
//std::cout << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
451
yolov8_det_trt.py
Normal file
451
yolov8_det_trt.py
Normal file
@ -0,0 +1,451 @@
|
||||
"""
|
||||
An example that uses TensorRT's Python api to make inferences.
|
||||
"""
|
||||
import ctypes
|
||||
import os
|
||||
import shutil
|
||||
import random
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pycuda.autoinit # noqa: F401
|
||||
import pycuda.driver as cuda
|
||||
import tensorrt as trt
|
||||
|
||||
CONF_THRESH = 0.2
|
||||
IOU_THRESHOLD = 0.4
|
||||
POSE_NUM = 17 * 3
|
||||
DET_NUM = 6
|
||||
SEG_NUM = 32
|
||||
OBB_NUM = 1
|
||||
|
||||
|
||||
def get_img_path_batches(batch_size, img_dir):
|
||||
ret = []
|
||||
batch = []
|
||||
for root, dirs, files in os.walk(img_dir):
|
||||
for name in files:
|
||||
if len(batch) == batch_size:
|
||||
ret.append(batch)
|
||||
batch = []
|
||||
batch.append(os.path.join(root, name))
|
||||
if len(batch) > 0:
|
||||
ret.append(batch)
|
||||
return ret
|
||||
|
||||
|
||||
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
|
||||
"""
|
||||
description: Plots one bounding box on image img,
|
||||
this function comes from YoLov8 project.
|
||||
param:
|
||||
x: a box likes [x1,y1,x2,y2]
|
||||
img: a opencv image object
|
||||
color: color to draw rectangle, such as (0,255,0)
|
||||
label: str
|
||||
line_thickness: int
|
||||
return:
|
||||
no return
|
||||
|
||||
"""
|
||||
tl = (
|
||||
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
|
||||
) # line/font thickness
|
||||
color = color or [random.randint(0, 255) for _ in range(3)]
|
||||
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
|
||||
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
|
||||
if label:
|
||||
tf = max(tl - 1, 1) # font thickness
|
||||
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
|
||||
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
|
||||
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
|
||||
cv2.putText(
|
||||
img,
|
||||
label,
|
||||
(c1[0], c1[1] - 2),
|
||||
0,
|
||||
tl / 3,
|
||||
[225, 255, 255],
|
||||
thickness=tf,
|
||||
lineType=cv2.LINE_AA,
|
||||
)
|
||||
|
||||
|
||||
class YoLov8TRT(object):
|
||||
"""
|
||||
description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
|
||||
"""
|
||||
|
||||
def __init__(self, engine_file_path):
|
||||
# Create a Context on this device,
|
||||
self.ctx = cuda.Device(0).make_context()
|
||||
stream = cuda.Stream()
|
||||
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
|
||||
runtime = trt.Runtime(TRT_LOGGER)
|
||||
|
||||
# Deserialize the engine from file
|
||||
with open(engine_file_path, "rb") as f:
|
||||
engine = runtime.deserialize_cuda_engine(f.read())
|
||||
context = engine.create_execution_context()
|
||||
|
||||
host_inputs = []
|
||||
cuda_inputs = []
|
||||
host_outputs = []
|
||||
cuda_outputs = []
|
||||
bindings = []
|
||||
|
||||
for binding in engine:
|
||||
print('bingding:', binding, engine.get_binding_shape(binding))
|
||||
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
|
||||
dtype = trt.nptype(engine.get_binding_dtype(binding))
|
||||
# Allocate host and device buffers
|
||||
host_mem = cuda.pagelocked_empty(size, dtype)
|
||||
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||
# Append the device buffer to device bindings.
|
||||
bindings.append(int(cuda_mem))
|
||||
# Append to the appropriate list.
|
||||
if engine.binding_is_input(binding):
|
||||
self.input_w = engine.get_binding_shape(binding)[-1]
|
||||
self.input_h = engine.get_binding_shape(binding)[-2]
|
||||
host_inputs.append(host_mem)
|
||||
cuda_inputs.append(cuda_mem)
|
||||
else:
|
||||
host_outputs.append(host_mem)
|
||||
cuda_outputs.append(cuda_mem)
|
||||
|
||||
# Store
|
||||
self.stream = stream
|
||||
self.context = context
|
||||
self.engine = engine
|
||||
self.host_inputs = host_inputs
|
||||
self.cuda_inputs = cuda_inputs
|
||||
self.host_outputs = host_outputs
|
||||
self.cuda_outputs = cuda_outputs
|
||||
self.bindings = bindings
|
||||
self.batch_size = engine.max_batch_size
|
||||
self.det_output_length = host_outputs[0].shape[0]
|
||||
|
||||
def infer(self, raw_image_generator):
|
||||
threading.Thread.__init__(self)
|
||||
# Make self the active context, pushing it on top of the context stack.
|
||||
self.ctx.push()
|
||||
# Restore
|
||||
stream = self.stream
|
||||
context = self.context
|
||||
host_inputs = self.host_inputs
|
||||
cuda_inputs = self.cuda_inputs
|
||||
host_outputs = self.host_outputs
|
||||
cuda_outputs = self.cuda_outputs
|
||||
bindings = self.bindings
|
||||
# Do image preprocess
|
||||
batch_image_raw = []
|
||||
batch_origin_h = []
|
||||
batch_origin_w = []
|
||||
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
|
||||
for i, image_raw in enumerate(raw_image_generator):
|
||||
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
|
||||
batch_image_raw.append(image_raw)
|
||||
batch_origin_h.append(origin_h)
|
||||
batch_origin_w.append(origin_w)
|
||||
np.copyto(batch_input_image[i], input_image)
|
||||
batch_input_image = np.ascontiguousarray(batch_input_image)
|
||||
|
||||
# Copy input image to host buffer
|
||||
np.copyto(host_inputs[0], batch_input_image.ravel())
|
||||
start = time.time()
|
||||
# Transfer input data to the GPU.
|
||||
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
|
||||
# Run inference.
|
||||
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
|
||||
# Transfer predictions back from the GPU.
|
||||
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
|
||||
# Synchronize the stream
|
||||
stream.synchronize()
|
||||
end = time.time()
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
# Here we use the first row of output in that batch_size = 1
|
||||
output = host_outputs[0]
|
||||
# Do postprocess
|
||||
for i in range(self.batch_size):
|
||||
result_boxes, result_scores, result_classid = self.post_process(
|
||||
output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
|
||||
batch_origin_w[i]
|
||||
)
|
||||
# Draw rectangles and labels on the original image
|
||||
for j in range(len(result_boxes)):
|
||||
box = result_boxes[j]
|
||||
plot_one_box(
|
||||
box,
|
||||
batch_image_raw[i],
|
||||
label="{}:{:.2f}".format(
|
||||
categories[int(result_classid[j])], result_scores[j]
|
||||
),
|
||||
)
|
||||
return batch_image_raw, end - start
|
||||
|
||||
def destroy(self):
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
|
||||
def get_raw_image(self, image_path_batch):
|
||||
"""
|
||||
description: Read an image from image path
|
||||
"""
|
||||
for img_path in image_path_batch:
|
||||
yield cv2.imread(img_path)
|
||||
|
||||
def get_raw_image_zeros(self, image_path_batch=None):
|
||||
"""
|
||||
description: Ready data for warmup
|
||||
"""
|
||||
for _ in range(self.batch_size):
|
||||
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
|
||||
|
||||
def preprocess_image(self, raw_bgr_image):
|
||||
"""
|
||||
description: Convert BGR image to RGB,
|
||||
resize and pad it to target size, normalize to [0,1],
|
||||
transform to NCHW format.
|
||||
param:
|
||||
input_image_path: str, image path
|
||||
return:
|
||||
image: the processed image
|
||||
image_raw: the original image
|
||||
h: original height
|
||||
w: original width
|
||||
"""
|
||||
image_raw = raw_bgr_image
|
||||
h, w, c = image_raw.shape
|
||||
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
|
||||
# Calculate widht and height and paddings
|
||||
r_w = self.input_w / w
|
||||
r_h = self.input_h / h
|
||||
if r_h > r_w:
|
||||
tw = self.input_w
|
||||
th = int(r_w * h)
|
||||
tx1 = tx2 = 0
|
||||
ty1 = int((self.input_h - th) / 2)
|
||||
ty2 = self.input_h - th - ty1
|
||||
else:
|
||||
tw = int(r_h * w)
|
||||
th = self.input_h
|
||||
tx1 = int((self.input_w - tw) / 2)
|
||||
tx2 = self.input_w - tw - tx1
|
||||
ty1 = ty2 = 0
|
||||
# Resize the image with long side while maintaining ratio
|
||||
image = cv2.resize(image, (tw, th))
|
||||
# Pad the short side with (128,128,128)
|
||||
image = cv2.copyMakeBorder(
|
||||
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
|
||||
)
|
||||
image = image.astype(np.float32)
|
||||
# Normalize to [0,1]
|
||||
image /= 255.0
|
||||
# HWC to CHW format:
|
||||
image = np.transpose(image, [2, 0, 1])
|
||||
# CHW to NCHW format
|
||||
image = np.expand_dims(image, axis=0)
|
||||
# Convert the image to row-major order, also known as "C order":
|
||||
image = np.ascontiguousarray(image)
|
||||
return image, image_raw, h, w
|
||||
|
||||
def xywh2xyxy(self, origin_h, origin_w, x):
|
||||
"""
|
||||
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
|
||||
param:
|
||||
origin_h: height of original image
|
||||
origin_w: width of original image
|
||||
x: A boxes numpy, each row is a box [center_x, center_y, w, h]
|
||||
return:
|
||||
y: A boxes numpy, each row is a box [x1, y1, x2, y2]
|
||||
"""
|
||||
y = np.zeros_like(x)
|
||||
r_w = self.input_w / origin_w
|
||||
r_h = self.input_h / origin_h
|
||||
if r_h > r_w:
|
||||
y[:, 0] = x[:, 0]
|
||||
y[:, 2] = x[:, 2]
|
||||
y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
|
||||
y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
|
||||
y /= r_w
|
||||
else:
|
||||
y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
|
||||
y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
|
||||
y[:, 1] = x[:, 1]
|
||||
y[:, 3] = x[:, 3]
|
||||
y /= r_h
|
||||
|
||||
return y
|
||||
|
||||
def post_process(self, output, origin_h, origin_w):
|
||||
"""
|
||||
description: postprocess the prediction
|
||||
param:
|
||||
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
|
||||
origin_h: height of original image
|
||||
origin_w: width of original image
|
||||
return:
|
||||
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
|
||||
result_scores: finally scores, a numpy, each element is the score correspoing to box
|
||||
result_classid: finally classid, a numpy, each element is the classid correspoing to box
|
||||
"""
|
||||
num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
|
||||
# Get the num of boxes detected
|
||||
num = int(output[0])
|
||||
# Reshape to a two dimentional ndarray
|
||||
# pred = np.reshape(output[1:], (-1, 38))[:num, :]
|
||||
pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
|
||||
# Do nms
|
||||
boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
|
||||
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
|
||||
result_scores = boxes[:, 4] if len(boxes) else np.array([])
|
||||
result_classid = boxes[:, 5] if len(boxes) else np.array([])
|
||||
return result_boxes, result_scores, result_classid
|
||||
|
||||
def bbox_iou(self, box1, box2, x1y1x2y2=True):
|
||||
"""
|
||||
description: compute the IoU of two bounding boxes
|
||||
param:
|
||||
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
|
||||
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
|
||||
x1y1x2y2: select the coordinate format
|
||||
return:
|
||||
iou: computed iou
|
||||
"""
|
||||
if not x1y1x2y2:
|
||||
# Transform from center and width to exact coordinates
|
||||
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
|
||||
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
|
||||
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
|
||||
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
|
||||
else:
|
||||
# Get the coordinates of bounding boxes
|
||||
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
|
||||
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
|
||||
|
||||
# Get the coordinates of the intersection rectangle
|
||||
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
|
||||
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
|
||||
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
|
||||
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
|
||||
# Intersection area
|
||||
inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
|
||||
* np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
|
||||
# Union Area
|
||||
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
|
||||
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
|
||||
|
||||
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
|
||||
|
||||
return iou
|
||||
|
||||
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
|
||||
"""
|
||||
description: Removes detections with lower object confidence score than 'conf_thres' and performs
|
||||
Non-Maximum Suppression to further filter detections.
|
||||
param:
|
||||
prediction: detections, (x1, y1, x2, y2, conf, cls_id)
|
||||
origin_h: original image height
|
||||
origin_w: original image width
|
||||
conf_thres: a confidence threshold to filter detections
|
||||
nms_thres: a iou threshold to filter detections
|
||||
return:
|
||||
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
|
||||
"""
|
||||
# Get the boxes that score > CONF_THRESH
|
||||
boxes = prediction[prediction[:, 4] >= conf_thres]
|
||||
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
|
||||
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
|
||||
# clip the coordinates
|
||||
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
|
||||
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
|
||||
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
|
||||
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
|
||||
# Object confidence
|
||||
confs = boxes[:, 4]
|
||||
# Sort by the confs
|
||||
boxes = boxes[np.argsort(-confs)]
|
||||
# Perform non-maximum suppression
|
||||
keep_boxes = []
|
||||
while boxes.shape[0]:
|
||||
large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
|
||||
label_match = boxes[0, -1] == boxes[:, -1]
|
||||
# Indices of boxes with lower confidence scores, large IOUs and matching labels
|
||||
invalid = large_overlap & label_match
|
||||
keep_boxes += [boxes[0]]
|
||||
boxes = boxes[~invalid]
|
||||
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
|
||||
return boxes
|
||||
|
||||
|
||||
class inferThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper, image_path_batch):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
self.image_path_batch = image_path_batch
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
|
||||
for i, img_path in enumerate(self.image_path_batch):
|
||||
parent, filename = os.path.split(img_path)
|
||||
save_name = os.path.join('output', filename)
|
||||
# Save image
|
||||
cv2.imwrite(save_name, batch_image_raw[i])
|
||||
print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
|
||||
|
||||
|
||||
class warmUpThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
|
||||
print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# load custom plugin and engine
|
||||
PLUGIN_LIBRARY = "./build_20250603/libmyplugins.so"
|
||||
engine_file_path = "./build_20250603/best.engine"
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
engine_file_path = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
PLUGIN_LIBRARY = sys.argv[2]
|
||||
|
||||
ctypes.CDLL(PLUGIN_LIBRARY)
|
||||
|
||||
# load coco labels
|
||||
|
||||
# categories = ["face", "shoe", "phone", "e-bike"]
|
||||
|
||||
categories = ["helmet","non-Helmet","shoes"]
|
||||
|
||||
if os.path.exists('output/'):
|
||||
shutil.rmtree('output/')
|
||||
os.makedirs('output/')
|
||||
# a YoLov8TRT instance
|
||||
yolov8_wrapper = YoLov8TRT(engine_file_path)
|
||||
try:
|
||||
print('batch size is', yolov8_wrapper.batch_size)
|
||||
|
||||
image_dir = "images/"
|
||||
image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
|
||||
|
||||
for i in range(10):
|
||||
# create a new thread to do warm_up
|
||||
thread1 = warmUpThread(yolov8_wrapper)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
for batch in image_path_batches:
|
||||
# create a new thread to do inference
|
||||
thread1 = inferThread(yolov8_wrapper, batch)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
finally:
|
||||
# destroy the instance
|
||||
yolov8_wrapper.destroy()
|
||||
276
yolov8_obb.cpp
Normal file
276
yolov8_obb.cpp
Normal file
@ -0,0 +1,276 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include "cuda_utils.h"
|
||||
#include "logging.h"
|
||||
#include "model.h"
|
||||
#include "postprocess.h"
|
||||
#include "preprocess.h"
|
||||
#include "utils.h"
|
||||
|
||||
Logger gLogger;
|
||||
using namespace nvinfer1;
|
||||
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
|
||||
|
||||
void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
|
||||
float& gw, int& max_channels) {
|
||||
IBuilder* builder = createInferBuilder(gLogger);
|
||||
IBuilderConfig* config = builder->createBuilderConfig();
|
||||
IHostMemory* serialized_engine = nullptr;
|
||||
|
||||
if (is_p == 6) {
|
||||
std::cout << "p6 is not supported right now" << std::endl;
|
||||
} else if (is_p == 2) {
|
||||
std::cout << "p2 is not supported right now" << std::endl;
|
||||
} else {
|
||||
serialized_engine = buildEngineYolov8Obb(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
|
||||
}
|
||||
|
||||
assert(serialized_engine);
|
||||
std::ofstream p(engine_name, std::ios::binary);
|
||||
if (!p) {
|
||||
std::cout << "could not open plan output file" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
|
||||
|
||||
delete serialized_engine;
|
||||
delete config;
|
||||
delete builder;
|
||||
}
|
||||
|
||||
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
|
||||
IExecutionContext** context) {
|
||||
std::ifstream file(engine_name, std::ios::binary);
|
||||
if (!file.good()) {
|
||||
std::cerr << "read " << engine_name << " error!" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
size_t size = 0;
|
||||
file.seekg(0, file.end);
|
||||
size = file.tellg();
|
||||
file.seekg(0, file.beg);
|
||||
char* serialized_engine = new char[size];
|
||||
assert(serialized_engine);
|
||||
file.read(serialized_engine, size);
|
||||
file.close();
|
||||
|
||||
*runtime = createInferRuntime(gLogger);
|
||||
assert(*runtime);
|
||||
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
|
||||
assert(*engine);
|
||||
*context = (*engine)->createExecutionContext();
|
||||
assert(*context);
|
||||
delete[] serialized_engine;
|
||||
}
|
||||
|
||||
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
|
||||
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
|
||||
std::string cuda_post_process) {
|
||||
assert(engine->getNbBindings() == 2);
|
||||
// In order to bind the buffers, we need to know the names of the input and output tensors.
|
||||
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
|
||||
const int inputIndex = engine->getBindingIndex(kInputTensorName);
|
||||
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
|
||||
assert(inputIndex == 0);
|
||||
assert(outputIndex == 1);
|
||||
// Create GPU buffers on device
|
||||
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
|
||||
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
|
||||
if (cuda_post_process == "c") {
|
||||
*output_buffer_host = new float[kBatchSize * kOutputSize];
|
||||
} else if (cuda_post_process == "g") {
|
||||
if (kBatchSize > 1) {
|
||||
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
// Allocate memory for decode_ptr_host and copy to device
|
||||
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
|
||||
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
|
||||
}
|
||||
}
|
||||
|
||||
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
|
||||
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
|
||||
// infer on the batch asynchronously, and DMA output back to host
|
||||
auto start = std::chrono::system_clock::now();
|
||||
context.enqueue(batchsize, buffers, stream, nullptr);
|
||||
if (cuda_post_process == "c") {
|
||||
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
|
||||
<< "ms" << std::endl;
|
||||
} else if (cuda_post_process == "g") {
|
||||
CUDA_CHECK(
|
||||
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
|
||||
cuda_decode_obb((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
|
||||
cuda_nms_obb(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
|
||||
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
|
||||
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference and gpu postprocess time: "
|
||||
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
|
||||
std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
|
||||
if (argc < 4)
|
||||
return false;
|
||||
if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
|
||||
wts = std::string(argv[2]);
|
||||
engine = std::string(argv[3]);
|
||||
auto sub_type = std::string(argv[4]);
|
||||
|
||||
if (sub_type[0] == 'n') {
|
||||
gd = 0.33;
|
||||
gw = 0.25;
|
||||
max_channels = 1024;
|
||||
} else if (sub_type[0] == 's') {
|
||||
gd = 0.33;
|
||||
gw = 0.50;
|
||||
max_channels = 1024;
|
||||
} else if (sub_type[0] == 'm') {
|
||||
gd = 0.67;
|
||||
gw = 0.75;
|
||||
max_channels = 576;
|
||||
} else if (sub_type[0] == 'l') {
|
||||
gd = 1.0;
|
||||
gw = 1.0;
|
||||
max_channels = 512;
|
||||
} else if (sub_type[0] == 'x') {
|
||||
gd = 1.0;
|
||||
gw = 1.25;
|
||||
max_channels = 640;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
if (sub_type.size() == 2 && sub_type[1] == '6') {
|
||||
is_p = 6;
|
||||
} else if (sub_type.size() == 2 && sub_type[1] == '2') {
|
||||
is_p = 2;
|
||||
}
|
||||
} else if (std::string(argv[1]) == "-d" && argc == 5) {
|
||||
engine = std::string(argv[2]);
|
||||
img_dir = std::string(argv[3]);
|
||||
cuda_post_process = std::string(argv[4]);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
cudaSetDevice(kGpuId);
|
||||
std::string wts_name = "";
|
||||
std::string engine_name = "";
|
||||
std::string img_dir;
|
||||
std::string sub_type = "";
|
||||
std::string cuda_post_process = "";
|
||||
int model_bboxes;
|
||||
int is_p = 0;
|
||||
float gd = 0.0f, gw = 0.0f;
|
||||
int max_channels = 0;
|
||||
|
||||
if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
|
||||
max_channels)) {
|
||||
std::cerr << "Arguments not right!" << std::endl;
|
||||
std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to "
|
||||
"plan file"
|
||||
<< std::endl;
|
||||
std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create a model using the API directly and serialize it to a file
|
||||
if (!wts_name.empty()) {
|
||||
serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Deserialize the engine from file
|
||||
IRuntime* runtime = nullptr;
|
||||
ICudaEngine* engine = nullptr;
|
||||
IExecutionContext* context = nullptr;
|
||||
deserialize_engine(engine_name, &runtime, &engine, &context);
|
||||
cudaStream_t stream;
|
||||
CUDA_CHECK(cudaStreamCreate(&stream));
|
||||
cuda_preprocess_init(kMaxInputImageSize);
|
||||
auto out_dims = engine->getBindingDimensions(1);
|
||||
model_bboxes = out_dims.d[0];
|
||||
// Prepare cpu and gpu buffers
|
||||
float* device_buffers[2];
|
||||
float* output_buffer_host = nullptr;
|
||||
float* decode_ptr_host = nullptr;
|
||||
float* decode_ptr_device = nullptr;
|
||||
|
||||
// Read images from directory
|
||||
std::vector<std::string> file_names;
|
||||
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
|
||||
std::cerr << "read_files_in_dir failed." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
|
||||
&decode_ptr_device, cuda_post_process);
|
||||
|
||||
// batch predict
|
||||
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
|
||||
// Get a batch of images
|
||||
std::vector<cv::Mat> img_batch;
|
||||
std::vector<std::string> img_name_batch;
|
||||
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
|
||||
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
|
||||
img_batch.push_back(img);
|
||||
img_name_batch.push_back(file_names[j]);
|
||||
}
|
||||
// Preprocess
|
||||
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
|
||||
// Run inference
|
||||
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
|
||||
decode_ptr_device, model_bboxes, cuda_post_process);
|
||||
std::vector<std::vector<Detection>> res_batch;
|
||||
if (cuda_post_process == "c") {
|
||||
// NMS
|
||||
batch_nms_obb(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
|
||||
} else if (cuda_post_process == "g") {
|
||||
//Process gpu decode and nms results
|
||||
batch_process_obb(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
|
||||
}
|
||||
// Draw bounding boxes
|
||||
draw_bbox_obb(img_batch, res_batch);
|
||||
// Save images
|
||||
for (size_t j = 0; j < img_batch.size(); j++) {
|
||||
cv::imwrite("_" + img_name_batch[j], img_batch[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Release stream and buffers
|
||||
cudaStreamDestroy(stream);
|
||||
CUDA_CHECK(cudaFree(device_buffers[0]));
|
||||
CUDA_CHECK(cudaFree(device_buffers[1]));
|
||||
CUDA_CHECK(cudaFree(decode_ptr_device));
|
||||
delete[] decode_ptr_host;
|
||||
delete[] output_buffer_host;
|
||||
cuda_preprocess_destroy();
|
||||
// Destroy the engine
|
||||
delete context;
|
||||
delete engine;
|
||||
delete runtime;
|
||||
|
||||
// Print histogram of the output distribution
|
||||
//std::cout << "\nOutput:\n\n";
|
||||
//for (unsigned int i = 0; i < kOutputSize; i++)
|
||||
//{
|
||||
// std::cout << prob[i] << ", ";
|
||||
// if (i % 10 == 0) std::cout << std::endl;
|
||||
//}
|
||||
//std::cout << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
571
yolov8_obb_trt.py
Normal file
571
yolov8_obb_trt.py
Normal file
@ -0,0 +1,571 @@
|
||||
"""
|
||||
An example that uses TensorRT's Python api to make inferences.
|
||||
"""
|
||||
import ctypes
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import cv2
|
||||
import math
|
||||
import numpy as np
|
||||
import pycuda.autoinit # noqa: F401
|
||||
import pycuda.driver as cuda
|
||||
import tensorrt as trt
|
||||
|
||||
CONF_THRESH = 0.5
|
||||
IOU_THRESHOLD = 0.4
|
||||
POSE_NUM = 17 * 3
|
||||
DET_NUM = 6
|
||||
SEG_NUM = 32
|
||||
OBB_NUM = 1
|
||||
|
||||
|
||||
def get_img_path_batches(batch_size, img_dir):
|
||||
ret = []
|
||||
batch = []
|
||||
for root, dirs, files in os.walk(img_dir):
|
||||
for name in files:
|
||||
if len(batch) == batch_size:
|
||||
ret.append(batch)
|
||||
batch = []
|
||||
batch.append(os.path.join(root, name))
|
||||
if len(batch) > 0:
|
||||
ret.append(batch)
|
||||
return ret
|
||||
|
||||
|
||||
def regularize_rboxes(rboxes):
|
||||
"""
|
||||
Regularize rotated boxes in range [0, pi/2].
|
||||
|
||||
Args:
|
||||
rboxes (numpy.ndarray): Input boxes of shape(N, 5) in xywhr format.
|
||||
|
||||
Returns:
|
||||
(numpy.ndarray): The regularized boxes.
|
||||
"""
|
||||
x, y, w, h, t = np.split(rboxes, 5, axis=-1)
|
||||
w_ = np.where(w > h, w, h)
|
||||
h_ = np.where(w > h, h, w)
|
||||
t = np.where(w > h, t, t + math.pi / 2) % math.pi
|
||||
return np.concatenate([x, y, w_, h_, t], axis=-1) # regularized boxes
|
||||
|
||||
|
||||
def xywhr2xyxyxyxy(x):
|
||||
"""
|
||||
Convert batched Oriented Bounding Boxes (OBB) from [xywh, rotation] to [xy1, xy2, xy3, xy4].
|
||||
|
||||
Args:
|
||||
x (numpy.ndarray): Boxes in [cx, cy, w, h, rotation] format of shape (n, 5) or (b, n, 5).
|
||||
|
||||
Returns:
|
||||
(numpy.ndarray): Converted corner points of shape (n, 4, 2) or (b, n, 4, 2).
|
||||
"""
|
||||
# Regularize the input boxes first
|
||||
rboxes = regularize_rboxes(x)
|
||||
|
||||
ctr = rboxes[..., :2]
|
||||
w, h, angle = (rboxes[..., i: i + 1] for i in range(2, 5))
|
||||
|
||||
cos_value = np.cos(angle)
|
||||
sin_value = np.sin(angle)
|
||||
|
||||
vec1 = np.concatenate([w / 2 * cos_value, w / 2 * sin_value], axis=-1)
|
||||
vec2 = np.concatenate([-h / 2 * sin_value, h / 2 * cos_value], axis=-1)
|
||||
|
||||
pt1 = ctr + vec1 + vec2
|
||||
pt2 = ctr + vec1 - vec2
|
||||
pt3 = ctr - vec1 - vec2
|
||||
pt4 = ctr - vec1 + vec2
|
||||
|
||||
return np.stack([pt1, pt2, pt3, pt4], axis=-2)
|
||||
|
||||
|
||||
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
|
||||
"""
|
||||
description: Plots one bounding box on image img,
|
||||
this function comes from YoLov8 project.
|
||||
param:
|
||||
x: a box likes [x1,y1,x2,y2,angle]
|
||||
img: a opencv image object
|
||||
color: color to draw rectangle, such as (0,255,0)
|
||||
label: str
|
||||
line_thickness: int
|
||||
return:
|
||||
no return
|
||||
|
||||
"""
|
||||
tl = (
|
||||
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
|
||||
) # line/font thickness
|
||||
box = xywhr2xyxyxyxy(x).reshape(-1, 4, 2).squeeze()
|
||||
p1 = [int(b) for b in box[0]]
|
||||
# NOTE: cv2-version polylines needs np.asarray type.
|
||||
cv2.polylines(img, [np.asarray(box, dtype=int)], True, color, thickness=tl, lineType=cv2.LINE_AA)
|
||||
if label:
|
||||
tf = max(tl - 1, 1) # font thickness
|
||||
w, h = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] # text width, height
|
||||
outside = p1[1] - h >= 3
|
||||
p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
|
||||
cv2.rectangle(img, p1, p2, color, -1, cv2.LINE_AA) # filled
|
||||
cv2.putText(
|
||||
img,
|
||||
label,
|
||||
(p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
|
||||
0,
|
||||
tl / 3,
|
||||
[225, 255, 255],
|
||||
thickness=tf,
|
||||
lineType=cv2.LINE_AA,
|
||||
)
|
||||
|
||||
|
||||
class YoLov8TRT(object):
|
||||
"""
|
||||
description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
|
||||
"""
|
||||
|
||||
def __init__(self, engine_file_path):
|
||||
# Create a Context on this device,
|
||||
self.ctx = cuda.Device(0).make_context()
|
||||
stream = cuda.Stream()
|
||||
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
|
||||
runtime = trt.Runtime(TRT_LOGGER)
|
||||
|
||||
# Deserialize the engine from file
|
||||
with open(engine_file_path, "rb") as f:
|
||||
engine = runtime.deserialize_cuda_engine(f.read())
|
||||
context = engine.create_execution_context()
|
||||
|
||||
host_inputs = []
|
||||
cuda_inputs = []
|
||||
host_outputs = []
|
||||
cuda_outputs = []
|
||||
bindings = []
|
||||
|
||||
for binding in engine:
|
||||
print('bingding:', binding, engine.get_binding_shape(binding))
|
||||
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
|
||||
dtype = trt.nptype(engine.get_binding_dtype(binding))
|
||||
# Allocate host and device buffers
|
||||
host_mem = cuda.pagelocked_empty(size, dtype)
|
||||
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||
# Append the device buffer to device bindings.
|
||||
bindings.append(int(cuda_mem))
|
||||
# Append to the appropriate list.
|
||||
if engine.binding_is_input(binding):
|
||||
self.input_w = engine.get_binding_shape(binding)[-1]
|
||||
self.input_h = engine.get_binding_shape(binding)[-2]
|
||||
host_inputs.append(host_mem)
|
||||
cuda_inputs.append(cuda_mem)
|
||||
else:
|
||||
host_outputs.append(host_mem)
|
||||
cuda_outputs.append(cuda_mem)
|
||||
|
||||
# Store
|
||||
self.stream = stream
|
||||
self.context = context
|
||||
self.engine = engine
|
||||
self.host_inputs = host_inputs
|
||||
self.cuda_inputs = cuda_inputs
|
||||
self.host_outputs = host_outputs
|
||||
self.cuda_outputs = cuda_outputs
|
||||
self.bindings = bindings
|
||||
self.batch_size = engine.max_batch_size
|
||||
self.det_output_length = host_outputs[0].shape[0]
|
||||
|
||||
def infer(self, raw_image_generator):
|
||||
threading.Thread.__init__(self)
|
||||
# Make self the active context, pushing it on top of the context stack.
|
||||
self.ctx.push()
|
||||
# Restore
|
||||
stream = self.stream
|
||||
context = self.context
|
||||
host_inputs = self.host_inputs
|
||||
cuda_inputs = self.cuda_inputs
|
||||
host_outputs = self.host_outputs
|
||||
cuda_outputs = self.cuda_outputs
|
||||
bindings = self.bindings
|
||||
# Do image preprocess
|
||||
batch_image_raw = []
|
||||
batch_origin_h = []
|
||||
batch_origin_w = []
|
||||
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
|
||||
for i, image_raw in enumerate(raw_image_generator):
|
||||
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
|
||||
batch_image_raw.append(image_raw)
|
||||
batch_origin_h.append(origin_h)
|
||||
batch_origin_w.append(origin_w)
|
||||
np.copyto(batch_input_image[i], input_image)
|
||||
batch_input_image = np.ascontiguousarray(batch_input_image)
|
||||
|
||||
# Copy input image to host buffer
|
||||
np.copyto(host_inputs[0], batch_input_image.ravel())
|
||||
start = time.time()
|
||||
# Transfer input data to the GPU.
|
||||
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
|
||||
# Run inference.
|
||||
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
|
||||
# Transfer predictions back from the GPU.
|
||||
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
|
||||
# Synchronize the stream
|
||||
stream.synchronize()
|
||||
end = time.time()
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
# Here we use the first row of output in that batch_size = 1
|
||||
output = host_outputs[0]
|
||||
# Do postprocess
|
||||
for i in range(self.batch_size):
|
||||
result_boxes, result_scores, result_classid = self.post_process(
|
||||
output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
|
||||
batch_origin_w[i]
|
||||
)
|
||||
# Draw rectangles and labels on the original image
|
||||
for j in range(len(result_boxes)):
|
||||
box = result_boxes[j]
|
||||
np.random.seed(int(result_classid[j]))
|
||||
color = [np.random.randint(0, 255) for _ in range(3)]
|
||||
plot_one_box(
|
||||
box,
|
||||
batch_image_raw[i],
|
||||
label="{}:{:.2f}".format(
|
||||
categories[int(result_classid[j])], result_scores[j]
|
||||
),
|
||||
color=color,
|
||||
line_thickness=1
|
||||
)
|
||||
return batch_image_raw, end - start
|
||||
|
||||
def destroy(self):
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
|
||||
def get_raw_image(self, image_path_batch):
|
||||
"""
|
||||
description: Read an image from image path
|
||||
"""
|
||||
for img_path in image_path_batch:
|
||||
yield cv2.imread(img_path)
|
||||
|
||||
def get_raw_image_zeros(self, image_path_batch=None):
|
||||
"""
|
||||
description: Ready data for warmup
|
||||
"""
|
||||
for _ in range(self.batch_size):
|
||||
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
|
||||
|
||||
def preprocess_image(self, raw_bgr_image):
|
||||
"""
|
||||
description: Convert BGR image to RGB,
|
||||
resize and pad it to target size, normalize to [0,1],
|
||||
transform to NCHW format.
|
||||
param:
|
||||
input_image_path: str, image path
|
||||
return:
|
||||
image: the processed image
|
||||
image_raw: the original image
|
||||
h: original height
|
||||
w: original width
|
||||
"""
|
||||
image_raw = raw_bgr_image
|
||||
h, w, c = image_raw.shape
|
||||
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
|
||||
# Calculate widht and height and paddings
|
||||
r_w = self.input_w / w
|
||||
r_h = self.input_h / h
|
||||
if r_h > r_w:
|
||||
tw = self.input_w
|
||||
th = int(r_w * h)
|
||||
tx1 = tx2 = 0
|
||||
ty1 = int((self.input_h - th) / 2)
|
||||
ty2 = self.input_h - th - ty1
|
||||
else:
|
||||
tw = int(r_h * w)
|
||||
th = self.input_h
|
||||
tx1 = int((self.input_w - tw) / 2)
|
||||
tx2 = self.input_w - tw - tx1
|
||||
ty1 = ty2 = 0
|
||||
# Resize the image with long side while maintaining ratio
|
||||
image = cv2.resize(image, (tw, th))
|
||||
# Pad the short side with (128,128,128)
|
||||
image = cv2.copyMakeBorder(
|
||||
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
|
||||
)
|
||||
image = image.astype(np.float32)
|
||||
# Normalize to [0,1]
|
||||
image /= 255.0
|
||||
# HWC to CHW format:
|
||||
image = np.transpose(image, [2, 0, 1])
|
||||
# CHW to NCHW format
|
||||
image = np.expand_dims(image, axis=0)
|
||||
# Convert the image to row-major order, also known as "C order":
|
||||
image = np.ascontiguousarray(image)
|
||||
return image, image_raw, h, w
|
||||
|
||||
def xywh2xyxy(self, origin_h, origin_w, x):
|
||||
"""
|
||||
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
|
||||
param:
|
||||
origin_h: height of original image
|
||||
origin_w: width of original image
|
||||
x: A boxes numpy, each row is a box [center_x, center_y, w, h]
|
||||
return:
|
||||
y: A boxes numpy, each row is a box [x1, y1, x2, y2]
|
||||
"""
|
||||
y = np.zeros_like(x)
|
||||
r_w = self.input_w / origin_w
|
||||
r_h = self.input_h / origin_h
|
||||
if r_h > r_w:
|
||||
y[:, 0] = x[:, 0]
|
||||
y[:, 2] = x[:, 2]
|
||||
y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
|
||||
y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
|
||||
y /= r_w
|
||||
else:
|
||||
y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
|
||||
y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
|
||||
y[:, 1] = x[:, 1]
|
||||
y[:, 3] = x[:, 3]
|
||||
y /= r_h
|
||||
|
||||
return y
|
||||
|
||||
def post_process(self, output, origin_h, origin_w):
|
||||
"""
|
||||
description: postprocess the prediction
|
||||
param:
|
||||
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id,angle cx,cy,w,h,conf,cls_id,angle ...]
|
||||
origin_h: height of original image
|
||||
origin_w: width of original image
|
||||
return:
|
||||
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2, angle]
|
||||
result_scores: finally scores, a numpy, each element is the score correspoing to box
|
||||
result_classid: finally classid, a numpy, each element is the classid correspoing to box
|
||||
"""
|
||||
num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
|
||||
# Get the num of boxes detected
|
||||
num = int(output[0])
|
||||
# Reshape to a two dimentional ndarray
|
||||
# pred = np.reshape(output[1:], (-1, 38))[:num, :]
|
||||
pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
|
||||
# Do nms
|
||||
boxes = self.non_max_suppression(pred, origin_h, origin_w,
|
||||
conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
|
||||
|
||||
columns_to_keep = [0, 1, 2, 3, 89]
|
||||
result_boxes = boxes[:, columns_to_keep] if len(boxes) else np.array([])
|
||||
result_scores = boxes[:, 4] if len(boxes) else np.array([])
|
||||
result_classid = boxes[:, 5] if len(boxes) else np.array([])
|
||||
return result_boxes, result_scores, result_classid
|
||||
|
||||
def covariance_matrix(self, boxes):
|
||||
"""
|
||||
description: Generating covariance matrix from obbs.
|
||||
param:
|
||||
boxes (np.ndarray): A numpy of shape (N, 5) representing rotated bounding boxes, with xywhr format.
|
||||
|
||||
return:
|
||||
(np.ndarray): Covariance metrixs corresponding to original rotated bounding boxes.
|
||||
"""
|
||||
# Gaussian bounding boxes, ignore the center points (the first two columns) because they are not needed here.
|
||||
widths = boxes[:, 2:3].reshape(-1)
|
||||
heights = boxes[:, 3:4].reshape(-1)
|
||||
angles = boxes[:, 4].reshape(-1)
|
||||
|
||||
a, b, c = (widths ** 2) / 12, (heights ** 2) / 12, angles
|
||||
|
||||
cos_angles = np.cos(c)
|
||||
sin_angles = np.sin(c)
|
||||
|
||||
cos2 = cos_angles ** 2
|
||||
sin2 = sin_angles ** 2
|
||||
|
||||
return a * cos2 + b * sin2, a * sin2 + b * cos2, (a - b) * cos_angles * sin_angles
|
||||
|
||||
def bbox_iou(self, box1, box2, x1y1x2y2=True):
|
||||
"""
|
||||
description: compute the IoU of two bounding boxes
|
||||
param:
|
||||
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
|
||||
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
|
||||
x1y1x2y2: select the coordinate format
|
||||
return:
|
||||
iou: computed iou
|
||||
"""
|
||||
if not x1y1x2y2:
|
||||
# Transform from center and width to exact coordinates
|
||||
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
|
||||
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
|
||||
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
|
||||
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
|
||||
else:
|
||||
# Get the coordinates of bounding boxes
|
||||
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
|
||||
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
|
||||
|
||||
# Get the coordinates of the intersection rectangle
|
||||
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
|
||||
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
|
||||
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
|
||||
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
|
||||
# Intersection area
|
||||
inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
|
||||
* np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
|
||||
# Union Area
|
||||
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
|
||||
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
|
||||
|
||||
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
|
||||
|
||||
return iou
|
||||
|
||||
def batch_probiou(self, obb1, obb2, eps=1e-7):
|
||||
"""
|
||||
description: Calculate the prob IoU between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
|
||||
param:
|
||||
obb1 (np.ndarray): A numpy of shape (N, 5) representing ground truth obbs, with xywhr format.
|
||||
obb2 (np.ndarray): A numpy of shape (M, 5) representing predicted obbs, with xywhr format.
|
||||
eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
|
||||
return:
|
||||
iou: computed iou
|
||||
"""
|
||||
x1, y1 = obb1[:, 0], obb1[:, 1]
|
||||
x2, y2 = obb2[:, 0], obb2[:, 1]
|
||||
|
||||
a1, b1, c1 = self.covariance_matrix(obb1)
|
||||
a2, b2, c2 = self.covariance_matrix(obb2)
|
||||
|
||||
t1 = (
|
||||
((a1 + a2) * (y1 - y2) ** 2 + (b1 + b2) * (x1 - x2) ** 2) /
|
||||
((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps)
|
||||
) * 0.25
|
||||
|
||||
t2 = (
|
||||
((c1 + c2) * (x2 - x1) * (y1 - y2)) /
|
||||
((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps)
|
||||
) * 0.5
|
||||
|
||||
t3 = (
|
||||
((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2) /
|
||||
(4 * (np.clip(a1 * b1 - c1 ** 2, 0, None) * np.clip(a2 * b2 - c2 ** 2, 0, None)) ** 0.5 + eps)
|
||||
+ eps
|
||||
)
|
||||
t3 = np.log(t3) * 0.5
|
||||
|
||||
bd = np.clip(t1 + t2 + t3, eps, 100.0)
|
||||
hd = np.sqrt(1.0 - np.exp(-bd) + eps)
|
||||
return 1 - hd
|
||||
|
||||
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
|
||||
"""
|
||||
description: Removes detections with lower object confidence score than 'conf_thres' and performs
|
||||
Non-Maximum Suppression to further filter detections.
|
||||
param:
|
||||
prediction: detections, (x1, y1, x2, y2, conf, cls_id, angle)
|
||||
origin_h: original image height
|
||||
origin_w: original image width
|
||||
conf_thres: a confidence threshold to filter detections
|
||||
nms_thres: a iou threshold to filter detections
|
||||
return:
|
||||
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id, angle)
|
||||
"""
|
||||
# Get the boxes that score > CONF_THRESH
|
||||
boxes = prediction[prediction[:, 4] >= conf_thres]
|
||||
col_idx = [0, 1, 2, 3, 89]
|
||||
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
|
||||
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
|
||||
# clip the coordinates
|
||||
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
|
||||
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
|
||||
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
|
||||
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
|
||||
# Object confidence
|
||||
confs = boxes[:, 4]
|
||||
# Sort by the confs
|
||||
boxes = boxes[np.argsort(-confs)]
|
||||
# Perform non-maximum suppression
|
||||
keep_boxes = []
|
||||
while boxes.shape[0]:
|
||||
large_overlap = self.batch_probiou(np.expand_dims(boxes[0, col_idx], 0), boxes[:, col_idx]) > nms_thres
|
||||
label_match = boxes[0, 5] == boxes[:, 5]
|
||||
# Indices of boxes with lower confidence scores, large IOUs and matching labels
|
||||
invalid = large_overlap & label_match
|
||||
keep_boxes += [boxes[0]]
|
||||
boxes = boxes[~invalid]
|
||||
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
|
||||
|
||||
return boxes
|
||||
|
||||
|
||||
class inferThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper, image_path_batch):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
self.image_path_batch = image_path_batch
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
|
||||
for i, img_path in enumerate(self.image_path_batch):
|
||||
parent, filename = os.path.split(img_path)
|
||||
save_name = os.path.join('output', filename)
|
||||
# Save image
|
||||
cv2.imwrite(save_name, batch_image_raw[i])
|
||||
print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
|
||||
|
||||
|
||||
class warmUpThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
|
||||
print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# load custom plugin and engine
|
||||
PLUGIN_LIBRARY = "./build/libmyplugins.so"
|
||||
engine_file_path = "yolov8n-obb.engine"
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
engine_file_path = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
PLUGIN_LIBRARY = sys.argv[2]
|
||||
|
||||
ctypes.CDLL(PLUGIN_LIBRARY)
|
||||
|
||||
# load DOTAV 1.5 labels
|
||||
|
||||
categories = ["plane", "ship", "storage tank", "baseball diamond", "tennis court",
|
||||
"basketball court", "ground track field", "harbor",
|
||||
"bridge", "large vehicle", "small vehicle", "helicopter",
|
||||
"roundabout", "soccer ball field", "swimming pool", "container crane"]
|
||||
|
||||
if os.path.exists('output/'):
|
||||
shutil.rmtree('output/')
|
||||
os.makedirs('output/')
|
||||
# a YoLov8TRT instance
|
||||
yolov8_wrapper = YoLov8TRT(engine_file_path)
|
||||
try:
|
||||
print('batch size is', yolov8_wrapper.batch_size)
|
||||
|
||||
image_dir = "images/"
|
||||
image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
|
||||
|
||||
for i in range(10):
|
||||
# create a new thread to do warm_up
|
||||
thread1 = warmUpThread(yolov8_wrapper)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
for batch in image_path_batches:
|
||||
# create a new thread to do inference
|
||||
thread1 = inferThread(yolov8_wrapper, batch)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
finally:
|
||||
# destroy the instance
|
||||
yolov8_wrapper.destroy()
|
||||
277
yolov8_pose.cpp
Normal file
277
yolov8_pose.cpp
Normal file
@ -0,0 +1,277 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include "cuda_utils.h"
|
||||
#include "logging.h"
|
||||
#include "model.h"
|
||||
#include "postprocess.h"
|
||||
#include "preprocess.h"
|
||||
#include "utils.h"
|
||||
|
||||
Logger gLogger;
|
||||
using namespace nvinfer1;
|
||||
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
|
||||
|
||||
void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
|
||||
float& gw, int& max_channels) {
|
||||
IBuilder* builder = createInferBuilder(gLogger);
|
||||
IBuilderConfig* config = builder->createBuilderConfig();
|
||||
IHostMemory* serialized_engine = nullptr;
|
||||
|
||||
if (is_p == 6) {
|
||||
serialized_engine = buildEngineYolov8PoseP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
|
||||
} else if (is_p == 2) {
|
||||
std::cout << "p2 is not supported right now" << std::endl;
|
||||
} else {
|
||||
serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
|
||||
}
|
||||
|
||||
assert(serialized_engine);
|
||||
std::ofstream p(engine_name, std::ios::binary);
|
||||
if (!p) {
|
||||
std::cout << "could not open plan output file" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
|
||||
|
||||
delete serialized_engine;
|
||||
delete config;
|
||||
delete builder;
|
||||
}
|
||||
|
||||
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
|
||||
IExecutionContext** context) {
|
||||
std::ifstream file(engine_name, std::ios::binary);
|
||||
if (!file.good()) {
|
||||
std::cerr << "read " << engine_name << " error!" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
size_t size = 0;
|
||||
file.seekg(0, file.end);
|
||||
size = file.tellg();
|
||||
file.seekg(0, file.beg);
|
||||
char* serialized_engine = new char[size];
|
||||
assert(serialized_engine);
|
||||
file.read(serialized_engine, size);
|
||||
file.close();
|
||||
|
||||
*runtime = createInferRuntime(gLogger);
|
||||
assert(*runtime);
|
||||
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
|
||||
assert(*engine);
|
||||
*context = (*engine)->createExecutionContext();
|
||||
assert(*context);
|
||||
delete[] serialized_engine;
|
||||
}
|
||||
|
||||
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
|
||||
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
|
||||
std::string cuda_post_process) {
|
||||
assert(engine->getNbBindings() == 2);
|
||||
// In order to bind the buffers, we need to know the names of the input and output tensors.
|
||||
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
|
||||
const int inputIndex = engine->getBindingIndex(kInputTensorName);
|
||||
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
|
||||
assert(inputIndex == 0);
|
||||
assert(outputIndex == 1);
|
||||
// Create GPU buffers on device
|
||||
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
|
||||
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
|
||||
if (cuda_post_process == "c") {
|
||||
*output_buffer_host = new float[kBatchSize * kOutputSize];
|
||||
} else if (cuda_post_process == "g") {
|
||||
if (kBatchSize > 1) {
|
||||
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
// Allocate memory for decode_ptr_host and copy to device
|
||||
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
|
||||
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
|
||||
}
|
||||
}
|
||||
|
||||
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
|
||||
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
|
||||
// infer on the batch asynchronously, and DMA output back to host
|
||||
auto start = std::chrono::system_clock::now();
|
||||
context.enqueue(batchsize, buffers, stream, nullptr);
|
||||
if (cuda_post_process == "c") {
|
||||
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
|
||||
<< "ms" << std::endl;
|
||||
} else if (cuda_post_process == "g") {
|
||||
CUDA_CHECK(
|
||||
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
|
||||
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
|
||||
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
|
||||
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
|
||||
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference and gpu postprocess time: "
|
||||
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
|
||||
std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
|
||||
if (argc < 4)
|
||||
return false;
|
||||
if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
|
||||
wts = std::string(argv[2]);
|
||||
engine = std::string(argv[3]);
|
||||
auto sub_type = std::string(argv[4]);
|
||||
|
||||
if (sub_type[0] == 'n') {
|
||||
gd = 0.33;
|
||||
gw = 0.25;
|
||||
max_channels = 1024;
|
||||
} else if (sub_type[0] == 's') {
|
||||
gd = 0.33;
|
||||
gw = 0.50;
|
||||
max_channels = 1024;
|
||||
} else if (sub_type[0] == 'm') {
|
||||
gd = 0.67;
|
||||
gw = 0.75;
|
||||
max_channels = 576;
|
||||
} else if (sub_type[0] == 'l') {
|
||||
gd = 1.0;
|
||||
gw = 1.0;
|
||||
max_channels = 512;
|
||||
} else if (sub_type[0] == 'x') {
|
||||
gd = 1.0;
|
||||
gw = 1.25;
|
||||
max_channels = 640;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
if (sub_type.size() == 2 && sub_type[1] == '6') {
|
||||
is_p = 6;
|
||||
} else if (sub_type.size() == 2 && sub_type[1] == '2') {
|
||||
is_p = 2;
|
||||
}
|
||||
} else if (std::string(argv[1]) == "-d" && argc == 5) {
|
||||
engine = std::string(argv[2]);
|
||||
img_dir = std::string(argv[3]);
|
||||
cuda_post_process = std::string(argv[4]);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
cudaSetDevice(kGpuId);
|
||||
std::string wts_name = "";
|
||||
std::string engine_name = "";
|
||||
std::string img_dir;
|
||||
std::string sub_type = "";
|
||||
std::string cuda_post_process = "";
|
||||
int model_bboxes;
|
||||
int is_p = 0;
|
||||
float gd = 0.0f, gw = 0.0f;
|
||||
int max_channels = 0;
|
||||
|
||||
if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
|
||||
max_channels)) {
|
||||
std::cerr << "Arguments not right!" << std::endl;
|
||||
std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to "
|
||||
"plan file"
|
||||
<< std::endl;
|
||||
std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create a model using the API directly and serialize it to a file
|
||||
if (!wts_name.empty()) {
|
||||
serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Deserialize the engine from file
|
||||
IRuntime* runtime = nullptr;
|
||||
ICudaEngine* engine = nullptr;
|
||||
IExecutionContext* context = nullptr;
|
||||
deserialize_engine(engine_name, &runtime, &engine, &context);
|
||||
cudaStream_t stream;
|
||||
CUDA_CHECK(cudaStreamCreate(&stream));
|
||||
cuda_preprocess_init(kMaxInputImageSize);
|
||||
auto out_dims = engine->getBindingDimensions(1);
|
||||
model_bboxes = out_dims.d[0];
|
||||
// Prepare cpu and gpu buffers
|
||||
float* device_buffers[2];
|
||||
float* output_buffer_host = nullptr;
|
||||
float* decode_ptr_host = nullptr;
|
||||
float* decode_ptr_device = nullptr;
|
||||
|
||||
// Read images from directory
|
||||
std::vector<std::string> file_names;
|
||||
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
|
||||
std::cerr << "read_files_in_dir failed." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
|
||||
&decode_ptr_device, cuda_post_process);
|
||||
|
||||
// batch predict
|
||||
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
|
||||
// Get a batch of images
|
||||
std::vector<cv::Mat> img_batch;
|
||||
std::vector<std::string> img_name_batch;
|
||||
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
|
||||
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
|
||||
img_batch.push_back(img);
|
||||
img_name_batch.push_back(file_names[j]);
|
||||
}
|
||||
// Preprocess
|
||||
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
|
||||
// Run inference
|
||||
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
|
||||
decode_ptr_device, model_bboxes, cuda_post_process);
|
||||
std::vector<std::vector<Detection>> res_batch;
|
||||
if (cuda_post_process == "c") {
|
||||
// NMS
|
||||
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
|
||||
} else if (cuda_post_process == "g") {
|
||||
// Process gpu decode and nms results
|
||||
// todo pose in gpu
|
||||
std::cerr << "pose_postprocess is not support in gpu right now" << std::endl;
|
||||
}
|
||||
// Draw bounding boxes
|
||||
draw_bbox_keypoints_line(img_batch, res_batch);
|
||||
// Save images
|
||||
for (size_t j = 0; j < img_batch.size(); j++) {
|
||||
cv::imwrite("_" + img_name_batch[j], img_batch[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Release stream and buffers
|
||||
cudaStreamDestroy(stream);
|
||||
CUDA_CHECK(cudaFree(device_buffers[0]));
|
||||
CUDA_CHECK(cudaFree(device_buffers[1]));
|
||||
CUDA_CHECK(cudaFree(decode_ptr_device));
|
||||
delete[] decode_ptr_host;
|
||||
delete[] output_buffer_host;
|
||||
cuda_preprocess_destroy();
|
||||
// Destroy the engine
|
||||
delete context;
|
||||
delete engine;
|
||||
delete runtime;
|
||||
|
||||
// Print histogram of the output distribution
|
||||
//std::cout << "\nOutput:\n\n";
|
||||
//for (unsigned int i = 0; i < kOutputSize; i++)
|
||||
//{
|
||||
// std::cout << prob[i] << ", ";
|
||||
// if (i % 10 == 0) std::cout << std::endl;
|
||||
//}
|
||||
//std::cout << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
502
yolov8_pose_trt.py
Normal file
502
yolov8_pose_trt.py
Normal file
@ -0,0 +1,502 @@
|
||||
"""
|
||||
An example that uses TensorRT's Python api to make inferences.
|
||||
"""
|
||||
import ctypes
|
||||
import os
|
||||
import shutil
|
||||
import random
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pycuda.autoinit # noqa: F401
|
||||
import pycuda.driver as cuda
|
||||
import tensorrt as trt
|
||||
|
||||
CONF_THRESH = 0.5
|
||||
IOU_THRESHOLD = 0.4
|
||||
POSE_NUM = 17 * 3
|
||||
DET_NUM = 6
|
||||
SEG_NUM = 32
|
||||
OBB_NUM = 1
|
||||
keypoint_pairs = [
|
||||
(0, 1), (0, 2), (0, 5), (0, 6), (1, 2),
|
||||
(1, 3), (2, 4), (5, 6), (5, 7), (5, 11),
|
||||
(6, 8), (6, 12), (7, 9), (8, 10), (11, 12),
|
||||
(11, 13), (12, 14), (13, 15), (14, 16)
|
||||
]
|
||||
|
||||
|
||||
def get_img_path_batches(batch_size, img_dir):
|
||||
ret = []
|
||||
batch = []
|
||||
for root, dirs, files in os.walk(img_dir):
|
||||
for name in files:
|
||||
if len(batch) == batch_size:
|
||||
ret.append(batch)
|
||||
batch = []
|
||||
batch.append(os.path.join(root, name))
|
||||
if len(batch) > 0:
|
||||
ret.append(batch)
|
||||
return ret
|
||||
|
||||
|
||||
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
|
||||
"""
|
||||
description: Plots one bounding box on image img,
|
||||
this function comes from YoLov8 project.
|
||||
param:
|
||||
x: a box likes [x1,y1,x2,y2]
|
||||
img: a opencv image object
|
||||
color: color to draw rectangle, such as (0,255,0)
|
||||
label: str
|
||||
line_thickness: int
|
||||
return:
|
||||
no return
|
||||
|
||||
"""
|
||||
tl = (
|
||||
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
|
||||
) # line/font thickness
|
||||
color = color or [random.randint(0, 255) for _ in range(3)]
|
||||
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
|
||||
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
|
||||
if label:
|
||||
tf = max(tl - 1, 1) # font thickness
|
||||
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
|
||||
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
|
||||
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
|
||||
cv2.putText(
|
||||
img,
|
||||
label,
|
||||
(c1[0], c1[1] - 2),
|
||||
0,
|
||||
tl / 3,
|
||||
[225, 255, 255],
|
||||
thickness=tf,
|
||||
lineType=cv2.LINE_AA,
|
||||
)
|
||||
|
||||
|
||||
class YoLov8TRT(object):
|
||||
"""
|
||||
description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
|
||||
"""
|
||||
|
||||
def __init__(self, engine_file_path):
|
||||
# Create a Context on this device,
|
||||
self.ctx = cuda.Device(0).make_context()
|
||||
stream = cuda.Stream()
|
||||
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
|
||||
runtime = trt.Runtime(TRT_LOGGER)
|
||||
|
||||
# Deserialize the engine from file
|
||||
with open(engine_file_path, "rb") as f:
|
||||
engine = runtime.deserialize_cuda_engine(f.read())
|
||||
context = engine.create_execution_context()
|
||||
|
||||
host_inputs = []
|
||||
cuda_inputs = []
|
||||
host_outputs = []
|
||||
cuda_outputs = []
|
||||
bindings = []
|
||||
|
||||
for binding in engine:
|
||||
print('bingding:', binding, engine.get_binding_shape(binding))
|
||||
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
|
||||
dtype = trt.nptype(engine.get_binding_dtype(binding))
|
||||
# Allocate host and device buffers
|
||||
host_mem = cuda.pagelocked_empty(size, dtype)
|
||||
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||
# Append the device buffer to device bindings.
|
||||
bindings.append(int(cuda_mem))
|
||||
# Append to the appropriate list.
|
||||
if engine.binding_is_input(binding):
|
||||
self.input_w = engine.get_binding_shape(binding)[-1]
|
||||
self.input_h = engine.get_binding_shape(binding)[-2]
|
||||
host_inputs.append(host_mem)
|
||||
cuda_inputs.append(cuda_mem)
|
||||
else:
|
||||
host_outputs.append(host_mem)
|
||||
cuda_outputs.append(cuda_mem)
|
||||
|
||||
# Store
|
||||
self.stream = stream
|
||||
self.context = context
|
||||
self.host_inputs = host_inputs
|
||||
self.cuda_inputs = cuda_inputs
|
||||
self.host_outputs = host_outputs
|
||||
self.cuda_outputs = cuda_outputs
|
||||
self.bindings = bindings
|
||||
self.batch_size = engine.max_batch_size
|
||||
self.det_output_size = host_outputs[0].shape[0]
|
||||
|
||||
def infer(self, raw_image_generator):
|
||||
threading.Thread.__init__(self)
|
||||
# Make self the active context, pushing it on top of the context stack.
|
||||
self.ctx.push()
|
||||
# Restore
|
||||
stream = self.stream
|
||||
context = self.context
|
||||
host_inputs = self.host_inputs
|
||||
cuda_inputs = self.cuda_inputs
|
||||
host_outputs = self.host_outputs
|
||||
cuda_outputs = self.cuda_outputs
|
||||
bindings = self.bindings
|
||||
# Do image preprocess
|
||||
batch_image_raw = []
|
||||
batch_origin_h = []
|
||||
batch_origin_w = []
|
||||
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
|
||||
for i, image_raw in enumerate(raw_image_generator):
|
||||
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
|
||||
batch_image_raw.append(image_raw)
|
||||
batch_origin_h.append(origin_h)
|
||||
batch_origin_w.append(origin_w)
|
||||
np.copyto(batch_input_image[i],
|
||||
input_image)
|
||||
batch_input_image = np.ascontiguousarray(batch_input_image)
|
||||
|
||||
# Copy input image to host buffer
|
||||
np.copyto(host_inputs[0], batch_input_image.ravel())
|
||||
start = time.time()
|
||||
# Transfer input data to the GPU.
|
||||
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
|
||||
# Run inference.
|
||||
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
|
||||
# Transfer predictions back from the GPU.
|
||||
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
|
||||
# Synchronize the stream
|
||||
stream.synchronize()
|
||||
end = time.time()
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
# Here we use the first row of output in that batch_size = 1
|
||||
output = host_outputs[0]
|
||||
# Do postprocess
|
||||
for i in range(self.batch_size):
|
||||
|
||||
result_boxes, result_scores, result_classid, keypoints = self.post_process(
|
||||
output[i * (self.det_output_size): (i + 1) * (self.det_output_size)],
|
||||
batch_origin_h[i], batch_origin_w[i]
|
||||
)
|
||||
|
||||
# Draw rectangles and labels on the original image
|
||||
for j in range(len(result_boxes)):
|
||||
box = result_boxes[j]
|
||||
plot_one_box(
|
||||
box,
|
||||
batch_image_raw[i],
|
||||
label="{}:{:.2f}".format(
|
||||
categories[int(result_classid[j])], result_scores[j]
|
||||
),
|
||||
)
|
||||
|
||||
num_keypoints = len(keypoints[j]) // 3
|
||||
points = []
|
||||
for k in range(num_keypoints):
|
||||
x = keypoints[j][k * 3]
|
||||
y = keypoints[j][k * 3 + 1]
|
||||
confidence = keypoints[j][k * 3 + 2]
|
||||
if confidence > 0:
|
||||
points.append((int(x), int(y)))
|
||||
else:
|
||||
points.append(None)
|
||||
|
||||
# 根据关键点索引对绘制线条
|
||||
for pair in keypoint_pairs:
|
||||
partA, partB = pair
|
||||
if points[partA] and points[partB]:
|
||||
cv2.line(batch_image_raw[i], points[partA], points[partB], (0, 255, 0), 2)
|
||||
|
||||
return batch_image_raw, end - start
|
||||
|
||||
def destroy(self):
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
|
||||
def get_raw_image(self, image_path_batch):
|
||||
"""
|
||||
description: Read an image from image path
|
||||
"""
|
||||
for img_path in image_path_batch:
|
||||
yield cv2.imread(img_path)
|
||||
|
||||
def get_raw_image_zeros(self, image_path_batch=None):
|
||||
"""
|
||||
description: Ready data for warmup
|
||||
"""
|
||||
for _ in range(self.batch_size):
|
||||
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
|
||||
|
||||
def preprocess_image(self, raw_bgr_image):
|
||||
"""
|
||||
description: Convert BGR image to RGB,
|
||||
resize and pad it to target size, normalize to [0,1],
|
||||
transform to NCHW format.
|
||||
param:
|
||||
input_image_path: str, image path
|
||||
return:
|
||||
image: the processed image
|
||||
image_raw: the original image
|
||||
h: original height
|
||||
w: original width
|
||||
"""
|
||||
image_raw = raw_bgr_image
|
||||
h, w, c = image_raw.shape
|
||||
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
|
||||
# Calculate widht and height and paddings
|
||||
r_w = self.input_w / w
|
||||
r_h = self.input_h / h
|
||||
if r_h > r_w:
|
||||
tw = self.input_w
|
||||
th = int(r_w * h)
|
||||
tx1 = tx2 = 0
|
||||
ty1 = int((self.input_h - th) / 2)
|
||||
ty2 = self.input_h - th - ty1
|
||||
else:
|
||||
tw = int(r_h * w)
|
||||
th = self.input_h
|
||||
tx1 = int((self.input_w - tw) / 2)
|
||||
tx2 = self.input_w - tw - tx1
|
||||
ty1 = ty2 = 0
|
||||
# Resize the image with long side while maintaining ratio
|
||||
image = cv2.resize(image, (tw, th))
|
||||
# Pad the short side with (128,128,128)
|
||||
image = cv2.copyMakeBorder(
|
||||
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
|
||||
)
|
||||
image = image.astype(np.float32)
|
||||
# Normalize to [0,1]
|
||||
image /= 255.0
|
||||
# HWC to CHW format:
|
||||
image = np.transpose(image, [2, 0, 1])
|
||||
# CHW to NCHW format
|
||||
image = np.expand_dims(image, axis=0)
|
||||
# Convert the image to row-major order, also known as "C order":
|
||||
image = np.ascontiguousarray(image)
|
||||
return image, image_raw, h, w
|
||||
|
||||
def xywh2xyxy_with_keypoints(self, origin_h, origin_w, boxes, keypoints):
|
||||
|
||||
n = len(boxes)
|
||||
box_array = np.zeros_like(boxes)
|
||||
keypoint_array = np.zeros_like(keypoints)
|
||||
r_w = self.input_w / origin_w
|
||||
r_h = self.input_h / origin_h
|
||||
for i in range(n):
|
||||
if r_h > r_w:
|
||||
box = boxes[i]
|
||||
lmk = keypoints[i]
|
||||
box_array[i, 0] = box[0] / r_w
|
||||
box_array[i, 2] = box[2] / r_w
|
||||
box_array[i, 1] = (box[1] - (self.input_h - r_w * origin_h) / 2) / r_w
|
||||
box_array[i, 3] = (box[3] - (self.input_h - r_w * origin_h) / 2) / r_w
|
||||
|
||||
for j in range(0, len(lmk), 3):
|
||||
keypoint_array[i, j] = lmk[j] / r_w
|
||||
keypoint_array[i, j + 1] = (lmk[j + 1] - (self.input_h - r_w * origin_h) / 2) / r_w
|
||||
keypoint_array[i, j + 2] = lmk[j + 2]
|
||||
else:
|
||||
|
||||
box = boxes[i]
|
||||
lmk = keypoints[i]
|
||||
|
||||
box_array[i, 0] = (box[0] - (self.input_w - r_h * origin_w) / 2) / r_h
|
||||
box_array[i, 2] = (box[2] - (self.input_w - r_h * origin_w) / 2) / r_h
|
||||
box_array[i, 1] = box[1] / r_h
|
||||
box_array[i, 3] = box[3] / r_h
|
||||
|
||||
for j in range(0, len(lmk), 3):
|
||||
keypoint_array[i, j] = (lmk[j] - (self.input_w - r_h * origin_w) / 2) / r_h
|
||||
keypoint_array[i, j + 1] = lmk[j + 1] / r_h
|
||||
keypoint_array[i, j + 2] = lmk[j + 2]
|
||||
|
||||
return box_array, keypoint_array
|
||||
|
||||
def post_process(self, output, origin_h, origin_w):
|
||||
"""
|
||||
description: Post-process the prediction to include pose keypoints
|
||||
param:
|
||||
output: A numpy array like [num_boxes, cx, cy, w, h, conf,
|
||||
cls_id, px1, py1, pconf1,...px17, py17, pconf17] where p denotes pose keypoint
|
||||
origin_h: Height of original image
|
||||
origin_w: Width of original image
|
||||
return:
|
||||
result_boxes: Final boxes, a numpy array, each row is a box [x1, y1, x2, y2]
|
||||
result_scores: Final scores, a numpy array, each element is the score corresponding to box
|
||||
result_classid: Final classID, a numpy array, each element is the classid corresponding to box
|
||||
result_keypoints: Final keypoints, a list of numpy arrays,
|
||||
each element represents keypoints for a box, shaped as (#keypoints, 3)
|
||||
"""
|
||||
# Number of values per detection: 38 base values + 17 keypoints * 3 values each + angle
|
||||
num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
|
||||
# Get the number of boxes detected
|
||||
num = int(output[0])
|
||||
# Reshape to a two-dimensional ndarray with the full detection shape
|
||||
pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
|
||||
|
||||
# Perform non-maximum suppression to filter the detections
|
||||
boxes = self.non_max_suppression(
|
||||
pred[:, :num_values_per_detection], origin_h, origin_w,
|
||||
conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
|
||||
|
||||
# Extract the bounding boxes, confidence scores, and class IDs
|
||||
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
|
||||
result_scores = boxes[:, 4] if len(boxes) else np.array([])
|
||||
result_classid = boxes[:, 5] if len(boxes) else np.array([])
|
||||
result_keypoints = boxes[:, -POSE_NUM-1:-1] if len(boxes) else np.array([])
|
||||
|
||||
# Return the post-processed results including keypoints
|
||||
return result_boxes, result_scores, result_classid, result_keypoints
|
||||
|
||||
def bbox_iou(self, box1, box2, x1y1x2y2=True):
|
||||
"""
|
||||
description: compute the IoU of two bounding boxes
|
||||
param:
|
||||
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
|
||||
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
|
||||
x1y1x2y2: select the coordinate format
|
||||
return:
|
||||
iou: computed iou
|
||||
"""
|
||||
if not x1y1x2y2:
|
||||
# Transform from center and width to exact coordinates
|
||||
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
|
||||
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
|
||||
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
|
||||
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
|
||||
else:
|
||||
# Get the coordinates of bounding boxes
|
||||
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
|
||||
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
|
||||
|
||||
# Get the coordinates of the intersection rectangle
|
||||
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
|
||||
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
|
||||
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
|
||||
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
|
||||
# Intersection area
|
||||
inter_area = np.clip(
|
||||
inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
|
||||
# Union Area
|
||||
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
|
||||
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
|
||||
|
||||
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
|
||||
|
||||
return iou
|
||||
|
||||
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
|
||||
"""
|
||||
description: Removes detections with lower object confidence score than 'conf_thres' and performs
|
||||
Non-Maximum Suppression to further filter detections.
|
||||
param:
|
||||
prediction: detections, (x1, y1, x2, y2, conf, cls_id)
|
||||
origin_h: original image height
|
||||
origin_w: original image width
|
||||
conf_thres: a confidence threshold to filter detections
|
||||
nms_thres: a iou threshold to filter detections
|
||||
return:
|
||||
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
|
||||
"""
|
||||
# Get the boxes that score > CONF_THRESH
|
||||
boxes = prediction[prediction[:, 4] >= conf_thres]
|
||||
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
|
||||
res_array = np.copy(boxes)
|
||||
box_pred_deep_copy = np.copy(boxes[:, :4])
|
||||
keypoints_pred_deep_copy = np.copy(boxes[:, -POSE_NUM-1:-1])
|
||||
res_box, res_keypoints = self.xywh2xyxy_with_keypoints(
|
||||
origin_h, origin_w, box_pred_deep_copy, keypoints_pred_deep_copy)
|
||||
res_array[:, :4] = res_box
|
||||
res_array[:, -POSE_NUM-1:-1] = res_keypoints
|
||||
# clip the coordinates
|
||||
res_array[:, 0] = np.clip(res_array[:, 0], 0, origin_w - 1)
|
||||
res_array[:, 2] = np.clip(res_array[:, 2], 0, origin_w - 1)
|
||||
res_array[:, 1] = np.clip(res_array[:, 1], 0, origin_h - 1)
|
||||
res_array[:, 3] = np.clip(res_array[:, 3], 0, origin_h - 1)
|
||||
# Object confidence
|
||||
confs = res_array[:, 4]
|
||||
# Sort by the confs
|
||||
res_array = res_array[np.argsort(-confs)]
|
||||
# Perform non-maximum suppression
|
||||
keep_res_array = []
|
||||
while res_array.shape[0]:
|
||||
large_overlap = self.bbox_iou(np.expand_dims(res_array[0, :4], 0), res_array[:, :4]) > nms_thres
|
||||
label_match = res_array[0, 5] == res_array[:, 5]
|
||||
invalid = large_overlap & label_match
|
||||
keep_res_array.append(res_array[0])
|
||||
res_array = res_array[~invalid]
|
||||
|
||||
res_array = np.stack(keep_res_array, 0) if len(keep_res_array) else np.array([])
|
||||
return res_array
|
||||
|
||||
|
||||
class inferThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper, image_path_batch):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
self.image_path_batch = image_path_batch
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
|
||||
for i, img_path in enumerate(self.image_path_batch):
|
||||
parent, filename = os.path.split(img_path)
|
||||
save_name = os.path.join('output', filename)
|
||||
# Save image
|
||||
|
||||
cv2.imwrite(save_name, batch_image_raw[i])
|
||||
print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
|
||||
|
||||
|
||||
class warmUpThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
|
||||
print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# load custom plugin and engine
|
||||
PLUGIN_LIBRARY = "./build/libmyplugins.so"
|
||||
engine_file_path = "yolov8n-pose.engine"
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
engine_file_path = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
PLUGIN_LIBRARY = sys.argv[2]
|
||||
|
||||
ctypes.CDLL(PLUGIN_LIBRARY)
|
||||
|
||||
# load coco labels
|
||||
|
||||
categories = ["person"]
|
||||
|
||||
if os.path.exists('output/'):
|
||||
shutil.rmtree('output/')
|
||||
os.makedirs('output/')
|
||||
# a YoLov8TRT instance
|
||||
yolov8_wrapper = YoLov8TRT(engine_file_path)
|
||||
try:
|
||||
print('batch size is', yolov8_wrapper.batch_size)
|
||||
|
||||
image_dir = "images/"
|
||||
image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
|
||||
|
||||
for i in range(10):
|
||||
# create a new thread to do warm_up
|
||||
thread1 = warmUpThread(yolov8_wrapper)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
for batch in image_path_batches:
|
||||
# create a new thread to do inference
|
||||
thread1 = inferThread(yolov8_wrapper, batch)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
finally:
|
||||
# destroy the instance
|
||||
yolov8_wrapper.destroy()
|
||||
332
yolov8_seg.cpp
Normal file
332
yolov8_seg.cpp
Normal file
@ -0,0 +1,332 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include "cuda_utils.h"
|
||||
#include "logging.h"
|
||||
#include "model.h"
|
||||
#include "postprocess.h"
|
||||
#include "preprocess.h"
|
||||
#include "utils.h"
|
||||
|
||||
Logger gLogger;
|
||||
using namespace nvinfer1;
|
||||
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
|
||||
const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4);
|
||||
|
||||
static cv::Rect get_downscale_rect(float bbox[4], float scale) {
|
||||
|
||||
float left = bbox[0];
|
||||
float top = bbox[1];
|
||||
float right = bbox[0] + bbox[2];
|
||||
float bottom = bbox[1] + bbox[3];
|
||||
|
||||
left = left < 0 ? 0 : left;
|
||||
top = top < 0 ? 0 : top;
|
||||
right = right > kInputW ? kInputW : right;
|
||||
bottom = bottom > kInputH ? kInputH : bottom;
|
||||
|
||||
left /= scale;
|
||||
top /= scale;
|
||||
right /= scale;
|
||||
bottom /= scale;
|
||||
return cv::Rect(int(left), int(top), int(right - left), int(bottom - top));
|
||||
}
|
||||
|
||||
std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets) {
|
||||
|
||||
std::vector<cv::Mat> masks;
|
||||
for (size_t i = 0; i < dets.size(); i++) {
|
||||
|
||||
cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
|
||||
auto r = get_downscale_rect(dets[i].bbox, 4);
|
||||
|
||||
for (int x = r.x; x < r.x + r.width; x++) {
|
||||
for (int y = r.y; y < r.y + r.height; y++) {
|
||||
float e = 0.0f;
|
||||
for (int j = 0; j < 32; j++) {
|
||||
e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
|
||||
}
|
||||
e = 1.0f / (1.0f + expf(-e));
|
||||
mask_mat.at<float>(y, x) = e;
|
||||
}
|
||||
}
|
||||
cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
|
||||
masks.push_back(mask_mat);
|
||||
}
|
||||
return masks;
|
||||
}
|
||||
|
||||
void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& sub_type, float& gd, float& gw,
|
||||
int& max_channels) {
|
||||
IBuilder* builder = createInferBuilder(gLogger);
|
||||
IBuilderConfig* config = builder->createBuilderConfig();
|
||||
IHostMemory* serialized_engine = nullptr;
|
||||
|
||||
serialized_engine = buildEngineYolov8Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
|
||||
|
||||
assert(serialized_engine);
|
||||
std::ofstream p(engine_name, std::ios::binary);
|
||||
if (!p) {
|
||||
std::cout << "could not open plan output file" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
|
||||
|
||||
delete serialized_engine;
|
||||
delete config;
|
||||
delete builder;
|
||||
}
|
||||
|
||||
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
|
||||
IExecutionContext** context) {
|
||||
std::ifstream file(engine_name, std::ios::binary);
|
||||
if (!file.good()) {
|
||||
std::cerr << "read " << engine_name << " error!" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
size_t size = 0;
|
||||
file.seekg(0, file.end);
|
||||
size = file.tellg();
|
||||
file.seekg(0, file.beg);
|
||||
char* serialized_engine = new char[size];
|
||||
assert(serialized_engine);
|
||||
file.read(serialized_engine, size);
|
||||
file.close();
|
||||
|
||||
*runtime = createInferRuntime(gLogger);
|
||||
assert(*runtime);
|
||||
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
|
||||
assert(*engine);
|
||||
*context = (*engine)->createExecutionContext();
|
||||
assert(*context);
|
||||
delete[] serialized_engine;
|
||||
}
|
||||
|
||||
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
|
||||
float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host,
|
||||
float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) {
|
||||
assert(engine->getNbBindings() == 3);
|
||||
// In order to bind the buffers, we need to know the names of the input and output tensors.
|
||||
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
|
||||
const int inputIndex = engine->getBindingIndex(kInputTensorName);
|
||||
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
|
||||
const int outputIndex_seg = engine->getBindingIndex("proto");
|
||||
|
||||
assert(inputIndex == 0);
|
||||
assert(outputIndex == 1);
|
||||
assert(outputIndex_seg == 2);
|
||||
// Create GPU buffers on device
|
||||
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
|
||||
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
|
||||
CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float)));
|
||||
|
||||
if (cuda_post_process == "c") {
|
||||
*output_buffer_host = new float[kBatchSize * kOutputSize];
|
||||
*output_seg_buffer_host = new float[kBatchSize * kOutputSegSize];
|
||||
} else if (cuda_post_process == "g") {
|
||||
if (kBatchSize > 1) {
|
||||
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
// Allocate memory for decode_ptr_host and copy to device
|
||||
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
|
||||
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
|
||||
}
|
||||
}
|
||||
|
||||
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg,
|
||||
int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes,
|
||||
std::string cuda_post_process) {
|
||||
// infer on the batch asynchronously, and DMA output back to host
|
||||
auto start = std::chrono::system_clock::now();
|
||||
context.enqueue(batchsize, buffers, stream, nullptr);
|
||||
if (cuda_post_process == "c") {
|
||||
|
||||
std::cout << "kOutputSize:" << kOutputSize << std::endl;
|
||||
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl;
|
||||
CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float),
|
||||
cudaMemcpyDeviceToHost, stream));
|
||||
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
|
||||
<< "ms" << std::endl;
|
||||
} else if (cuda_post_process == "g") {
|
||||
CUDA_CHECK(
|
||||
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
|
||||
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
|
||||
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
|
||||
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
|
||||
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
auto end = std::chrono::system_clock::now();
|
||||
std::cout << "inference and gpu postprocess time: "
|
||||
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir,
|
||||
std::string& sub_type, std::string& cuda_post_process, std::string& labels_filename, float& gd,
|
||||
float& gw, int& max_channels) {
|
||||
if (argc < 4)
|
||||
return false;
|
||||
if (std::string(argv[1]) == "-s" && argc == 5) {
|
||||
wts = std::string(argv[2]);
|
||||
engine = std::string(argv[3]);
|
||||
sub_type = std::string(argv[4]);
|
||||
if (sub_type == "n") {
|
||||
gd = 0.33;
|
||||
gw = 0.25;
|
||||
max_channels = 1024;
|
||||
} else if (sub_type == "s") {
|
||||
gd = 0.33;
|
||||
gw = 0.50;
|
||||
max_channels = 1024;
|
||||
} else if (sub_type == "m") {
|
||||
gd = 0.67;
|
||||
gw = 0.75;
|
||||
max_channels = 576;
|
||||
} else if (sub_type == "l") {
|
||||
gd = 1.0;
|
||||
gw = 1.0;
|
||||
max_channels = 512;
|
||||
} else if (sub_type == "x") {
|
||||
gd = 1.0;
|
||||
gw = 1.25;
|
||||
max_channels = 640;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (std::string(argv[1]) == "-d" && argc == 6) {
|
||||
engine = std::string(argv[2]);
|
||||
img_dir = std::string(argv[3]);
|
||||
cuda_post_process = std::string(argv[4]);
|
||||
labels_filename = std::string(argv[5]);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
cudaSetDevice(kGpuId);
|
||||
std::string wts_name = "";
|
||||
std::string engine_name = "";
|
||||
std::string img_dir;
|
||||
std::string sub_type = "";
|
||||
std::string cuda_post_process = "";
|
||||
std::string labels_filename = "../coco.txt";
|
||||
int model_bboxes;
|
||||
float gd = 0.0f, gw = 0.0f;
|
||||
int max_channels = 0;
|
||||
|
||||
if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process, labels_filename, gd, gw,
|
||||
max_channels)) {
|
||||
std::cerr << "Arguments not right!" << std::endl;
|
||||
std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl;
|
||||
std::cerr << "./yolov8 -d [.engine] ../samples [c/g] coco_file// deserialize plan file and run inference"
|
||||
<< std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create a model using the API directly and serialize it to a file
|
||||
if (!wts_name.empty()) {
|
||||
serialize_engine(wts_name, engine_name, sub_type, gd, gw, max_channels);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Deserialize the engine from file
|
||||
IRuntime* runtime = nullptr;
|
||||
ICudaEngine* engine = nullptr;
|
||||
IExecutionContext* context = nullptr;
|
||||
deserialize_engine(engine_name, &runtime, &engine, &context);
|
||||
cudaStream_t stream;
|
||||
CUDA_CHECK(cudaStreamCreate(&stream));
|
||||
cuda_preprocess_init(kMaxInputImageSize);
|
||||
auto out_dims = engine->getBindingDimensions(1);
|
||||
model_bboxes = out_dims.d[0];
|
||||
// Prepare cpu and gpu buffers
|
||||
float* device_buffers[3];
|
||||
float* output_buffer_host = nullptr;
|
||||
float* output_seg_buffer_host = nullptr;
|
||||
float* decode_ptr_host = nullptr;
|
||||
float* decode_ptr_device = nullptr;
|
||||
|
||||
// Read images from directory
|
||||
std::vector<std::string> file_names;
|
||||
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
|
||||
std::cerr << "read_files_in_dir failed." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::unordered_map<int, std::string> labels_map;
|
||||
read_labels(labels_filename, labels_map);
|
||||
assert(kNumClass == labels_map.size());
|
||||
|
||||
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host,
|
||||
&output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process);
|
||||
|
||||
// // batch predict
|
||||
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
|
||||
// Get a batch of images
|
||||
std::vector<cv::Mat> img_batch;
|
||||
std::vector<std::string> img_name_batch;
|
||||
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
|
||||
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
|
||||
img_batch.push_back(img);
|
||||
img_name_batch.push_back(file_names[j]);
|
||||
}
|
||||
// Preprocess
|
||||
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
|
||||
// Run inference
|
||||
infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize,
|
||||
decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
|
||||
std::vector<std::vector<Detection>> res_batch;
|
||||
if (cuda_post_process == "c") {
|
||||
// NMS
|
||||
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
|
||||
for (size_t b = 0; b < img_batch.size(); b++) {
|
||||
auto& res = res_batch[b];
|
||||
cv::Mat img = img_batch[b];
|
||||
auto masks = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res);
|
||||
draw_mask_bbox(img, res, masks, labels_map);
|
||||
cv::imwrite("_" + img_name_batch[b], img);
|
||||
}
|
||||
} else if (cuda_post_process == "g") {
|
||||
// Process gpu decode and nms results
|
||||
// batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
|
||||
// todo seg in gpu
|
||||
std::cerr << "seg_postprocess is not support in gpu right now" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Release stream and buffers
|
||||
cudaStreamDestroy(stream);
|
||||
CUDA_CHECK(cudaFree(device_buffers[0]));
|
||||
CUDA_CHECK(cudaFree(device_buffers[1]));
|
||||
CUDA_CHECK(cudaFree(device_buffers[2]));
|
||||
CUDA_CHECK(cudaFree(decode_ptr_device));
|
||||
delete[] decode_ptr_host;
|
||||
delete[] output_buffer_host;
|
||||
delete[] output_seg_buffer_host;
|
||||
cuda_preprocess_destroy();
|
||||
// Destroy the engine
|
||||
delete context;
|
||||
delete engine;
|
||||
delete runtime;
|
||||
|
||||
// Print histogram of the output distribution
|
||||
// std::cout << "\nOutput:\n\n";
|
||||
// for (unsigned int i = 0; i < kOutputSize; i++)
|
||||
//{
|
||||
// std::cout << prob[i] << ", ";
|
||||
// if (i % 10 == 0) std::cout << std::endl;
|
||||
//}
|
||||
// std::cout << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
580
yolov8_seg_trt.py
Normal file
580
yolov8_seg_trt.py
Normal file
@ -0,0 +1,580 @@
|
||||
"""
|
||||
An example that uses TensorRT's Python api to make inferences.
|
||||
"""
|
||||
import ctypes
|
||||
import os
|
||||
import shutil
|
||||
import random
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pycuda.autoinit # noqa: F401
|
||||
import pycuda.driver as cuda
|
||||
import tensorrt as trt
|
||||
|
||||
CONF_THRESH = 0.5
|
||||
IOU_THRESHOLD = 0.4
|
||||
POSE_NUM = 17 * 3
|
||||
DET_NUM = 6
|
||||
SEG_NUM = 32
|
||||
OBB_NUM = 1
|
||||
|
||||
|
||||
def get_img_path_batches(batch_size, img_dir):
|
||||
ret = []
|
||||
batch = []
|
||||
for root, dirs, files in os.walk(img_dir):
|
||||
for name in files:
|
||||
if len(batch) == batch_size:
|
||||
ret.append(batch)
|
||||
batch = []
|
||||
batch.append(os.path.join(root, name))
|
||||
if len(batch) > 0:
|
||||
ret.append(batch)
|
||||
return ret
|
||||
|
||||
|
||||
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
|
||||
"""
|
||||
description: Plots one bounding box on image img,
|
||||
this function comes from YoLov8 project.
|
||||
param:
|
||||
x: a box likes [x1,y1,x2,y2]
|
||||
img: a opencv image object
|
||||
color: color to draw rectangle, such as (0,255,0)
|
||||
label: str
|
||||
line_thickness: int
|
||||
return:
|
||||
no return
|
||||
|
||||
"""
|
||||
tl = (
|
||||
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
|
||||
) # line/font thickness
|
||||
color = color or [random.randint(0, 255) for _ in range(3)]
|
||||
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
|
||||
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
|
||||
if label:
|
||||
tf = max(tl - 1, 1) # font thickness
|
||||
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
|
||||
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
|
||||
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
|
||||
cv2.putText(
|
||||
img,
|
||||
label,
|
||||
(c1[0], c1[1] - 2),
|
||||
0,
|
||||
tl / 3,
|
||||
[225, 255, 255],
|
||||
thickness=tf,
|
||||
lineType=cv2.LINE_AA,
|
||||
)
|
||||
|
||||
|
||||
class YoLov8TRT(object):
|
||||
"""
|
||||
description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
|
||||
"""
|
||||
|
||||
def __init__(self, engine_file_path):
|
||||
# Create a Context on this device,
|
||||
self.ctx = cuda.Device(0).make_context()
|
||||
stream = cuda.Stream()
|
||||
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
|
||||
runtime = trt.Runtime(TRT_LOGGER)
|
||||
|
||||
# Deserialize the engine from file
|
||||
with open(engine_file_path, "rb") as f:
|
||||
engine = runtime.deserialize_cuda_engine(f.read())
|
||||
context = engine.create_execution_context()
|
||||
|
||||
host_inputs = []
|
||||
cuda_inputs = []
|
||||
host_outputs = []
|
||||
cuda_outputs = []
|
||||
bindings = []
|
||||
|
||||
for binding in engine:
|
||||
print('bingding:', binding, engine.get_binding_shape(binding))
|
||||
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
|
||||
dtype = trt.nptype(engine.get_binding_dtype(binding))
|
||||
# Allocate host and device buffers
|
||||
host_mem = cuda.pagelocked_empty(size, dtype)
|
||||
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||
# Append the device buffer to device bindings.
|
||||
bindings.append(int(cuda_mem))
|
||||
# Append to the appropriate list.
|
||||
if engine.binding_is_input(binding):
|
||||
self.input_w = engine.get_binding_shape(binding)[-1]
|
||||
self.input_h = engine.get_binding_shape(binding)[-2]
|
||||
host_inputs.append(host_mem)
|
||||
cuda_inputs.append(cuda_mem)
|
||||
else:
|
||||
host_outputs.append(host_mem)
|
||||
cuda_outputs.append(cuda_mem)
|
||||
|
||||
# Store
|
||||
self.stream = stream
|
||||
self.context = context
|
||||
self.engine = engine
|
||||
self.host_inputs = host_inputs
|
||||
self.cuda_inputs = cuda_inputs
|
||||
self.host_outputs = host_outputs
|
||||
self.cuda_outputs = cuda_outputs
|
||||
self.bindings = bindings
|
||||
self.batch_size = engine.max_batch_size
|
||||
|
||||
# Data length
|
||||
self.det_output_length = host_outputs[0].shape[0]
|
||||
self.seg_output_length = host_outputs[1].shape[0]
|
||||
self.seg_w = int(self.input_w / 4)
|
||||
self.seg_h = int(self.input_h / 4)
|
||||
self.seg_c = int(self.seg_output_length / (self.seg_w * self.seg_w))
|
||||
self.det_row_output_length = self.seg_c + DET_NUM + POSE_NUM + OBB_NUM
|
||||
|
||||
# Draw mask
|
||||
self.colors_obj = Colors()
|
||||
|
||||
def infer(self, raw_image_generator):
|
||||
threading.Thread.__init__(self)
|
||||
# Make self the active context, pushing it on top of the context stack.
|
||||
self.ctx.push()
|
||||
# Restore
|
||||
stream = self.stream
|
||||
context = self.context
|
||||
host_inputs = self.host_inputs
|
||||
cuda_inputs = self.cuda_inputs
|
||||
host_outputs = self.host_outputs
|
||||
cuda_outputs = self.cuda_outputs
|
||||
bindings = self.bindings
|
||||
# Do image preprocess
|
||||
batch_image_raw = []
|
||||
batch_origin_h = []
|
||||
batch_origin_w = []
|
||||
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
|
||||
for i, image_raw in enumerate(raw_image_generator):
|
||||
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
|
||||
batch_image_raw.append(image_raw)
|
||||
batch_origin_h.append(origin_h)
|
||||
batch_origin_w.append(origin_w)
|
||||
np.copyto(batch_input_image[i], input_image)
|
||||
batch_input_image = np.ascontiguousarray(batch_input_image)
|
||||
|
||||
# Copy input image to host buffer
|
||||
np.copyto(host_inputs[0], batch_input_image.ravel())
|
||||
start = time.time()
|
||||
# Transfer input data to the GPU.
|
||||
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
|
||||
# Run inference.
|
||||
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
|
||||
# Transfer predictions back from the GPU.
|
||||
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
|
||||
cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)
|
||||
|
||||
# Synchronize the stream
|
||||
stream.synchronize()
|
||||
end = time.time()
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
# Here we use the first row of output in that batch_size = 1
|
||||
output = host_outputs[0]
|
||||
output_proto_mask = host_outputs[1]
|
||||
# Do postprocess
|
||||
for i in range(self.batch_size):
|
||||
result_boxes, result_scores, result_classid, result_proto_coef = self.post_process(
|
||||
output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
|
||||
batch_origin_w[i]
|
||||
)
|
||||
|
||||
if result_proto_coef.shape[0] == 0:
|
||||
continue
|
||||
result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i],
|
||||
batch_origin_w[i])
|
||||
|
||||
self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid],
|
||||
im_src=batch_image_raw[i])
|
||||
|
||||
# Draw rectangles and labels on the original image
|
||||
for j in range(len(result_boxes)):
|
||||
box = result_boxes[j]
|
||||
plot_one_box(
|
||||
box,
|
||||
batch_image_raw[i],
|
||||
label="{}:{:.2f}".format(
|
||||
categories[int(result_classid[j])], result_scores[j]
|
||||
),
|
||||
)
|
||||
return batch_image_raw, end - start
|
||||
|
||||
def destroy(self):
|
||||
# Remove any context from the top of the context stack, deactivating it.
|
||||
self.ctx.pop()
|
||||
|
||||
def get_raw_image(self, image_path_batch):
|
||||
"""
|
||||
description: Read an image from image path
|
||||
"""
|
||||
for img_path in image_path_batch:
|
||||
yield cv2.imread(img_path)
|
||||
|
||||
def get_raw_image_zeros(self, image_path_batch=None):
|
||||
"""
|
||||
description: Ready data for warmup
|
||||
"""
|
||||
for _ in range(self.batch_size):
|
||||
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
|
||||
|
||||
def preprocess_image(self, raw_bgr_image):
|
||||
"""
|
||||
description: Convert BGR image to RGB,
|
||||
resize and pad it to target size, normalize to [0,1],
|
||||
transform to NCHW format.
|
||||
param:
|
||||
input_image_path: str, image path
|
||||
return:
|
||||
image: the processed image
|
||||
image_raw: the original image
|
||||
h: original height
|
||||
w: original width
|
||||
"""
|
||||
image_raw = raw_bgr_image
|
||||
h, w, c = image_raw.shape
|
||||
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
|
||||
# Calculate widht and height and paddings
|
||||
r_w = self.input_w / w
|
||||
r_h = self.input_h / h
|
||||
if r_h > r_w:
|
||||
tw = self.input_w
|
||||
th = int(r_w * h)
|
||||
tx1 = tx2 = 0
|
||||
ty1 = int((self.input_h - th) / 2)
|
||||
ty2 = self.input_h - th - ty1
|
||||
else:
|
||||
tw = int(r_h * w)
|
||||
th = self.input_h
|
||||
tx1 = int((self.input_w - tw) / 2)
|
||||
tx2 = self.input_w - tw - tx1
|
||||
ty1 = ty2 = 0
|
||||
# Resize the image with long side while maintaining ratio
|
||||
image = cv2.resize(image, (tw, th))
|
||||
# Pad the short side with (128,128,128)
|
||||
image = cv2.copyMakeBorder(
|
||||
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
|
||||
)
|
||||
image = image.astype(np.float32)
|
||||
# Normalize to [0,1]
|
||||
image /= 255.0
|
||||
# HWC to CHW format:
|
||||
image = np.transpose(image, [2, 0, 1])
|
||||
# CHW to NCHW format
|
||||
image = np.expand_dims(image, axis=0)
|
||||
# Convert the image to row-major order, also known as "C order":
|
||||
image = np.ascontiguousarray(image)
|
||||
return image, image_raw, h, w
|
||||
|
||||
def xywh2xyxy(self, origin_h, origin_w, x):
|
||||
"""
|
||||
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
|
||||
param:
|
||||
origin_h: height of original image
|
||||
origin_w: width of original image
|
||||
x: A boxes numpy, each row is a box [center_x, center_y, w, h]
|
||||
return:
|
||||
y: A boxes numpy, each row is a box [x1, y1, x2, y2]
|
||||
"""
|
||||
y = np.zeros_like(x)
|
||||
r_w = self.input_w / origin_w
|
||||
r_h = self.input_h / origin_h
|
||||
if r_h > r_w:
|
||||
y[:, 0] = x[:, 0]
|
||||
y[:, 2] = x[:, 2]
|
||||
y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
|
||||
y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
|
||||
y /= r_w
|
||||
else:
|
||||
y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
|
||||
y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
|
||||
y[:, 1] = x[:, 1]
|
||||
y[:, 3] = x[:, 3]
|
||||
y /= r_h
|
||||
|
||||
return y
|
||||
|
||||
def post_process(self, output, origin_h, origin_w):
|
||||
"""
|
||||
description: postprocess the prediction
|
||||
param:
|
||||
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
|
||||
origin_h: height of original image
|
||||
origin_w: width of original image
|
||||
return:
|
||||
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
|
||||
result_scores: finally scores, a numpy, each element is the score correspoing to box
|
||||
result_classid: finally classid, a numpy, each element is the classid correspoing to box
|
||||
"""
|
||||
# Get the num of boxes detected
|
||||
num = int(output[0])
|
||||
# Reshape to a two dimentional ndarray
|
||||
pred = np.reshape(output[1:], (-1, self.det_row_output_length))[:num, :]
|
||||
|
||||
# Do nms
|
||||
boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
|
||||
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
|
||||
result_scores = boxes[:, 4] if len(boxes) else np.array([])
|
||||
result_classid = boxes[:, 5] if len(boxes) else np.array([])
|
||||
result_proto_coef = boxes[:, DET_NUM:int(DET_NUM + SEG_NUM)] if len(boxes) else np.array([])
|
||||
return result_boxes, result_scores, result_classid, result_proto_coef
|
||||
|
||||
def bbox_iou(self, box1, box2, x1y1x2y2=True):
|
||||
"""
|
||||
description: compute the IoU of two bounding boxes
|
||||
param:
|
||||
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
|
||||
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
|
||||
x1y1x2y2: select the coordinate format
|
||||
return:
|
||||
iou: computed iou
|
||||
"""
|
||||
if not x1y1x2y2:
|
||||
# Transform from center and width to exact coordinates
|
||||
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
|
||||
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
|
||||
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
|
||||
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
|
||||
else:
|
||||
# Get the coordinates of bounding boxes
|
||||
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
|
||||
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
|
||||
|
||||
# Get the coordinates of the intersection rectangle
|
||||
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
|
||||
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
|
||||
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
|
||||
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
|
||||
# Intersection area
|
||||
inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
|
||||
* np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
|
||||
# Union Area
|
||||
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
|
||||
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
|
||||
|
||||
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
|
||||
|
||||
return iou
|
||||
|
||||
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
|
||||
"""
|
||||
description: Removes detections with lower object confidence score than 'conf_thres' and performs
|
||||
Non-Maximum Suppression to further filter detections.
|
||||
param:
|
||||
prediction: detections, (x1, y1, x2, y2, conf, cls_id)
|
||||
origin_h: original image height
|
||||
origin_w: original image width
|
||||
conf_thres: a confidence threshold to filter detections
|
||||
nms_thres: a iou threshold to filter detections
|
||||
return:
|
||||
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
|
||||
"""
|
||||
# Get the boxes that score > CONF_THRESH
|
||||
boxes = prediction[prediction[:, 4] >= conf_thres]
|
||||
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
|
||||
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
|
||||
# clip the coordinates
|
||||
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
|
||||
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
|
||||
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
|
||||
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
|
||||
# Object confidence
|
||||
confs = boxes[:, 4]
|
||||
# Sort by the confs
|
||||
boxes = boxes[np.argsort(-confs)]
|
||||
# Perform non-maximum suppression
|
||||
keep_boxes = []
|
||||
while boxes.shape[0]:
|
||||
large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
|
||||
label_match = boxes[0, 5] == boxes[:, 5]
|
||||
# Indices of boxes with lower confidence scores, large IOUs and matching labels
|
||||
invalid = large_overlap & label_match
|
||||
keep_boxes += [boxes[0]]
|
||||
boxes = boxes[~invalid]
|
||||
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
|
||||
return boxes
|
||||
|
||||
def sigmoid(self, x):
|
||||
return 1 / (1 + np.exp(-x))
|
||||
|
||||
def scale_mask(self, mask, ih, iw):
|
||||
mask = cv2.resize(mask, (self.input_w, self.input_h))
|
||||
r_w = self.input_w / (iw * 1.0)
|
||||
r_h = self.input_h / (ih * 1.0)
|
||||
if r_h > r_w:
|
||||
w = self.input_w
|
||||
h = int(r_w * ih)
|
||||
x = 0
|
||||
y = int((self.input_h - h) / 2)
|
||||
else:
|
||||
w = int(r_h * iw)
|
||||
h = self.input_h
|
||||
x = int((self.input_w - w) / 2)
|
||||
y = 0
|
||||
crop = mask[y:y + h, x:x + w]
|
||||
crop = cv2.resize(crop, (iw, ih))
|
||||
return crop
|
||||
|
||||
def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw):
|
||||
"""
|
||||
description: Mask pred by yolov8 instance segmentation ,
|
||||
param:
|
||||
output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input
|
||||
result_proto_coef: prototype mask coefficients (n, 32), n represents n results
|
||||
result_boxes :
|
||||
ih: rows of original image
|
||||
iw: cols of original image
|
||||
return:
|
||||
mask_result: (n, ih, iw)
|
||||
"""
|
||||
result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w)
|
||||
c, mh, mw = result_proto_masks.shape
|
||||
print(result_proto_masks.shape)
|
||||
print(result_proto_coef.shape)
|
||||
masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh,
|
||||
mw)
|
||||
|
||||
mask_result = []
|
||||
for mask, box in zip(masks, result_boxes):
|
||||
mask_s = np.zeros((ih, iw))
|
||||
crop_mask = self.scale_mask(mask, ih, iw)
|
||||
x1 = int(box[0])
|
||||
y1 = int(box[1])
|
||||
x2 = int(box[2])
|
||||
y2 = int(box[3])
|
||||
crop = crop_mask[y1:y2, x1:x2]
|
||||
crop = np.where(crop >= 0.5, 1, 0)
|
||||
crop = crop.astype(np.uint8)
|
||||
mask_s[y1:y2, x1:x2] = crop
|
||||
|
||||
mask_result.append(mask_s)
|
||||
mask_result = np.array(mask_result)
|
||||
return mask_result
|
||||
|
||||
def draw_mask(self, masks, colors_, im_src, alpha=0.5):
|
||||
"""
|
||||
description: Draw mask on image ,
|
||||
param:
|
||||
masks : result_mask
|
||||
colors_: color to draw mask
|
||||
im_src : original image
|
||||
alpha : scale between original image and mask
|
||||
return:
|
||||
no return
|
||||
"""
|
||||
if len(masks) == 0:
|
||||
return
|
||||
masks = np.asarray(masks, dtype=np.uint8)
|
||||
masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
|
||||
masks = np.asarray(masks, dtype=np.float32)
|
||||
colors_ = np.asarray(colors_, dtype=np.float32)
|
||||
s = masks.sum(2, keepdims=True).clip(0, 1)
|
||||
masks = (masks @ colors_).clip(0, 255)
|
||||
im_src[:] = masks * alpha + im_src * (1 - s * alpha)
|
||||
|
||||
|
||||
class inferThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper, image_path_batch):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
self.image_path_batch = image_path_batch
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
|
||||
for i, img_path in enumerate(self.image_path_batch):
|
||||
parent, filename = os.path.split(img_path)
|
||||
save_name = os.path.join('output', filename)
|
||||
# Save image
|
||||
cv2.imwrite(save_name, batch_image_raw[i])
|
||||
print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
|
||||
|
||||
|
||||
class warmUpThread(threading.Thread):
|
||||
def __init__(self, yolov8_wrapper):
|
||||
threading.Thread.__init__(self)
|
||||
self.yolov8_wrapper = yolov8_wrapper
|
||||
|
||||
def run(self):
|
||||
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
|
||||
print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
|
||||
|
||||
|
||||
class Colors:
|
||||
def __init__(self):
|
||||
hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A',
|
||||
'92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF',
|
||||
'344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF',
|
||||
'FF95C8', 'FF37C7')
|
||||
self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
|
||||
self.n = len(self.palette)
|
||||
|
||||
def __call__(self, i, bgr=False):
|
||||
c = self.palette[int(i) % self.n]
|
||||
return (c[2], c[1], c[0]) if bgr else c
|
||||
|
||||
@staticmethod
|
||||
def hex2rgb(h): # rgb order (PIL)
|
||||
return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# load custom plugin and engine
|
||||
PLUGIN_LIBRARY = "./build/libmyplugins.so"
|
||||
engine_file_path = "yolov8n-seg.engine"
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
engine_file_path = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
PLUGIN_LIBRARY = sys.argv[2]
|
||||
|
||||
ctypes.CDLL(PLUGIN_LIBRARY)
|
||||
|
||||
# load coco labels
|
||||
|
||||
categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
|
||||
"traffic light",
|
||||
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
|
||||
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
|
||||
"frisbee",
|
||||
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
|
||||
"surfboard",
|
||||
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
|
||||
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
|
||||
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
|
||||
"cell phone",
|
||||
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
|
||||
"teddy bear",
|
||||
"hair drier", "toothbrush"]
|
||||
|
||||
if os.path.exists('output/'):
|
||||
shutil.rmtree('output/')
|
||||
os.makedirs('output/')
|
||||
# a YoLov8TRT instance
|
||||
yolov8_wrapper = YoLov8TRT(engine_file_path)
|
||||
try:
|
||||
print('batch size is', yolov8_wrapper.batch_size)
|
||||
|
||||
image_dir = "images/"
|
||||
image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
|
||||
|
||||
for i in range(10):
|
||||
# create a new thread to do warm_up
|
||||
thread1 = warmUpThread(yolov8_wrapper)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
for batch in image_path_batches:
|
||||
# create a new thread to do inference
|
||||
thread1 = inferThread(yolov8_wrapper, batch)
|
||||
thread1.start()
|
||||
thread1.join()
|
||||
finally:
|
||||
# destroy the instance
|
||||
yolov8_wrapper.destroy()
|
||||
74
测试post请求.py
Normal file
74
测试post请求.py
Normal file
@ -0,0 +1,74 @@
|
||||
import datetime
|
||||
import requests
|
||||
import json
|
||||
import yaml
|
||||
import uuid
|
||||
import time
|
||||
|
||||
|
||||
with open('config.yaml', 'r') as file:
|
||||
configData = yaml.safe_load(file)
|
||||
|
||||
tokenResult = {}
|
||||
getTokenUrl = configData['dataConfig']['getTokenUrl']
|
||||
vod_channelNo = configData['video_config']['v1_channelNo']
|
||||
# 告警信息url
|
||||
putMessageUrl = configData['dataConfig']['putMessageUrl']
|
||||
|
||||
def get_token(tokenResult):
|
||||
if 'token' in tokenResult and 'current_time' in tokenResult:
|
||||
token_time = datetime.datetime.strptime(tokenResult['current_time'],
|
||||
"%Y-%m-%d %H:%M:%S")
|
||||
current_time = datetime.datetime.now()
|
||||
time_diff = current_time - token_time
|
||||
if time_diff.total_seconds() > 20 * 60:
|
||||
# 过期重新请求 token
|
||||
# print("token 已过期")
|
||||
response = requests.post(getTokenUrl)
|
||||
if response.status_code == 200:
|
||||
data = json.loads(response.text)
|
||||
if 'retCode' in data and data['retCode'] == '200':
|
||||
token = data['responseBody']['token']
|
||||
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
tokenResult['token'] = token
|
||||
tokenResult['current_time'] = current_time
|
||||
else:
|
||||
tokenResult['error'] = data['errorDesc']
|
||||
else:
|
||||
tokenResult['error'] = response.status_code
|
||||
token = tokenResult['token']
|
||||
return token
|
||||
|
||||
|
||||
def send_post_request(url, token, msg, picUrl, videoUrl):
|
||||
payload = {
|
||||
"tenantCode": "8",
|
||||
"channelNo": vod_channelNo,
|
||||
"alarmContent": msg,
|
||||
"alarmTime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"picInfo": [
|
||||
{"url": picUrl}
|
||||
],
|
||||
"videoInfo": [
|
||||
{"url": videoUrl}
|
||||
]
|
||||
}
|
||||
headers = {
|
||||
'X-Access-Token': token,
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
# print(url)
|
||||
# print(headers)
|
||||
# print(payload)
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
print(response)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
token = get_token()
|
||||
print("token: ", token)
|
||||
|
||||
uuid_str = str(uuid.uuid4())[:6] + str(int(time.time())) # 生成UUID的前6位
|
||||
|
||||
upload_http_url_img = configData['minioConfig']['bucket_name'] + f'/{uuid_str}_{vod_channelNo}_.jpg'
|
||||
send_post_request(putMessageUrl, token, "消息内容", upload_http_url_img, '')
|
||||
37
读取mp4帧保存成图片.py
Normal file
37
读取mp4帧保存成图片.py
Normal file
@ -0,0 +1,37 @@
|
||||
|
||||
# url =
|
||||
|
||||
import cv2
|
||||
|
||||
# 打开视频文件
|
||||
# video_path = "/home/admin-root/haotian/锻8/tensorrtx/yolov8/video/180.50.13.253_07_2025042215303747.mp4"
|
||||
video_path = "/home/admin-root/haotian/锻8/tensorrtx/yolov8/video/180.50.13.253_02_20250627142949812.mp4"
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
|
||||
# 检查是否成功打开视频
|
||||
if not cap.isOpened():
|
||||
print("Error: Could not open video.")
|
||||
exit()
|
||||
|
||||
|
||||
n = 0
|
||||
|
||||
# 逐帧读取视频
|
||||
while True:
|
||||
# 读取一帧,ret为是否读取成功,frame为帧数据
|
||||
ret, frame = cap.read()
|
||||
|
||||
# 如果读取失败(如视频结束),退出循环
|
||||
if not ret:
|
||||
print("Reached end of video.")
|
||||
break
|
||||
|
||||
# 在此处处理帧数据(例如显示、保存、分析等)
|
||||
cv2.imwrite(f"./images_20250627142949812/mp4_{n}.jpg", frame)
|
||||
print(f"保存图片{n}")
|
||||
n += 1
|
||||
|
||||
|
||||
# 释放资源
|
||||
cap.release()
|
||||
# cv2.destroyAllWindows()
|
||||
Loading…
Reference in New Issue
Block a user