第一次提交

This commit is contained in:
shan 2026-01-06 10:04:15 +08:00
commit 7f5d8a10a3
100 changed files with 18366 additions and 0 deletions

81
.gitignore vendored Normal file
View File

@ -0,0 +1,81 @@
############################
# 视频文件(全部忽略)
############################
*.mp4
*.avi
*.mov
*.mkv
*.flv
*.wmv
*.webm
*.rm
*.rmvb
*.ts
*.m3u8
############################
# 图片文件(全部忽略)
############################
*.jpg
*.jpeg
*.png
*.bmp
*.tiff
*.tif
*.gif
*.webp
############################
# YOLO / TensorRT 模型文件
############################
*.pt
*.onnx
*.wts
*.engine
# 只保留 best.engine
!best.engine
############################
# CMake / 编译生成文件(忽略)
############################
CMakeFiles/
CMakeCache.txt
cmake_install.cmake
Makefile
*.o
*.a
*.so
*.dll
*.exe
############################
# build 目录规则
############################
# 忽略默认 build 目录
build/
# ❗保留你指定的 build_XXXXXX 目录
!build_*/
############################
# Python 缓存(忽略)
############################
__pycache__/
*.pyc
*.pyo
############################
# 日志 / 临时文件
############################
*.log
*.tmp
*.swp
*.bak
############################
# 系统文件
############################
.DS_Store
Thumbs.db

5
001测试拼接列表.py Normal file
View File

@ -0,0 +1,5 @@
l = ["a", "b", "c", "d"]
print('_'.join(l))

65
CMakeLists.txt Normal file
View File

@ -0,0 +1,65 @@
cmake_minimum_required(VERSION 3.10)
project(yolov8)
add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CUDA_COMPILER /usr/bin/nvcc)
enable_language(CUDA)
include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/plugin)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
message("embed_platform on")
include_directories(/usr/local/cuda/targets/aarch64-linux/include)
link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
message("embed_platform off")
# cuda
include_directories(/usr/local/cuda-12.4/include)
link_directories(/usr/local/cuda-12.4/lib64)
# tensorrt
include_directories(/home/admin-root/software/TensorRT-8.6.1.6/include)
link_directories(/home/admin-root/software/TensorRT-8.6.1.6/lib)
# include_directories(/home/lindsay/TensorRT-7.2.3.4/include)
# link_directories(/home/lindsay/TensorRT-7.2.3.4/lib)
endif()
add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
target_link_libraries(myplugins nvinfer cudart)
find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})
file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
add_executable(yolov8_det ${PROJECT_SOURCE_DIR}/yolov8_det.cpp ${SRCS})
target_link_libraries(yolov8_det nvinfer)
target_link_libraries(yolov8_det cudart)
target_link_libraries(yolov8_det myplugins)
target_link_libraries(yolov8_det ${OpenCV_LIBS})
add_executable(yolov8_seg ${PROJECT_SOURCE_DIR}/yolov8_seg.cpp ${SRCS})
target_link_libraries(yolov8_seg nvinfer cudart myplugins ${OpenCV_LIBS})
add_executable(yolov8_pose ${PROJECT_SOURCE_DIR}/yolov8_pose.cpp ${SRCS})
target_link_libraries(yolov8_pose nvinfer cudart myplugins ${OpenCV_LIBS})
add_executable(yolov8_cls ${PROJECT_SOURCE_DIR}/yolov8_cls.cpp ${SRCS})
target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS})
add_executable(yolov8_5u_det ${PROJECT_SOURCE_DIR}/yolov8_5u_det.cpp ${SRCS})
target_link_libraries(yolov8_5u_det nvinfer cudart myplugins ${OpenCV_LIBS})
add_executable(yolov8_obb ${PROJECT_SOURCE_DIR}/yolov8_obb.cpp ${SRCS})
target_link_libraries(yolov8_obb nvinfer cudart myplugins ${OpenCV_LIBS})

200
README.md Normal file
View File

@ -0,0 +1,200 @@
# YOLOv8
The Pytorch implementation is [ultralytics/yolov8](https://github.com/ultralytics/ultralytics/tree/main/ultralytics).
The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.com/xiaocao-tian/yolov8_tensorrt)
## Contributors
<a href="https://github.com/xiaocao-tian"><img src="https://avatars.githubusercontent.com/u/65889782?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/lindsayshuo"><img src="https://avatars.githubusercontent.com/u/45239466?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/xinsuinizhuan"><img src="https://avatars.githubusercontent.com/u/40679769?v=4?s=48" width="40px;" alt=""/></a>
<a href="https://github.com/Rex-LK"><img src="https://avatars.githubusercontent.com/u/74702576?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/emptysoal"><img src="https://avatars.githubusercontent.com/u/57931586?s=48&v=4" width="40px;" alt=""/></a>
<a href="https://github.com/ChangjunDAI"><img src="https://avatars.githubusercontent.com/u/65420228?s=48&v=4" width="40px;" alt=""/></a>
## Requirements
- TensorRT 8.0+
- OpenCV 3.4.0+
- ultralytics<=8.2.103
## Different versions of yolov8
Currently, we support yolov8
- For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page.
## Config
- Choose the model n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 from command line arguments.
- Check more configs in [include/config.h](./include/config.h)
## How to Run, yolov8n as example
1. generate .wts from pytorch with .pt, or download .wts from model zoo
```
// download https://github.com/ultralytics/assets/releases/yolov8n.pt
// download https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for 10 cls p2 model)
cp {tensorrtx}/yolov8/gen_wts.py {ultralytics}/ultralytics
cd {ultralytics}/ultralytics
python gen_wts.py -w yolov8n.pt -o yolov8n.wts -t detect
// a file 'yolov8n.wts' will be generated.
// For p2 model
// download https://github.com/lindsayshuo/yolov8_p2_tensorrtx/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for 10 cls p2 model)
cd {ultralytics}/ultralytics
python gen_wts.py -w VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt -o VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts -t detect (only for 10 cls p2 model)
// a file 'VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts' will be generated.
// For yolov8_5u_det model
// download https://github.com/ultralytics/assets/releases/yolov5nu.pt
cd {ultralytics}/ultralytics
python gen_wts.py -w yolov5nu.pt -o yolov5nu.wts -t detect
// a file 'yolov5nu.wts' will be generated.
```
2. build tensorrtx/yolov8 and run
### Detection
```
cd {tensorrtx}/yolov8/
mkdir build
cd build
cp {ultralytics}/ultralytics/yolov8.wts {tensorrtx}/yolov8/build
cmake ..
make
sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file
sudo ./yolov8_det -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed.
// For example yolov8n
sudo ./yolov8_det -s yolov8n.wts yolov8.engine n
sudo ./yolov8_det -d yolov8n.engine ../images c //cpu postprocess
sudo ./yolov8_det -d yolov8n.engine ../images g //gpu postprocess
// For p2 model:
// change the "const static int kNumClass" in config.h to 10;
sudo ./yolov8_det -s VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine x2
wget https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/0000008_01999_d_0000040.jpg
cp -r 0000008_01999_d_0000040.jpg ../images
sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images c //cpu postprocess
sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images g //gpu postprocess
// For yolov8_5u_det(YOLOv5u with the anchor-free, objectness-free split head structure based on YOLOv8 features) model:
sudo ./yolov8_5u_det -s [.wts] [.engine] [n/s/m/l/x//n6/s6/m6/l6/x6]
sudo ./yolov8_5u_det -d yolov5xu.engine ../images c //cpu postprocess
sudo ./yolov8_5u_det -d yolov5xu.engine ../images g //gpu postprocess
```
### Instance Segmentation
```
# Build and serialize TensorRT engine
./yolov8_seg -s yolov8s-seg.wts yolov8s-seg.engine s
# Download the labels file
wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt
# Run inference with labels file
./yolov8_seg -d yolov8s-seg.engine ../images c coco.txt
```
### Classification
```
cd {tensorrtx}/yolov8/
// Download inference images
wget https://github.com/lindsayshuo/infer_pic/releases/download/pics/1709970363.6990473rescls.jpg
mkdir samples
cp -r 1709970363.6990473rescls.jpg samples
// Download ImageNet labels
wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt
// update kClsNumClass in config.h if your model is trained on custom dataset
mkdir build
cd build
cp {ultralytics}/ultralytics/yolov8n-cls.wts {tensorrtx}/yolov8/build
cmake ..
make
sudo ./yolov8_cls -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file
sudo ./yolov8_cls -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed.
// For example yolov8n
sudo ./yolov8_cls -s yolov8n-cls.wts yolov8-cls.engine n
sudo ./yolov8_cls -d yolov8n-cls.engine ../samples
```
### Pose Estimation
```
cd {tensorrtx}/yolov8/
// update "kPoseNumClass = 1" in config.h
mkdir build
cd build
cp {ultralytics}/ultralytics/yolov8-pose.wts {tensorrtx}/yolov8/build
cmake ..
make
sudo ./yolov8_pose -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file
sudo ./yolov8_pose -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed.
// For example yolov8-pose
sudo ./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n
sudo ./yolov8_pose -d yolov8n-pose.engine ../images c //cpu postprocess
sudo ./yolov8_pose -d yolov8n-pose.engine ../images g //gpu postprocess
```
### Oriented Bounding Boxes (OBB) Estimation
```
cd {tensorrtx}/yolov8/
// update "kObbNumClass = 15" "kInputH = 1024" "kInputW = 1024" in config.h
wget https://github.com/lindsayshuo/infer_pic/releases/download/pics/obb.png
mkdir images
mv obb.png ./images
mkdir build
cd build
cp {ultralytics}/ultralytics/yolov8-obb.wts {tensorrtx}/yolov8/build
cmake ..
make
sudo ./yolov8_obb -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file
sudo ./yolov8_obb -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed.
// For example yolov8-obb
sudo ./yolov8_obb -s yolov8n-obb.wts yolov8n-obb.engine n
sudo ./yolov8_obb -d yolov8n-obb.engine ../images c //cpu postprocess
sudo ./yolov8_obb -d yolov8n-obb.engine ../images g //gpu postprocess
```
4. optional, load and run the tensorrt model in python
```
// install python-tensorrt, pycuda, etc.
// ensure the yolov8n.engine and libmyplugins.so have been built
python yolov8_det_trt.py # Detection
python yolov8_seg_trt.py # Segmentation
python yolov8_cls_trt.py # Classification
python yolov8_pose_trt.py # Pose Estimation
python yolov8_5u_det_trt.py # yolov8_5u_det(YOLOv5u with the anchor-free, objectness-free split head structure based on YOLOv8 features) model
python yolov8_obb_trt.py # Oriented Bounding Boxes (OBB) Estimation
```
# INT8 Quantization
1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh
2. unzip it in yolov8/build
3. set the macro `USE_INT8` in config.h, change `kInputQuantizationFolder` into your image folder path and make
4. serialize the model and test
<p align="center">
<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg" height="360px;">
</p>
## More Information
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)

12
attendance/2025-01-08.txt Normal file
View File

@ -0,0 +1,12 @@
新人打卡, 员工名haotian ,相似度0.99852,打卡时间2025-01-08 14:38:01.425799
新人打卡, 员工名:胡同同 ,相似度0.96767,打卡时间2025-01-08 14:41:52.114084
新人打卡, 员工名:杨威 ,相似度0.98566,打卡时间2025-01-08 14:57:25.434180
新人打卡, 员工名:张建峰 ,相似度0.9608,打卡时间2025-01-08 14:57:42.841012
新人打卡, 员工名:郑俊 ,相似度0.8994,打卡时间2025-01-08 15:17:13.444042
新人打卡, 员工名:马可义 ,相似度0.90012,打卡时间2025-01-08 15:17:15.850005
新人打卡, 员工名:李同同 ,相似度0.9586,打卡时间2025-01-08 15:18:17.529510
新人打卡, 员工名:白景辰(1) ,相似度0.99092,打卡时间2025-01-08 15:18:34.346216
新人打卡, 员工名:焦军红(1) ,相似度0.97479,打卡时间2025-01-08 15:18:39.877829
新人打卡, 员工名:林时波 ,相似度0.98172,打卡时间2025-01-08 15:19:11.575465
新人打卡, 员工名:林凯 ,相似度0.9983,打卡时间2025-01-08 15:27:33.302375
新人打卡, 员工名:于波 ,相似度0.92919,打卡时间2025-01-08 15:27:34.106271

BIN
build_20250603/best.engine Normal file

Binary file not shown.

5
build_20250603/build.sh Normal file
View File

@ -0,0 +1,5 @@
# conda activate trt
python gen_wts.py -w best.pt -o best.wts -t detect
cmake ..
make
./yolov8_det -s best.wts best.engine n

57
build_20250603/gen_wts.py Normal file
View File

@ -0,0 +1,57 @@
import sys # noqa: F401
import argparse
import os
import struct
import torch
def parse_args():
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
parser.add_argument('-w', '--weights', required=True,
help='Input weights (.pt) file path (required)')
parser.add_argument(
'-o', '--output', help='Output (.wts) file path (optional)')
parser.add_argument(
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
help='determines the model is detection/classification')
args = parser.parse_args()
if not os.path.isfile(args.weights):
raise SystemExit('Invalid input file')
if not args.output:
args.output = os.path.splitext(args.weights)[0] + '.wts'
elif os.path.isdir(args.output):
args.output = os.path.join(
args.output,
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
return args.weights, args.output, args.type
pt_file, wts_file, m_type = parse_args()
print(f'Generating .wts for {m_type} model')
# Load model
print(f'Loading {pt_file}')
# Initialize
device = 'cpu'
# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32
if m_type in ['detect', 'seg', 'pose', 'obb']:
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
delattr(model.model[-1], 'anchors')
model.to(device).eval()
with open(wts_file, 'w') as f:
f.write('{}\n'.format(len(model.state_dict().keys())))
for k, v in model.state_dict().items():
vr = v.reshape(-1).cpu().numpy()
f.write('{} {} '.format(k, len(vr)))
for vv in vr:
f.write(' ')
f.write(struct.pack('>f', float(vv)).hex())
f.write('\n')

Binary file not shown.

BIN
build_20250603/yolov8_cls Normal file

Binary file not shown.

BIN
build_20250603/yolov8_det Normal file

Binary file not shown.

BIN
build_20250603/yolov8_obb Normal file

Binary file not shown.

BIN
build_20250603/yolov8_pose Normal file

Binary file not shown.

BIN
build_20250603/yolov8_seg Normal file

Binary file not shown.

BIN
build_20250630/best.engine Normal file

Binary file not shown.

5
build_20250630/build.sh Normal file
View File

@ -0,0 +1,5 @@
# conda activate trt
python gen_wts.py -w best.pt -o best.wts -t detect
cmake ..
make
./yolov8_det -s best.wts best.engine n

57
build_20250630/gen_wts.py Normal file
View File

@ -0,0 +1,57 @@
import sys # noqa: F401
import argparse
import os
import struct
import torch
def parse_args():
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
parser.add_argument('-w', '--weights', required=True,
help='Input weights (.pt) file path (required)')
parser.add_argument(
'-o', '--output', help='Output (.wts) file path (optional)')
parser.add_argument(
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
help='determines the model is detection/classification')
args = parser.parse_args()
if not os.path.isfile(args.weights):
raise SystemExit('Invalid input file')
if not args.output:
args.output = os.path.splitext(args.weights)[0] + '.wts'
elif os.path.isdir(args.output):
args.output = os.path.join(
args.output,
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
return args.weights, args.output, args.type
pt_file, wts_file, m_type = parse_args()
print(f'Generating .wts for {m_type} model')
# Load model
print(f'Loading {pt_file}')
# Initialize
device = 'cpu'
# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32
if m_type in ['detect', 'seg', 'pose', 'obb']:
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
delattr(model.model[-1], 'anchors')
model.to(device).eval()
with open(wts_file, 'w') as f:
f.write('{}\n'.format(len(model.state_dict().keys())))
for k, v in model.state_dict().items():
vr = v.reshape(-1).cpu().numpy()
f.write('{} {} '.format(k, len(vr)))
for vv in vr:
f.write(' ')
f.write(struct.pack('>f', float(vv)).hex())
f.write('\n')

Binary file not shown.

BIN
build_20250630/yolov8_cls Normal file

Binary file not shown.

BIN
build_20250630/yolov8_det Normal file

Binary file not shown.

BIN
build_20250630/yolov8_obb Normal file

Binary file not shown.

BIN
build_20250630/yolov8_pose Normal file

Binary file not shown.

BIN
build_20250630/yolov8_seg Normal file

Binary file not shown.

BIN
build_20251226/best.engine Normal file

Binary file not shown.

Binary file not shown.

BIN
build_20251226/yolov8_cls Normal file

Binary file not shown.

BIN
build_20251226/yolov8_det Normal file

Binary file not shown.

BIN
build_20251226/yolov8_obb Normal file

Binary file not shown.

BIN
build_20251226/yolov8_pose Normal file

Binary file not shown.

BIN
build_20251226/yolov8_seg Normal file

Binary file not shown.

BIN
build_250306/best.engine Normal file

Binary file not shown.

5
build_250306/build.sh Normal file
View File

@ -0,0 +1,5 @@
# conda activate trt
python gen_wts.py -w best.pt -o best.wts -t detect
cmake ..
make
./yolov8_det -s best.wts best.engine n

Binary file not shown.

57
build_250306/gen_wts.py Normal file
View File

@ -0,0 +1,57 @@
import sys # noqa: F401
import argparse
import os
import struct
import torch
def parse_args():
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
parser.add_argument('-w', '--weights', required=True,
help='Input weights (.pt) file path (required)')
parser.add_argument(
'-o', '--output', help='Output (.wts) file path (optional)')
parser.add_argument(
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
help='determines the model is detection/classification')
args = parser.parse_args()
if not os.path.isfile(args.weights):
raise SystemExit('Invalid input file')
if not args.output:
args.output = os.path.splitext(args.weights)[0] + '.wts'
elif os.path.isdir(args.output):
args.output = os.path.join(
args.output,
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
return args.weights, args.output, args.type
pt_file, wts_file, m_type = parse_args()
print(f'Generating .wts for {m_type} model')
# Load model
print(f'Loading {pt_file}')
# Initialize
device = 'cpu'
# Load model
model = torch.load(pt_file, map_location=device)['model'].float() # load to FP32
if m_type in ['detect', 'seg', 'pose', 'obb']:
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
delattr(model.model[-1], 'anchors')
model.to(device).eval()
with open(wts_file, 'w') as f:
f.write('{}\n'.format(len(model.state_dict().keys())))
for k, v in model.state_dict().items():
vr = v.reshape(-1).cpu().numpy()
f.write('{} {} '.format(k, len(vr)))
for vv in vr:
f.write(' ')
f.write(struct.pack('>f', float(vv)).hex())
f.write('\n')

BIN
build_250306/yolov8_5u_det Normal file

Binary file not shown.

BIN
build_250306/yolov8_cls Normal file

Binary file not shown.

BIN
build_250306/yolov8_det Normal file

Binary file not shown.

BIN
build_250306/yolov8_obb Normal file

Binary file not shown.

BIN
build_250306/yolov8_pose Normal file

Binary file not shown.

BIN
build_250306/yolov8_seg Normal file

Binary file not shown.

View File

@ -0,0 +1,57 @@
import sys # noqa: F401
import argparse
import os
import struct
import torch
def parse_args():
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
parser.add_argument('-w', '--weights', required=True,
help='Input weights (.pt) file path (required)')
parser.add_argument(
'-o', '--output', help='Output (.wts) file path (optional)')
parser.add_argument(
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
help='determines the model is detection/classification')
args = parser.parse_args()
if not os.path.isfile(args.weights):
raise SystemExit('Invalid input file')
if not args.output:
args.output = os.path.splitext(args.weights)[0] + '.wts'
elif os.path.isdir(args.output):
args.output = os.path.join(
args.output,
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
return args.weights, args.output, args.type
pt_file, wts_file, m_type = parse_args()
print(f'Generating .wts for {m_type} model')
# Load model
print(f'Loading {pt_file}')
# Initialize
device = 'cpu'
# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32
if m_type in ['detect', 'seg', 'pose', 'obb']:
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
delattr(model.model[-1], 'anchors')
model.to(device).eval()
with open(wts_file, 'w') as f:
f.write('{}\n'.format(len(model.state_dict().keys())))
for k, v in model.state_dict().items():
vr = v.reshape(-1).cpu().numpy()
f.write('{} {} '.format(k, len(vr)))
for vv in vr:
f.write(' ')
f.write(struct.pack('>f', float(vv)).hex())
f.write('\n')

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,5 @@
# conda activate trt
python gen_wts.py -w best.pt -o best.wts -t detect
cmake ..
make
./yolov8_det -s best.wts best.engine n

View File

@ -0,0 +1,57 @@
import sys # noqa: F401
import argparse
import os
import struct
import torch
def parse_args():
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
parser.add_argument('-w', '--weights', required=True,
help='Input weights (.pt) file path (required)')
parser.add_argument(
'-o', '--output', help='Output (.wts) file path (optional)')
parser.add_argument(
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
help='determines the model is detection/classification')
args = parser.parse_args()
if not os.path.isfile(args.weights):
raise SystemExit('Invalid input file')
if not args.output:
args.output = os.path.splitext(args.weights)[0] + '.wts'
elif os.path.isdir(args.output):
args.output = os.path.join(
args.output,
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
return args.weights, args.output, args.type
pt_file, wts_file, m_type = parse_args()
print(f'Generating .wts for {m_type} model')
# Load model
print(f'Loading {pt_file}')
# Initialize
device = 'cpu'
# Load model
model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32
if m_type in ['detect', 'seg', 'pose', 'obb']:
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
delattr(model.model[-1], 'anchors')
model.to(device).eval()
with open(wts_file, 'w') as f:
f.write('{}\n'.format(len(model.state_dict().keys())))
for k, v in model.state_dict().items():
vr = v.reshape(-1).cpu().numpy()
f.write('{} {} '.format(k, len(vr)))
for vv in vr:
f.write(' ')
f.write(struct.pack('>f', float(vv)).hex())
f.write('\n')

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

92
config.yaml Normal file
View File

@ -0,0 +1,92 @@
# ===============================
# TensorRT Engine 配置
# ===============================
engine_path: 'build_20251226/'
# ===============================
# 视频配置(只启用 v0 本地视频)
# ===============================
video_config:
# 输出路径
m3u8_path: 'output/'
save_path: 'mp4/'
people_save_path: 'attendance/'
# 检测类别(和 engine 对应)
categories: ["helmet", "non-Helmet", "shoe"]
# 兼容旧代码字段
m3u8_path_0: 'output/'
# ===== v0本地视频唯一启用=====
v0_ip: 'local_video'
v0_channelNo: '0#'
v0_testclasses: [0, 1, 2]
v0_path: '/home/admin-root/haotian/锻8/tensorrtx/yolov8/video/场景1.mp4'
# ===== 其余通道全部禁用(别删,防止 KeyError=====
v1_ip: ''
v1_path: ''
v2_ip: ''
v2_path: ''
v3_ip: ''
v3_path: ''
v4_ip: ''
v4_path: ''
v5_ip: ''
v5_path: ''
v6_ip: ''
v6_path: ''
v7_ip: ''
v7_path: ''
v8_ip: ''
v8_path: ''
v9_ip: ''
v9_path: ''
v10_ip: ''
v10_path: ''
v11_ip: ''
v11_path: ''
# ===============================
# MinIO先留着不影响跑视频
# ===============================
minioConfig:
endpoint: '180.50.12.100:9000/'
access_key: 'admin'
secret_key: '12345678aA'
secure: false
bucket_name: 'vi-attachment'
# ===============================
# 数据上报(不影响本地跑)
# ===============================
dataConfig:
getTokenUrl: 'http://180.50.12.100/api/appsys/sso/httpheader/login/v1?username_=szls'
putMessageUrl: 'http://180.50.12.100/api/edge/edgecallmanages/vi-alarm/v1'
timeInterval: 600
# ===============================
# 人脸服务(即使不用也不能缺)
# ===============================
compreface_service:
domain: 'http://180.50.12.104'
port: '8000'
api_key: '88f43f2f-1483-4ad0-ae6c-8f1a800c3acd'
det_prob_threshold: 0.99
limit: 0

135
config.yaml.back_d8_1_mp4 Normal file
View File

@ -0,0 +1,135 @@
# engine_path: 'build_250306/'
engine_path: 'build_20251226/'
video_config:
# 保存m3u8文件路径
# m3u8_path: '/home/pro/hls/mid/'
# m3u8_path: '/workspace/hls_data/mid/'
m3u8_path: 'output/'
# 保存mp4文件路径
# save_path: '/home/pro/tensorrtx-master/yolov8/mp4/'
save_path: 'mp4/'
people_save_path: 'attendance/'
# categories : ["face", "shoe", "phone", "e-bike"]
categories : ["helmet", "non-Helmet", "shoe"]
# m3u8_path_0: '/workspace/hls_data/mid/'
m3u8_path_0: 'output/'
v0_ip: 'test243'
v0_channelNo: '0#'
v0_testclasses : [0, 1, 2]
# v0_path: 'rtsp://180.50.12.106:8554/camera_test/2'
v0_path: '/home/admin-root/haotian/锻8/tensorrtx/yolov8/video/场景1.mp4'
v1_ip: '180.50.13.20'
v1_channelNo: 'D19'
v1_testclasses : [1, 2]
v1_path: 'rtsp://admin:sy12345678@180.50.13.20:554/Streaming/Channels/102'
v2_ip: '180.50.13.21'
# v2_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v2_channelNo: 'D20'
v2_testclasses : [1, 2]
v2_path: 'rtsp://admin:sy12345678@180.50.13.21:554/Streaming/Channels/102 '
v3_ip: '180.50.13.22'
# v3_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v3_channelNo: 'D21'
v3_testclasses : [1, 2]
v3_path: 'rtsp://admin:sy12345678@180.50.13.22:554/Streaming/Channels/102'
v4_ip: '180.50.13.23'
# v4_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v4_channelNo: 'D22'
v4_testclasses : [1, 2]
v4_path: 'rtsp://admin:sy12345678@180.50.13.23:554/Streaming/Channels/102'
v5_ip: '180.50.13.24'
# v5_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v5_channelNo: 'D23'
v5_testclasses : [1, 2]
v5_path: 'rtsp://admin:sy12345678@180.50.13.24:554/Streaming/Channels/102'
v6_ip: '192.168.21.30'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v6_channelNo: '6#'
v6_testclasses : [0]
v6_path: 'rtsp://admin:12345678a@192.168.21.30:554/Streaming/Channels/101'
v7_ip: '192.168.21.37'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v7_channelNo: '7#'
v7_testclasses : [0]
v7_path: 'rtsp://admin:12345678a@192.168.21.37:554/Streaming/Channels/101'
v8_ip: '192.168.21.50'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v8_channelNo: '8#'
v8_testclasses: [0]
v8_path: 'rtsp://admin:12345678a@192.168.21.50:554/Streaming/Channels/101'
v9_ip: '192.168.21.51'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v9_channelNo: '9#'
v9_testclasses: [0]
v9_path: 'rtsp://admin:12345678a@192.168.21.51:554/Streaming/Channels/101'
v10_ip: '192.168.21.18'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v10_channelNo: '10#'
v10_testclasses: [0]
v10_path: 'rtsp://admin:12345678a@192.168.21.18:554/Streaming/Channels/101'
v11_ip: '192.168.21.55'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v11_channelNo: '11#'
v11_testclasses: [0]
v11_path: 'rtsp://admin:12345678a@192.168.21.55:554/Streaming/Channels/101'
minioConfig:
# endpoint: '10.0.0.58:9000/'
# access_key: 'root'
# secret_key: '@root123456'
# secure: False
# bucket_name: 'miniotest'
#bucketName: vi
endpoint: '180.50.12.100:9000/'
access_key: 'admin'
secret_key: '12345678aA'
secure: False
bucket_name: 'vi-attachment'
dataConfig:
# getTokenUrl: 'http://192.168.220.202/api/appsys/sso/httpheader/login/v1?username_=digital'
getTokenUrl: 'http://180.50.12.100/api/appsys/sso/httpheader/login/v1?username_=szls'
# putMessageUrl: 'http://192.168.220.200/api/edge/edgecallmanages/vi-alarm/v1'
putMessageUrl: 'http://180.50.12.100/api/edge/edgecallmanages/vi-alarm/v1'
timeInterval: 600
# getTokenUrl: 'http://192.168.220.202/api/appsys/sso/httpheader/login/v1/username_=digital'
# putMessageUrl: 'http://192.168.220.202/api/edge/edgecallmanages/vi-alarm/v1'
compreface_service:
domain: 'http://180.50.12.104'
port: '8000'
api_key: '88f43f2f-1483-4ad0-ae6c-8f1a800c3acd'
# api_key: 'ab77978a-cc2b-4fa0-8959-6294e856721a'
# api_key: '6d89a2ce-b71a-4894-96bb-03c6712e86d0'
# 人脸置信度,>0.9j就判断为人脸
det_prob_threshold: 0.99
# 识别图像中人脸的个数0代表没有限制。
limit: 0

131
config.yaml.back_first Normal file
View File

@ -0,0 +1,131 @@
engine_path: 'build250306/'
video_config:
# 保存m3u8文件路径
m3u8_path: '/home/pro/hls/mid/'
# m3u8_path: '/workspace/hls_data/mid/'
# 保存mp4文件路径
# save_path: '/home/pro/tensorrtx-master/yolov8/mp4/'
save_path: 'mp4/'
people_save_path: 'attendance/'
# categories : ["face", "shoe", "phone", "e-bike"]
categories : ["helmet", "non-Helmet", "shoe"]
m3u8_path_0: '/workspace/hls_data/mid/'
v0_ip: 'test243'
v0_channelNo: '0#'
v0_testclasses : [0, 1, 2]
v0_path: 'rtsp://180.50.12.106:8554/camera_test/2'
v1_ip: '180.50.13.20'
v1_channelNo: 'D19'
v1_testclasses : [1, 2]
v1_path: 'rtsp://admin:sy12345678@180.50.13.20:554/Streaming/Channels/102'
v2_ip: '180.50.13.21'
# v2_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v2_channelNo: 'D20'
v2_testclasses : [1, 2]
v2_path: 'rtsp://admin:sy12345678@180.50.13.21:554/Streaming/Channels/102 '
v3_ip: '180.50.13.22'
# v3_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v3_channelNo: 'D21'
v3_testclasses : [1, 2]
v3_path: 'rtsp://admin:sy12345678@180.50.13.22:554/Streaming/Channels/102'
v4_ip: '180.50.13.23'
# v4_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v4_channelNo: 'D22'
v4_testclasses : [1, 2]
v4_path: 'rtsp://admin:sy12345678@180.50.13.23:554/Streaming/Channels/102'
v5_ip: '180.50.13.24'
# v5_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v5_channelNo: 'D23'
v5_testclasses : [1, 2]
v5_path: 'rtsp://admin:sy12345678@180.50.13.24:554/Streaming/Channels/102'
v6_ip: '192.168.21.30'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v6_channelNo: '6#'
v6_testclasses : [0]
v6_path: 'rtsp://admin:12345678a@192.168.21.30:554/Streaming/Channels/101'
v7_ip: '192.168.21.37'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v7_channelNo: '7#'
v7_testclasses : [0]
v7_path: 'rtsp://admin:12345678a@192.168.21.37:554/Streaming/Channels/101'
v8_ip: '192.168.21.50'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v8_channelNo: '8#'
v8_testclasses: [0]
v8_path: 'rtsp://admin:12345678a@192.168.21.50:554/Streaming/Channels/101'
v9_ip: '192.168.21.51'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v9_channelNo: '9#'
v9_testclasses: [0]
v9_path: 'rtsp://admin:12345678a@192.168.21.51:554/Streaming/Channels/101'
v10_ip: '192.168.21.18'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v10_channelNo: '10#'
v10_testclasses: [0]
v10_path: 'rtsp://admin:12345678a@192.168.21.18:554/Streaming/Channels/101'
v11_ip: '192.168.21.55'
# v6_path: 'rtsp://10.0.0.17:8554/camera_test/2'
v11_channelNo: '11#'
v11_testclasses: [0]
v11_path: 'rtsp://admin:12345678a@192.168.21.55:554/Streaming/Channels/101'
minioConfig:
# endpoint: '10.0.0.58:9000/'
# access_key: 'root'
# secret_key: '@root123456'
# secure: False
# bucket_name: 'miniotest'
#bucketName: vi
endpoint: '180.50.12.100:9000/'
access_key: 'admin'
secret_key: '12345678aA'
secure: False
bucket_name: 'vi-attachment'
dataConfig:
# getTokenUrl: 'http://192.168.220.202/api/appsys/sso/httpheader/login/v1?username_=digital'
getTokenUrl: 'http://180.50.12.100/api/appsys/sso/httpheader/login/v1?username_=szls'
# putMessageUrl: 'http://192.168.220.200/api/edge/edgecallmanages/vi-alarm/v1'
putMessageUrl: 'http://180.50.12.100/api/edge/edgecallmanages/vi-alarm/v1'
timeInterval: 600
# getTokenUrl: 'http://192.168.220.202/api/appsys/sso/httpheader/login/v1/username_=digital'
# putMessageUrl: 'http://192.168.220.202/api/edge/edgecallmanages/vi-alarm/v1'
compreface_service:
domain: 'http://180.50.12.104'
port: '8000'
api_key: '88f43f2f-1483-4ad0-ae6c-8f1a800c3acd'
# api_key: 'ab77978a-cc2b-4fa0-8959-6294e856721a'
# api_key: '6d89a2ce-b71a-4894-96bb-03c6712e86d0'
# 人脸置信度,>0.9j就判断为人脸
det_prob_threshold: 0.99
# 识别图像中人脸的个数0代表没有限制。
limit: 0

1206
d8_1.py Normal file

File diff suppressed because it is too large Load Diff

1205
d8_1_mp4 copy.py Normal file

File diff suppressed because it is too large Load Diff

1205
d8_1_mp4.py Normal file

File diff suppressed because it is too large Load Diff

1203
d8_2_new copy 2.py Normal file

File diff suppressed because it is too large Load Diff

1203
d8_2_new copy.py Normal file

File diff suppressed because it is too large Load Diff

1256
d8_2_new.py Normal file

File diff suppressed because it is too large Load Diff

57
gen_wts.py Normal file
View File

@ -0,0 +1,57 @@
import sys # noqa: F401
import argparse
import os
import struct
import torch
def parse_args():
parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
parser.add_argument('-w', '--weights', required=True,
help='Input weights (.pt) file path (required)')
parser.add_argument(
'-o', '--output', help='Output (.wts) file path (optional)')
parser.add_argument(
'-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'],
help='determines the model is detection/classification')
args = parser.parse_args()
if not os.path.isfile(args.weights):
raise SystemExit('Invalid input file')
if not args.output:
args.output = os.path.splitext(args.weights)[0] + '.wts'
elif os.path.isdir(args.output):
args.output = os.path.join(
args.output,
os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
return args.weights, args.output, args.type
pt_file, wts_file, m_type = parse_args()
print(f'Generating .wts for {m_type} model')
# Load model
print(f'Loading {pt_file}')
# Initialize
device = 'cpu'
# Load model
model = torch.load(pt_file, map_location=device)['model'].float() # load to FP32
if m_type in ['detect', 'seg', 'pose', 'obb']:
anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
delattr(model.model[-1], 'anchors')
model.to(device).eval()
with open(wts_file, 'w') as f:
f.write('{}\n'.format(len(model.state_dict().keys())))
for k, v in model.state_dict().items():
vr = v.reshape(-1).cpu().numpy()
f.write('{} {} '.format(k, len(vr)))
for vv in vr:
f.write(' ')
f.write(struct.pack('>f', float(vv)).hex())
f.write('\n')

BIN
images.zip Normal file

Binary file not shown.

36
include/block.h Normal file
View File

@ -0,0 +1,36 @@
#pragma once
#include <map>
#include <string>
#include <vector>
#include "NvInfer.h"
int calculateP(int ksize);
std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);
nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
int ch, int k, int s, int p, std::string lname);
nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
int c2, int n, bool shortcut, float e, std::string lname);
nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
int c2, int n, bool shortcut, float e, std::string lname);
nvinfer1::IElementWiseLayer* C3(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
int c2, int n, bool shortcut, float e, std::string lname);
nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
int c2, int k, std::string lname);
nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);
nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
int px_arry_num, int num_class, bool is_segmentation, bool is_pose, bool is_obb);

39
include/calibrator.h Normal file
View File

@ -0,0 +1,39 @@
#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H
#include <NvInfer.h>
#include <string>
#include <vector>
#include "macros.h"
//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//! CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
{
public:
Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
virtual ~Int8EntropyCalibrator2();
int getBatchSize() const TRT_NOEXCEPT override;
bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
private:
int batchsize_;
int input_w_;
int input_h_;
int img_idx_;
std::string img_dir_;
std::vector<std::string> img_files_;
size_t input_count_;
std::string calib_table_name_;
const char* input_blob_name_;
bool read_cache_;
void* device_input_;
std::vector<char> calib_cache_;
};
#endif // ENTROPY_CALIBRATOR_H

31
include/config.h Normal file
View File

@ -0,0 +1,31 @@
#define USE_FP16
//#define USE_FP32
//#define USE_INT8
const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";
const static int kNumClass = 3;
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 640;
const static int kInputW = 640;
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;
const static float kConfThreshKeypoints = 0.5f; // keypoints confidence
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 1000;
//Quantization input image folder path
const static char* kInputQuantizationFolder = "./coco_calib";
// Classfication model's number of classes
constexpr static int kClsNumClass = 1000;
// Classfication model's input shape
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;
// pose model's number of classes
constexpr static int kPoseNumClass = 1;
const static int kNumberOfPoints = 17; // number of keypoints total
// obb model's number of classes
constexpr static int kObbNumClass = 15;

18
include/cuda_utils.h Normal file
View File

@ -0,0 +1,18 @@
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_
#include <cuda_runtime_api.h>
#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
{\
cudaError_t error_code = callstr;\
if (error_code != cudaSuccess) {\
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
assert(0);\
}\
}
#endif // CUDA_CHECK
#endif // TRTX_CUDA_UTILS_H_

504
include/logging.h Normal file
View File

@ -0,0 +1,504 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H
#include "NvInferRuntimeCommon.h"
#include <cassert>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <sstream>
#include <string>
#include "macros.h"
using Severity = nvinfer1::ILogger::Severity;
class LogStreamConsumerBuffer : public std::stringbuf
{
public:
LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
: mOutput(stream)
, mPrefix(prefix)
, mShouldLog(shouldLog)
{
}
LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
: mOutput(other.mOutput)
{
}
~LogStreamConsumerBuffer()
{
// std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
// std::streambuf::pptr() gives a pointer to the current position of the output sequence
// if the pointer to the beginning is not equal to the pointer to the current position,
// call putOutput() to log the output to the stream
if (pbase() != pptr())
{
putOutput();
}
}
// synchronizes the stream buffer and returns 0 on success
// synchronizing the stream buffer consists of inserting the buffer contents into the stream,
// resetting the buffer and flushing the stream
virtual int sync()
{
putOutput();
return 0;
}
void putOutput()
{
if (mShouldLog)
{
// prepend timestamp
std::time_t timestamp = std::time(nullptr);
tm* tm_local = std::localtime(&timestamp);
std::cout << "[";
std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
// std::stringbuf::str() gets the string contents of the buffer
// insert the buffer contents pre-appended by the appropriate prefix into the stream
mOutput << mPrefix << str();
// set the buffer to empty
str("");
// flush the stream
mOutput.flush();
}
}
void setShouldLog(bool shouldLog)
{
mShouldLog = shouldLog;
}
private:
std::ostream& mOutput;
std::string mPrefix;
bool mShouldLog;
};
//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
: mBuffer(stream, prefix, shouldLog)
{
}
protected:
LogStreamConsumerBuffer mBuffer;
};
//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//! Order of base classes is LogStreamConsumerBase and then std::ostream.
//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//! Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
//! \brief Creates a LogStreamConsumer which logs messages with level severity.
//! Reportable severity determines if the messages are severe enough to be logged.
LogStreamConsumer(Severity reportableSeverity, Severity severity)
: LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
, std::ostream(&mBuffer) // links the stream buffer with the stream
, mShouldLog(severity <= reportableSeverity)
, mSeverity(severity)
{
}
LogStreamConsumer(LogStreamConsumer&& other)
: LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
, std::ostream(&mBuffer) // links the stream buffer with the stream
, mShouldLog(other.mShouldLog)
, mSeverity(other.mSeverity)
{
}
void setReportableSeverity(Severity reportableSeverity)
{
mShouldLog = mSeverity <= reportableSeverity;
mBuffer.setShouldLog(mShouldLog);
}
private:
static std::ostream& severityOstream(Severity severity)
{
return severity >= Severity::kINFO ? std::cout : std::cerr;
}
static std::string severityPrefix(Severity severity)
{
switch (severity)
{
case Severity::kINTERNAL_ERROR: return "[F] ";
case Severity::kERROR: return "[E] ";
case Severity::kWARNING: return "[W] ";
case Severity::kINFO: return "[I] ";
case Severity::kVERBOSE: return "[V] ";
default: assert(0); return "";
}
}
bool mShouldLog;
Severity mSeverity;
};
//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.
class Logger : public nvinfer1::ILogger
{
public:
Logger(Severity severity = Severity::kWARNING)
: mReportableSeverity(severity)
{
}
//!
//! \enum TestResult
//! \brief Represents the state of a given test
//!
enum class TestResult
{
kRUNNING, //!< The test is running
kPASSED, //!< The test passed
kFAILED, //!< The test failed
kWAIVED //!< The test was waived
};
//!
//! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
//! \return The nvinfer1::ILogger associated with this Logger
//!
//! TODO Once all samples are updated to use this method to register the logger with TensorRT,
//! we can eliminate the inheritance of Logger from ILogger
//!
nvinfer1::ILogger& getTRTLogger()
{
return *this;
}
//!
//! \brief Implementation of the nvinfer1::ILogger::log() virtual method
//!
//! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
//! inheritance from nvinfer1::ILogger
//!
void log(Severity severity, const char* msg) TRT_NOEXCEPT override
{
LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
}
//!
//! \brief Method for controlling the verbosity of logging output
//!
//! \param severity The logger will only emit messages that have severity of this level or higher.
//!
void setReportableSeverity(Severity severity)
{
mReportableSeverity = severity;
}
//!
//! \brief Opaque handle that holds logging information for a particular test
//!
//! This object is an opaque handle to information used by the Logger to print test results.
//! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
//! with Logger::reportTest{Start,End}().
//!
class TestAtom
{
public:
TestAtom(TestAtom&&) = default;
private:
friend class Logger;
TestAtom(bool started, const std::string& name, const std::string& cmdline)
: mStarted(started)
, mName(name)
, mCmdline(cmdline)
{
}
bool mStarted;
std::string mName;
std::string mCmdline;
};
//!
//! \brief Define a test for logging
//!
//! \param[in] name The name of the test. This should be a string starting with
//! "TensorRT" and containing dot-separated strings containing
//! the characters [A-Za-z0-9_].
//! For example, "TensorRT.sample_googlenet"
//! \param[in] cmdline The command line used to reproduce the test
//
//! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
//!
static TestAtom defineTest(const std::string& name, const std::string& cmdline)
{
return TestAtom(false, name, cmdline);
}
//!
//! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
//! as input
//!
//! \param[in] name The name of the test
//! \param[in] argc The number of command-line arguments
//! \param[in] argv The array of command-line arguments (given as C strings)
//!
//! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
{
auto cmdline = genCmdlineString(argc, argv);
return defineTest(name, cmdline);
}
//!
//! \brief Report that a test has started.
//!
//! \pre reportTestStart() has not been called yet for the given testAtom
//!
//! \param[in] testAtom The handle to the test that has started
//!
static void reportTestStart(TestAtom& testAtom)
{
reportTestResult(testAtom, TestResult::kRUNNING);
assert(!testAtom.mStarted);
testAtom.mStarted = true;
}
//!
//! \brief Report that a test has ended.
//!
//! \pre reportTestStart() has been called for the given testAtom
//!
//! \param[in] testAtom The handle to the test that has ended
//! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
//! TestResult::kFAILED, TestResult::kWAIVED
//!
static void reportTestEnd(const TestAtom& testAtom, TestResult result)
{
assert(result != TestResult::kRUNNING);
assert(testAtom.mStarted);
reportTestResult(testAtom, result);
}
static int reportPass(const TestAtom& testAtom)
{
reportTestEnd(testAtom, TestResult::kPASSED);
return EXIT_SUCCESS;
}
static int reportFail(const TestAtom& testAtom)
{
reportTestEnd(testAtom, TestResult::kFAILED);
return EXIT_FAILURE;
}
static int reportWaive(const TestAtom& testAtom)
{
reportTestEnd(testAtom, TestResult::kWAIVED);
return EXIT_SUCCESS;
}
static int reportTest(const TestAtom& testAtom, bool pass)
{
return pass ? reportPass(testAtom) : reportFail(testAtom);
}
Severity getReportableSeverity() const
{
return mReportableSeverity;
}
private:
//!
//! \brief returns an appropriate string for prefixing a log message with the given severity
//!
static const char* severityPrefix(Severity severity)
{
switch (severity)
{
case Severity::kINTERNAL_ERROR: return "[F] ";
case Severity::kERROR: return "[E] ";
case Severity::kWARNING: return "[W] ";
case Severity::kINFO: return "[I] ";
case Severity::kVERBOSE: return "[V] ";
default: assert(0); return "";
}
}
//!
//! \brief returns an appropriate string for prefixing a test result message with the given result
//!
static const char* testResultString(TestResult result)
{
switch (result)
{
case TestResult::kRUNNING: return "RUNNING";
case TestResult::kPASSED: return "PASSED";
case TestResult::kFAILED: return "FAILED";
case TestResult::kWAIVED: return "WAIVED";
default: assert(0); return "";
}
}
//!
//! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
//!
static std::ostream& severityOstream(Severity severity)
{
return severity >= Severity::kINFO ? std::cout : std::cerr;
}
//!
//! \brief method that implements logging test results
//!
static void reportTestResult(const TestAtom& testAtom, TestResult result)
{
severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
<< testAtom.mCmdline << std::endl;
}
//!
//! \brief generate a command line string from the given (argc, argv) values
//!
static std::string genCmdlineString(int argc, char const* const* argv)
{
std::stringstream ss;
for (int i = 0; i < argc; i++)
{
if (i > 0)
ss << " ";
ss << argv[i];
}
return ss.str();
}
Severity mReportableSeverity;
};
namespace
{
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//! LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//! LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//! LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//! LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
// ("fatal" severity)
//!
//! Example usage:
//!
//! LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}
} // anonymous namespace
#endif // TENSORRT_LOGGING_H

29
include/macros.h Normal file
View File

@ -0,0 +1,29 @@
#ifndef __MACROS_H
#define __MACROS_H
#include "NvInfer.h"
#ifdef API_EXPORTS
#if defined(_MSC_VER)
#define API __declspec(dllexport)
#else
#define API __attribute__((visibility("default")))
#endif
#else
#if defined(_MSC_VER)
#define API __declspec(dllimport)
#else
#define API
#endif
#endif // API_EXPORTS
#if NV_TENSORRT_MAJOR >= 8
#define TRT_NOEXCEPT noexcept
#define TRT_CONST_ENQUEUE const
#else
#define TRT_NOEXCEPT
#define TRT_CONST_ENQUEUE
#endif
#endif // __MACROS_H

43
include/model.h Normal file
View File

@ -0,0 +1,43 @@
#pragma once
#include <assert.h>
#include <string>
#include "NvInfer.h"
nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw);
nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8_5uDet(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8_5uDetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd,
float& gw, int& max_channels);
nvinfer1::IHostMemory* buildEngineYolov8Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
int& max_channels);

41
include/postprocess.h Normal file
View File

@ -0,0 +1,41 @@
#pragma once
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"
// Preprocessing functions
cv::Rect get_rect(cv::Mat& img, float bbox[4]);
// Processing functions
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
int bbox_element, const std::vector<cv::Mat>& img_batch);
void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
int bbox_element, const std::vector<cv::Mat>& img_batch);
void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
int count);
void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
cv::Mat& img, int count);
// NMS functions
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);
void batch_nms(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
float conf_thresh, float nms_thresh = 0.5);
void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);
void batch_nms_obb(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
float conf_thresh, float nms_thresh = 0.5);
// CUDA-related functions
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream);
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream);
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
// Drawing functions
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
std::unordered_map<int, std::string>& labels_map);

16
include/preprocess.h Normal file
View File

@ -0,0 +1,16 @@
#pragma once
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "types.h"
#include <map>
void cuda_preprocess_init(int max_image_size);
void cuda_preprocess_destroy();
void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height, cudaStream_t stream);
void cuda_batch_preprocess(std::vector<cv::Mat> &img_batch, float *dst, int dst_width, int dst_height, cudaStream_t stream);

19
include/types.h Normal file
View File

@ -0,0 +1,19 @@
#pragma once
#include "config.h"
struct alignas(float) Detection {
//center_x center_y w h
float bbox[4];
float conf; // bbox_conf * cls_conf
float class_id;
float mask[32];
float keypoints[kNumberOfPoints * 3]; // keypoints array with dynamic size based on kNumberOfPoints
float angle; // obb angle
};
struct AffineMatrix {
float value[6];
};
const int bbox_element =
sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag

86
include/utils.h Normal file
View File

@ -0,0 +1,86 @@
#pragma once
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include <fstream>
static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
int w, h, x, y;
float r_w = input_w / (img.cols*1.0);
float r_h = input_h / (img.rows*1.0);
if (r_h > r_w) {
w = input_w;
h = r_w * img.rows;
x = 0;
y = (input_h - h) / 2;
} else {
w = r_h * img.cols;
h = input_h;
x = (input_w - w) / 2;
y = 0;
}
cv::Mat re(h, w, CV_8UC3);
cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
return out;
}
static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
DIR *p_dir = opendir(p_dir_name);
if (p_dir == nullptr) {
return -1;
}
struct dirent* p_file = nullptr;
while ((p_file = readdir(p_dir)) != nullptr) {
if (strcmp(p_file->d_name, ".") != 0 &&
strcmp(p_file->d_name, "..") != 0) {
//std::string cur_file_name(p_dir_name);
//cur_file_name += "/";
//cur_file_name += p_file->d_name;
std::string cur_file_name(p_file->d_name);
file_names.push_back(cur_file_name);
}
}
closedir(p_dir);
return 0;
}
// Function to trim leading and trailing whitespace from a string
static inline std::string trim_leading_whitespace(const std::string& str) {
size_t first = str.find_first_not_of(' ');
if (std::string::npos == first) {
return str;
}
size_t last = str.find_last_not_of(' ');
return str.substr(first, (last - first + 1));
}
// Src: https://stackoverflow.com/questions/16605967
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
std::ostringstream out;
out.precision(n);
out << std::fixed << a_value;
return out.str();
}
static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
std::ifstream file(labels_filename);
// Read each line of the file
std::string line;
int index = 0;
while (std::getline(file, line)) {
// Strip the line of any leading or trailing whitespace
line = trim_leading_whitespace(line);
// Add the stripped line to the labels_map, using the loop index as the key
labels_map[index] = line;
index++;
}
// Close the file
file.close();
return 0;
}

371
plugin/yololayer.cu Normal file
View File

@ -0,0 +1,371 @@
#include <assert.h>
#include <math.h>
#include <iostream>
#include <vector>
#include "cuda_utils.h"
#include "types.h"
#include "yololayer.h"
namespace Tn {
template <typename T>
void write(char*& buffer, const T& val) {
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template <typename T>
void read(const char*& buffer, T& val) {
val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
}
} // namespace Tn
__device__ float sigmoid(float x) {
return 1.0f / (1.0f + exp(-x));
}
namespace nvinfer1 {
YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth,
int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb,
const int* strides, int stridesLength) {
mClassCount = classCount;
mNumberofpoints = numberofpoints;
mConfthreshkeypoints = confthreshkeypoints;
mYoloV8NetWidth = netWidth;
mYoloV8netHeight = netHeight;
mMaxOutObject = maxOut;
mStridesLength = stridesLength;
mStrides = new int[stridesLength];
memcpy(mStrides, strides, stridesLength * sizeof(int));
is_segmentation_ = is_segmentation;
is_pose_ = is_pose;
is_obb_ = is_obb;
}
YoloLayerPlugin::~YoloLayerPlugin() {
if (mStrides != nullptr) {
delete[] mStrides;
mStrides = nullptr;
}
}
YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
using namespace Tn;
const char *d = reinterpret_cast<const char*>(data), *a = d;
read(d, mClassCount);
read(d, mNumberofpoints);
read(d, mConfthreshkeypoints);
read(d, mThreadCount);
read(d, mYoloV8NetWidth);
read(d, mYoloV8netHeight);
read(d, mMaxOutObject);
read(d, mStridesLength);
mStrides = new int[mStridesLength];
for (int i = 0; i < mStridesLength; ++i) {
read(d, mStrides[i]);
}
read(d, is_segmentation_);
read(d, is_pose_);
read(d, is_obb_);
assert(d == a + length);
}
void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
using namespace Tn;
char *d = static_cast<char*>(buffer), *a = d;
write(d, mClassCount);
write(d, mNumberofpoints);
write(d, mConfthreshkeypoints);
write(d, mThreadCount);
write(d, mYoloV8NetWidth);
write(d, mYoloV8netHeight);
write(d, mMaxOutObject);
write(d, mStridesLength);
for (int i = 0; i < mStridesLength; ++i) {
write(d, mStrides[i]);
}
write(d, is_segmentation_);
write(d, is_pose_);
write(d, is_obb_);
assert(d == a + getSerializationSize());
}
size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) +
sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) +
sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_) + sizeof(is_obb_);
}
int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
return 0;
}
nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
int nbInputDims) TRT_NOEXCEPT {
int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
return nvinfer1::Dims3(total_size + 1, 1, 1);
}
void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
mPluginNamespace = pluginNamespace;
}
const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
return mPluginNamespace;
}
nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
int nbInputs) const TRT_NOEXCEPT {
return nvinfer1::DataType::kFLOAT;
}
bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
int nbInputs) const TRT_NOEXCEPT {
return false;
}
bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
return false;
}
void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};
void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};
void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
return "YoloLayer_TRT";
}
const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
return "1";
}
void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
delete this;
}
nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
YoloLayerPlugin* p =
new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight,
mMaxOutObject, is_segmentation_, is_pose_, is_obb_, mStrides, mStridesLength);
p->setPluginNamespace(mPluginNamespace);
return p;
}
int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
return 0;
}
__device__ float Logist(float data) {
return 1.0f / (1.0f + expf(-data));
};
__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem,
bool is_segmentation, bool is_pose, bool is_obb) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx >= numElements)
return;
const int N_kpts = nk;
int total_grid = grid_h * grid_w;
int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0);
int batchIdx = idx / total_grid;
int elemIdx = idx % total_grid;
const float* curInput = input + batchIdx * total_grid * info_len;
int outputIdx = batchIdx * outputElem;
int class_id = 0;
float max_cls_prob = 0.0;
for (int i = 4; i < 4 + classes; i++) {
float p = Logist(curInput[elemIdx + i * total_grid]);
if (p > max_cls_prob) {
max_cls_prob = p;
class_id = i - 4;
}
}
if (max_cls_prob < 0.1)
return;
int count = (int)atomicAdd(output + outputIdx, 1);
if (count >= maxoutobject)
return;
char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
Detection* det = (Detection*)(data);
int row = elemIdx / grid_w;
int col = elemIdx % grid_w;
det->conf = max_cls_prob;
det->class_id = class_id;
det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;
if (is_segmentation) {
for (int k = 0; k < 32; ++k) {
det->mask[k] =
curInput[elemIdx + (4 + classes + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0) + k) * total_grid];
}
}
if (is_pose) {
for (int kpt = 0; kpt < N_kpts; kpt++) {
int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3) * total_grid;
int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 1) * total_grid;
int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 2) * total_grid;
float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]);
float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride;
float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride;
bool is_within_bbox =
kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3];
if (kpt_confidence < confkeypoints || !is_within_bbox) {
det->keypoints[kpt * 3] = -1;
det->keypoints[kpt * 3 + 1] = -1;
det->keypoints[kpt * 3 + 2] = -1;
} else {
det->keypoints[kpt * 3] = kpt_x;
det->keypoints[kpt * 3 + 1] = kpt_y;
det->keypoints[kpt * 3 + 2] = kpt_confidence;
}
}
}
if (is_obb) {
double pi = M_PI;
auto angle_inx = curInput[elemIdx + (4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) +
0) * total_grid];
auto angle = (sigmoid(angle_inx) - 0.25f) * pi;
auto cos1 = cos(angle);
auto sin1 = sin(angle);
auto xf = (curInput[elemIdx + 2 * total_grid] - curInput[elemIdx + 0 * total_grid]) / 2;
auto yf = (curInput[elemIdx + 3 * total_grid] - curInput[elemIdx + 1 * total_grid]) / 2;
auto x = xf * cos1 - yf * sin1;
auto y = xf * sin1 + yf * cos1;
float cx = (col + 0.5f + x) * stride;
float cy = (row + 0.5f + y) * stride;
float w1 = (curInput[elemIdx + 0 * total_grid] + curInput[elemIdx + 2 * total_grid]) * stride;
float h1 = (curInput[elemIdx + 1 * total_grid] + curInput[elemIdx + 3 * total_grid]) * stride;
det->bbox[0] = cx;
det->bbox[1] = cy;
det->bbox[2] = w1;
det->bbox[3] = h1;
det->angle = angle;
}
}
void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
int mYoloV8NetWidth, int batchSize) {
int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
cudaMemsetAsync(output, 0, sizeof(float), stream);
for (int idx = 0; idx < batchSize; ++idx) {
CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
}
int numElem = 0;
int maxGrids = mStridesLength;
int flatGridsLen = 2 * maxGrids;
int* flatGrids = new int[flatGridsLen];
for (int i = 0; i < maxGrids; ++i) {
flatGrids[2 * i] = mYoloV8netHeight / mStrides[i];
flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i];
}
for (unsigned int i = 0; i < maxGrids; i++) {
// Access the elements of the original 2D array from the flattened 1D array
int grid_h = flatGrids[2 * i]; // Corresponds to the access of grids[i][0]
int grid_w = flatGrids[2 * i + 1]; // Corresponds to the access of grids[i][1]
int stride = mStrides[i];
numElem = grid_h * grid_w * batchSize; // Calculate the total number of elements
if (numElem < mThreadCount) // Adjust the thread count if needed
mThreadCount = numElem;
// The CUDA kernel call remains unchanged
CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints,
mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_, is_obb_);
}
delete[] flatGrids;
}
PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
YoloPluginCreator::YoloPluginCreator() {
mPluginAttributes.clear();
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
return "YoloLayer_TRT";
}
const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
return "1";
}
const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
return &mFC;
}
IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
assert(fc->nbFields == 1);
assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
int netinfo_count = 9;
int class_count = combinedInfo[0];
int numberofpoints = combinedInfo[1];
float confthreshkeypoints = combinedInfo[2];
int input_w = combinedInfo[3];
int input_h = combinedInfo[4];
int max_output_object_count = combinedInfo[5];
bool is_segmentation = combinedInfo[6];
bool is_pose = combinedInfo[7];
bool is_obb = combinedInfo[8];
const int* px_arry = combinedInfo + netinfo_count;
int px_arry_length = fc->fields[0].length - netinfo_count;
YoloLayerPlugin* obj =
new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h,
max_output_object_count, is_segmentation, is_pose, is_obb, px_arry, px_arry_length);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
size_t serialLength) TRT_NOEXCEPT {
// This object will be deleted when the network is destroyed, which will
// call YoloLayerPlugin::destroy()
YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
} // namespace nvinfer1

110
plugin/yololayer.h Normal file
View File

@ -0,0 +1,110 @@
#pragma once
#include <string>
#include <vector>
#include "NvInfer.h"
#include "macros.h"
namespace nvinfer1 {
class API YoloLayerPlugin : public IPluginV2IOExt {
public:
YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight,
int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength);
YoloLayerPlugin(const void* data, size_t length);
~YoloLayerPlugin();
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
int initialize() TRT_NOEXCEPT override;
virtual void terminate() TRT_NOEXCEPT override {}
virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
cudaStream_t stream) TRT_NOEXCEPT override;
virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
int nbOutputs) const TRT_NOEXCEPT override {
return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
}
const char* getPluginType() const TRT_NOEXCEPT override;
const char* getPluginVersion() const TRT_NOEXCEPT override;
void destroy() TRT_NOEXCEPT override;
IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
const char* getPluginNamespace() const TRT_NOEXCEPT override;
nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
int32_t nbInputs) const TRT_NOEXCEPT;
bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
int nbInputs) const TRT_NOEXCEPT override;
bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
int32_t nbOutput) TRT_NOEXCEPT override;
void detachFromContext() TRT_NOEXCEPT override;
private:
void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
int mYoloV8NetWidth, int batchSize);
int mThreadCount = 256;
const char* mPluginNamespace;
int mClassCount;
int mNumberofpoints;
float mConfthreshkeypoints;
int mYoloV8NetWidth;
int mYoloV8netHeight;
int mMaxOutObject;
bool is_segmentation_;
bool is_pose_;
bool is_obb_;
int* mStrides;
int mStridesLength;
};
class API YoloPluginCreator : public IPluginCreator {
public:
YoloPluginCreator();
~YoloPluginCreator() override = default;
const char* getPluginName() const TRT_NOEXCEPT override;
const char* getPluginVersion() const TRT_NOEXCEPT override;
const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
size_t serialLength) TRT_NOEXCEPT override;
void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }
const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
private:
std::string mNamespace;
static PluginFieldCollection mFC;
static std::vector<PluginField> mPluginAttributes;
};
REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
} // namespace nvinfer1

309
src/block.cpp Normal file
View File

@ -0,0 +1,309 @@
#include "block.h"
#include <assert.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include "config.h"
#include "yololayer.h"
int calculateP(int ksize) {
return ksize / 3;
}
std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
std::cout << "Loading weights: " << file << std::endl;
std::map<std::string, nvinfer1::Weights> WeightMap;
std::ifstream input(file);
assert(input.is_open() &&
"Unable to load weight file. please check if the "
".wts file path is right!!!!!!");
int32_t count;
input >> count;
assert(count > 0 && "Invalid weight map file.");
while (count--) {
nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
uint32_t size;
std::string name;
input >> name >> std::dec >> size;
wt.type = nvinfer1::DataType::kFLOAT;
uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; x++) {
input >> std::hex >> val[x];
}
wt.values = val;
wt.count = size;
WeightMap[name] = wt;
}
return WeightMap;
}
static nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap,
nvinfer1::ITensor& input, std::string lname, float eps) {
float* gamma = (float*)weightMap[lname + ".weight"].values;
float* beta = (float*)weightMap[lname + ".bias"].values;
float* mean = (float*)weightMap[lname + ".running_mean"].values;
float* var = (float*)weightMap[lname + ".running_var"].values;
int len = weightMap[lname + ".running_var"].count;
float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
scval[i] = gamma[i] / sqrt(var[i] + eps);
}
nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};
float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
}
nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};
float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
pval[i] = 1.0;
}
nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
weightMap[lname + ".scale"] = scale;
weightMap[lname + ".shift"] = shift;
weightMap[lname + ".power"] = power;
nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
assert(output);
return output;
}
nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input,
int ch, int k, int s, int p, std::string lname) {
nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
nvinfer1::IConvolutionLayer* conv =
network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
assert(conv);
conv->setStrideNd(nvinfer1::DimsHW{s, s});
conv->setPaddingNd(nvinfer1::DimsHW{p, p});
nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
nvinfer1::IElementWiseLayer* ew =
network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
assert(ew);
return ew;
}
nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, std::string lname) {
nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".cv1");
nvinfer1::IElementWiseLayer* conv2 =
convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, 3, 1, 1, lname + ".cv2");
if (shortcut && c1 == c2) {
nvinfer1::IElementWiseLayer* ew =
network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
return ew;
}
return conv2;
}
static nvinfer1::ILayer* bottleneck_c3(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
int c1, int c2, bool shortcut, float e, std::string lname) {
nvinfer1::IElementWiseLayer* cv1 =
convBnSiLU(network, weightMap, input, (int)((float)c2 * e), 1, 1, calculateP(1), lname + ".cv1");
nvinfer1::IElementWiseLayer* cv2 =
convBnSiLU(network, weightMap, *cv1->getOutput(0), c2, 3, 1, calculateP(3), lname + ".cv2");
if (shortcut && c1 == c2) {
auto ew = network->addElementWise(input, *cv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
return ew;
}
return cv2;
}
nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
int c2, int n, bool shortcut, float e, std::string lname) {
int c_ = (float)c2 * e;
nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, 0, lname + ".cv1");
nvinfer1::Dims d = conv1->getOutput(0)->getDimensions();
nvinfer1::ISliceLayer* split1 =
network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{0, 0, 0},
nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});
nvinfer1::ISliceLayer* split2 =
network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0},
nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1});
nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)};
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2);
nvinfer1::ITensor* y1 = split2->getOutput(0);
for (int i = 0; i < n; i++) {
auto* b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i));
y1 = b->getOutput(0);
nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
cat = network->addConcatenation(inputTensors, 2);
}
nvinfer1::IElementWiseLayer* conv2 =
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
return conv2;
}
nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
int c2, int n, bool shortcut, float e, std::string lname) {
assert(network != nullptr);
int hidden_channels = static_cast<int>(c2 * e);
// cv1 branch
nvinfer1::IElementWiseLayer* conv1 =
convBnSiLU(network, weightMap, input, 2 * hidden_channels, 1, 1, 0, lname + ".cv1");
nvinfer1::ITensor* cv1_out = conv1->getOutput(0);
// Split the output of cv1 into two tensors
nvinfer1::Dims dims = cv1_out->getDimensions();
nvinfer1::ISliceLayer* split1 =
network->addSlice(*cv1_out, nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{dims.d[0] / 2, dims.d[1], dims.d[2]},
nvinfer1::Dims3{1, 1, 1});
nvinfer1::ISliceLayer* split2 =
network->addSlice(*cv1_out, nvinfer1::Dims3{dims.d[0] / 2, 0, 0},
nvinfer1::Dims3{dims.d[0] / 2, dims.d[1], dims.d[2]}, nvinfer1::Dims3{1, 1, 1});
// Create y1 bottleneck sequence
nvinfer1::ITensor* y1 = split1->getOutput(0);
for (int i = 0; i < n; ++i) {
auto* bottleneck_layer = bottleneck(network, weightMap, *y1, hidden_channels, hidden_channels, shortcut, 1.0,
lname + ".m." + std::to_string(i));
y1 = bottleneck_layer->getOutput(0); // update 'y1' to be the output of the current bottleneck
}
// Concatenate y1 with the second split of cv1
nvinfer1::ITensor* concatInputs[2] = {y1, split2->getOutput(0)};
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2);
// cv2 to produce the final output
nvinfer1::IElementWiseLayer* conv2 =
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
return conv2;
}
nvinfer1::IElementWiseLayer* C3(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
int c2, int n, bool shortcut, float e, std::string lname) {
int c_ = (float)c2 * e;
nvinfer1::IElementWiseLayer* cv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, calculateP(1), lname + ".cv1");
nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, input, c_, 1, 1, calculateP(1), lname + ".cv2");
nvinfer1::ITensor* y1 = cv1->getOutput(0);
for (int i = 0; i < n; i++) {
auto b = bottleneck_c3(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i));
y1 = b->getOutput(0);
}
nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)};
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2);
nvinfer1::IElementWiseLayer* conv3 =
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, calculateP(1), lname + ".cv3");
return conv3;
}
nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
int c2, int k, std::string lname) {
int c_ = c1 / 2;
nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1");
nvinfer1::IPoolingLayer* pool1 =
network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
pool1->setStrideNd(nvinfer1::DimsHW{1, 1});
pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
nvinfer1::IPoolingLayer* pool2 =
network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
pool2->setStrideNd(nvinfer1::DimsHW{1, 1});
pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
nvinfer1::IPoolingLayer* pool3 =
network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
pool3->setStrideNd(nvinfer1::DimsHW{1, 1});
pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0),
pool3->getOutput(0)};
nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4);
nvinfer1::IElementWiseLayer* conv2 =
convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
return conv2;
}
nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights> weightMap,
nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) {
nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input);
shuffle1->setReshapeDimensions(nvinfer1::Dims3{4, 16, grid});
shuffle1->setSecondTranspose(nvinfer1::Permutation{1, 0, 2});
nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0));
nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
nvinfer1::IConvolutionLayer* conv =
network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty);
conv->setStrideNd(nvinfer1::DimsHW{s, s});
conv->setPaddingNd(nvinfer1::DimsHW{p, p});
nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0));
shuffle2->setReshapeDimensions(nvinfer1::Dims2{4, grid});
return shuffle2;
}
nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
int px_arry_num, int num_class, bool is_segmentation, bool is_pose,
bool is_obb) {
auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
const int netinfo_count = 9; // Assuming the first 5 elements are for netinfo as per existing code.
const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined.
std::vector<int> combinedInfo(total_count);
// Fill in the first 5 elements as per existing netinfo.
combinedInfo[0] = num_class;
combinedInfo[1] = kNumberOfPoints;
combinedInfo[2] = kConfThreshKeypoints;
combinedInfo[3] = kInputW;
combinedInfo[4] = kInputH;
combinedInfo[5] = kMaxNumOutputBbox;
combinedInfo[6] = is_segmentation;
combinedInfo[7] = is_pose;
combinedInfo[8] = is_obb;
// Copy the contents of px_arry into the combinedInfo vector after the initial
// 5 elements.
std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);
// Now let's create the PluginField object to hold this combined information.
nvinfer1::PluginField pluginField;
pluginField.name = "combinedInfo"; // This can be any name that the plugin will recognize
pluginField.data = combinedInfo.data();
pluginField.type = nvinfer1::PluginFieldType::kINT32;
pluginField.length = combinedInfo.size();
// Create the PluginFieldCollection to hold the PluginField object.
nvinfer1::PluginFieldCollection pluginFieldCollection;
pluginFieldCollection.nbFields = 1; // We have just one field, but it's a combined array
pluginFieldCollection.fields = &pluginField;
// Create the plugin object using the PluginFieldCollection.
nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);
// We assume that the plugin is to be added onto the network.
// Prepare input tensors for the YOLO Layer.
std::vector<nvinfer1::ITensor*> inputTensors;
for (auto det : dets) {
inputTensors.push_back(det->getOutput(0)); // Assuming each IConcatenationLayer has one output tensor.
}
// Add the plugin to the network using the prepared input tensors.
nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);
return yoloLayer; // Return the added YOLO layer.
}

80
src/calibrator.cpp Normal file
View File

@ -0,0 +1,80 @@
#include <iostream>
#include <iterator>
#include <fstream>
#include <opencv2/dnn/dnn.hpp>
#include "calibrator.h"
#include "cuda_utils.h"
#include "utils.h"
Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
const char* input_blob_name, bool read_cache)
: batchsize_(batchsize)
, input_w_(input_w)
, input_h_(input_h)
, img_idx_(0)
, img_dir_(img_dir)
, calib_table_name_(calib_table_name)
, input_blob_name_(input_blob_name)
, read_cache_(read_cache)
{
input_count_ = 3 * input_w * input_h * batchsize;
CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
read_files_in_dir(img_dir, img_files_);
}
Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
{
CUDA_CHECK(cudaFree(device_input_));
}
int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT
{
return batchsize_;
}
bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT
{
if (img_idx_ + batchsize_ > (int)img_files_.size()) {
return false;
}
std::vector<cv::Mat> input_imgs_;
for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
std::cout << img_files_[i] << " " << i << std::endl;
cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
if (temp.empty()){
std::cerr << "Fatal error: image cannot open!" << std::endl;
return false;
}
cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
input_imgs_.push_back(pr_img);
}
img_idx_ += batchsize_;
cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);
CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
assert(!strcmp(names[0], input_blob_name_));
bindings[0] = device_input_;
return true;
}
const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT
{
std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
calib_cache_.clear();
std::ifstream input(calib_table_name_, std::ios::binary);
input >> std::noskipws;
if (read_cache_ && input.good())
{
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
}
length = calib_cache_.size();
return length ? calib_cache_.data() : nullptr;
}
void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT
{
std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
std::ofstream output(calib_table_name_, std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}

2750
src/model.cpp Normal file

File diff suppressed because it is too large Load Diff

507
src/postprocess.cpp Normal file
View File

@ -0,0 +1,507 @@
#include "postprocess.h"
#include <algorithm>
#include <iostream> // Include this header for printing
#include "utils.h"
cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
float l, r, t, b;
float r_w = kInputW / (img.cols * 1.0);
float r_h = kInputH / (img.rows * 1.0);
if (r_h > r_w) {
l = bbox[0];
r = bbox[2];
t = bbox[1] - (kInputH - r_w * img.rows) / 2;
b = bbox[3] - (kInputH - r_w * img.rows) / 2;
l = l / r_w;
r = r / r_w;
t = t / r_w;
b = b / r_w;
} else {
l = bbox[0] - (kInputW - r_h * img.cols) / 2;
r = bbox[2] - (kInputW - r_h * img.cols) / 2;
t = bbox[1];
b = bbox[3];
l = l / r_h;
r = r / r_h;
t = t / r_h;
b = b / r_h;
}
l = std::max(0.0f, l);
t = std::max(0.0f, t);
int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));
return cv::Rect(int(round(l)), int(round(t)), width, height);
}
cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) {
float l, r, t, b;
float r_w = kInputW / (img.cols * 1.0);
float r_h = kInputH / (img.rows * 1.0);
if (r_h > r_w) {
l = bbox[0] / r_w;
r = bbox[2] / r_w;
t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
lmk[i] /= r_w;
lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
// lmk[i + 2]
}
} else {
l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
t = bbox[1] / r_h;
b = bbox[3] / r_h;
for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
lmk[i + 1] /= r_h;
// lmk[i + 2]
}
}
l = std::max(0.0f, l);
t = std::max(0.0f, t);
int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));
return cv::Rect(int(round(l)), int(round(t)), width, height);
}
static float iou(float lbox[4], float rbox[4]) {
float interBox[] = {
(std::max)(lbox[0], rbox[0]),
(std::min)(lbox[2], rbox[2]),
(std::max)(lbox[1], rbox[1]),
(std::min)(lbox[3], rbox[3]),
};
if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
return 0.0f;
float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS;
return interBoxS / unionBoxS;
}
static bool cmp(const Detection& a, const Detection& b) {
if (a.conf == b.conf) {
return a.bbox[0] < b.bbox[0];
}
return a.conf > b.conf;
}
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
int det_size = sizeof(Detection) / sizeof(float);
std::map<float, std::vector<Detection>> m;
for (int i = 0; i < output[0]; i++) {
if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4]))
continue;
Detection det;
memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
if (m.count(det.class_id) == 0)
m.emplace(det.class_id, std::vector<Detection>());
m[det.class_id].push_back(det);
}
for (auto it = m.begin(); it != m.end(); it++) {
auto& dets = it->second;
std::sort(dets.begin(), dets.end(), cmp);
for (size_t m = 0; m < dets.size(); ++m) {
auto& item = dets[m];
res.push_back(item);
for (size_t n = m + 1; n < dets.size(); ++n) {
if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
dets.erase(dets.begin() + n);
--n;
}
}
}
}
}
void batch_nms(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
float conf_thresh, float nms_thresh) {
res_batch.resize(batch_size);
for (int i = 0; i < batch_size; i++) {
nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
}
}
void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
int count) {
Detection det;
for (int i = 0; i < count; i++) {
int basic_pos = 1 + i * bbox_element;
int keep_flag = decode_ptr_host[basic_pos + 6];
if (keep_flag == 1) {
det.bbox[0] = decode_ptr_host[basic_pos + 0];
det.bbox[1] = decode_ptr_host[basic_pos + 1];
det.bbox[2] = decode_ptr_host[basic_pos + 2];
det.bbox[3] = decode_ptr_host[basic_pos + 3];
det.conf = decode_ptr_host[basic_pos + 4];
det.class_id = decode_ptr_host[basic_pos + 5];
res.push_back(det);
}
}
}
void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
int bbox_element, const std::vector<cv::Mat>& img_batch) {
res_batch.resize(batch_size);
int count = static_cast<int>(*decode_ptr_host);
count = std::min(count, kMaxNumOutputBbox);
for (int i = 0; i < batch_size; i++) {
auto& img = const_cast<cv::Mat&>(img_batch[i]);
process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
}
}
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
for (size_t i = 0; i < img_batch.size(); i++) {
auto& res = res_batch[i];
cv::Mat img = img_batch[i];
for (size_t j = 0; j < res.size(); j++) {
cv::Rect r = get_rect(img, res[j].bbox);
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
cv::Scalar(0xFF, 0xFF, 0xFF), 2);
}
}
}
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
const std::vector<std::pair<int, int>> skeleton_pairs = {
{0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11},
{6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};
for (size_t i = 0; i < img_batch.size(); i++) {
auto& res = res_batch[i];
cv::Mat img = img_batch[i];
for (size_t j = 0; j < res.size(); j++) {
cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
cv::Scalar(0xFF, 0xFF, 0xFF), 2);
for (int k = 0; k < kNumberOfPoints * 3; k += 3) {
if (res[j].keypoints[k + 2] > 0.5) {
cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
cv::Scalar(0, 0x27, 0xC1), -1);
}
}
for (const auto& bone : skeleton_pairs) {
int kp1_idx = bone.first * 3;
int kp2_idx = bone.second * 3;
if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
}
}
}
}
}
cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
int x, y, w, h;
float r_w = kInputW / (img.cols * 1.0);
float r_h = kInputH / (img.rows * 1.0);
if (r_h > r_w) {
w = kInputW;
h = r_w * img.rows;
x = 0;
y = (kInputH - h) / 2;
} else {
w = r_h * img.cols;
h = kInputH;
x = (kInputW - w) / 2;
y = 0;
}
cv::Rect r(x, y, w, h);
cv::Mat res;
cv::resize(mask(r), res, img.size());
return res;
}
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
std::unordered_map<int, std::string>& labels_map) {
static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
for (size_t i = 0; i < dets.size(); i++) {
cv::Mat img_mask = scale_mask(masks[i], img);
auto color = colors[(int)dets[i].class_id % colors.size()];
auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
cv::Rect r = get_rect(img, dets[i].bbox);
for (int x = r.x; x < r.x + r.width; x++) {
for (int y = r.y; y < r.y + r.height; y++) {
float val = img_mask.at<float>(y, x);
if (val <= 0.5)
continue;
img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
}
}
cv::rectangle(img, r, bgr, 2);
// Get the size of the text
cv::Size textSize =
cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
// Set the top left corner of the rectangle
cv::Point topLeft(r.x, r.y - textSize.height);
// Set the bottom right corner of the rectangle
cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);
// Set the thickness of the rectangle lines
int lineThickness = 2;
// Draw the rectangle on the image
cv::rectangle(img, topLeft, bottomRight, bgr, -1);
cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
}
}
void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
cv::Mat& img, int count) {
Detection det;
for (int i = 0; i < count; i++) {
int basic_pos = 1 + i * bbox_element;
int keep_flag = decode_ptr_host[basic_pos + 6];
if (keep_flag == 1) {
det.bbox[0] = decode_ptr_host[basic_pos + 0];
det.bbox[1] = decode_ptr_host[basic_pos + 1];
det.bbox[2] = decode_ptr_host[basic_pos + 2];
det.bbox[3] = decode_ptr_host[basic_pos + 3];
det.conf = decode_ptr_host[basic_pos + 4];
det.class_id = decode_ptr_host[basic_pos + 5];
det.angle = decode_ptr_host[basic_pos + 7];
res.push_back(det);
}
}
}
void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
int bbox_element, const std::vector<cv::Mat>& img_batch) {
res_batch.resize(batch_size);
int count = static_cast<int>(*decode_ptr_host);
count = std::min(count, kMaxNumOutputBbox);
for (int i = 0; i < batch_size; i++) {
auto& img = const_cast<cv::Mat&>(img_batch[i]);
process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
}
}
std::tuple<float, float, float> convariance_matrix(Detection res) {
float w = res.bbox[2];
float h = res.bbox[3];
float a = w * w / 12.0;
float b = h * h / 12.0;
float c = res.angle;
float cos_r = std::cos(c);
float sin_r = std::sin(c);
float cos_r2 = cos_r * cos_r;
float sin_r2 = sin_r * sin_r;
float a_val = a * cos_r2 + b * sin_r2;
float b_val = a * sin_r2 + b * cos_r2;
float c_val = (a - b) * cos_r * sin_r;
return std::make_tuple(a_val, b_val, c_val);
}
static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) {
// Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
float a1, b1, c1, a2, b2, c2;
std::tuple<float, float, float> matrix1 = {a1, b1, c1};
std::tuple<float, float, float> matrix2 = {a2, b2, c2};
matrix1 = convariance_matrix(res1);
matrix2 = convariance_matrix(res2);
a1 = std::get<0>(matrix1);
b1 = std::get<1>(matrix1);
c1 = std::get<2>(matrix1);
a2 = std::get<0>(matrix2);
b2 = std::get<1>(matrix2);
c2 = std::get<2>(matrix2);
float x1 = res1.bbox[0], y1 = res1.bbox[1];
float x2 = res2.bbox[0], y2 = res2.bbox[1];
float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) /
((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
float t3 = std::log(
((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) /
(4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) +
eps) +
eps);
float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
bd = std::max(std::min(bd, 100.0f), eps);
float hd = std::sqrt(1.0 - std::exp(-bd) + eps);
return 1 - hd;
}
void nms_obb(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
int det_size = sizeof(Detection) / sizeof(float);
std::map<float, std::vector<Detection>> m;
for (int i = 0; i < output[0]; i++) {
if (output[1 + det_size * i + 4] <= conf_thresh)
continue;
Detection det;
memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
if (m.count(det.class_id) == 0)
m.emplace(det.class_id, std::vector<Detection>());
m[det.class_id].push_back(det);
}
for (auto it = m.begin(); it != m.end(); it++) {
auto& dets = it->second;
std::sort(dets.begin(), dets.end(), cmp);
for (size_t m = 0; m < dets.size(); ++m) {
auto& item = dets[m];
res.push_back(item);
for (size_t n = m + 1; n < dets.size(); ++n) {
if (probiou(item, dets[n]) >= nms_thresh) {
dets.erase(dets.begin() + n);
--n;
}
}
}
}
}
void batch_nms_obb(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
float conf_thresh, float nms_thresh) {
res_batch.resize(batch_size);
for (int i = 0; i < batch_size; i++) {
nms_obb(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
}
}
static std::vector<cv::Point> get_corner(cv::Mat& img, const Detection& box) {
float cos_value, sin_value;
// Calculate center point and width/height
float x1 = box.bbox[0];
float y1 = box.bbox[1];
float w = box.bbox[2];
float h = box.bbox[3];
float angle = box.angle * 180.0f / CV_PI; // Convert radians to degrees
// Print original angle
std::cout << "Original angle: " << angle << std::endl;
// Swap width and height if height is greater than or equal to width
if (h >= w) {
std::swap(w, h);
angle = fmod(angle + 90.0f, 180.0f); // Adjust angle to be within [0, 180)
}
// Ensure the angle is between 0 and 180 degrees
if (angle < 0) {
angle += 360.0f; // Convert to positive value
}
if (angle > 180.0f) {
angle -= 180.0f; // Subtract 180 from angles greater than 180
}
// Print adjusted angle
std::cout << "Adjusted angle: " << angle << std::endl;
// Convert to normal angle value
float normal_angle = fmod(angle, 180.0f);
if (normal_angle < 0) {
normal_angle += 180.0f; // Ensure it's a positive value
}
// Print normal angle value
std::cout << "Normal angle: " << normal_angle << std::endl;
cos_value = std::cos(angle * CV_PI / 180.0f); // Convert to radians
sin_value = std::sin(angle * CV_PI / 180.0f);
// Calculate each corner point
float l = x1 - w / 2; // Left boundary
float r = x1 + w / 2; // Right boundary
float t = y1 - h / 2; // Top boundary
float b = y1 + h / 2; // Bottom boundary
// Use get_rect function to scale the coordinates
float bbox[4] = {l, t, r, b};
cv::Rect rect = get_rect(img, bbox);
float x_ = (rect.x + rect.x + rect.width) / 2; // Center x
float y_ = (rect.y + rect.y + rect.height) / 2; // Center y
float width = rect.width; // Width
float height = rect.height; // Height
// Calculate each corner point
std::vector<cv::Point> corner_points(4);
float vec1x = width / 2 * cos_value;
float vec1y = width / 2 * sin_value;
float vec2x = -height / 2 * sin_value;
float vec2y = height / 2 * cos_value;
corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))); // Top-left corner
corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))); // Top-right corner
corner_points[2] =
cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))); // Bottom-right corner
corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))); // Bottom-left corner
// Check and adjust corner points to ensure the rectangle is parallel to image boundaries
for (auto& point : corner_points) {
point.x = std::max(0, std::min(point.x, img.cols - 1));
point.y = std::max(0, std::min(point.y, img.rows - 1));
}
return corner_points;
}
void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
for (size_t i = 0; i < img_batch.size(); i++) {
auto& res = res_batch[i];
auto& img = img_batch[i];
for (auto& obj : res) {
auto color = colors[(int)obj.class_id % colors.size()];
auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
auto corner_points = get_corner(img, obj);
cv::polylines(img, std::vector<std::vector<cv::Point>>{corner_points}, true, bgr, 1);
auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf));
cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr);
int width = textsize.width;
int height = textsize.height;
bool outside = (corner_points[0].y - height >= 3) ? true : false;
cv::Point p1(corner_points[0].x, corner_points[0].y), p2;
p2.x = corner_points[0].x + width;
if (outside) {
p2.y = corner_points[0].y - height - 3;
} else {
p2.y = corner_points[0].y + height + 3;
}
cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA);
cv::putText(
img, text,
cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)),
0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA);
}
}
}

193
src/postprocess.cu Normal file
View File

@ -0,0 +1,193 @@
//
// Created by lindsay on 23-7-17.
//
#include "postprocess.h"
#include "types.h"
static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray,
int max_objects) {
float count = predict[0];
int position = (blockDim.x * blockIdx.x + threadIdx.x);
if (position >= count)
return;
float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
int index = atomicAdd(parray, 1);
if (index >= max_objects)
return;
float confidence = pitem[4];
if (confidence < confidence_threshold)
return;
//[center_x center_y w h conf class_id mask[32] keypoints[51] angle]
float cx = pitem[0];
float cy = pitem[1];
float width = pitem[2];
float height = pitem[3];
float label = pitem[5];
float angle = pitem[89];
float* pout_item = parray + 1 + index * bbox_element;
*pout_item++ = cx;
*pout_item++ = cy;
*pout_item++ = width;
*pout_item++ = height;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1; // 1 = keep, 0 = ignore
*pout_item++ = angle;
}
static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray,
int max_objects) {
float count = predict[0];
int position = (blockDim.x * blockIdx.x + threadIdx.x);
if (position >= count)
return;
float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
int index = atomicAdd(parray, 1);
if (index >= max_objects)
return;
float confidence = pitem[4];
if (confidence < confidence_threshold)
return;
float left = pitem[0];
float top = pitem[1];
float right = pitem[2];
float bottom = pitem[3];
float label = pitem[5];
float* pout_item = parray + 1 + index * bbox_element;
*pout_item++ = left;
*pout_item++ = top;
*pout_item++ = right;
*pout_item++ = bottom;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1; // 1 = keep, 0 = ignore
}
static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop,
float bright, float bbottom) {
float cleft = max(aleft, bleft);
float ctop = max(atop, btop);
float cright = min(aright, bright);
float cbottom = min(abottom, bbottom);
float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
if (c_area == 0.0f)
return 0.0f;
float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
return c_area / (a_area + b_area - c_area);
}
static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) {
int position = (blockDim.x * blockIdx.x + threadIdx.x);
int count = bboxes[0];
if (position >= count)
return;
float* pcurrent = bboxes + 1 + position * bbox_element;
for (int i = 0; i < count; ++i) {
float* pitem = bboxes + 1 + i * bbox_element;
if (i == position || pcurrent[5] != pitem[5])
continue;
if (pitem[4] >= pcurrent[4]) {
if (pitem[4] == pcurrent[4] && i < position)
continue;
float iou =
box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]);
if (iou > threshold) {
pcurrent[6] = 0;
return;
}
}
}
}
static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) {
float a_val = w * w / 12.0f;
float b_val = h * h / 12.0f;
float cos_r = cosf(r);
float sin_r = sinf(r);
a = a_val * cos_r * cos_r + b_val * sin_r * sin_r;
b = a_val * sin_r * sin_r + b_val * cos_r * cos_r;
c = (a_val - b_val) * sin_r * cos_r;
}
static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2,
float h2, float r2, float eps = 1e-7) {
// Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
float a1, b1, c1, a2, b2, c2;
convariance_matrix(w1, h1, r1, a1, b1, c1);
convariance_matrix(w2, h2, r2, a2, b2, c2);
float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) /
((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) /
(4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) +
eps);
float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
bd = fmaxf(fminf(bd, 100.0f), eps);
float hd = sqrtf(1.0f - expf(-bd) + eps);
return 1 - hd;
}
static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) {
int position = (blockDim.x * blockIdx.x + threadIdx.x);
int count = bboxes[0];
if (position >= count)
return;
float* pcurrent = bboxes + 1 + position * bbox_element;
for (int i = 0; i < count; ++i) {
float* pitem = bboxes + 1 + i * bbox_element;
if (i == position || pcurrent[5] != pitem[5])
continue;
if (pitem[4] >= pcurrent[4]) {
if (pitem[4] == pcurrent[4] && i < position)
continue;
float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1],
pitem[2], pitem[3], pitem[7]);
if (iou > threshold) {
pcurrent[6] = 0;
return;
}
}
}
}
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream) {
int block = 256;
int grid = ceil(num_bboxes / (float)block);
decode_kernel<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
}
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
int block = max_objects < 256 ? max_objects : 256;
int grid = ceil(max_objects / (float)block);
nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream) {
int block = 256;
int grid = ceil(num_bboxes / (float)block);
decode_kernel_obb<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray,
max_objects);
}
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
int block = max_objects < 256 ? max_objects : 256;
int grid = ceil(max_objects / (float)block);
nms_kernel_obb<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}

139
src/preprocess.cu Normal file
View File

@ -0,0 +1,139 @@
#include "cuda_utils.h"
#include "preprocess.h"
static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;
__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= edge)
return;
float m_x1 = d2s.value[0];
float m_y1 = d2s.value[1];
float m_z1 = d2s.value[2];
float m_x2 = d2s.value[3];
float m_y2 = d2s.value[4];
float m_z2 = d2s.value[5];
int dx = position % dst_width;
int dy = position / dst_width;
float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
float c0, c1, c2;
if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
// out of range
c0 = const_value_st;
c1 = const_value_st;
c2 = const_value_st;
} else {
int y_low = floorf(src_y);
int x_low = floorf(src_x);
int y_high = y_low + 1;
int x_high = x_low + 1;
uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
float ly = src_y - y_low;
float lx = src_x - x_low;
float hy = 1 - ly;
float hx = 1 - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
uint8_t* v1 = const_value;
uint8_t* v2 = const_value;
uint8_t* v3 = const_value;
uint8_t* v4 = const_value;
if (y_low >= 0) {
if (x_low >= 0)
v1 = src + y_low * src_line_size + x_low * 3;
if (x_high < src_width)
v2 = src + y_low * src_line_size + x_high * 3;
}
if (y_high < src_height) {
if (x_low >= 0)
v3 = src + y_high * src_line_size + x_low * 3;
if (x_high < src_width)
v4 = src + y_high * src_line_size + x_high * 3;
}
c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
}
// bgr to rgb
float t = c2;
c2 = c0;
c0 = t;
// normalization
c0 = c0 / 255.0f;
c1 = c1 / 255.0f;
c2 = c2 / 255.0f;
// rgbrgbrgb to rrrgggbbb
int area = dst_width * dst_height;
float* pdst_c0 = dst + dy * dst_width + dx;
float* pdst_c1 = pdst_c0 + area;
float* pdst_c2 = pdst_c1 + area;
*pdst_c0 = c0;
*pdst_c1 = c1;
*pdst_c2 = c2;
}
void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
cudaStream_t stream) {
int img_size = src_width * src_height * 3;
// copy data to pinned memory
memcpy(img_buffer_host, src, img_size);
// copy data to device memory
CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));
AffineMatrix s2d, d2s;
float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);
s2d.value[0] = scale;
s2d.value[1] = 0;
s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
s2d.value[3] = 0;
s2d.value[4] = scale;
s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
int jobs = dst_height * dst_width;
int threads = 256;
int blocks = ceil(jobs / (float)threads);
warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
dst_width, dst_height, 128, d2s, jobs);
}
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
cudaStream_t stream) {
int dst_size = dst_width * dst_height * 3;
for (size_t i = 0; i < img_batch.size(); i++) {
cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
dst_height, stream);
CUDA_CHECK(cudaStreamSynchronize(stream));
}
}
void cuda_preprocess_init(int max_image_size) {
// prepare input data in pinned memory
CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
// prepare input data in device memory
CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}
void cuda_preprocess_destroy() {
CUDA_CHECK(cudaFree(img_buffer_device));
CUDA_CHECK(cudaFreeHost(img_buffer_host));
}

278
yolov8_5u_det.cpp Normal file
View File

@ -0,0 +1,278 @@
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"
Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
float& gw, int& max_channels) {
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serialized_engine = nullptr;
if (is_p == 6) {
serialized_engine =
buildEngineYolov8_5uDetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
} else {
serialized_engine = buildEngineYolov8_5uDet(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
}
assert(serialized_engine);
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cout << "could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
delete serialized_engine;
delete config;
delete builder;
}
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
std::string cuda_post_process) {
assert(engine->getNbBindings() == 2);
// In order to bind the buffers, we need to know the names of the input and
// output tensors. Note that indices are guaranteed to be less than
// IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
if (cuda_post_process == "c") {
*output_buffer_host = new float[kBatchSize * kOutputSize];
} else if (cuda_post_process == "g") {
if (kBatchSize > 1) {
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
exit(0);
}
// Allocate memory for decode_ptr_host and copy to device
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
}
}
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
// infer on the batch asynchronously, and DMA output back to host
auto start = std::chrono::system_clock::now();
context.enqueue(batchsize, buffers, stream, nullptr);
if (cuda_post_process == "c") {
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "ms" << std::endl;
} else if (cuda_post_process == "g") {
CUDA_CHECK(
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox,
stream); // cuda nms
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference and gpu postprocess time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
if (argc < 4)
return false;
if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
wts = std::string(argv[2]);
engine = std::string(argv[3]);
auto sub_type = std::string(argv[4]);
if (sub_type[0] == 'n') {
gd = 0.33;
gw = 0.25;
max_channels = 1024;
} else if (sub_type[0] == 's') {
gd = 0.33;
gw = 0.50;
max_channels = 1024;
} else if (sub_type[0] == 'm') {
gd = 0.67;
gw = 0.75;
max_channels = 576;
} else if (sub_type[0] == 'l') {
gd = 1.0;
gw = 1.0;
max_channels = 512;
} else if (sub_type[0] == 'x') {
gd = 1.33;
gw = 1.25;
max_channels = 640;
} else {
return false;
}
if (sub_type.size() == 2 && sub_type[1] == '6') {
is_p = 6;
}
} else if (std::string(argv[1]) == "-d" && argc == 5) {
engine = std::string(argv[2]);
img_dir = std::string(argv[3]);
cuda_post_process = std::string(argv[4]);
} else {
return false;
}
return true;
}
int main(int argc, char** argv) {
cudaSetDevice(kGpuId);
std::string wts_name = "";
std::string engine_name = "";
std::string img_dir;
std::string sub_type = "";
std::string cuda_post_process = "";
int model_bboxes;
int is_p = 0;
float gd = 0.0f, gw = 0.0f;
int max_channels = 0;
if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
max_channels)) {
std::cerr << "Arguments not right!" << std::endl;
std::cerr << "./yolov8_5u_det -s [.wts] [.engine] "
"[n/s/m/l/x//n6/s6/m6/l6/x6] // serialize model to "
"plan file"
<< std::endl;
std::cerr << "./yolov8_5u_det -d [.engine] ../samples [c/g]// deserialize "
"plan file and run inference"
<< std::endl;
return -1;
}
// Create a model using the API directly and serialize it to a file
if (!wts_name.empty()) {
serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
return 0;
}
// Deserialize the engine from file
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
IExecutionContext* context = nullptr;
deserialize_engine(engine_name, &runtime, &engine, &context);
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
cuda_preprocess_init(kMaxInputImageSize);
auto out_dims = engine->getBindingDimensions(1);
model_bboxes = out_dims.d[0];
// Prepare cpu and gpu buffers
float* device_buffers[2];
float* output_buffer_host = nullptr;
float* decode_ptr_host = nullptr;
float* decode_ptr_device = nullptr;
// Read images from directory
std::vector<std::string> file_names;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
std::cerr << "read_files_in_dir failed." << std::endl;
return -1;
}
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
&decode_ptr_device, cuda_post_process);
// batch predict
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
std::vector<std::string> img_name_batch;
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
img_batch.push_back(img);
img_name_batch.push_back(file_names[j]);
}
// Preprocess
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
// Run inference
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
decode_ptr_device, model_bboxes, cuda_post_process);
std::vector<std::vector<Detection>> res_batch;
if (cuda_post_process == "c") {
// NMS
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
} else if (cuda_post_process == "g") {
// Process gpu decode and nms results
batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
}
// Draw bounding boxes
draw_bbox(img_batch, res_batch);
// Save images
for (size_t j = 0; j < img_batch.size(); j++) {
cv::imwrite("_" + img_name_batch[j], img_batch[j]);
}
}
// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
CUDA_CHECK(cudaFree(decode_ptr_device));
delete[] decode_ptr_host;
delete[] output_buffer_host;
cuda_preprocess_destroy();
// Destroy the engine
delete context;
delete engine;
delete runtime;
// Print histogram of the output distribution
// std::cout << "\nOutput:\n\n";
// for (unsigned int i = 0; i < kOutputSize; i++)
//{
// std::cout << prob[i] << ", ";
// if (i % 10 == 0) std::cout << std::endl;
//}
// std::cout << std::endl;
return 0;
}

462
yolov8_5u_det_trt.py Normal file
View File

@ -0,0 +1,462 @@
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt
CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1
def get_img_path_batches(batch_size, img_dir):
ret = []
batch = []
for root, dirs, files in os.walk(img_dir):
for name in files:
if len(batch) == batch_size:
ret.append(batch)
batch = []
batch.append(os.path.join(root, name))
if len(batch) > 0:
ret.append(batch)
return ret
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
"""
description: Plots one bounding box on image img,
this function comes from YoLov8 project.
param:
x: a box likes [x1,y1,x2,y2]
img: a opencv image object
color: color to draw rectangle, such as (0,255,0)
label: str
line_thickness: int
return:
no return
"""
tl = (
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
) # line/font thickness
color = color or [random.randint(0, 255) for _ in range(3)]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(
img,
label,
(c1[0], c1[1] - 2),
0,
tl / 3,
[225, 255, 255],
thickness=tf,
lineType=cv2.LINE_AA,
)
class YoLov8TRT(object):
"""
description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path):
# Create a Context on this device,
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
self.det_output_length = host_outputs[0].shape[0]
def infer(self, raw_image_generator):
threading.Thread.__init__(self)
# Make self the active context, pushing it on top of the context stack.
self.ctx.push()
# Restore
stream = self.stream
context = self.context
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
# Do image preprocess
batch_image_raw = []
batch_origin_h = []
batch_origin_w = []
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
for i, image_raw in enumerate(raw_image_generator):
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
batch_image_raw.append(image_raw)
batch_origin_h.append(origin_h)
batch_origin_w.append(origin_w)
np.copyto(batch_input_image[i], input_image)
batch_input_image = np.ascontiguousarray(batch_input_image)
# Copy input image to host buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
start = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
# Synchronize the stream
stream.synchronize()
end = time.time()
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
# Here we use the first row of output in that batch_size = 1
output = host_outputs[0]
# Do postprocess
for i in range(self.batch_size):
result_boxes, result_scores, result_classid = self.post_process(
output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
batch_origin_w[i]
)
# Draw rectangles and labels on the original image
for j in range(len(result_boxes)):
box = result_boxes[j]
plot_one_box(
box,
batch_image_raw[i],
label="{}:{:.2f}".format(
categories[int(result_classid[j])], result_scores[j]
),
)
return batch_image_raw, end - start
def destroy(self):
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
def get_raw_image(self, image_path_batch):
"""
description: Read an image from image path
"""
for img_path in image_path_batch:
yield cv2.imread(img_path)
def get_raw_image_zeros(self, image_path_batch=None):
"""
description: Ready data for warmup
"""
for _ in range(self.batch_size):
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
def preprocess_image(self, raw_bgr_image):
"""
description: Convert BGR image to RGB,
resize and pad it to target size, normalize to [0,1],
transform to NCHW format.
param:
input_image_path: str, image path
return:
image: the processed image
image_raw: the original image
h: original height
w: original width
"""
image_raw = raw_bgr_image
h, w, c = image_raw.shape
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
# Calculate widht and height and paddings
r_w = self.input_w / w
r_h = self.input_h / h
if r_h > r_w:
tw = self.input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((self.input_h - th) / 2)
ty2 = self.input_h - th - ty1
else:
tw = int(r_h * w)
th = self.input_h
tx1 = int((self.input_w - tw) / 2)
tx2 = self.input_w - tw - tx1
ty1 = ty2 = 0
# Resize the image with long side while maintaining ratio
image = cv2.resize(image, (tw, th))
# Pad the short side with (128,128,128)
image = cv2.copyMakeBorder(
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
)
image = image.astype(np.float32)
# Normalize to [0,1]
image /= 255.0
# HWC to CHW format:
image = np.transpose(image, [2, 0, 1])
# CHW to NCHW format
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order":
image = np.ascontiguousarray(image)
return image, image_raw, h, w
def xywh2xyxy(self, origin_h, origin_w, x):
"""
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
param:
origin_h: height of original image
origin_w: width of original image
x: A boxes numpy, each row is a box [center_x, center_y, w, h]
return:
y: A boxes numpy, each row is a box [x1, y1, x2, y2]
"""
y = np.zeros_like(x)
r_w = self.input_w / origin_w
r_h = self.input_h / origin_h
if r_h > r_w:
y[:, 0] = x[:, 0]
y[:, 2] = x[:, 2]
y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
y /= r_w
else:
y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
y[:, 1] = x[:, 1]
y[:, 3] = x[:, 3]
y /= r_h
return y
def post_process(self, output, origin_h, origin_w):
"""
description: postprocess the prediction
param:
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
origin_h: height of original image
origin_w: width of original image
return:
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
result_scores: finally scores, a numpy, each element is the score correspoing to box
result_classid: finally classid, a numpy, each element is the classid correspoing to box
"""
num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
# Get the num of boxes detected
num = int(output[0])
# Reshape to a two dimentional ndarray
# pred = np.reshape(output[1:], (-1, 38))[:num, :]
pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
# Do nms
boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
result_scores = boxes[:, 4] if len(boxes) else np.array([])
result_classid = boxes[:, 5] if len(boxes) else np.array([])
return result_boxes, result_scores, result_classid
def bbox_iou(self, box1, box2, x1y1x2y2=True):
"""
description: compute the IoU of two bounding boxes
param:
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
x1y1x2y2: select the coordinate format
return:
iou: computed iou
"""
if not x1y1x2y2:
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
# Get the coordinates of the intersection rectangle
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
# Intersection area
inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
* np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
# Union Area
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
"""
description: Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
param:
prediction: detections, (x1, y1, x2, y2, conf, cls_id)
origin_h: original image height
origin_w: original image width
conf_thres: a confidence threshold to filter detections
nms_thres: a iou threshold to filter detections
return:
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
"""
# Get the boxes that score > CONF_THRESH
boxes = prediction[prediction[:, 4] >= conf_thres]
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
# clip the coordinates
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
# Object confidence
confs = boxes[:, 4]
# Sort by the confs
boxes = boxes[np.argsort(-confs)]
# Perform non-maximum suppression
keep_boxes = []
while boxes.shape[0]:
large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
label_match = boxes[0, -1] == boxes[:, -1]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
keep_boxes += [boxes[0]]
boxes = boxes[~invalid]
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
return boxes
class inferThread(threading.Thread):
def __init__(self, yolov8_wrapper, image_path_batch):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
self.image_path_batch = image_path_batch
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
for i, img_path in enumerate(self.image_path_batch):
parent, filename = os.path.split(img_path)
save_name = os.path.join('output', filename)
# Save image
cv2.imwrite(save_name, batch_image_raw[i])
print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
class warmUpThread(threading.Thread):
def __init__(self, yolov8_wrapper):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
if __name__ == "__main__":
# load custom plugin and engine
PLUGIN_LIBRARY = "./build/libmyplugins.so"
engine_file_path = "yolov5xu.engine"
if len(sys.argv) > 1:
engine_file_path = sys.argv[1]
if len(sys.argv) > 2:
PLUGIN_LIBRARY = sys.argv[2]
ctypes.CDLL(PLUGIN_LIBRARY)
# load coco labels
categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
"traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
"frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
"surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
"cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
"teddy bear",
"hair drier", "toothbrush"]
if os.path.exists('output/'):
shutil.rmtree('output/')
os.makedirs('output/')
# a YoLov8TRT instance
yolov8_wrapper = YoLov8TRT(engine_file_path)
try:
print('batch size is', yolov8_wrapper.batch_size)
image_dir = "images/"
image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
for i in range(10):
# create a new thread to do warm_up
thread1 = warmUpThread(yolov8_wrapper)
thread1.start()
thread1.join()
for batch in image_path_batches:
# create a new thread to do inference
thread1 = inferThread(yolov8_wrapper, batch)
thread1.start()
thread1.join()
finally:
# destroy the instance
yolov8_wrapper.destroy()

285
yolov8_cls.cpp Normal file
View File

@ -0,0 +1,285 @@
#include "cuda_utils.h"
#include "logging.h"
#include "utils.h"
#include "model.h"
#include "config.h"
#include "calibrator.h"
#include <iostream>
#include <chrono>
#include <cmath>
#include <numeric>
#include <opencv2/opencv.hpp>
using namespace nvinfer1;
static Logger gLogger;
const static int kOutputSize = kClsNumClass;
void batch_preprocess(std::vector<cv::Mat>& imgs, float* output, int dst_width=224, int dst_height=224) {
for (size_t b = 0; b < imgs.size(); b++) {
int h = imgs[b].rows;
int w = imgs[b].cols;
int m = std::min(h, w);
int top = (h - m) / 2;
int left = (w - m) / 2;
cv::Mat img = imgs[b](cv::Rect(left, top, m, m));
cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR);
cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
img.convertTo(img, CV_32F, 1/255.0);
std::vector<cv::Mat> channels(3);
cv::split(img, channels);
// CHW format
for (int c = 0; c < 3; ++c) {
int i = 0;
for (int row = 0; row < dst_height; ++row) {
for (int col = 0; col < dst_width; ++col) {
output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] =
channels[c].at<float>(row, col);
++i;
}
}
}
}
}
std::vector<float> softmax(float *prob, int n) {
std::vector<float> res;
float sum = 0.0f;
float t;
for (int i = 0; i < n; i++) {
t = expf(prob[i]);
res.push_back(t);
sum += t;
}
for (int i = 0; i < n; i++) {
res[i] /= sum;
}
return res;
}
std::vector<int> topk(const std::vector<float>& vec, int k) {
std::vector<int> topk_index;
std::vector<size_t> vec_index(vec.size());
std::iota(vec_index.begin(), vec_index.end(), 0);
std::sort(vec_index.begin(), vec_index.end(), [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; });
int k_num = std::min<int>(vec.size(), k);
for (int i = 0; i < k_num; ++i) {
topk_index.push_back(vec_index[i]);
}
return topk_index;
}
std::vector<std::string> read_classes(std::string file_name) {
std::vector<std::string> classes;
std::ifstream ifs(file_name, std::ios::in);
if (!ifs.is_open()) {
std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl;
assert(0);
}
std::string s;
while (std::getline(ifs, s)) {
classes.push_back(s);
}
ifs.close();
return classes;
}
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir) {
if (argc < 4) return false;
if (std::string(argv[1]) == "-s" && (argc == 5)) {
wts = std::string(argv[2]);
engine = std::string(argv[3]);
auto net = std::string(argv[4]);
if (net[0] == 'n') {
gd = 0.33;
gw = 0.25;
} else if (net[0] == 's') {
gd = 0.33;
gw = 0.50;
} else if (net[0] == 'm') {
gd = 0.67;
gw = 0.75;
} else if (net[0] == 'l') {
gd = 1.0;
gw = 1.0;
} else if (net[0] == 'x') {
gd = 1.0;
gw = 1.25;
} else {
return false;
}
} else if (std::string(argv[1]) == "-d" && argc == 4) {
engine = std::string(argv[2]);
img_dir = std::string(argv[3]);
} else {
return false;
}
return true;
}
void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer, float** output_buffer_host) {
assert(engine->getNbBindings() == 2);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float)));
*cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW];
*output_buffer_host = new float[kBatchSize * kOutputSize];
}
void infer(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name, std::string& engine_name) {
// Create builder
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
// Create model to populate the network, then set the outputs and create an engine
IHostMemory *serialized_engine = nullptr;
//engine = buildEngineYolov8Cls(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
serialized_engine = buildEngineYolov8Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw);
assert(serialized_engine);
// Save engine to file
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cerr << "Could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
// Close everything down
delete serialized_engine;
delete config;
delete builder;
}
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
int main(int argc, char** argv) {
cudaSetDevice(kGpuId);
std::string wts_name = "";
std::string engine_name = "";
float gd = 0.0f, gw = 0.0f;
std::string img_dir;
if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) {
std::cerr << "arguments not right!" << std::endl;
std::cerr << "./yolov8_cls -s [.wts] [.engine] [n/s/m/l/x or c gd gw] // serialize model to plan file" << std::endl;
std::cerr << "./yolov8_cls -d [.engine] ../samples // deserialize plan file and run inference" << std::endl;
return -1;
}
// Create a model using the API directly and serialize it to a file
if (!wts_name.empty()) {
serialize_engine(kBatchSize, gd, gw, wts_name, engine_name);
return 0;
}
// Deserialize the engine from file
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
IExecutionContext* context = nullptr;
deserialize_engine(engine_name, &runtime, &engine, &context);
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
// Prepare cpu and gpu buffers
float* device_buffers[2];
float* cpu_input_buffer = nullptr;
float* output_buffer_host = nullptr;
prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host);
// Read images from directory
std::vector<std::string> file_names;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
std::cerr << "read_files_in_dir failed." << std::endl;
return -1;
}
// Read imagenet labels
auto classes = read_classes("imagenet_classes.txt");
// batch predict
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
std::vector<std::string> img_name_batch;
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
img_batch.push_back(img);
img_name_batch.push_back(file_names[j]);
}
// Preprocess
batch_preprocess(img_batch, cpu_input_buffer);
// Run inference
auto start = std::chrono::system_clock::now();
infer(*context, stream, (void**)device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize);
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
// Postprocess and get top-k result
for (size_t b = 0; b < img_name_batch.size(); b++) {
float* p = &output_buffer_host[b * kOutputSize];
auto res = softmax(p, kOutputSize);
auto topk_idx = topk(res, 3);
std::cout << img_name_batch[b] << std::endl;
for (auto idx: topk_idx) {
std::cout << " " << classes[idx] << " " << res[idx] << std::endl;
}
}
}
// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
delete[] cpu_input_buffer;
delete[] output_buffer_host;
// Destroy the engine
delete context;
delete engine;
delete runtime;
return 0;
}

283
yolov8_cls_trt.py Normal file
View File

@ -0,0 +1,283 @@
"""
An example that uses TensorRT's Python api to make inferences.
"""
import os
import shutil
import sys
import threading
import time
import cv2
import numpy as np
import torch
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
def get_img_path_batches(batch_size, img_dir):
ret = []
batch = []
for root, dirs, files in os.walk(img_dir):
for name in files:
if len(batch) == batch_size:
ret.append(batch)
batch = []
batch.append(os.path.join(root, name))
if len(batch) > 0:
ret.append(batch)
return ret
with open("imagenet_classes.txt") as f:
classes = [line.strip() for line in f.readlines()]
class YoLov8TRT(object):
"""
description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path):
# Create a Context on this device,
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
self.mean = (0.485, 0.456, 0.406)
self.std = (0.229, 0.224, 0.225)
for binding in engine:
print('binding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(
binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
def infer(self, raw_image_generator):
threading.Thread.__init__(self)
# Make self the active context, pushing it on top of the context stack.
self.ctx.push()
# Restore
stream = self.stream
context = self.context
engine = self.engine
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
# Do image preprocess
batch_image_raw = []
batch_input_image = np.empty(
shape=[self.batch_size, 3, self.input_h, self.input_w])
for i, image_raw in enumerate(raw_image_generator):
batch_image_raw.append(image_raw)
input_image = self.preprocess_cls_image(image_raw)
np.copyto(batch_input_image[i], input_image)
batch_input_image = np.ascontiguousarray(batch_input_image)
# Copy input image to host buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
start = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async(batch_size=self.batch_size,
bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
# Synchronize the stream
stream.synchronize()
end = time.time()
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
# Here we use the first row of output in that batch_size = 1
output = host_outputs[0]
# Do postprocess
for i in range(self.batch_size):
classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls(
output)
cv2.putText(batch_image_raw[i], str(
classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA)
print(classes_ls, predicted_conf_ls)
return batch_image_raw, end - start
def destroy(self):
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
def get_raw_image(self, image_path_batch):
"""
description: Read an image from image path
"""
for img_path in image_path_batch:
yield cv2.imread(img_path)
def get_raw_image_zeros(self, image_path_batch=None):
"""
description: Ready data for warmup
"""
for _ in range(self.batch_size):
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
def preprocess_cls_image(self, raw_bgr_image, dst_width=224, dst_height=224):
"""
description: Convert BGR image to RGB,
crop the center square frame,
resize it to target size, normalize to [0,1],
transform to NCHW format.
param:
raw_bgr_image: numpy array, raw BGR image
dst_width: int, target image width
dst_height: int, target image height
return:
image: the processed image
image_raw: the original image
h: original height
w: original width
"""
image_raw = raw_bgr_image
h, w, c = image_raw.shape
# Crop the center square frame
m = min(h, w)
top = (h - m) // 2
left = (w - m) // 2
image = raw_bgr_image[top:top + m, left:left + m]
# Resize the image with target size while maintaining ratio
image = cv2.resize(image, (dst_width, dst_height), interpolation=cv2.INTER_LINEAR)
# Convert BGR to RGB
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Normalize to [0,1]
image = image.astype(np.float32) / 255.0
# HWC to CHW format
image = image.transpose(2, 0, 1)
# CHW to NCHW format (add batch dimension)
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order"
image = np.ascontiguousarray(image)
batch_data = np.expand_dims(image, axis=0)
return batch_data
def postprocess_cls(self, output_data):
classes_ls = []
predicted_conf_ls = []
category_id_ls = []
output_data = output_data.reshape(self.batch_size, -1)
output_data = torch.Tensor(output_data)
p = torch.nn.functional.softmax(output_data, dim=1)
score, index = torch.topk(p, 3)
for ind in range(index.shape[0]):
input_category_id = index[ind][0].item() # 716
category_id_ls.append(input_category_id)
predicted_confidence = score[ind][0].item()
predicted_conf_ls.append(predicted_confidence)
classes_ls.append(classes[input_category_id])
return classes_ls, predicted_conf_ls, category_id_ls
class inferThread(threading.Thread):
def __init__(self, yolov8_wrapper, image_path_batch):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
self.image_path_batch = image_path_batch
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(
self.yolov8_wrapper.get_raw_image(self.image_path_batch))
for i, img_path in enumerate(self.image_path_batch):
parent, filename = os.path.split(img_path)
save_name = os.path.join('output', filename)
# Save image
cv2.imwrite(save_name, batch_image_raw[i])
print('input->{}, time->{:.2f}ms, saving into output/'.format(
self.image_path_batch, use_time * 1000))
class warmUpThread(threading.Thread):
def __init__(self, yolov8_wrapper):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(
self.yolov8_wrapper.get_raw_image_zeros())
print(
'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
if __name__ == "__main__":
# load custom plugin and engine
engine_file_path = "./yolov8x-cls-fp32.engine"
if len(sys.argv) > 1:
engine_file_path = sys.argv[1]
if os.path.exists('output/'):
shutil.rmtree('output/')
os.makedirs('output/')
# a YoLov8TRT instance
yolov8_wrapper = YoLov8TRT(engine_file_path)
try:
print('batch size is', yolov8_wrapper.batch_size)
image_dir = "samples/"
image_path_batches = get_img_path_batches(
yolov8_wrapper.batch_size, image_dir)
for i in range(10):
# create a new thread to do warm_up
thread1 = warmUpThread(yolov8_wrapper)
thread1.start()
thread1.join()
for batch in image_path_batches:
# create a new thread to do inference
thread1 = inferThread(yolov8_wrapper, batch)
thread1.start()
thread1.join()
finally:
# destroy the instance
yolov8_wrapper.destroy()

276
yolov8_det.cpp Normal file
View File

@ -0,0 +1,276 @@
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"
Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
float& gw, int& max_channels) {
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serialized_engine = nullptr;
if (is_p == 6) {
serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
} else if (is_p == 2) {
serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
} else {
serialized_engine = buildEngineYolov8Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
}
assert(serialized_engine);
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cout << "could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
delete serialized_engine;
delete config;
delete builder;
}
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
std::string cuda_post_process) {
assert(engine->getNbBindings() == 2);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
if (cuda_post_process == "c") {
*output_buffer_host = new float[kBatchSize * kOutputSize];
} else if (cuda_post_process == "g") {
if (kBatchSize > 1) {
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
exit(0);
}
// Allocate memory for decode_ptr_host and copy to device
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
}
}
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
// infer on the batch asynchronously, and DMA output back to host
auto start = std::chrono::system_clock::now();
context.enqueue(batchsize, buffers, stream, nullptr);
if (cuda_post_process == "c") {
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "ms" << std::endl;
} else if (cuda_post_process == "g") {
CUDA_CHECK(
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference and gpu postprocess time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
if (argc < 4)
return false;
if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
wts = std::string(argv[2]);
engine = std::string(argv[3]);
auto sub_type = std::string(argv[4]);
if (sub_type[0] == 'n') {
gd = 0.33;
gw = 0.25;
max_channels = 1024;
} else if (sub_type[0] == 's') {
gd = 0.33;
gw = 0.50;
max_channels = 1024;
} else if (sub_type[0] == 'm') {
gd = 0.67;
gw = 0.75;
max_channels = 576;
} else if (sub_type[0] == 'l') {
gd = 1.0;
gw = 1.0;
max_channels = 512;
} else if (sub_type[0] == 'x') {
gd = 1.0;
gw = 1.25;
max_channels = 640;
} else {
return false;
}
if (sub_type.size() == 2 && sub_type[1] == '6') {
is_p = 6;
} else if (sub_type.size() == 2 && sub_type[1] == '2') {
is_p = 2;
}
} else if (std::string(argv[1]) == "-d" && argc == 5) {
engine = std::string(argv[2]);
img_dir = std::string(argv[3]);
cuda_post_process = std::string(argv[4]);
} else {
return false;
}
return true;
}
int main(int argc, char** argv) {
cudaSetDevice(kGpuId);
std::string wts_name = "";
std::string engine_name = "";
std::string img_dir;
std::string sub_type = "";
std::string cuda_post_process = "";
int model_bboxes;
int is_p = 0;
float gd = 0.0f, gw = 0.0f;
int max_channels = 0;
if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
max_channels)) {
std::cerr << "Arguments not right!" << std::endl;
std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to "
"plan file"
<< std::endl;
std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl;
return -1;
}
// Create a model using the API directly and serialize it to a file
if (!wts_name.empty()) {
serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
return 0;
}
// Deserialize the engine from file
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
IExecutionContext* context = nullptr;
deserialize_engine(engine_name, &runtime, &engine, &context);
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
cuda_preprocess_init(kMaxInputImageSize);
auto out_dims = engine->getBindingDimensions(1);
model_bboxes = out_dims.d[0];
// Prepare cpu and gpu buffers
float* device_buffers[2];
float* output_buffer_host = nullptr;
float* decode_ptr_host = nullptr;
float* decode_ptr_device = nullptr;
// Read images from directory
std::vector<std::string> file_names;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
std::cerr << "read_files_in_dir failed." << std::endl;
return -1;
}
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
&decode_ptr_device, cuda_post_process);
// batch predict
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
std::vector<std::string> img_name_batch;
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
img_batch.push_back(img);
img_name_batch.push_back(file_names[j]);
}
// Preprocess
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
// Run inference
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
decode_ptr_device, model_bboxes, cuda_post_process);
std::vector<std::vector<Detection>> res_batch;
if (cuda_post_process == "c") {
// NMS
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
} else if (cuda_post_process == "g") {
//Process gpu decode and nms results
batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
}
// Draw bounding boxes
draw_bbox(img_batch, res_batch);
// Save images
for (size_t j = 0; j < img_batch.size(); j++) {
cv::imwrite("_" + img_name_batch[j], img_batch[j]);
}
}
// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
CUDA_CHECK(cudaFree(decode_ptr_device));
delete[] decode_ptr_host;
delete[] output_buffer_host;
cuda_preprocess_destroy();
// Destroy the engine
delete context;
delete engine;
delete runtime;
// Print histogram of the output distribution
//std::cout << "\nOutput:\n\n";
//for (unsigned int i = 0; i < kOutputSize; i++)
//{
// std::cout << prob[i] << ", ";
// if (i % 10 == 0) std::cout << std::endl;
//}
//std::cout << std::endl;
return 0;
}

451
yolov8_det_trt.py Normal file
View File

@ -0,0 +1,451 @@
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt
CONF_THRESH = 0.2
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1
def get_img_path_batches(batch_size, img_dir):
ret = []
batch = []
for root, dirs, files in os.walk(img_dir):
for name in files:
if len(batch) == batch_size:
ret.append(batch)
batch = []
batch.append(os.path.join(root, name))
if len(batch) > 0:
ret.append(batch)
return ret
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
"""
description: Plots one bounding box on image img,
this function comes from YoLov8 project.
param:
x: a box likes [x1,y1,x2,y2]
img: a opencv image object
color: color to draw rectangle, such as (0,255,0)
label: str
line_thickness: int
return:
no return
"""
tl = (
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
) # line/font thickness
color = color or [random.randint(0, 255) for _ in range(3)]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(
img,
label,
(c1[0], c1[1] - 2),
0,
tl / 3,
[225, 255, 255],
thickness=tf,
lineType=cv2.LINE_AA,
)
class YoLov8TRT(object):
"""
description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path):
# Create a Context on this device,
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
self.det_output_length = host_outputs[0].shape[0]
def infer(self, raw_image_generator):
threading.Thread.__init__(self)
# Make self the active context, pushing it on top of the context stack.
self.ctx.push()
# Restore
stream = self.stream
context = self.context
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
# Do image preprocess
batch_image_raw = []
batch_origin_h = []
batch_origin_w = []
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
for i, image_raw in enumerate(raw_image_generator):
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
batch_image_raw.append(image_raw)
batch_origin_h.append(origin_h)
batch_origin_w.append(origin_w)
np.copyto(batch_input_image[i], input_image)
batch_input_image = np.ascontiguousarray(batch_input_image)
# Copy input image to host buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
start = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
# Synchronize the stream
stream.synchronize()
end = time.time()
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
# Here we use the first row of output in that batch_size = 1
output = host_outputs[0]
# Do postprocess
for i in range(self.batch_size):
result_boxes, result_scores, result_classid = self.post_process(
output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
batch_origin_w[i]
)
# Draw rectangles and labels on the original image
for j in range(len(result_boxes)):
box = result_boxes[j]
plot_one_box(
box,
batch_image_raw[i],
label="{}:{:.2f}".format(
categories[int(result_classid[j])], result_scores[j]
),
)
return batch_image_raw, end - start
def destroy(self):
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
def get_raw_image(self, image_path_batch):
"""
description: Read an image from image path
"""
for img_path in image_path_batch:
yield cv2.imread(img_path)
def get_raw_image_zeros(self, image_path_batch=None):
"""
description: Ready data for warmup
"""
for _ in range(self.batch_size):
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
def preprocess_image(self, raw_bgr_image):
"""
description: Convert BGR image to RGB,
resize and pad it to target size, normalize to [0,1],
transform to NCHW format.
param:
input_image_path: str, image path
return:
image: the processed image
image_raw: the original image
h: original height
w: original width
"""
image_raw = raw_bgr_image
h, w, c = image_raw.shape
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
# Calculate widht and height and paddings
r_w = self.input_w / w
r_h = self.input_h / h
if r_h > r_w:
tw = self.input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((self.input_h - th) / 2)
ty2 = self.input_h - th - ty1
else:
tw = int(r_h * w)
th = self.input_h
tx1 = int((self.input_w - tw) / 2)
tx2 = self.input_w - tw - tx1
ty1 = ty2 = 0
# Resize the image with long side while maintaining ratio
image = cv2.resize(image, (tw, th))
# Pad the short side with (128,128,128)
image = cv2.copyMakeBorder(
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
)
image = image.astype(np.float32)
# Normalize to [0,1]
image /= 255.0
# HWC to CHW format:
image = np.transpose(image, [2, 0, 1])
# CHW to NCHW format
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order":
image = np.ascontiguousarray(image)
return image, image_raw, h, w
def xywh2xyxy(self, origin_h, origin_w, x):
"""
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
param:
origin_h: height of original image
origin_w: width of original image
x: A boxes numpy, each row is a box [center_x, center_y, w, h]
return:
y: A boxes numpy, each row is a box [x1, y1, x2, y2]
"""
y = np.zeros_like(x)
r_w = self.input_w / origin_w
r_h = self.input_h / origin_h
if r_h > r_w:
y[:, 0] = x[:, 0]
y[:, 2] = x[:, 2]
y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
y /= r_w
else:
y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
y[:, 1] = x[:, 1]
y[:, 3] = x[:, 3]
y /= r_h
return y
def post_process(self, output, origin_h, origin_w):
"""
description: postprocess the prediction
param:
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
origin_h: height of original image
origin_w: width of original image
return:
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
result_scores: finally scores, a numpy, each element is the score correspoing to box
result_classid: finally classid, a numpy, each element is the classid correspoing to box
"""
num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
# Get the num of boxes detected
num = int(output[0])
# Reshape to a two dimentional ndarray
# pred = np.reshape(output[1:], (-1, 38))[:num, :]
pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
# Do nms
boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
result_scores = boxes[:, 4] if len(boxes) else np.array([])
result_classid = boxes[:, 5] if len(boxes) else np.array([])
return result_boxes, result_scores, result_classid
def bbox_iou(self, box1, box2, x1y1x2y2=True):
"""
description: compute the IoU of two bounding boxes
param:
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
x1y1x2y2: select the coordinate format
return:
iou: computed iou
"""
if not x1y1x2y2:
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
# Get the coordinates of the intersection rectangle
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
# Intersection area
inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
* np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
# Union Area
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
"""
description: Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
param:
prediction: detections, (x1, y1, x2, y2, conf, cls_id)
origin_h: original image height
origin_w: original image width
conf_thres: a confidence threshold to filter detections
nms_thres: a iou threshold to filter detections
return:
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
"""
# Get the boxes that score > CONF_THRESH
boxes = prediction[prediction[:, 4] >= conf_thres]
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
# clip the coordinates
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
# Object confidence
confs = boxes[:, 4]
# Sort by the confs
boxes = boxes[np.argsort(-confs)]
# Perform non-maximum suppression
keep_boxes = []
while boxes.shape[0]:
large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
label_match = boxes[0, -1] == boxes[:, -1]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
keep_boxes += [boxes[0]]
boxes = boxes[~invalid]
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
return boxes
class inferThread(threading.Thread):
def __init__(self, yolov8_wrapper, image_path_batch):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
self.image_path_batch = image_path_batch
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
for i, img_path in enumerate(self.image_path_batch):
parent, filename = os.path.split(img_path)
save_name = os.path.join('output', filename)
# Save image
cv2.imwrite(save_name, batch_image_raw[i])
print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
class warmUpThread(threading.Thread):
def __init__(self, yolov8_wrapper):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
if __name__ == "__main__":
# load custom plugin and engine
PLUGIN_LIBRARY = "./build_20250603/libmyplugins.so"
engine_file_path = "./build_20250603/best.engine"
if len(sys.argv) > 1:
engine_file_path = sys.argv[1]
if len(sys.argv) > 2:
PLUGIN_LIBRARY = sys.argv[2]
ctypes.CDLL(PLUGIN_LIBRARY)
# load coco labels
# categories = ["face", "shoe", "phone", "e-bike"]
categories = ["helmet","non-Helmet","shoes"]
if os.path.exists('output/'):
shutil.rmtree('output/')
os.makedirs('output/')
# a YoLov8TRT instance
yolov8_wrapper = YoLov8TRT(engine_file_path)
try:
print('batch size is', yolov8_wrapper.batch_size)
image_dir = "images/"
image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
for i in range(10):
# create a new thread to do warm_up
thread1 = warmUpThread(yolov8_wrapper)
thread1.start()
thread1.join()
for batch in image_path_batches:
# create a new thread to do inference
thread1 = inferThread(yolov8_wrapper, batch)
thread1.start()
thread1.join()
finally:
# destroy the instance
yolov8_wrapper.destroy()

276
yolov8_obb.cpp Normal file
View File

@ -0,0 +1,276 @@
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"
Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
float& gw, int& max_channels) {
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serialized_engine = nullptr;
if (is_p == 6) {
std::cout << "p6 is not supported right now" << std::endl;
} else if (is_p == 2) {
std::cout << "p2 is not supported right now" << std::endl;
} else {
serialized_engine = buildEngineYolov8Obb(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
}
assert(serialized_engine);
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cout << "could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
delete serialized_engine;
delete config;
delete builder;
}
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
std::string cuda_post_process) {
assert(engine->getNbBindings() == 2);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
if (cuda_post_process == "c") {
*output_buffer_host = new float[kBatchSize * kOutputSize];
} else if (cuda_post_process == "g") {
if (kBatchSize > 1) {
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
exit(0);
}
// Allocate memory for decode_ptr_host and copy to device
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
}
}
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
// infer on the batch asynchronously, and DMA output back to host
auto start = std::chrono::system_clock::now();
context.enqueue(batchsize, buffers, stream, nullptr);
if (cuda_post_process == "c") {
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "ms" << std::endl;
} else if (cuda_post_process == "g") {
CUDA_CHECK(
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
cuda_decode_obb((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
cuda_nms_obb(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference and gpu postprocess time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
if (argc < 4)
return false;
if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
wts = std::string(argv[2]);
engine = std::string(argv[3]);
auto sub_type = std::string(argv[4]);
if (sub_type[0] == 'n') {
gd = 0.33;
gw = 0.25;
max_channels = 1024;
} else if (sub_type[0] == 's') {
gd = 0.33;
gw = 0.50;
max_channels = 1024;
} else if (sub_type[0] == 'm') {
gd = 0.67;
gw = 0.75;
max_channels = 576;
} else if (sub_type[0] == 'l') {
gd = 1.0;
gw = 1.0;
max_channels = 512;
} else if (sub_type[0] == 'x') {
gd = 1.0;
gw = 1.25;
max_channels = 640;
} else {
return false;
}
if (sub_type.size() == 2 && sub_type[1] == '6') {
is_p = 6;
} else if (sub_type.size() == 2 && sub_type[1] == '2') {
is_p = 2;
}
} else if (std::string(argv[1]) == "-d" && argc == 5) {
engine = std::string(argv[2]);
img_dir = std::string(argv[3]);
cuda_post_process = std::string(argv[4]);
} else {
return false;
}
return true;
}
int main(int argc, char** argv) {
cudaSetDevice(kGpuId);
std::string wts_name = "";
std::string engine_name = "";
std::string img_dir;
std::string sub_type = "";
std::string cuda_post_process = "";
int model_bboxes;
int is_p = 0;
float gd = 0.0f, gw = 0.0f;
int max_channels = 0;
if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
max_channels)) {
std::cerr << "Arguments not right!" << std::endl;
std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to "
"plan file"
<< std::endl;
std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl;
return -1;
}
// Create a model using the API directly and serialize it to a file
if (!wts_name.empty()) {
serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
return 0;
}
// Deserialize the engine from file
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
IExecutionContext* context = nullptr;
deserialize_engine(engine_name, &runtime, &engine, &context);
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
cuda_preprocess_init(kMaxInputImageSize);
auto out_dims = engine->getBindingDimensions(1);
model_bboxes = out_dims.d[0];
// Prepare cpu and gpu buffers
float* device_buffers[2];
float* output_buffer_host = nullptr;
float* decode_ptr_host = nullptr;
float* decode_ptr_device = nullptr;
// Read images from directory
std::vector<std::string> file_names;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
std::cerr << "read_files_in_dir failed." << std::endl;
return -1;
}
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
&decode_ptr_device, cuda_post_process);
// batch predict
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
std::vector<std::string> img_name_batch;
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
img_batch.push_back(img);
img_name_batch.push_back(file_names[j]);
}
// Preprocess
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
// Run inference
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
decode_ptr_device, model_bboxes, cuda_post_process);
std::vector<std::vector<Detection>> res_batch;
if (cuda_post_process == "c") {
// NMS
batch_nms_obb(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
} else if (cuda_post_process == "g") {
//Process gpu decode and nms results
batch_process_obb(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
}
// Draw bounding boxes
draw_bbox_obb(img_batch, res_batch);
// Save images
for (size_t j = 0; j < img_batch.size(); j++) {
cv::imwrite("_" + img_name_batch[j], img_batch[j]);
}
}
// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
CUDA_CHECK(cudaFree(decode_ptr_device));
delete[] decode_ptr_host;
delete[] output_buffer_host;
cuda_preprocess_destroy();
// Destroy the engine
delete context;
delete engine;
delete runtime;
// Print histogram of the output distribution
//std::cout << "\nOutput:\n\n";
//for (unsigned int i = 0; i < kOutputSize; i++)
//{
// std::cout << prob[i] << ", ";
// if (i % 10 == 0) std::cout << std::endl;
//}
//std::cout << std::endl;
return 0;
}

571
yolov8_obb_trt.py Normal file
View File

@ -0,0 +1,571 @@
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import sys
import threading
import time
import cv2
import math
import numpy as np
import pycuda.autoinit # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt
CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1
def get_img_path_batches(batch_size, img_dir):
ret = []
batch = []
for root, dirs, files in os.walk(img_dir):
for name in files:
if len(batch) == batch_size:
ret.append(batch)
batch = []
batch.append(os.path.join(root, name))
if len(batch) > 0:
ret.append(batch)
return ret
def regularize_rboxes(rboxes):
"""
Regularize rotated boxes in range [0, pi/2].
Args:
rboxes (numpy.ndarray): Input boxes of shape(N, 5) in xywhr format.
Returns:
(numpy.ndarray): The regularized boxes.
"""
x, y, w, h, t = np.split(rboxes, 5, axis=-1)
w_ = np.where(w > h, w, h)
h_ = np.where(w > h, h, w)
t = np.where(w > h, t, t + math.pi / 2) % math.pi
return np.concatenate([x, y, w_, h_, t], axis=-1) # regularized boxes
def xywhr2xyxyxyxy(x):
"""
Convert batched Oriented Bounding Boxes (OBB) from [xywh, rotation] to [xy1, xy2, xy3, xy4].
Args:
x (numpy.ndarray): Boxes in [cx, cy, w, h, rotation] format of shape (n, 5) or (b, n, 5).
Returns:
(numpy.ndarray): Converted corner points of shape (n, 4, 2) or (b, n, 4, 2).
"""
# Regularize the input boxes first
rboxes = regularize_rboxes(x)
ctr = rboxes[..., :2]
w, h, angle = (rboxes[..., i: i + 1] for i in range(2, 5))
cos_value = np.cos(angle)
sin_value = np.sin(angle)
vec1 = np.concatenate([w / 2 * cos_value, w / 2 * sin_value], axis=-1)
vec2 = np.concatenate([-h / 2 * sin_value, h / 2 * cos_value], axis=-1)
pt1 = ctr + vec1 + vec2
pt2 = ctr + vec1 - vec2
pt3 = ctr - vec1 - vec2
pt4 = ctr - vec1 + vec2
return np.stack([pt1, pt2, pt3, pt4], axis=-2)
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
"""
description: Plots one bounding box on image img,
this function comes from YoLov8 project.
param:
x: a box likes [x1,y1,x2,y2,angle]
img: a opencv image object
color: color to draw rectangle, such as (0,255,0)
label: str
line_thickness: int
return:
no return
"""
tl = (
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
) # line/font thickness
box = xywhr2xyxyxyxy(x).reshape(-1, 4, 2).squeeze()
p1 = [int(b) for b in box[0]]
# NOTE: cv2-version polylines needs np.asarray type.
cv2.polylines(img, [np.asarray(box, dtype=int)], True, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
w, h = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] # text width, height
outside = p1[1] - h >= 3
p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
cv2.rectangle(img, p1, p2, color, -1, cv2.LINE_AA) # filled
cv2.putText(
img,
label,
(p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
0,
tl / 3,
[225, 255, 255],
thickness=tf,
lineType=cv2.LINE_AA,
)
class YoLov8TRT(object):
"""
description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path):
# Create a Context on this device,
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
self.det_output_length = host_outputs[0].shape[0]
def infer(self, raw_image_generator):
threading.Thread.__init__(self)
# Make self the active context, pushing it on top of the context stack.
self.ctx.push()
# Restore
stream = self.stream
context = self.context
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
# Do image preprocess
batch_image_raw = []
batch_origin_h = []
batch_origin_w = []
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
for i, image_raw in enumerate(raw_image_generator):
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
batch_image_raw.append(image_raw)
batch_origin_h.append(origin_h)
batch_origin_w.append(origin_w)
np.copyto(batch_input_image[i], input_image)
batch_input_image = np.ascontiguousarray(batch_input_image)
# Copy input image to host buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
start = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
# Synchronize the stream
stream.synchronize()
end = time.time()
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
# Here we use the first row of output in that batch_size = 1
output = host_outputs[0]
# Do postprocess
for i in range(self.batch_size):
result_boxes, result_scores, result_classid = self.post_process(
output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
batch_origin_w[i]
)
# Draw rectangles and labels on the original image
for j in range(len(result_boxes)):
box = result_boxes[j]
np.random.seed(int(result_classid[j]))
color = [np.random.randint(0, 255) for _ in range(3)]
plot_one_box(
box,
batch_image_raw[i],
label="{}:{:.2f}".format(
categories[int(result_classid[j])], result_scores[j]
),
color=color,
line_thickness=1
)
return batch_image_raw, end - start
def destroy(self):
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
def get_raw_image(self, image_path_batch):
"""
description: Read an image from image path
"""
for img_path in image_path_batch:
yield cv2.imread(img_path)
def get_raw_image_zeros(self, image_path_batch=None):
"""
description: Ready data for warmup
"""
for _ in range(self.batch_size):
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
def preprocess_image(self, raw_bgr_image):
"""
description: Convert BGR image to RGB,
resize and pad it to target size, normalize to [0,1],
transform to NCHW format.
param:
input_image_path: str, image path
return:
image: the processed image
image_raw: the original image
h: original height
w: original width
"""
image_raw = raw_bgr_image
h, w, c = image_raw.shape
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
# Calculate widht and height and paddings
r_w = self.input_w / w
r_h = self.input_h / h
if r_h > r_w:
tw = self.input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((self.input_h - th) / 2)
ty2 = self.input_h - th - ty1
else:
tw = int(r_h * w)
th = self.input_h
tx1 = int((self.input_w - tw) / 2)
tx2 = self.input_w - tw - tx1
ty1 = ty2 = 0
# Resize the image with long side while maintaining ratio
image = cv2.resize(image, (tw, th))
# Pad the short side with (128,128,128)
image = cv2.copyMakeBorder(
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
)
image = image.astype(np.float32)
# Normalize to [0,1]
image /= 255.0
# HWC to CHW format:
image = np.transpose(image, [2, 0, 1])
# CHW to NCHW format
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order":
image = np.ascontiguousarray(image)
return image, image_raw, h, w
def xywh2xyxy(self, origin_h, origin_w, x):
"""
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
param:
origin_h: height of original image
origin_w: width of original image
x: A boxes numpy, each row is a box [center_x, center_y, w, h]
return:
y: A boxes numpy, each row is a box [x1, y1, x2, y2]
"""
y = np.zeros_like(x)
r_w = self.input_w / origin_w
r_h = self.input_h / origin_h
if r_h > r_w:
y[:, 0] = x[:, 0]
y[:, 2] = x[:, 2]
y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
y /= r_w
else:
y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
y[:, 1] = x[:, 1]
y[:, 3] = x[:, 3]
y /= r_h
return y
def post_process(self, output, origin_h, origin_w):
"""
description: postprocess the prediction
param:
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id,angle cx,cy,w,h,conf,cls_id,angle ...]
origin_h: height of original image
origin_w: width of original image
return:
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2, angle]
result_scores: finally scores, a numpy, each element is the score correspoing to box
result_classid: finally classid, a numpy, each element is the classid correspoing to box
"""
num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
# Get the num of boxes detected
num = int(output[0])
# Reshape to a two dimentional ndarray
# pred = np.reshape(output[1:], (-1, 38))[:num, :]
pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
# Do nms
boxes = self.non_max_suppression(pred, origin_h, origin_w,
conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
columns_to_keep = [0, 1, 2, 3, 89]
result_boxes = boxes[:, columns_to_keep] if len(boxes) else np.array([])
result_scores = boxes[:, 4] if len(boxes) else np.array([])
result_classid = boxes[:, 5] if len(boxes) else np.array([])
return result_boxes, result_scores, result_classid
def covariance_matrix(self, boxes):
"""
description: Generating covariance matrix from obbs.
param:
boxes (np.ndarray): A numpy of shape (N, 5) representing rotated bounding boxes, with xywhr format.
return:
(np.ndarray): Covariance metrixs corresponding to original rotated bounding boxes.
"""
# Gaussian bounding boxes, ignore the center points (the first two columns) because they are not needed here.
widths = boxes[:, 2:3].reshape(-1)
heights = boxes[:, 3:4].reshape(-1)
angles = boxes[:, 4].reshape(-1)
a, b, c = (widths ** 2) / 12, (heights ** 2) / 12, angles
cos_angles = np.cos(c)
sin_angles = np.sin(c)
cos2 = cos_angles ** 2
sin2 = sin_angles ** 2
return a * cos2 + b * sin2, a * sin2 + b * cos2, (a - b) * cos_angles * sin_angles
def bbox_iou(self, box1, box2, x1y1x2y2=True):
"""
description: compute the IoU of two bounding boxes
param:
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
x1y1x2y2: select the coordinate format
return:
iou: computed iou
"""
if not x1y1x2y2:
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
# Get the coordinates of the intersection rectangle
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
# Intersection area
inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
* np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
# Union Area
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
def batch_probiou(self, obb1, obb2, eps=1e-7):
"""
description: Calculate the prob IoU between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
param:
obb1 (np.ndarray): A numpy of shape (N, 5) representing ground truth obbs, with xywhr format.
obb2 (np.ndarray): A numpy of shape (M, 5) representing predicted obbs, with xywhr format.
eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
return:
iou: computed iou
"""
x1, y1 = obb1[:, 0], obb1[:, 1]
x2, y2 = obb2[:, 0], obb2[:, 1]
a1, b1, c1 = self.covariance_matrix(obb1)
a2, b2, c2 = self.covariance_matrix(obb2)
t1 = (
((a1 + a2) * (y1 - y2) ** 2 + (b1 + b2) * (x1 - x2) ** 2) /
((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps)
) * 0.25
t2 = (
((c1 + c2) * (x2 - x1) * (y1 - y2)) /
((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps)
) * 0.5
t3 = (
((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2) /
(4 * (np.clip(a1 * b1 - c1 ** 2, 0, None) * np.clip(a2 * b2 - c2 ** 2, 0, None)) ** 0.5 + eps)
+ eps
)
t3 = np.log(t3) * 0.5
bd = np.clip(t1 + t2 + t3, eps, 100.0)
hd = np.sqrt(1.0 - np.exp(-bd) + eps)
return 1 - hd
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
"""
description: Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
param:
prediction: detections, (x1, y1, x2, y2, conf, cls_id, angle)
origin_h: original image height
origin_w: original image width
conf_thres: a confidence threshold to filter detections
nms_thres: a iou threshold to filter detections
return:
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id, angle)
"""
# Get the boxes that score > CONF_THRESH
boxes = prediction[prediction[:, 4] >= conf_thres]
col_idx = [0, 1, 2, 3, 89]
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
# clip the coordinates
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
# Object confidence
confs = boxes[:, 4]
# Sort by the confs
boxes = boxes[np.argsort(-confs)]
# Perform non-maximum suppression
keep_boxes = []
while boxes.shape[0]:
large_overlap = self.batch_probiou(np.expand_dims(boxes[0, col_idx], 0), boxes[:, col_idx]) > nms_thres
label_match = boxes[0, 5] == boxes[:, 5]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
keep_boxes += [boxes[0]]
boxes = boxes[~invalid]
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
return boxes
class inferThread(threading.Thread):
def __init__(self, yolov8_wrapper, image_path_batch):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
self.image_path_batch = image_path_batch
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
for i, img_path in enumerate(self.image_path_batch):
parent, filename = os.path.split(img_path)
save_name = os.path.join('output', filename)
# Save image
cv2.imwrite(save_name, batch_image_raw[i])
print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
class warmUpThread(threading.Thread):
def __init__(self, yolov8_wrapper):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
if __name__ == "__main__":
# load custom plugin and engine
PLUGIN_LIBRARY = "./build/libmyplugins.so"
engine_file_path = "yolov8n-obb.engine"
if len(sys.argv) > 1:
engine_file_path = sys.argv[1]
if len(sys.argv) > 2:
PLUGIN_LIBRARY = sys.argv[2]
ctypes.CDLL(PLUGIN_LIBRARY)
# load DOTAV 1.5 labels
categories = ["plane", "ship", "storage tank", "baseball diamond", "tennis court",
"basketball court", "ground track field", "harbor",
"bridge", "large vehicle", "small vehicle", "helicopter",
"roundabout", "soccer ball field", "swimming pool", "container crane"]
if os.path.exists('output/'):
shutil.rmtree('output/')
os.makedirs('output/')
# a YoLov8TRT instance
yolov8_wrapper = YoLov8TRT(engine_file_path)
try:
print('batch size is', yolov8_wrapper.batch_size)
image_dir = "images/"
image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
for i in range(10):
# create a new thread to do warm_up
thread1 = warmUpThread(yolov8_wrapper)
thread1.start()
thread1.join()
for batch in image_path_batches:
# create a new thread to do inference
thread1 = inferThread(yolov8_wrapper, batch)
thread1.start()
thread1.join()
finally:
# destroy the instance
yolov8_wrapper.destroy()

277
yolov8_pose.cpp Normal file
View File

@ -0,0 +1,277 @@
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"
Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
float& gw, int& max_channels) {
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serialized_engine = nullptr;
if (is_p == 6) {
serialized_engine = buildEngineYolov8PoseP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
} else if (is_p == 2) {
std::cout << "p2 is not supported right now" << std::endl;
} else {
serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
}
assert(serialized_engine);
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cout << "could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
delete serialized_engine;
delete config;
delete builder;
}
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
std::string cuda_post_process) {
assert(engine->getNbBindings() == 2);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
if (cuda_post_process == "c") {
*output_buffer_host = new float[kBatchSize * kOutputSize];
} else if (cuda_post_process == "g") {
if (kBatchSize > 1) {
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
exit(0);
}
// Allocate memory for decode_ptr_host and copy to device
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
}
}
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
// infer on the batch asynchronously, and DMA output back to host
auto start = std::chrono::system_clock::now();
context.enqueue(batchsize, buffers, stream, nullptr);
if (cuda_post_process == "c") {
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "ms" << std::endl;
} else if (cuda_post_process == "g") {
CUDA_CHECK(
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference and gpu postprocess time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
if (argc < 4)
return false;
if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
wts = std::string(argv[2]);
engine = std::string(argv[3]);
auto sub_type = std::string(argv[4]);
if (sub_type[0] == 'n') {
gd = 0.33;
gw = 0.25;
max_channels = 1024;
} else if (sub_type[0] == 's') {
gd = 0.33;
gw = 0.50;
max_channels = 1024;
} else if (sub_type[0] == 'm') {
gd = 0.67;
gw = 0.75;
max_channels = 576;
} else if (sub_type[0] == 'l') {
gd = 1.0;
gw = 1.0;
max_channels = 512;
} else if (sub_type[0] == 'x') {
gd = 1.0;
gw = 1.25;
max_channels = 640;
} else {
return false;
}
if (sub_type.size() == 2 && sub_type[1] == '6') {
is_p = 6;
} else if (sub_type.size() == 2 && sub_type[1] == '2') {
is_p = 2;
}
} else if (std::string(argv[1]) == "-d" && argc == 5) {
engine = std::string(argv[2]);
img_dir = std::string(argv[3]);
cuda_post_process = std::string(argv[4]);
} else {
return false;
}
return true;
}
int main(int argc, char** argv) {
cudaSetDevice(kGpuId);
std::string wts_name = "";
std::string engine_name = "";
std::string img_dir;
std::string sub_type = "";
std::string cuda_post_process = "";
int model_bboxes;
int is_p = 0;
float gd = 0.0f, gw = 0.0f;
int max_channels = 0;
if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
max_channels)) {
std::cerr << "Arguments not right!" << std::endl;
std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to "
"plan file"
<< std::endl;
std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl;
return -1;
}
// Create a model using the API directly and serialize it to a file
if (!wts_name.empty()) {
serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
return 0;
}
// Deserialize the engine from file
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
IExecutionContext* context = nullptr;
deserialize_engine(engine_name, &runtime, &engine, &context);
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
cuda_preprocess_init(kMaxInputImageSize);
auto out_dims = engine->getBindingDimensions(1);
model_bboxes = out_dims.d[0];
// Prepare cpu and gpu buffers
float* device_buffers[2];
float* output_buffer_host = nullptr;
float* decode_ptr_host = nullptr;
float* decode_ptr_device = nullptr;
// Read images from directory
std::vector<std::string> file_names;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
std::cerr << "read_files_in_dir failed." << std::endl;
return -1;
}
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
&decode_ptr_device, cuda_post_process);
// batch predict
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
std::vector<std::string> img_name_batch;
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
img_batch.push_back(img);
img_name_batch.push_back(file_names[j]);
}
// Preprocess
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
// Run inference
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
decode_ptr_device, model_bboxes, cuda_post_process);
std::vector<std::vector<Detection>> res_batch;
if (cuda_post_process == "c") {
// NMS
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
} else if (cuda_post_process == "g") {
// Process gpu decode and nms results
// todo pose in gpu
std::cerr << "pose_postprocess is not support in gpu right now" << std::endl;
}
// Draw bounding boxes
draw_bbox_keypoints_line(img_batch, res_batch);
// Save images
for (size_t j = 0; j < img_batch.size(); j++) {
cv::imwrite("_" + img_name_batch[j], img_batch[j]);
}
}
// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
CUDA_CHECK(cudaFree(decode_ptr_device));
delete[] decode_ptr_host;
delete[] output_buffer_host;
cuda_preprocess_destroy();
// Destroy the engine
delete context;
delete engine;
delete runtime;
// Print histogram of the output distribution
//std::cout << "\nOutput:\n\n";
//for (unsigned int i = 0; i < kOutputSize; i++)
//{
// std::cout << prob[i] << ", ";
// if (i % 10 == 0) std::cout << std::endl;
//}
//std::cout << std::endl;
return 0;
}

502
yolov8_pose_trt.py Normal file
View File

@ -0,0 +1,502 @@
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt
CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1
keypoint_pairs = [
(0, 1), (0, 2), (0, 5), (0, 6), (1, 2),
(1, 3), (2, 4), (5, 6), (5, 7), (5, 11),
(6, 8), (6, 12), (7, 9), (8, 10), (11, 12),
(11, 13), (12, 14), (13, 15), (14, 16)
]
def get_img_path_batches(batch_size, img_dir):
ret = []
batch = []
for root, dirs, files in os.walk(img_dir):
for name in files:
if len(batch) == batch_size:
ret.append(batch)
batch = []
batch.append(os.path.join(root, name))
if len(batch) > 0:
ret.append(batch)
return ret
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
"""
description: Plots one bounding box on image img,
this function comes from YoLov8 project.
param:
x: a box likes [x1,y1,x2,y2]
img: a opencv image object
color: color to draw rectangle, such as (0,255,0)
label: str
line_thickness: int
return:
no return
"""
tl = (
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
) # line/font thickness
color = color or [random.randint(0, 255) for _ in range(3)]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(
img,
label,
(c1[0], c1[1] - 2),
0,
tl / 3,
[225, 255, 255],
thickness=tf,
lineType=cv2.LINE_AA,
)
class YoLov8TRT(object):
"""
description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path):
# Create a Context on this device,
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
self.det_output_size = host_outputs[0].shape[0]
def infer(self, raw_image_generator):
threading.Thread.__init__(self)
# Make self the active context, pushing it on top of the context stack.
self.ctx.push()
# Restore
stream = self.stream
context = self.context
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
# Do image preprocess
batch_image_raw = []
batch_origin_h = []
batch_origin_w = []
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
for i, image_raw in enumerate(raw_image_generator):
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
batch_image_raw.append(image_raw)
batch_origin_h.append(origin_h)
batch_origin_w.append(origin_w)
np.copyto(batch_input_image[i],
input_image)
batch_input_image = np.ascontiguousarray(batch_input_image)
# Copy input image to host buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
start = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
# Synchronize the stream
stream.synchronize()
end = time.time()
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
# Here we use the first row of output in that batch_size = 1
output = host_outputs[0]
# Do postprocess
for i in range(self.batch_size):
result_boxes, result_scores, result_classid, keypoints = self.post_process(
output[i * (self.det_output_size): (i + 1) * (self.det_output_size)],
batch_origin_h[i], batch_origin_w[i]
)
# Draw rectangles and labels on the original image
for j in range(len(result_boxes)):
box = result_boxes[j]
plot_one_box(
box,
batch_image_raw[i],
label="{}:{:.2f}".format(
categories[int(result_classid[j])], result_scores[j]
),
)
num_keypoints = len(keypoints[j]) // 3
points = []
for k in range(num_keypoints):
x = keypoints[j][k * 3]
y = keypoints[j][k * 3 + 1]
confidence = keypoints[j][k * 3 + 2]
if confidence > 0:
points.append((int(x), int(y)))
else:
points.append(None)
# 根据关键点索引对绘制线条
for pair in keypoint_pairs:
partA, partB = pair
if points[partA] and points[partB]:
cv2.line(batch_image_raw[i], points[partA], points[partB], (0, 255, 0), 2)
return batch_image_raw, end - start
def destroy(self):
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
def get_raw_image(self, image_path_batch):
"""
description: Read an image from image path
"""
for img_path in image_path_batch:
yield cv2.imread(img_path)
def get_raw_image_zeros(self, image_path_batch=None):
"""
description: Ready data for warmup
"""
for _ in range(self.batch_size):
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
def preprocess_image(self, raw_bgr_image):
"""
description: Convert BGR image to RGB,
resize and pad it to target size, normalize to [0,1],
transform to NCHW format.
param:
input_image_path: str, image path
return:
image: the processed image
image_raw: the original image
h: original height
w: original width
"""
image_raw = raw_bgr_image
h, w, c = image_raw.shape
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
# Calculate widht and height and paddings
r_w = self.input_w / w
r_h = self.input_h / h
if r_h > r_w:
tw = self.input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((self.input_h - th) / 2)
ty2 = self.input_h - th - ty1
else:
tw = int(r_h * w)
th = self.input_h
tx1 = int((self.input_w - tw) / 2)
tx2 = self.input_w - tw - tx1
ty1 = ty2 = 0
# Resize the image with long side while maintaining ratio
image = cv2.resize(image, (tw, th))
# Pad the short side with (128,128,128)
image = cv2.copyMakeBorder(
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
)
image = image.astype(np.float32)
# Normalize to [0,1]
image /= 255.0
# HWC to CHW format:
image = np.transpose(image, [2, 0, 1])
# CHW to NCHW format
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order":
image = np.ascontiguousarray(image)
return image, image_raw, h, w
def xywh2xyxy_with_keypoints(self, origin_h, origin_w, boxes, keypoints):
n = len(boxes)
box_array = np.zeros_like(boxes)
keypoint_array = np.zeros_like(keypoints)
r_w = self.input_w / origin_w
r_h = self.input_h / origin_h
for i in range(n):
if r_h > r_w:
box = boxes[i]
lmk = keypoints[i]
box_array[i, 0] = box[0] / r_w
box_array[i, 2] = box[2] / r_w
box_array[i, 1] = (box[1] - (self.input_h - r_w * origin_h) / 2) / r_w
box_array[i, 3] = (box[3] - (self.input_h - r_w * origin_h) / 2) / r_w
for j in range(0, len(lmk), 3):
keypoint_array[i, j] = lmk[j] / r_w
keypoint_array[i, j + 1] = (lmk[j + 1] - (self.input_h - r_w * origin_h) / 2) / r_w
keypoint_array[i, j + 2] = lmk[j + 2]
else:
box = boxes[i]
lmk = keypoints[i]
box_array[i, 0] = (box[0] - (self.input_w - r_h * origin_w) / 2) / r_h
box_array[i, 2] = (box[2] - (self.input_w - r_h * origin_w) / 2) / r_h
box_array[i, 1] = box[1] / r_h
box_array[i, 3] = box[3] / r_h
for j in range(0, len(lmk), 3):
keypoint_array[i, j] = (lmk[j] - (self.input_w - r_h * origin_w) / 2) / r_h
keypoint_array[i, j + 1] = lmk[j + 1] / r_h
keypoint_array[i, j + 2] = lmk[j + 2]
return box_array, keypoint_array
def post_process(self, output, origin_h, origin_w):
"""
description: Post-process the prediction to include pose keypoints
param:
output: A numpy array like [num_boxes, cx, cy, w, h, conf,
cls_id, px1, py1, pconf1,...px17, py17, pconf17] where p denotes pose keypoint
origin_h: Height of original image
origin_w: Width of original image
return:
result_boxes: Final boxes, a numpy array, each row is a box [x1, y1, x2, y2]
result_scores: Final scores, a numpy array, each element is the score corresponding to box
result_classid: Final classID, a numpy array, each element is the classid corresponding to box
result_keypoints: Final keypoints, a list of numpy arrays,
each element represents keypoints for a box, shaped as (#keypoints, 3)
"""
# Number of values per detection: 38 base values + 17 keypoints * 3 values each + angle
num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM
# Get the number of boxes detected
num = int(output[0])
# Reshape to a two-dimensional ndarray with the full detection shape
pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
# Perform non-maximum suppression to filter the detections
boxes = self.non_max_suppression(
pred[:, :num_values_per_detection], origin_h, origin_w,
conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
# Extract the bounding boxes, confidence scores, and class IDs
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
result_scores = boxes[:, 4] if len(boxes) else np.array([])
result_classid = boxes[:, 5] if len(boxes) else np.array([])
result_keypoints = boxes[:, -POSE_NUM-1:-1] if len(boxes) else np.array([])
# Return the post-processed results including keypoints
return result_boxes, result_scores, result_classid, result_keypoints
def bbox_iou(self, box1, box2, x1y1x2y2=True):
"""
description: compute the IoU of two bounding boxes
param:
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
x1y1x2y2: select the coordinate format
return:
iou: computed iou
"""
if not x1y1x2y2:
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
# Get the coordinates of the intersection rectangle
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
# Intersection area
inter_area = np.clip(
inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
# Union Area
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
"""
description: Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
param:
prediction: detections, (x1, y1, x2, y2, conf, cls_id)
origin_h: original image height
origin_w: original image width
conf_thres: a confidence threshold to filter detections
nms_thres: a iou threshold to filter detections
return:
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
"""
# Get the boxes that score > CONF_THRESH
boxes = prediction[prediction[:, 4] >= conf_thres]
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
res_array = np.copy(boxes)
box_pred_deep_copy = np.copy(boxes[:, :4])
keypoints_pred_deep_copy = np.copy(boxes[:, -POSE_NUM-1:-1])
res_box, res_keypoints = self.xywh2xyxy_with_keypoints(
origin_h, origin_w, box_pred_deep_copy, keypoints_pred_deep_copy)
res_array[:, :4] = res_box
res_array[:, -POSE_NUM-1:-1] = res_keypoints
# clip the coordinates
res_array[:, 0] = np.clip(res_array[:, 0], 0, origin_w - 1)
res_array[:, 2] = np.clip(res_array[:, 2], 0, origin_w - 1)
res_array[:, 1] = np.clip(res_array[:, 1], 0, origin_h - 1)
res_array[:, 3] = np.clip(res_array[:, 3], 0, origin_h - 1)
# Object confidence
confs = res_array[:, 4]
# Sort by the confs
res_array = res_array[np.argsort(-confs)]
# Perform non-maximum suppression
keep_res_array = []
while res_array.shape[0]:
large_overlap = self.bbox_iou(np.expand_dims(res_array[0, :4], 0), res_array[:, :4]) > nms_thres
label_match = res_array[0, 5] == res_array[:, 5]
invalid = large_overlap & label_match
keep_res_array.append(res_array[0])
res_array = res_array[~invalid]
res_array = np.stack(keep_res_array, 0) if len(keep_res_array) else np.array([])
return res_array
class inferThread(threading.Thread):
def __init__(self, yolov8_wrapper, image_path_batch):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
self.image_path_batch = image_path_batch
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
for i, img_path in enumerate(self.image_path_batch):
parent, filename = os.path.split(img_path)
save_name = os.path.join('output', filename)
# Save image
cv2.imwrite(save_name, batch_image_raw[i])
print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
class warmUpThread(threading.Thread):
def __init__(self, yolov8_wrapper):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
if __name__ == "__main__":
# load custom plugin and engine
PLUGIN_LIBRARY = "./build/libmyplugins.so"
engine_file_path = "yolov8n-pose.engine"
if len(sys.argv) > 1:
engine_file_path = sys.argv[1]
if len(sys.argv) > 2:
PLUGIN_LIBRARY = sys.argv[2]
ctypes.CDLL(PLUGIN_LIBRARY)
# load coco labels
categories = ["person"]
if os.path.exists('output/'):
shutil.rmtree('output/')
os.makedirs('output/')
# a YoLov8TRT instance
yolov8_wrapper = YoLov8TRT(engine_file_path)
try:
print('batch size is', yolov8_wrapper.batch_size)
image_dir = "images/"
image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
for i in range(10):
# create a new thread to do warm_up
thread1 = warmUpThread(yolov8_wrapper)
thread1.start()
thread1.join()
for batch in image_path_batches:
# create a new thread to do inference
thread1 = inferThread(yolov8_wrapper, batch)
thread1.start()
thread1.join()
finally:
# destroy the instance
yolov8_wrapper.destroy()

332
yolov8_seg.cpp Normal file
View File

@ -0,0 +1,332 @@
#include <fstream>
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"
Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4);
static cv::Rect get_downscale_rect(float bbox[4], float scale) {
float left = bbox[0];
float top = bbox[1];
float right = bbox[0] + bbox[2];
float bottom = bbox[1] + bbox[3];
left = left < 0 ? 0 : left;
top = top < 0 ? 0 : top;
right = right > kInputW ? kInputW : right;
bottom = bottom > kInputH ? kInputH : bottom;
left /= scale;
top /= scale;
right /= scale;
bottom /= scale;
return cv::Rect(int(left), int(top), int(right - left), int(bottom - top));
}
std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets) {
std::vector<cv::Mat> masks;
for (size_t i = 0; i < dets.size(); i++) {
cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
auto r = get_downscale_rect(dets[i].bbox, 4);
for (int x = r.x; x < r.x + r.width; x++) {
for (int y = r.y; y < r.y + r.height; y++) {
float e = 0.0f;
for (int j = 0; j < 32; j++) {
e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
}
e = 1.0f / (1.0f + expf(-e));
mask_mat.at<float>(y, x) = e;
}
}
cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
masks.push_back(mask_mat);
}
return masks;
}
void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& sub_type, float& gd, float& gw,
int& max_channels) {
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serialized_engine = nullptr;
serialized_engine = buildEngineYolov8Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
assert(serialized_engine);
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cout << "could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
delete serialized_engine;
delete config;
delete builder;
}
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host,
float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) {
assert(engine->getNbBindings() == 3);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
const int outputIndex_seg = engine->getBindingIndex("proto");
assert(inputIndex == 0);
assert(outputIndex == 1);
assert(outputIndex_seg == 2);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float)));
if (cuda_post_process == "c") {
*output_buffer_host = new float[kBatchSize * kOutputSize];
*output_seg_buffer_host = new float[kBatchSize * kOutputSegSize];
} else if (cuda_post_process == "g") {
if (kBatchSize > 1) {
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
exit(0);
}
// Allocate memory for decode_ptr_host and copy to device
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
}
}
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg,
int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes,
std::string cuda_post_process) {
// infer on the batch asynchronously, and DMA output back to host
auto start = std::chrono::system_clock::now();
context.enqueue(batchsize, buffers, stream, nullptr);
if (cuda_post_process == "c") {
std::cout << "kOutputSize:" << kOutputSize << std::endl;
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
stream));
std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl;
CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float),
cudaMemcpyDeviceToHost, stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "ms" << std::endl;
} else if (cuda_post_process == "g") {
CUDA_CHECK(
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference and gpu postprocess time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir,
std::string& sub_type, std::string& cuda_post_process, std::string& labels_filename, float& gd,
float& gw, int& max_channels) {
if (argc < 4)
return false;
if (std::string(argv[1]) == "-s" && argc == 5) {
wts = std::string(argv[2]);
engine = std::string(argv[3]);
sub_type = std::string(argv[4]);
if (sub_type == "n") {
gd = 0.33;
gw = 0.25;
max_channels = 1024;
} else if (sub_type == "s") {
gd = 0.33;
gw = 0.50;
max_channels = 1024;
} else if (sub_type == "m") {
gd = 0.67;
gw = 0.75;
max_channels = 576;
} else if (sub_type == "l") {
gd = 1.0;
gw = 1.0;
max_channels = 512;
} else if (sub_type == "x") {
gd = 1.0;
gw = 1.25;
max_channels = 640;
} else {
return false;
}
} else if (std::string(argv[1]) == "-d" && argc == 6) {
engine = std::string(argv[2]);
img_dir = std::string(argv[3]);
cuda_post_process = std::string(argv[4]);
labels_filename = std::string(argv[5]);
} else {
return false;
}
return true;
}
int main(int argc, char** argv) {
cudaSetDevice(kGpuId);
std::string wts_name = "";
std::string engine_name = "";
std::string img_dir;
std::string sub_type = "";
std::string cuda_post_process = "";
std::string labels_filename = "../coco.txt";
int model_bboxes;
float gd = 0.0f, gw = 0.0f;
int max_channels = 0;
if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process, labels_filename, gd, gw,
max_channels)) {
std::cerr << "Arguments not right!" << std::endl;
std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl;
std::cerr << "./yolov8 -d [.engine] ../samples [c/g] coco_file// deserialize plan file and run inference"
<< std::endl;
return -1;
}
// Create a model using the API directly and serialize it to a file
if (!wts_name.empty()) {
serialize_engine(wts_name, engine_name, sub_type, gd, gw, max_channels);
return 0;
}
// Deserialize the engine from file
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
IExecutionContext* context = nullptr;
deserialize_engine(engine_name, &runtime, &engine, &context);
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
cuda_preprocess_init(kMaxInputImageSize);
auto out_dims = engine->getBindingDimensions(1);
model_bboxes = out_dims.d[0];
// Prepare cpu and gpu buffers
float* device_buffers[3];
float* output_buffer_host = nullptr;
float* output_seg_buffer_host = nullptr;
float* decode_ptr_host = nullptr;
float* decode_ptr_device = nullptr;
// Read images from directory
std::vector<std::string> file_names;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
std::cerr << "read_files_in_dir failed." << std::endl;
return -1;
}
std::unordered_map<int, std::string> labels_map;
read_labels(labels_filename, labels_map);
assert(kNumClass == labels_map.size());
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host,
&output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process);
// // batch predict
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
std::vector<std::string> img_name_batch;
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
img_batch.push_back(img);
img_name_batch.push_back(file_names[j]);
}
// Preprocess
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
// Run inference
infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize,
decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
std::vector<std::vector<Detection>> res_batch;
if (cuda_post_process == "c") {
// NMS
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
for (size_t b = 0; b < img_batch.size(); b++) {
auto& res = res_batch[b];
cv::Mat img = img_batch[b];
auto masks = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res);
draw_mask_bbox(img, res, masks, labels_map);
cv::imwrite("_" + img_name_batch[b], img);
}
} else if (cuda_post_process == "g") {
// Process gpu decode and nms results
// batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
// todo seg in gpu
std::cerr << "seg_postprocess is not support in gpu right now" << std::endl;
}
}
// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
CUDA_CHECK(cudaFree(device_buffers[2]));
CUDA_CHECK(cudaFree(decode_ptr_device));
delete[] decode_ptr_host;
delete[] output_buffer_host;
delete[] output_seg_buffer_host;
cuda_preprocess_destroy();
// Destroy the engine
delete context;
delete engine;
delete runtime;
// Print histogram of the output distribution
// std::cout << "\nOutput:\n\n";
// for (unsigned int i = 0; i < kOutputSize; i++)
//{
// std::cout << prob[i] << ", ";
// if (i % 10 == 0) std::cout << std::endl;
//}
// std::cout << std::endl;
return 0;
}

580
yolov8_seg_trt.py Normal file
View File

@ -0,0 +1,580 @@
"""
An example that uses TensorRT's Python api to make inferences.
"""
import ctypes
import os
import shutil
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt
CONF_THRESH = 0.5
IOU_THRESHOLD = 0.4
POSE_NUM = 17 * 3
DET_NUM = 6
SEG_NUM = 32
OBB_NUM = 1
def get_img_path_batches(batch_size, img_dir):
ret = []
batch = []
for root, dirs, files in os.walk(img_dir):
for name in files:
if len(batch) == batch_size:
ret.append(batch)
batch = []
batch.append(os.path.join(root, name))
if len(batch) > 0:
ret.append(batch)
return ret
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
"""
description: Plots one bounding box on image img,
this function comes from YoLov8 project.
param:
x: a box likes [x1,y1,x2,y2]
img: a opencv image object
color: color to draw rectangle, such as (0,255,0)
label: str
line_thickness: int
return:
no return
"""
tl = (
line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
) # line/font thickness
color = color or [random.randint(0, 255) for _ in range(3)]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(
img,
label,
(c1[0], c1[1] - 2),
0,
tl / 3,
[225, 255, 255],
thickness=tf,
lineType=cv2.LINE_AA,
)
class YoLov8TRT(object):
"""
description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path):
# Create a Context on this device,
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
# Data length
self.det_output_length = host_outputs[0].shape[0]
self.seg_output_length = host_outputs[1].shape[0]
self.seg_w = int(self.input_w / 4)
self.seg_h = int(self.input_h / 4)
self.seg_c = int(self.seg_output_length / (self.seg_w * self.seg_w))
self.det_row_output_length = self.seg_c + DET_NUM + POSE_NUM + OBB_NUM
# Draw mask
self.colors_obj = Colors()
def infer(self, raw_image_generator):
threading.Thread.__init__(self)
# Make self the active context, pushing it on top of the context stack.
self.ctx.push()
# Restore
stream = self.stream
context = self.context
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
# Do image preprocess
batch_image_raw = []
batch_origin_h = []
batch_origin_w = []
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
for i, image_raw in enumerate(raw_image_generator):
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
batch_image_raw.append(image_raw)
batch_origin_h.append(origin_h)
batch_origin_w.append(origin_w)
np.copyto(batch_input_image[i], input_image)
batch_input_image = np.ascontiguousarray(batch_input_image)
# Copy input image to host buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
start = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)
# Synchronize the stream
stream.synchronize()
end = time.time()
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
# Here we use the first row of output in that batch_size = 1
output = host_outputs[0]
output_proto_mask = host_outputs[1]
# Do postprocess
for i in range(self.batch_size):
result_boxes, result_scores, result_classid, result_proto_coef = self.post_process(
output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
batch_origin_w[i]
)
if result_proto_coef.shape[0] == 0:
continue
result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i],
batch_origin_w[i])
self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid],
im_src=batch_image_raw[i])
# Draw rectangles and labels on the original image
for j in range(len(result_boxes)):
box = result_boxes[j]
plot_one_box(
box,
batch_image_raw[i],
label="{}:{:.2f}".format(
categories[int(result_classid[j])], result_scores[j]
),
)
return batch_image_raw, end - start
def destroy(self):
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
def get_raw_image(self, image_path_batch):
"""
description: Read an image from image path
"""
for img_path in image_path_batch:
yield cv2.imread(img_path)
def get_raw_image_zeros(self, image_path_batch=None):
"""
description: Ready data for warmup
"""
for _ in range(self.batch_size):
yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
def preprocess_image(self, raw_bgr_image):
"""
description: Convert BGR image to RGB,
resize and pad it to target size, normalize to [0,1],
transform to NCHW format.
param:
input_image_path: str, image path
return:
image: the processed image
image_raw: the original image
h: original height
w: original width
"""
image_raw = raw_bgr_image
h, w, c = image_raw.shape
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
# Calculate widht and height and paddings
r_w = self.input_w / w
r_h = self.input_h / h
if r_h > r_w:
tw = self.input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((self.input_h - th) / 2)
ty2 = self.input_h - th - ty1
else:
tw = int(r_h * w)
th = self.input_h
tx1 = int((self.input_w - tw) / 2)
tx2 = self.input_w - tw - tx1
ty1 = ty2 = 0
# Resize the image with long side while maintaining ratio
image = cv2.resize(image, (tw, th))
# Pad the short side with (128,128,128)
image = cv2.copyMakeBorder(
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
)
image = image.astype(np.float32)
# Normalize to [0,1]
image /= 255.0
# HWC to CHW format:
image = np.transpose(image, [2, 0, 1])
# CHW to NCHW format
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order":
image = np.ascontiguousarray(image)
return image, image_raw, h, w
def xywh2xyxy(self, origin_h, origin_w, x):
"""
description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
param:
origin_h: height of original image
origin_w: width of original image
x: A boxes numpy, each row is a box [center_x, center_y, w, h]
return:
y: A boxes numpy, each row is a box [x1, y1, x2, y2]
"""
y = np.zeros_like(x)
r_w = self.input_w / origin_w
r_h = self.input_h / origin_h
if r_h > r_w:
y[:, 0] = x[:, 0]
y[:, 2] = x[:, 2]
y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
y /= r_w
else:
y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
y[:, 1] = x[:, 1]
y[:, 3] = x[:, 3]
y /= r_h
return y
def post_process(self, output, origin_h, origin_w):
"""
description: postprocess the prediction
param:
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
origin_h: height of original image
origin_w: width of original image
return:
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
result_scores: finally scores, a numpy, each element is the score correspoing to box
result_classid: finally classid, a numpy, each element is the classid correspoing to box
"""
# Get the num of boxes detected
num = int(output[0])
# Reshape to a two dimentional ndarray
pred = np.reshape(output[1:], (-1, self.det_row_output_length))[:num, :]
# Do nms
boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
result_scores = boxes[:, 4] if len(boxes) else np.array([])
result_classid = boxes[:, 5] if len(boxes) else np.array([])
result_proto_coef = boxes[:, DET_NUM:int(DET_NUM + SEG_NUM)] if len(boxes) else np.array([])
return result_boxes, result_scores, result_classid, result_proto_coef
def bbox_iou(self, box1, box2, x1y1x2y2=True):
"""
description: compute the IoU of two bounding boxes
param:
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
x1y1x2y2: select the coordinate format
return:
iou: computed iou
"""
if not x1y1x2y2:
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
# Get the coordinates of the intersection rectangle
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
# Intersection area
inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
* np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
# Union Area
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
"""
description: Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
param:
prediction: detections, (x1, y1, x2, y2, conf, cls_id)
origin_h: original image height
origin_w: original image width
conf_thres: a confidence threshold to filter detections
nms_thres: a iou threshold to filter detections
return:
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
"""
# Get the boxes that score > CONF_THRESH
boxes = prediction[prediction[:, 4] >= conf_thres]
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
# clip the coordinates
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
# Object confidence
confs = boxes[:, 4]
# Sort by the confs
boxes = boxes[np.argsort(-confs)]
# Perform non-maximum suppression
keep_boxes = []
while boxes.shape[0]:
large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
label_match = boxes[0, 5] == boxes[:, 5]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
keep_boxes += [boxes[0]]
boxes = boxes[~invalid]
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
return boxes
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def scale_mask(self, mask, ih, iw):
mask = cv2.resize(mask, (self.input_w, self.input_h))
r_w = self.input_w / (iw * 1.0)
r_h = self.input_h / (ih * 1.0)
if r_h > r_w:
w = self.input_w
h = int(r_w * ih)
x = 0
y = int((self.input_h - h) / 2)
else:
w = int(r_h * iw)
h = self.input_h
x = int((self.input_w - w) / 2)
y = 0
crop = mask[y:y + h, x:x + w]
crop = cv2.resize(crop, (iw, ih))
return crop
def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw):
"""
description: Mask pred by yolov8 instance segmentation ,
param:
output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input
result_proto_coef: prototype mask coefficients (n, 32), n represents n results
result_boxes :
ih: rows of original image
iw: cols of original image
return:
mask_result: (n, ih, iw)
"""
result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w)
c, mh, mw = result_proto_masks.shape
print(result_proto_masks.shape)
print(result_proto_coef.shape)
masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh,
mw)
mask_result = []
for mask, box in zip(masks, result_boxes):
mask_s = np.zeros((ih, iw))
crop_mask = self.scale_mask(mask, ih, iw)
x1 = int(box[0])
y1 = int(box[1])
x2 = int(box[2])
y2 = int(box[3])
crop = crop_mask[y1:y2, x1:x2]
crop = np.where(crop >= 0.5, 1, 0)
crop = crop.astype(np.uint8)
mask_s[y1:y2, x1:x2] = crop
mask_result.append(mask_s)
mask_result = np.array(mask_result)
return mask_result
def draw_mask(self, masks, colors_, im_src, alpha=0.5):
"""
description: Draw mask on image ,
param:
masks : result_mask
colors_: color to draw mask
im_src : original image
alpha : scale between original image and mask
return:
no return
"""
if len(masks) == 0:
return
masks = np.asarray(masks, dtype=np.uint8)
masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
masks = np.asarray(masks, dtype=np.float32)
colors_ = np.asarray(colors_, dtype=np.float32)
s = masks.sum(2, keepdims=True).clip(0, 1)
masks = (masks @ colors_).clip(0, 255)
im_src[:] = masks * alpha + im_src * (1 - s * alpha)
class inferThread(threading.Thread):
def __init__(self, yolov8_wrapper, image_path_batch):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
self.image_path_batch = image_path_batch
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
for i, img_path in enumerate(self.image_path_batch):
parent, filename = os.path.split(img_path)
save_name = os.path.join('output', filename)
# Save image
cv2.imwrite(save_name, batch_image_raw[i])
print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
class warmUpThread(threading.Thread):
def __init__(self, yolov8_wrapper):
threading.Thread.__init__(self)
self.yolov8_wrapper = yolov8_wrapper
def run(self):
batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
class Colors:
def __init__(self):
hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A',
'92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF',
'344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF',
'FF95C8', 'FF37C7')
self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
self.n = len(self.palette)
def __call__(self, i, bgr=False):
c = self.palette[int(i) % self.n]
return (c[2], c[1], c[0]) if bgr else c
@staticmethod
def hex2rgb(h): # rgb order (PIL)
return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))
if __name__ == "__main__":
# load custom plugin and engine
PLUGIN_LIBRARY = "./build/libmyplugins.so"
engine_file_path = "yolov8n-seg.engine"
if len(sys.argv) > 1:
engine_file_path = sys.argv[1]
if len(sys.argv) > 2:
PLUGIN_LIBRARY = sys.argv[2]
ctypes.CDLL(PLUGIN_LIBRARY)
# load coco labels
categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
"traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
"frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
"surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
"cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
"teddy bear",
"hair drier", "toothbrush"]
if os.path.exists('output/'):
shutil.rmtree('output/')
os.makedirs('output/')
# a YoLov8TRT instance
yolov8_wrapper = YoLov8TRT(engine_file_path)
try:
print('batch size is', yolov8_wrapper.batch_size)
image_dir = "images/"
image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
for i in range(10):
# create a new thread to do warm_up
thread1 = warmUpThread(yolov8_wrapper)
thread1.start()
thread1.join()
for batch in image_path_batches:
# create a new thread to do inference
thread1 = inferThread(yolov8_wrapper, batch)
thread1.start()
thread1.join()
finally:
# destroy the instance
yolov8_wrapper.destroy()

74
测试post请求.py Normal file
View File

@ -0,0 +1,74 @@
import datetime
import requests
import json
import yaml
import uuid
import time
with open('config.yaml', 'r') as file:
configData = yaml.safe_load(file)
tokenResult = {}
getTokenUrl = configData['dataConfig']['getTokenUrl']
vod_channelNo = configData['video_config']['v1_channelNo']
# 告警信息url
putMessageUrl = configData['dataConfig']['putMessageUrl']
def get_token(tokenResult):
if 'token' in tokenResult and 'current_time' in tokenResult:
token_time = datetime.datetime.strptime(tokenResult['current_time'],
"%Y-%m-%d %H:%M:%S")
current_time = datetime.datetime.now()
time_diff = current_time - token_time
if time_diff.total_seconds() > 20 * 60:
# 过期重新请求 token
# print("token 已过期")
response = requests.post(getTokenUrl)
if response.status_code == 200:
data = json.loads(response.text)
if 'retCode' in data and data['retCode'] == '200':
token = data['responseBody']['token']
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
tokenResult['token'] = token
tokenResult['current_time'] = current_time
else:
tokenResult['error'] = data['errorDesc']
else:
tokenResult['error'] = response.status_code
token = tokenResult['token']
return token
def send_post_request(url, token, msg, picUrl, videoUrl):
payload = {
"tenantCode": "8",
"channelNo": vod_channelNo,
"alarmContent": msg,
"alarmTime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"picInfo": [
{"url": picUrl}
],
"videoInfo": [
{"url": videoUrl}
]
}
headers = {
'X-Access-Token': token,
'Content-Type': 'application/json'
}
# print(url)
# print(headers)
# print(payload)
response = requests.post(url, headers=headers, data=json.dumps(payload))
print(response)
if __name__ == '__main__':
token = get_token()
print("token: ", token)
uuid_str = str(uuid.uuid4())[:6] + str(int(time.time())) # 生成UUID的前6位
upload_http_url_img = configData['minioConfig']['bucket_name'] + f'/{uuid_str}_{vod_channelNo}_.jpg'
send_post_request(putMessageUrl, token, "消息内容", upload_http_url_img, '')

View File

@ -0,0 +1,37 @@
# url =
import cv2
# 打开视频文件
# video_path = "/home/admin-root/haotian/锻8/tensorrtx/yolov8/video/180.50.13.253_07_2025042215303747.mp4"
video_path = "/home/admin-root/haotian/锻8/tensorrtx/yolov8/video/180.50.13.253_02_20250627142949812.mp4"
cap = cv2.VideoCapture(video_path)
# 检查是否成功打开视频
if not cap.isOpened():
print("Error: Could not open video.")
exit()
n = 0
# 逐帧读取视频
while True:
# 读取一帧ret为是否读取成功frame为帧数据
ret, frame = cap.read()
# 如果读取失败(如视频结束),退出循环
if not ret:
print("Reached end of video.")
break
# 在此处处理帧数据(例如显示、保存、分析等)
cv2.imwrite(f"./images_20250627142949812/mp4_{n}.jpg", frame)
print(f"保存图片{n}")
n += 1
# 释放资源
cap.release()
# cv2.destroyAllWindows()