使用原生TensortRT-API加速推理

Polygraphy 是构建在 TensorRT API 之上的高级工具，简化了一些常见的操作，特别是模型转换、性能分析和调试。Polygraphy 提供了更高层次的抽象，使得许多复杂的操作更加简便和自动化。在上一篇文章中，我们已经基于Polygraphy实现了对于PyTorch模型–>TensorRT的推理加速。

然而，原生的TensorRT API提供了更低层的 API，允许用户对引擎的每一步（包括构建、优化、推理）进行更细致的控制。因此，本文将使用原生的TensorRT API，重新梳理将PyTorch模型转换为TensorRT的详细步骤。

本文目标

基于官方的demo，将FC-ResNet101的PyTorch模型转换为TensorRT 的engine，演示如何使用原生TensorRT API而非Polygraphy实现PyTorch模型的推理加速。

操作步骤

step1. 环境搭建

在上一篇环境搭建的基础上，需要额外安装pycuda工具包：

1	pip install pycuda

step2. 执行转换

step2.1 定义模型

import torch
import torch.nn as nn

output_onnx="fcn-resnet101.onnx"

# FC-ResNet101 pretrained model from torch-hub extended with argmax layer
class FCN_ResNet101(nn.Module):
    def __init__(self):
        super(FCN_ResNet101, self).__init__()
        self.model = torch.hub.load('pytorch/vision:v0.6.0', 'fcn_resnet101', pretrained=True)

    def forward(self, inputs):
        x = self.model(inputs)['out']
        x = x.argmax(1, keepdims=True)
        return x

model = FCN_ResNet101()
model.eval()

step2.2 定义输入数据结构

1 2	# Generate input tensor with random values input_tensor = torch.rand(4, 3, 224, 224)

这个数据用于帮助 ONNX 生成器了解模型的输入形状，以便在导出模型时能够正确地构建计算图

step2.3 执行转换： PyTorch转ONNX

output_onnx="fcn-resnet101.onnx"

# Export torch model to ONNX
print("Exporting ONNX model {}".format(output_onnx))
torch.onnx.export(model, input_tensor, output_onnx,
    opset_version=12,
    do_constant_folding=True,
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch", 2: "height", 3: "width"},
                  "output": {0: "batch", 2: "height", 3: "width"}},
    verbose=False)

这里，将batch size，height和width都设置成了动态的。

step2.4 执行转换：ONNX转TensorRT

运行如下命令行：

1	trtexec --onnx=fcn-resnet101.onnx --saveEngine=fcn-resnet101.engine --optShapes=input:1x3x1026x1282

如果需要设置动态batch size，并启用FP16，添加额外的参数如下：

trtexec --onnx=fcn-resnet101.onnx \
        --fp16 \
        --saveEngine=fcn-resnet101.engine \
        --minShapes=input:1x3x1026x1282 \
        --optShapes=input:4x3x1026x1282 \
        --maxShapes=input:8x3x1026x1282

step2.5 执行TensorRT的推理

首先导入必要的包

from tqdm import tqdm
import numpy as np
import os
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt

import matplotlib.pyplot as plt
from PIL import Image

然后准备测试数据

TRT_LOGGER = trt.Logger()

# Filenames of TensorRT plan file and input/output images.
engine_file = "fcn-resnet101_fp16.engine"
input_file  = "input.ppm"# 输入图片的路径
output_file = "output.ppm"# mask可视化结果的保存路径

接下来定义前后处理的函数：

# For torchvision models, input images are loaded in to a range of [0, 1] and
# normalized using mean = [0.485, 0.456, 0.406] and stddev = [0.229, 0.224, 0.225].
def preprocess(image):
    # Mean normalization
    mean = np.array([0.485, 0.456, 0.406]).astype('float32')
    stddev = np.array([0.229, 0.224, 0.225]).astype('float32')
    data = (np.asarray(image).astype('float32') / float(255.0) - mean) / stddev

    # Switch from HWC to to CHW order
    return np.moveaxis(data, 2, 0)

def postprocess(data):
    num_classes = 21
    # create a color palette, selecting a color for each class
    palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
    colors = np.array([palette*i%255 for i in range(num_classes)]).astype("uint8")
    # plot the segmentation predictions for 21 classes in different colors
    img = Image.fromarray(data.astype('uint8'), mode='P')
    img.putpalette(colors)
    return img

现在，准备使用TensorRT的原生API进行推理了。

先定义将导出的TensorRT engine加载进来的函数，如下：

def load_engine(engine_file_path):
    assert os.path.exists(engine_file_path)
    print("Reading engine from file {}".format(engine_file_path))
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

然后定义推理函数：

def infer(engine, input_file, output_file):
    print("Reading input image from file {}".format(input_file))
    with Image.open(input_file) as img:
        input_image = preprocess(img)
        image_width = img.width
        image_height = img.height
        print(input_image.shape,image_height,image_width)

    with engine.create_execution_context() as context:
        # Set input shape based on image dimensions for inference
        context.set_binding_shape(engine.get_binding_index("input"), (1, 3, image_height, image_width))
        # Allocate host and device buffers
        bindings = []
        for binding in engine:
            print('binding: ',binding)
            binding_idx = engine.get_binding_index(binding)
            size = trt.volume(context.get_binding_shape(binding_idx))
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            if engine.binding_is_input(binding):

                input_buffer = np.ascontiguousarray(input_image)
                input_memory = cuda.mem_alloc(input_image.nbytes)
                bindings.append(int(input_memory))
            else:

                output_buffer = cuda.pagelocked_empty(size, dtype)
                output_memory = cuda.mem_alloc(output_buffer.nbytes)
                bindings.append(int(output_memory))
        stream = cuda.Stream()
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(input_memory, input_buffer, stream)
        # Run inference
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer prediction output from the GPU.
        cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
        # Synchronize the stream
        stream.synchronize()

    with postprocess(np.reshape(output_buffer, (image_height, image_width))) as img:
        print("Writing output image to file {}".format(output_file))
        img.convert('RGB').save(output_file, "PPM")
    #print(output_buffer)

最后，使用测试图片执行推理：

# according to : https://github.com/NVIDIA/TensorRT/issues/2052
engine= load_engine(engine_file)# 加载engine
infer(engine, input_file, output_file)# 执行推理
del engine# 推理完成后释放engine的内存空间