在最近的項目中需要對Xavier上的tensorflow代碼進行加速,然后xavier中又自帶了TensorRT爱榕,所以就直接使用TensorRT進行加速瓣喊。本文針對tensorflow,如果使用其他的框架需要進行相應(yīng)的修改黔酥。
TensorRT簡介
TensorRT是NVIDIA 推出的一款基于CUDA和cudnn的神經(jīng)網(wǎng)絡(luò)推斷加速引擎藻三,相比于一般的深度學(xué)習(xí)框架洪橘,在CPU或者GPU模式下其可提供10X乃至100X的加速,極大提高了深度學(xué)習(xí)模型在邊緣設(shè)備上的推斷速度棵帽。
TensorRT主要是通過兩種方式對inference進行加速熄求,一種是將模型進行融合聚合,另外一種就是調(diào)整精度逗概,使用FP16,INT8等精度弟晚。
在Xavier平臺上,預(yù)先安裝了c++版本的TensorRT逾苫,所以本文不涉及TensorRT的安裝
使用TensorRT的整體流程如下:
- 在pc端首先訓(xùn)練好模型指巡,得到pb模型文件
- 在pc端將模型文件轉(zhuǎn)化為TensorRT能夠使用的模型格式,tensorflow的話將pb轉(zhuǎn)化為uff格式
- 在xavier端使用模型文件構(gòu)建出engine
- 在xavier端使用engine進行推理
導(dǎo)出pb模型文件
pb文件轉(zhuǎn)化為uff文件
參考我另外一篇文章ubuntu16.04安裝TensorRT5.1隶垮,安裝完TensorRT后,可以使用convert-to-uff命令進行模型的轉(zhuǎn)換秘噪,-o表示轉(zhuǎn)化完的模型的名稱
convert-to-uff model.pb -o model.uff
xavier端構(gòu)建engine
這一步我覺得最苦難的是CMakeLists.txt文件的編寫狸吞,需要加入cuda的一些動態(tài)庫,以及TensorRT的頭文件和動態(tài)庫指煎,我的CMakeLists.txt文件如下所示蹋偏。這一步和下一步我都使用這份CMakeLists.txt。
cmake_minimum_required(VERSION 2.8)
project(tensorrt)
find_package(OpenCV REQUIRED )
# 添加cuda頭文件
include_directories(/usr/local/cuda/include)
# 添加tensorrt的動態(tài)庫
link_libraries("/usr/lib/aarch64-linux-gnu/libnvparsers.so")
link_libraries("/usr/lib/aarch64-linux-gnu/libnvinfer.so")
# 添加cuda動態(tài)庫
link_libraries("/usr/local/cuda/lib64/libcudart.so")
# 確定可執(zhí)行文件名稱
add_executable(tensorrt tensorrt.cpp)
add_executable(uff_to_plan uff_to_plan.cpp)
# 添加opencv動態(tài)庫
target_link_libraries(tensorrt ${OpenCV_LIBS})
然后下面是構(gòu)建engine并保存的代碼
在這一步有一個問題至壤,tensorflow中輸入的格式為(height,width,channel) 即HWC威始,但是在TensorRT中是CHW格式,這一步在轉(zhuǎn)換模型的時候就自動完成了像街,不需要自己轉(zhuǎn)換黎棠,但是最后使用tensorrt推理的時候要注意把圖像的格式轉(zhuǎn)換為CHW。
class Logger : public ILogger
{
void log(Severity severity, const char *msg) override
{
cout << msg << endl;
}
} gLogger;
int main(int argc, char *argv[])
{
/* parse uff */
IBuilder *builder = createInferBuilder(gLogger);
INetworkDefinition *network = builder->createNetwork();
IUffParser *parser = createUffParser();
/* register input and output */
parser->registerInput(inputName.c_str(), DimsCHW(3, inputHeight, inputWidth), UffInputOrder::kNCHW);
parser->registerOutput(outputName.c_str());
if (!parser->parse(modelName.c_str(), *network, DataType::kFLOAT))
{
cout << "Failed to parse UFF\n";
builder->destroy();
parser->destroy();
network->destroy();
return 1;
}
/* build engine */
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(maxWorkspaceSize);
/* use FP16 */
builder->setFp16Mode(true);
// builder->setInt8Mode(true);
ICudaEngine *engine = builder->buildCudaEngine(*network);
/* serialize engine and write to file */
ofstream planFile;
planFile.open(planFilename);
IHostMemory *serializedEngine = engine->serialize();
planFile.write((char *)serializedEngine->data(), serializedEngine->size());
planFile.close();
/* break down */
builder->destroy();
parser->destroy();
network->destroy();
engine->destroy();
serializedEngine->destroy();
return 0;
}
使用TensorRT進行推理
在上一步中構(gòu)建出了engine并進行序列化保存成了plan文件镰绎,這一步就是讀取plan文件并且反序列化構(gòu)建出engine脓斩,使用engine進行推理
void cvImageToTensor(const cv::Mat &image, float *tensor, nvinfer1::Dims dimensions)
{
const size_t channels = dimensions.d[0];
const size_t height = dimensions.d[1];
const size_t width = dimensions.d[2];
// TODO: validate dimensions match
const size_t stridesCv[3] = {width * channels, channels, 1};
const size_t strides[3] = {height * width, width, 1};
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
for (int k = 0; k < channels; k++)
{
const size_t offsetCv = i * stridesCv[0] + j * stridesCv[1] + k * stridesCv[2];
const size_t offset = k * strides[0] + i * strides[1] + j * strides[2];
tensor[offset] = (float)image.data[offsetCv];
}
}
}
}
void solve()
{
/* build the engine */
ifstream planFile(planFileName);
stringstream planBuffer;
planBuffer << planFile.rdbuf();
string plan = planBuffer.str();
IRuntime *runtime = createInferRuntime(gLogger);
ICudaEngine *engine = runtime->deserializeCudaEngine((void*)plan.data(),
plan.size(), nullptr);
IExecutionContext *context = engine->createExecutionContext();
// get the input and output dimensions
int inputBindingIndex, outputBindingIndex;
inputBindingIndex = engine->getBindingIndex(inputName.c_str());
outputBindingIndex = engine->getBindingIndex(outputName.c_str());
Dims inputDims, outputDims;
inputDims = engine->getBindingDimensions(inputBindingIndex);
outputDims = engine->getBindingDimensions(outputBindingIndex);
// get the input and output size
int inputWidth, inputHeight, outputHeight, outputWidth;
inputHeight = inputDims.d[1];
inputWidth = inputDims.d[2];
outputHeight = outputDims.d[1];
outputWidth = outputDims.d[2];
/* get the number of input and output */
float *inputDataHost, *outputDataHost;
size_t numInput, numOutput;
numInput = numTensorElements(inputDims);
numOutput = numTensorElements(outputDims);
inputDataHost = (float *)malloc(numInput * sizeof(float));
outputDataHost = (float *)malloc(numOutput * sizeof(float));
/* transfer to device */
void *inputDataDevice, *outputDataDevice;
cudaMalloc(&inputDataDevice, numInput * sizeof(float));
cudaMalloc(&outputDataDevice, numOutput * sizeof(float));
if (inputDataDevice == nullptr || outputDataDevice == nullptr)
{
std::cerr << "Out of memory" << std::endl;
exit(1);
}
void *bindings[2];
bindings[inputBindingIndex] = inputDataDevice;
bindings[outputBindingIndex] = outputDataDevice;
// get the image name
vector<string> images;
getImages(imageFolderName, images);
cout << "Executing inference engine..." << endl;
for(int i = 0; i < images.size(); ++i) {
string imageFileName = images[i];
cv::Mat image = cv::imread(imageFolderName + imageFileName);
/* BGR to RGB */
cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
/* resize */
cv::resize(image, image, cv::Size(inputWidth, inputHeight));
/* convert HWC to float CHW */
cvImageToTensor(image, inputDataHost, inputDims);
cudaMemcpy(inputDataDevice, inputDataHost, numInput * sizeof(float), cudaMemcpyHostToDevice);
/* execute engine */
context->execute(kBatchSize, bindings);
/* transfer output back to host */
cudaMemcpy(outputDataHost, outputDataDevice, numOutput * sizeof(float), cudaMemcpyDeviceToHost);
cv::Mat preds(outputHeight, outputWidth, CV_8UC1);
TensorToImage(outputDataHost, preds, outputDims);
// 后續(xù)處理省略
cout << i << "/" << images.size() << " is over" << endl;
}
engine->destroy();
context->destroy();
free(inputDataHost);
free(outputDataHost);
cudaFree(inputDataDevice);
cudaFree(outputDataDevice);
}
參考代碼
- https://github.com/NVIDIA-AI-IOT/tf_to_trt_image_classification
- xavier中自帶的tensorrt實例,在/usr/src/tensorrt/examples中