TensorRT使用基本流程

原創

2020-06-21 19:18

tensorRT的使用包括兩個階段， build and runtime (deployment)：

build：Import and optimize trained models to generate inference engines

build階段主要完成模型轉換（從caffe或TensorFlow到TensorRT），在模型轉換時會完成前述優化過程中的層間融合，精度校準。這一步的輸出是一個針對特定GPU平臺和網絡模型的優化過的TensorRT模型，這個TensorRT模型可以序列化存儲到磁盤或內存中。存儲到磁盤中的文件稱之爲 plan file。

build階段依次實例化以下對象

ILogger
IBuilder
INetworkDefiniton
IParser
ICudaEngine
serialize成IHostMemory

build示例代碼

    //brief: Builds an engine from a network definition.
    //創建一個IBuilder對象
    IBuilder *builder = createInferBuilder(gLogger);
    //brief: A network definition for input to the builder.
    // 創建一個network對象
    INetworkDefinition *network = builder->createNetwork();
    //brief: Class used for parsing Caffe models. Allows users to export models trained using Caffe to TRT.
    //創建一個ICaffeParser對象，繼承自IParser類，用於解析caffe模型
    ICaffeParser *parser = createCaffeParser();
    //brief: Set the IPluginFactory used to create the user defined plugins.
    parser->setPluginFactory(&pluginFactory);
    engine_file = enginefile;
    
    if ((dataType == DataType::kINT8 && !builder->platformHasFastInt8()) ||
        (dataType == DataType::kHALF && !builder->platformHasFastFp16()))
        return false;
        
    const IBlobNameToTensor *blobNameToTensor = parser->parse(      
            deployFile.c_str(),                                         
            modelFile.c_str(),
            *network,
            dataType == DataType::kINT8 ? DataType::kFLOAT: dataType);
    // specify which tensors are output
    for (auto &s : OUTPUT_BLOB_NAMES)
        //markOutput brief: Mark a tensor as a network output.
        //標記輸出tensor
        network->markOutput(*blobNameToTensor->find(s.c_str()));
    //find brief: Given a blob name, returns a pointer to a ITensor object.
    // Build the engine
    //設置存儲空間和batchSize等
    builder->setMaxBatchSize(maxBatchSize);
    //workspaceSize brief: The maximum GPU temporary memory which the engine can use at       execution time.
    builder->setMaxWorkspaceSize(1 << 30);


    //brief: Build a CUDA engine from a network definition.
    //生成ICudaEngine
    engine = builder->buildCudaEngine(*network);
    assert(engine);
    network->destroy();
    parser->destroy();

    // serialize the engine,
    //brief: Serialize the network to a stream.
    //序列化engine
    TRTModelStream = engine->serialize();

    engine->destroy();
    builder->destroy();
    pluginFactory.destroyPlugin();

runtime (deploy)：Generate runtime inference engine for inference

runtime或者說是deploy階段主要完成推理過程，Kernel Auto-Tuning 和 Dynamic Tensor Memory 應該是在這裏完成的。將上面一個步驟中的plan文件首先反序列化，並創建一個 runtime engine，然後就可以輸入數據（比如測試集或數據集之外的圖片），然後輸出分類向量結果或檢測結果。

tensorRT的好處就是不需要安裝其他深度學習框架，就可以實現部署和推理。

runtime階段實例化以下對象

IRuntime
ICudaEngine
IExecutionContext

以下是runtime示例代碼

//創建IRuntime對象
runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
//創建ICudaEngine對象
engine = runtime->deserializeCudaEngine(TRTModelStream->data(), TRTModelStream->size(), &pluginFactory);
assert(engine != nullptr);
std::cout << "createinference" << std::endl;
for (int bi = 0; bi < engine->getNbBindings(); bi++)
{
    if (engine->bindingIsInput(bi) == true) printf("Binding %d (%s): Input.\n", bi, engine->getBindingName(bi));
    else printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi));
}

std::cout << "****TensorRT Phase:********doInference" << std::endl;
//創建IExecutionContext對象，創建上下文環境 context，用於啓動kernel
context = engine->createExecutionContext();
//context->setProfiler(&gProfiler);
assert(engine->getNbBindings() == 2);

// In order to bind the buffers, we need to know the names of the 
// input and output tensors. //獲取輸入，輸出tensor索引
inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME.c_str());// inputIndex=0
outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAMES[0].c_str());// outputIndex=1
//std::cout<<"outputIndex = "<<outputIndex<<std::endl;
//申請GPU顯存
// Allocate GPU memory for Input / Output data
CUDA_CHECK(
        cudaMalloc(&buffers[inputIndex], batchSize * inputDim.c() * inputDim.h() * inputDim.w() * sizeof(float)));
CUDA_CHECK(cudaMalloc(
        &buffers[outputIndex], batchSize * outputDim.c() * outputDim.h() * outputDim.w() * sizeof(float)));
        
//使用cuda 流來管理並行計算
// Use CUDA streams to manage the concurrency of copying and executing
CUDA_CHECK(cudaStreamCreate(&stream));

//從內存到顯存，input是讀入內存中的數據；buffers[inputIndex]是顯存上的存儲區域，用於存放輸入數據
// Copy Input Data to the GPU
cudaMemcpyAsync(buffers[inputIndex], input, 
                batchSize * size_of_single_input, 
                cudaMemcpyHostToDevice, stream);
                
//啓動cuda覈計算
// Launch an instance of the GIE compute kernel
context->enqueue(batchSize, buffers, stream, nullptr);

//從顯存到內存，buffers[outputIndex]是顯存中的存儲區，存放模型輸出；output是內存中的數據
// Copy Output Data to the Host
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex],
                           batchSize * outputDim.c() * outputDim.h() * outputDim.w() * sizeof(float),
                           cudaMemcpyDeviceToHost));
                           
//如果使用了多個cuda流，需要同步
// It is possible to have multiple instances of the code above
// in flight on the GPU in different streams.
// The host can then sync on a given stream and use the results
CUDA_CHECK(cudaStreamSynchronize(stream));

參考

TensorRT(1)-介紹-使用-安裝：https://arleyzhang.github.io/articles/7f4b25ce/

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

TensorRT使用基本流程

build：Import and optimize trained models to generate inference engines

runtime (deploy)：Generate runtime inference engine for inference

參考

10分鐘搞定Mysql主從部署配置

如何使用 JS 判斷用戶是否處於活躍狀態

「Pygors跨平臺GUI」2：安裝MinGW-w64、MSYS2還是WSL2

[轉帖]

python列出centos7內存使用前50的進程信息

「Pygors跨平臺GUI」1：Pygors跨平臺GUI應用研究

一鍵自動化博客發佈工具,用過的人都說好(掘金篇)

lightdb數據庫超時相關控制參數

lightdb秒級增加列和刪除列（not null帶默認值）

Java ThreadPoolShutdown

吳恩達機器學習課程思維導圖

Glog官方文檔翻譯

macbook終端走代理（brew和git）

numpy 系統性總結

TensorRT使用基本流程

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結