win10 tensoRT5.1.5.0 C++推理demo學習

推理流程

1.日誌接口
TensorRT的兩種初始化方法：
①創建IBuilder對象去優化網絡（創建後可生成序列化文件）
②創建IRuntime對象去執行優化網絡（從序列化文件導入）
在任何一種情況下，你都必須實現一個日誌記錄接口，TensorRT通過該接口報告錯誤，警告和信息性消息。
Logger.cpp

Logger gLogger{Logger::Severity::kWARNING};

//! gLoggerSample is used to set default reportability level for sample specific logging.
Logger gLoggerSample{Logger::Severity::kINFO};

LogStreamConsumer gLogVerbose{LOG_VERBOSE(gLoggerSample)};
LogStreamConsumer gLogInfo{LOG_INFO(gLoggerSample)};
LogStreamConsumer gLogWarning{LOG_WARN(gLoggerSample)};
LogStreamConsumer gLogError{LOG_ERROR(gLoggerSample)};
LogStreamConsumer gLogFatal{LOG_FATAL(gLoggerSample)};

void setReportableSeverity(Logger::Severity severity)
{
    gLogger.setReportableSeverity(severity);
    gLogVerbose.setReportableSeverity(severity);
    gLogInfo.setReportableSeverity(severity);
    gLogWarning.setReportableSeverity(severity);
    gLogError.setReportableSeverity(severity);
    gLogFatal.setReportableSeverity(severity);
}

2.創建構建器和網絡

    IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
    INetworkDefinition* network = builder->createNetwork();

3.構建解析器並解析模型
目前支持以下格式的序列化模型： Caffe 、ONNX（Pytorch、Caffe2等）、UFF（TensorFlow）。
Caffe：
構建解析器

ICaffeParser* parser = createCaffeParser();

解析模型

const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor = parser->parse(
        locateFile(mParams.prototxtFileName, mParams.dataDirs).c_str(),
        locateFile(mParams.weightsFileName, mParams.dataDirs).c_str(),
        *network,
        nvinfer1::DataType::kFLOAT);

設置構建器，定義引擎Engine

builder->setMaxBatchSize(mParams.batchSize);
builder->setMaxWorkspaceSize(16_MB);
builder->allowGPUFallback(true);
builder->setFp16Mode(mParams.fp16);
builder->setInt8Mode(mParams.int8);
builder->setStrictTypeConstraints(true);
    
samplesCommon::enableDLA(builder.get(), mParams.dlaCore);

mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());

推理，給出關鍵代碼

samplesCommon::BufferManager buffers(mEngine, mParams.batchSize);

auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
    
if (!context->enqueue(mParams.batchSize, buffers.getDeviceBindings().data(), stream, nullptr))
        return false;
float* hostInputBuffer = static_cast<float*>(buffers.getHostBuffer(inputTensorName));

const float* prob = static_cast<const float*>(buffers.getHostBuffer(outputTensorName));
#prob爲輸出
float val{0.0f};
    int idx{0};
    for (unsigned int i = 0; i < 10; i++)
    {
        val = std::max(val, prob[i]);
        if (val == prob[i])
            idx = i;
        gLogInfo << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
    }
    gLogInfo << std::endl;

    return (idx == groundTruthDigit && val > 0.9f);

ONNX：
構建解析器

auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());

解析onnx

if ( !parser->parseFromFile( locateFile(modelFile, gArgs.dataDirs).c_str(), static_cast<int>(gLogger.getReportableSeverity()) ) )
    {
        gLogError << "Failure while parsing ONNX file" << std::endl;
        return false;
    }

設置構建器，定義引擎Engine

builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 20);
builder->setFp16Mode(gArgs.runInFp16);
builder->setInt8Mode(gArgs.runInInt8);

if (gArgs.runInInt8)
{
    samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
}
    
samplesCommon::enableDLA(builder, gArgs.useDLACore);
    
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);

// we can destroy the parser
parser->destroy();

// serialize the engine, then close everything down
trtModelStream = engine->serialize();
engine->destroy();
network->destroy();
builder->destroy();

反序列化引擎，推理

ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
assert(engine != nullptr);
trtModelStream->destroy();
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
// run inference
float prob[OUTPUT_SIZE];
doInference(*context, data, prob, 1);

mnist推理得到prob，即output

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    int inputIndex{}, outputIndex{};
    for (int b = 0; b < engine.getNbBindings(); ++b)
    {
        if (engine.bindingIsInput(b))
            inputIndex = b;
        else
            outputIndex = b;
    }

    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // release the stream and the buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

UFF：
構建解析器

IUFFParser* parser = createUffParser();

聲明輸入、輸出

parser->registerInput("Input", DimsCHW(3, 300, 300), UffInputOrder::kNCHW);
    // MarkOutput_0 is a node created by the UFF converter when we specify an ouput with -O.
    parser->registerOutput("MarkOutput_0");

設置構建器，定義引擎Engine

 builder->setMaxBatchSize(maxBatchSize);
// The _GB literal operator is defined in common/common.h
builder->setMaxWorkspaceSize(1_GB); 
// We need about 1GB of scratch space for the plugin layer for batch size 5.
if (gArgs.runInInt8)
{
    builder->setInt8Mode(gArgs.runInInt8);
    builder->setInt8Calibrator(calibrator);
}
        
builder->setFp16Mode(gArgs.runInFp16);
samplesCommon::enableDLA(builder, gArgs.useDLACore);

gLogInfo << "Begin building engine..." << std::endl;
ICudaEngine* engine = builder->buildCudaEngine(*network);

// We don't need the network any more, and we can destroy the parser.
network->destroy();
parser->destroy();

// Serialize the engine, then close everything down.
trtModelStream = engine->serialize();

builder->destroy();

engine->destroy();

得到序列化的trtModelStream後，對圖像進行預處理，data爲圖像，detectionOut爲網絡輸入，keepCount

IRuntime* runtime = createInferRuntime(gLogger.getTRTLogger());
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
assert(engine != nullptr);
trtModelStream->destroy();
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);

vector<float> data(N * INPUT_C * INPUT_H * INPUT_W);
vector<float> detectionOut(N * detectionOutputParam.keepTopK * 7);
vector<int> keepCount(N);

// Run inference.
doInference(*context, &data[0], &detectionOut[0], &keepCount[0], N);

sampleUffSSD c++推理

1.下載模型：
http://download.tensorflow.org/models/object_detection/ssd_inception_v2_coco_2017_11_17.tar.gz
2.進入TensorRT-5.1.5.0\samples\sampleUffSSD路徑下執行，uff這個包要下linux版的tensorRT用pip install安裝。

python3 xxx\Anaconda3\envs\python36\Lib\site-packages\uff\bin\convert_to_uff.py \
  ssd_inception_v2_coco_2017_11_17/frozen_inference_graph.pb -O NMS \
  -p config.py \
  -o ssd_inception_v2_coco_2017_11_17/sample_ssd_relu6.uff

3.VS2015下打開TensorRT-5.1.5.0\samples\sampleUffSSD下的sample_uff_ssd.sln進行推理，修改一下自己的cuda和tensorRT配置的include路徑和lib，運行即可。

放下源碼便於複習吧

#include <cassert>
#include <chrono>
#include <cublas_v2.h>
#include <cudnn.h>
#include <iostream>
#include <sstream>
#include <string.h>
#include <time.h>
#include <unordered_map>
#include <vector>

#include "BatchStreamPPM.h"
#include "NvUffParser.h"
#include "logger.h"
#include "common.h"
#include "argsParser.h"
#include "NvInferPlugin.h"
#include "EntropyCalibrator.h"

using namespace nvinfer1;
using namespace nvuffparser;

const std::string gSampleName = "TensorRT.sample_uff_ssd";

static samplesCommon::Args gArgs;

static constexpr int OUTPUT_CLS_SIZE = 91;

const char* OUTPUT_BLOB_NAME0 = "NMS";

//INT8 Calibration, currently set to calibrate over 100 images
static constexpr int CAL_BATCH_SIZE = 10;
static constexpr int FIRST_CAL_BATCH = 0, NB_CAL_BATCHES = 10;

DetectionOutputParameters detectionOutputParam{true, false, 0, OUTPUT_CLS_SIZE, 100, 100, 0.5, 0.6, CodeTypeSSD::TF_CENTER, {0, 2, 1}, true, true};

// Visualization
const float visualizeThreshold = 0.5;

void printOutput(int64_t eltCount, DataType dtype, void* buffer)
{
    assert(samplesCommon::getElementSize(dtype) == sizeof(float));

    size_t memSize = eltCount * samplesCommon::getElementSize(dtype);
    float* outputs = new float[eltCount];
    CHECK(cudaMemcpyAsync(outputs, buffer, memSize, cudaMemcpyDeviceToHost));

    int maxIdx = std::distance(outputs, std::max_element(outputs, outputs + eltCount));

    gLogVerbose << "Output:\n";
    for (int64_t eltIdx = 0; eltIdx < eltCount; ++eltIdx)
    {
        gLogVerbose << eltIdx << " => " << outputs[eltIdx] << "\t : ";
        if (eltIdx == maxIdx)
            gLogVerbose << "***";
        gLogVerbose << "\n";
    }
    gLogVerbose << std::endl;

    delete[] outputs;
}

std::string locateFile(const std::string& input)
{
    std::vector<std::string> dirs{"data/ssd/",
                                  "data/ssd/VOC2007/",
                                  "data/ssd/VOC2007/PPMImages/",
                                  "data/samples/ssd/",
                                  "data/int8_samples/ssd/",
                                  "int8/ssd/",
                                  "data/samples/ssd/VOC2007/",
                                  "data/samples/ssd/VOC2007/PPMImages/"};
    return locateFile(input, dirs);
}

void populateTFInputData(float* data)
{

    auto fileName = locateFile("inp_bus.txt");
    std::ifstream labelFile(fileName);
    string line;
    int id = 0;
    while (getline(labelFile, line))
    {
        istringstream iss(line);
        float num;
        iss >> num;
        data[id++] = num;
    }

    return;
}

void populateClassLabels(std::string (&CLASSES)[OUTPUT_CLS_SIZE])
{

    auto fileName = locateFile("ssd_coco_labels.txt");
    std::ifstream labelFile(fileName);
    string line;
    int id = 0;
    while (getline(labelFile, line))
    {
        CLASSES[id++] = line;
    }

    return;
}

std::vector<std::pair<int64_t, DataType>>
calculateBindingBufferSizes(const ICudaEngine& engine, int nbBindings, int batchSize)
{
    std::vector<std::pair<int64_t, DataType>> sizes;
    for (int i = 0; i < nbBindings; ++i)
    {
        Dims dims = engine.getBindingDimensions(i);
        DataType dtype = engine.getBindingDataType(i);

        int64_t eltCount = samplesCommon::volume(dims) * batchSize;
        sizes.push_back(std::make_pair(eltCount, dtype));
    }

    return sizes;
}

ICudaEngine* loadModelAndCreateEngine(const char* uffFile, int maxBatchSize,
                                      IUffParser* parser, IInt8Calibrator* calibrator, IHostMemory*& trtModelStream)
{
    // Create the builder
    IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
    assert(builder != nullptr);

    // Parse the UFF model to populate the network, then set the outputs.
    INetworkDefinition* network = builder->createNetwork();

    gLogInfo << "Begin parsing model..." << std::endl;
    if (!parser->parse(uffFile, *network, nvinfer1::DataType::kFLOAT))
    {
        gLogError << "Failure while parsing UFF file" << std::endl;
        return nullptr;
    }

    gLogInfo << "End parsing model..." << std::endl;

    // Build the engine.
    builder->setMaxBatchSize(maxBatchSize);
    // The _GB literal operator is defined in common/common.h
    builder->setMaxWorkspaceSize(1_GB); // We need about 1GB of scratch space for the plugin layer for batch size 5.
    if (gArgs.runInInt8)
    {
        builder->setInt8Mode(gArgs.runInInt8);
        builder->setInt8Calibrator(calibrator);
    }
        
    builder->setFp16Mode(gArgs.runInFp16);
    samplesCommon::enableDLA(builder, gArgs.useDLACore);

    gLogInfo << "Begin building engine..." << std::endl;
    ICudaEngine* engine = builder->buildCudaEngine(*network);
    if (!engine)
    {
        gLogError << "Unable to create engine" << std::endl;
        return nullptr;
    }
    gLogInfo << "End building engine..." << std::endl;

    // We don't need the network any more, and we can destroy the parser.
    network->destroy();
    parser->destroy();

    // Serialize the engine, then close everything down.
    trtModelStream = engine->serialize();

    builder->destroy();
    shutdownProtobufLibrary();
    return engine;
}

void doInference(IExecutionContext& context, float* inputData, float* detectionOut, int* keepCount, int batchSize)
{

    const ICudaEngine& engine = context.getEngine();
    // Input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly 1 input and 2 output.
    int nbBindings = engine.getNbBindings();

    std::vector<void*> buffers(nbBindings);
    std::vector<std::pair<int64_t, DataType>> buffersSizes = calculateBindingBufferSizes(engine, nbBindings, batchSize);

    for (int i = 0; i < nbBindings; ++i)
    {
        auto bufferSizesOutput = buffersSizes[i];
        buffers[i] = samplesCommon::safeCudaMalloc(bufferSizesOutput.first * samplesCommon::getElementSize(bufferSizesOutput.second));
    }

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings().
    int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
        outputIndex0 = engine.getBindingIndex(OUTPUT_BLOB_NAME0),
        outputIndex1 = outputIndex0 + 1; //engine.getBindingIndex(OUTPUT_BLOB_NAME1);

    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
    CHECK(cudaMemcpyAsync(buffers[inputIndex], inputData, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));

    auto t_start = std::chrono::high_resolution_clock::now();
    context.execute(batchSize, &buffers[0]);
    auto t_end = std::chrono::high_resolution_clock::now();
    float total = std::chrono::duration<float, std::milli>(t_end - t_start).count();

    gLogInfo << "Time taken for inference is " << total << " ms." << std::endl;

    for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx)
    {
        if (engine.bindingIsInput(bindingIdx))
            continue;
        auto bufferSizesOutput = buffersSizes[bindingIdx];
        printOutput(bufferSizesOutput.first, bufferSizesOutput.second,
                    buffers[bindingIdx]);
    }

    CHECK(cudaMemcpyAsync(detectionOut, buffers[outputIndex0], batchSize * detectionOutputParam.keepTopK * 7 * sizeof(float), cudaMemcpyDeviceToHost, stream));
    CHECK(cudaMemcpyAsync(keepCount, buffers[outputIndex1], batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release the stream and the buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex0]));
    CHECK(cudaFree(buffers[outputIndex1]));
}

class FlattenConcat : public IPluginV2
{
public:
    FlattenConcat(int concatAxis, bool ignoreBatch)
        : mIgnoreBatch(ignoreBatch)
        , mConcatAxisID(concatAxis)
    {
        assert(mConcatAxisID == 1 || mConcatAxisID == 2 || mConcatAxisID == 3);
    }
    //clone constructor
    FlattenConcat(int concatAxis, bool ignoreBatch, int numInputs, int outputConcatAxis, int* inputConcatAxis)
        : mIgnoreBatch(ignoreBatch)
        , mConcatAxisID(concatAxis)
        , mOutputConcatAxis(outputConcatAxis)
        , mNumInputs(numInputs)
    {
        CHECK(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
        for (int i = 0; i < mNumInputs; ++i)
            mInputConcatAxis[i] = inputConcatAxis[i];
    }

    FlattenConcat(const void* data, size_t length)
    {
        const char *d = reinterpret_cast<const char*>(data), *a = d;
        mIgnoreBatch = read<bool>(d);
        mConcatAxisID = read<int>(d);
        assert(mConcatAxisID == 1 || mConcatAxisID == 2 || mConcatAxisID == 3);
        mOutputConcatAxis = read<int>(d);
        mNumInputs = read<int>(d);
        CHECK(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
        CHECK(cudaMallocHost((void**) &mCopySize, mNumInputs * sizeof(int)));

        std::for_each(mInputConcatAxis, mInputConcatAxis + mNumInputs, [&](int& inp) { inp = read<int>(d); });

        mCHW = read<nvinfer1::DimsCHW>(d);

        std::for_each(mCopySize, mCopySize + mNumInputs, [&](size_t& inp) { inp = read<size_t>(d); });

        assert(d == a + length);
    }
    ~FlattenConcat()
    {
        if (mInputConcatAxis)
            CHECK(cudaFreeHost(mInputConcatAxis));
        if (mCopySize)
            CHECK(cudaFreeHost(mCopySize));
    }
    int getNbOutputs() const override { return 1; }

    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
    {
        assert(nbInputDims >= 1);
        assert(index == 0);
        mNumInputs = nbInputDims;
        CHECK(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
        mOutputConcatAxis = 0;

        for (int i = 0; i < nbInputDims; ++i)
        {
            int flattenInput = 0;
            assert(inputs[i].nbDims == 3);
            if (mConcatAxisID != 1)
                assert(inputs[i].d[0] == inputs[0].d[0]);
            if (mConcatAxisID != 2)
                assert(inputs[i].d[1] == inputs[0].d[1]);
            if (mConcatAxisID != 3)
                assert(inputs[i].d[2] == inputs[0].d[2]);
            flattenInput = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2];
            mInputConcatAxis[i] = flattenInput;
            mOutputConcatAxis += mInputConcatAxis[i];
        }

        return DimsCHW(mConcatAxisID == 1 ? mOutputConcatAxis : 1,
                       mConcatAxisID == 2 ? mOutputConcatAxis : 1,
                       mConcatAxisID == 3 ? mOutputConcatAxis : 1);
    }

    int initialize() override
    {
        CHECK(cublasCreate(&mCublas));
        return 0;
    }

    void terminate() override
    {
        CHECK(cublasDestroy(mCublas));
    }

    size_t getWorkspaceSize(int) const override { return 0; }

    int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) override
    {
        int numConcats = 1;
        assert(mConcatAxisID != 0);
        numConcats = std::accumulate(mCHW.d, mCHW.d + mConcatAxisID - 1, 1, std::multiplies<int>());
        cublasSetStream(mCublas, stream);

        if (!mIgnoreBatch)
            numConcats *= batchSize;

        float* output = reinterpret_cast<float*>(outputs[0]);
        int offset = 0;
        for (int i = 0; i < mNumInputs; ++i)
        {
            const float* input = reinterpret_cast<const float*>(inputs[i]);
            float* inputTemp;
            CHECK(cudaMalloc(&inputTemp, mCopySize[i] * batchSize));

            CHECK(cudaMemcpyAsync(inputTemp, input, mCopySize[i] * batchSize, cudaMemcpyDeviceToDevice, stream));

            for (int n = 0; n < numConcats; ++n)
            {
                CHECK(cublasScopy(mCublas, mInputConcatAxis[i],
                                  inputTemp + n * mInputConcatAxis[i], 1,
                                  output + (n * mOutputConcatAxis + offset), 1));
            }
            CHECK(cudaFree(inputTemp));
            offset += mInputConcatAxis[i];
        }

        return 0;
    }

    size_t getSerializationSize() const override
    {
        return sizeof(bool) + sizeof(int) * (3 + mNumInputs) + sizeof(nvinfer1::Dims) + (sizeof(mCopySize) * mNumInputs);
    }

    void serialize(void* buffer) const override
    {
        char *d = reinterpret_cast<char*>(buffer), *a = d;
        write(d, mIgnoreBatch);
        write(d, mConcatAxisID);
        write(d, mOutputConcatAxis);
        write(d, mNumInputs);
        for (int i = 0; i < mNumInputs; ++i)
        {
            write(d, mInputConcatAxis[i]);
        }
        write(d, mCHW);
        for (int i = 0; i < mNumInputs; ++i)
        {
            write(d, mCopySize[i]);
        }
        assert(d == a + getSerializationSize());
    }

    void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override
    {
        assert(nbOutputs == 1);
        mCHW = inputs[0];
        assert(inputs[0].nbDims == 3);
        CHECK(cudaMallocHost((void**) &mCopySize, nbInputs * sizeof(int)));
        for (int i = 0; i < nbInputs; ++i)
        {
            mCopySize[i] = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2] * sizeof(float);
        }
    }

    bool supportsFormat(DataType type, PluginFormat format) const override
    {
        return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
    }
    const char* getPluginType() const override { return "FlattenConcat_TRT"; }

    const char* getPluginVersion() const override { return "1"; }

    void destroy() override { delete this; }

    IPluginV2* clone() const override
    {
        return new FlattenConcat(mConcatAxisID, mIgnoreBatch, mNumInputs, mOutputConcatAxis, mInputConcatAxis);
    }

    void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const override { return mNamespace.c_str(); }

private:
    template <typename T>
    void write(char*& buffer, const T& val) const
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template <typename T>
    T read(const char*& buffer)
    {
        T val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
        return val;
    }

    size_t* mCopySize = nullptr;
    bool mIgnoreBatch{false};
    int mConcatAxisID{0}, mOutputConcatAxis{0}, mNumInputs{0};
    int* mInputConcatAxis = nullptr;
    nvinfer1::Dims mCHW;
    cublasHandle_t mCublas;
    std::string mNamespace;
};

namespace
{
const char* FLATTENCONCAT_PLUGIN_VERSION{"1"};
const char* FLATTENCONCAT_PLUGIN_NAME{"FlattenConcat_TRT"};
} // namespace

class FlattenConcatPluginCreator : public IPluginCreator
{
public:
    FlattenConcatPluginCreator()
    {
        mPluginAttributes.emplace_back(PluginField("axis", nullptr, PluginFieldType::kINT32, 1));
        mPluginAttributes.emplace_back(PluginField("ignoreBatch", nullptr, PluginFieldType::kINT32, 1));

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    ~FlattenConcatPluginCreator() {}

    const char* getPluginName() const override { return FLATTENCONCAT_PLUGIN_NAME; }

    const char* getPluginVersion() const override { return FLATTENCONCAT_PLUGIN_VERSION; }

    const PluginFieldCollection* getFieldNames() override { return &mFC; }

    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
    {
        const PluginField* fields = fc->fields;
        for (int i = 0; i < fc->nbFields; ++i)
        {
            const char* attrName = fields[i].name;
            if (!strcmp(attrName, "axis"))
            {
                assert(fields[i].type == PluginFieldType::kINT32);
                mConcatAxisID = *(static_cast<const int*>(fields[i].data));
            }
            if (!strcmp(attrName, "ignoreBatch"))
            {
                assert(fields[i].type == PluginFieldType::kINT32);
                mIgnoreBatch = *(static_cast<const bool*>(fields[i].data));
            }
        }

        return new FlattenConcat(mConcatAxisID, mIgnoreBatch);
    }

    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
    {

        //This object will be deleted when the network is destroyed, which will
        //call Concat::destroy()
        return new FlattenConcat(serialData, serialLength);
    }

    void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const override { return mNamespace.c_str(); }

private:
    static PluginFieldCollection mFC;
    bool mIgnoreBatch{false};
    int mConcatAxisID;
    static std::vector<PluginField> mPluginAttributes;
    std::string mNamespace = "";
};

PluginFieldCollection FlattenConcatPluginCreator::mFC{};
std::vector<PluginField> FlattenConcatPluginCreator::mPluginAttributes;

REGISTER_TENSORRT_PLUGIN(FlattenConcatPluginCreator);

void printHelp(const char* name)
{
    std::cout << "Usage: " << name << "\n"
        << "Optional Parameters:\n"
        << "  -h, --help Display help information.\n"
        << "  --useDLACore=N    Specify the DLA engine to run on.\n"
        << "  --fp16            Specify to run in fp16 mode.\n"
        << "  --int8            Specify to run in int8 mode." << std::endl;
}

int main(int argc, char* argv[])
{
    // Parse command-line arguments.
    bool argsOK = samplesCommon::parseArgs(gArgs, argc, argv);

    if (gArgs.help)
    {
        printHelp(argv[0]);
        return EXIT_SUCCESS;        
    }
    if (!argsOK)
    {
        gLogError << "Invalid arguments" << std::endl;
        printHelp(argv[0]);
        return EXIT_FAILURE;
    }

    auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast<const char**>(argv));

    gLogger.reportTestStart(sampleTest);

    initLibNvInferPlugins(&gLogger.getTRTLogger(), "");

    auto fileName = locateFile("sample_ssd_relu6.uff");
    gLogInfo << fileName << std::endl;

    const int N = 1;
    auto parser = createUffParser();

    BatchStream calibrationStream(CAL_BATCH_SIZE, NB_CAL_BATCHES);

    parser->registerInput("Input", DimsCHW(3, 300, 300), UffInputOrder::kNCHW);
    // MarkOutput_0 is a node created by the UFF converter when we specify an ouput with -O.
    parser->registerOutput("MarkOutput_0");

    IHostMemory* trtModelStream{nullptr};

    std::unique_ptr<IInt8Calibrator> calibrator;
    calibrator.reset(new Int8EntropyCalibrator2(calibrationStream, FIRST_CAL_BATCH, "UffSSD", INPUT_BLOB_NAME));

    ICudaEngine* tmpEngine = loadModelAndCreateEngine(fileName.c_str(), N, parser, calibrator.get(), trtModelStream);
    assert(tmpEngine != nullptr);
    assert(trtModelStream != nullptr);
    tmpEngine->destroy();

    // Available images.
    std::vector<std::string> imageList = {"dog.ppm", "bus.ppm"};
    std::vector<samplesCommon::PPM<INPUT_C, INPUT_H, INPUT_W>> ppms(N);

    assert(ppms.size() <= imageList.size());
    gLogInfo << " Num batches  " << N << std::endl;
    for (int i = 0; i < N; ++i)
    {
        readPPMFile(locateFile(imageList[i]), ppms[i]);
    }

    vector<float> data(N * INPUT_C * INPUT_H * INPUT_W);

    for (int i = 0, volImg = INPUT_C * INPUT_H * INPUT_W; i < N; ++i)
    {
        for (int c = 0; c < INPUT_C; ++c)
        {
            for (unsigned j = 0, volChl = INPUT_H * INPUT_W; j < volChl; ++j)
            {
                data[i * volImg + c * volChl + j] = (2.0 / 255.0) * float(ppms[i].buffer[j * INPUT_C + c]) - 1.0;
            }
        }
    }
    gLogInfo << " Data Size  " << data.size() << std::endl;

    // Deserialize the engine.
    gLogInfo << "*** deserializing" << std::endl;
    IRuntime* runtime = createInferRuntime(gLogger.getTRTLogger());
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
    assert(engine != nullptr);
    trtModelStream->destroy();
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);

    // Host memory for outputs.
    vector<float> detectionOut(N * detectionOutputParam.keepTopK * 7);
    vector<int> keepCount(N);

    // Run inference.
    doInference(*context, &data[0], &detectionOut[0], &keepCount[0], N);
    gLogInfo << " KeepCount " << keepCount[0] << std::endl;

    std::string CLASSES[OUTPUT_CLS_SIZE];

    populateClassLabels(CLASSES);

    bool pass = true;

    for (int p = 0; p < N; ++p)
    {
        int numDetections = 0;
        // at least one correct detection
        bool correctDetection = false;

        for (int i = 0; i < keepCount[p]; ++i)
        {
            float* det = &detectionOut[0] + (p * detectionOutputParam.keepTopK + i) * 7;
            if (det[2] < visualizeThreshold)
                continue;

            // Output format for each detection is stored in the below order
            // [image_id, label, confidence, xmin, ymin, xmax, ymax]
            assert((int) det[1] < OUTPUT_CLS_SIZE);
            std::string storeName = CLASSES[(int) det[1]] + "-" + std::to_string(det[2]) + ".ppm";

            numDetections++;
            if ((p == 0 && CLASSES[(int) det[1]] == "dog") || (p == 1 && ( CLASSES[(int) det[1]] == "truck" || CLASSES[(int) det[1]] == "car" ) ) )
            {
                correctDetection = true;
            }

            gLogInfo << "Detected " << CLASSES[(int) det[1]].c_str()
                     << " in the image " << int(det[0]) << " (" << ppms[p].fileName.c_str() << ")"
                     << " with confidence " << det[2] * 100.f << " and coordinates (" << det[3] * INPUT_W << "," << det[4] * INPUT_H << ")"
                     << ",(" << det[5] * INPUT_W << "," << det[6] * INPUT_H << ")."
                     << std::endl;

            gLogInfo << "Result stored in " << storeName.c_str() << "." << std::endl;

            samplesCommon::writePPMFileWithBBox(storeName, ppms[p], {det[3] * INPUT_W, det[4] * INPUT_H, det[5] * INPUT_W, det[6] * INPUT_H});
        }
        pass &= correctDetection;
        pass &= numDetections >= 1;
    }

    // Destroy the engine.
    context->destroy();
    engine->destroy();
    runtime->destroy();

    return gLogger.reportTest(sampleTest, pass);
}

win10 tensoRT5.1.5.0 C++推理demo學習

推理流程

sampleUffSSD c++推理

MNN量化源碼詳解

int8量化學習

MNN學習筆記

數據增強總結

實習學習筆記

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結