Skip to content

Commit 4f952c3

Browse files
fantesmergify[bot]
authored andcommittedApr 15, 2021
fix(TensorRT): fix some memory allocation weirdness in trt backend
1 parent 0c716a6 commit 4f952c3

File tree

3 files changed

+51
-47
lines changed

3 files changed

+51
-47
lines changed
 

‎CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ if (NOT EXISTS ${CMAKE_BINARY_DIR}/src)
8888
COMMAND bash -c "mkdir ${CMAKE_BINARY_DIR}/src")
8989
endif()
9090

91-
set(CMAKE_CXX_FLAGS "-g -O2 -Wall -Wextra -fopenmp -fPIC -std=c++14 -DUSE_OPENCV -DUSE_LMDB")
91+
set(CMAKE_CXX_FLAGS "-g -O2 -Wall -Wextra -fopenmp -fPIC -std=c++14 -DUSE_OPENCV -DUSE_LMDB -Wl,--no-as-needed -ltcmalloc_minimal")
9292

9393
if(WARNING)
9494
string(APPEND CMAKE_CXX_FLAGS " -Werror")

‎src/backends/tensorrt/tensorrtlib.cc

+47-45
Original file line numberDiff line numberDiff line change
@@ -564,52 +564,54 @@ namespace dd
564564
+ out_blob);
565565
}
566566

567-
if (_bbox)
568-
{
569-
_outputIndex1 = _engine->getBindingIndex("keep_count");
570-
_buffers.resize(3);
571-
_floatOut.resize(_max_batch_size * _top_k * 7);
572-
_keepCount.resize(_max_batch_size);
573-
if (inputc._bw)
574-
cudaMalloc(&_buffers.data()[_inputIndex],
575-
_max_batch_size * inputc._height * inputc._width
576-
* sizeof(float));
577-
else
578-
cudaMalloc(&_buffers.data()[_inputIndex],
579-
_max_batch_size * 3 * inputc._height * inputc._width
580-
* sizeof(float));
581-
cudaMalloc(&_buffers.data()[_outputIndex0],
582-
_max_batch_size * _top_k * 7 * sizeof(float));
583-
cudaMalloc(&_buffers.data()[_outputIndex1],
584-
_max_batch_size * sizeof(int));
585-
}
586-
else if (_ctc)
587-
{
588-
throw MLLibBadParamException(
589-
"ocr not yet implemented over tensorRT backend");
590-
}
591-
else if (_timeserie)
567+
if (_first_predict)
592568
{
593-
throw MLLibBadParamException(
594-
"timeseries not yet implemented over tensorRT backend");
595-
}
596-
else // classification / regression
597-
{
598-
if (_regression)
599-
_buffers.resize(1);
600-
else
601-
_buffers.resize(2);
602-
_floatOut.resize(_max_batch_size * this->_nclasses);
603-
if (inputc._bw)
604-
cudaMalloc(&_buffers.data()[_inputIndex],
605-
_max_batch_size * inputc._height * inputc._width
606-
* sizeof(float));
607-
else
608-
cudaMalloc(&_buffers.data()[_inputIndex],
609-
_max_batch_size * 3 * inputc._height * inputc._width
610-
* sizeof(float));
611-
cudaMalloc(&_buffers.data()[_outputIndex0],
612-
_max_batch_size * _nclasses * sizeof(float));
569+
_first_predict = false;
570+
571+
if (_bbox)
572+
{
573+
_outputIndex1 = _engine->getBindingIndex("keep_count");
574+
_buffers.resize(3);
575+
_floatOut.resize(_max_batch_size * _top_k * 7);
576+
_keepCount.resize(_max_batch_size);
577+
if (inputc._bw)
578+
cudaMalloc(&_buffers.data()[_inputIndex],
579+
_max_batch_size * inputc._height * inputc._width
580+
* sizeof(float));
581+
else
582+
cudaMalloc(&_buffers.data()[_inputIndex],
583+
_max_batch_size * 3 * inputc._height
584+
* inputc._width * sizeof(float));
585+
cudaMalloc(&_buffers.data()[_outputIndex0],
586+
_max_batch_size * _top_k * 7 * sizeof(float));
587+
cudaMalloc(&_buffers.data()[_outputIndex1],
588+
_max_batch_size * sizeof(int));
589+
}
590+
else if (_ctc)
591+
{
592+
throw MLLibBadParamException(
593+
"ocr not yet implemented over tensorRT backend");
594+
}
595+
else if (_timeserie)
596+
{
597+
throw MLLibBadParamException(
598+
"timeseries not yet implemented over tensorRT backend");
599+
}
600+
else // classification / regression
601+
{
602+
_buffers.resize(2);
603+
_floatOut.resize(_max_batch_size * this->_nclasses);
604+
if (inputc._bw)
605+
cudaMalloc(&_buffers.data()[_inputIndex],
606+
_max_batch_size * inputc._height * inputc._width
607+
* sizeof(float));
608+
else
609+
cudaMalloc(&_buffers.data()[_inputIndex],
610+
_max_batch_size * 3 * inputc._height
611+
* inputc._width * sizeof(float));
612+
cudaMalloc(&_buffers.data()[_outputIndex0],
613+
_max_batch_size * _nclasses * sizeof(float));
614+
}
613615
}
614616
}
615617

‎src/backends/tensorrt/tensorrtlib.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ namespace dd
140140
int _outputIndex0;
141141
int _outputIndex1;
142142

143+
bool _first_predict
144+
= true; // do some cuda allocations only at first predict
145+
143146
bool _explicit_batch
144147
= false; /**< whether TRT uses explicit batch model (ONNX). */
145148

@@ -154,6 +157,5 @@ namespace dd
154157

155158
nvinfer1::ICudaEngine *read_engine_from_onnx();
156159
};
157-
158160
}
159161
#endif

0 commit comments

Comments
 (0)