fix(TensorRT): fix some memory allocation weirdness in trt backend

fantes · mergify[bot] · commit 4f952c3fbc2f · 2021-04-15T19:54:38.000Z
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -88,7 +88,7 @@ if (NOT EXISTS ${CMAKE_BINARY_DIR}/src)
 	COMMAND  bash -c "mkdir ${CMAKE_BINARY_DIR}/src")
 endif()
 
-set(CMAKE_CXX_FLAGS "-g -O2 -Wall -Wextra -fopenmp -fPIC -std=c++14 -DUSE_OPENCV -DUSE_LMDB")
+set(CMAKE_CXX_FLAGS "-g -O2 -Wall -Wextra -fopenmp -fPIC -std=c++14 -DUSE_OPENCV -DUSE_LMDB  -Wl,--no-as-needed -ltcmalloc_minimal")
 
 if(WARNING)
   string(APPEND CMAKE_CXX_FLAGS " -Werror")
diff --git a/src/backends/tensorrt/tensorrtlib.cc b/src/backends/tensorrt/tensorrtlib.cc
@@ -564,52 +564,54 @@ namespace dd
                                          + out_blob);
           }
 
-        if (_bbox)
-          {
-            _outputIndex1 = _engine->getBindingIndex("keep_count");
-            _buffers.resize(3);
-            _floatOut.resize(_max_batch_size * _top_k * 7);
-            _keepCount.resize(_max_batch_size);
-            if (inputc._bw)
-              cudaMalloc(&_buffers.data()[_inputIndex],
-                         _max_batch_size * inputc._height * inputc._width
-                             * sizeof(float));
-            else
-              cudaMalloc(&_buffers.data()[_inputIndex],
-                         _max_batch_size * 3 * inputc._height * inputc._width
-                             * sizeof(float));
-            cudaMalloc(&_buffers.data()[_outputIndex0],
-                       _max_batch_size * _top_k * 7 * sizeof(float));
-            cudaMalloc(&_buffers.data()[_outputIndex1],
-                       _max_batch_size * sizeof(int));
-          }
-        else if (_ctc)
-          {
-            throw MLLibBadParamException(
-                "ocr not yet implemented over tensorRT backend");
-          }
-        else if (_timeserie)
+        if (_first_predict)
           {
-            throw MLLibBadParamException(
-                "timeseries not yet implemented over tensorRT backend");
-          }
-        else // classification / regression
-          {
-            if (_regression)
-              _buffers.resize(1);
-            else
-              _buffers.resize(2);
-            _floatOut.resize(_max_batch_size * this->_nclasses);
-            if (inputc._bw)
-              cudaMalloc(&_buffers.data()[_inputIndex],
-                         _max_batch_size * inputc._height * inputc._width
-                             * sizeof(float));
-            else
-              cudaMalloc(&_buffers.data()[_inputIndex],
-                         _max_batch_size * 3 * inputc._height * inputc._width
-                             * sizeof(float));
-            cudaMalloc(&_buffers.data()[_outputIndex0],
-                       _max_batch_size * _nclasses * sizeof(float));
+            _first_predict = false;
+
+            if (_bbox)
+              {
+                _outputIndex1 = _engine->getBindingIndex("keep_count");
+                _buffers.resize(3);
+                _floatOut.resize(_max_batch_size * _top_k * 7);
+                _keepCount.resize(_max_batch_size);
+                if (inputc._bw)
+                  cudaMalloc(&_buffers.data()[_inputIndex],
+                             _max_batch_size * inputc._height * inputc._width
+                                 * sizeof(float));
+                else
+                  cudaMalloc(&_buffers.data()[_inputIndex],
+                             _max_batch_size * 3 * inputc._height
+                                 * inputc._width * sizeof(float));
+                cudaMalloc(&_buffers.data()[_outputIndex0],
+                           _max_batch_size * _top_k * 7 * sizeof(float));
+                cudaMalloc(&_buffers.data()[_outputIndex1],
+                           _max_batch_size * sizeof(int));
+              }
+            else if (_ctc)
+              {
+                throw MLLibBadParamException(
+                    "ocr not yet implemented over tensorRT backend");
+              }
+            else if (_timeserie)
+              {
+                throw MLLibBadParamException(
+                    "timeseries not yet implemented over tensorRT backend");
+              }
+            else // classification / regression
+              {
+                _buffers.resize(2);
+                _floatOut.resize(_max_batch_size * this->_nclasses);
+                if (inputc._bw)
+                  cudaMalloc(&_buffers.data()[_inputIndex],
+                             _max_batch_size * inputc._height * inputc._width
+                                 * sizeof(float));
+                else
+                  cudaMalloc(&_buffers.data()[_inputIndex],
+                             _max_batch_size * 3 * inputc._height
+                                 * inputc._width * sizeof(float));
+                cudaMalloc(&_buffers.data()[_outputIndex0],
+                           _max_batch_size * _nclasses * sizeof(float));
+              }
           }
       }
 
diff --git a/src/backends/tensorrt/tensorrtlib.h b/src/backends/tensorrt/tensorrtlib.h
@@ -140,6 +140,9 @@ namespace dd
     int _outputIndex0;
     int _outputIndex1;
 
+    bool _first_predict
+        = true; // do some cuda allocations only at first predict
+
     bool _explicit_batch
         = false; /**< whether TRT uses explicit batch model (ONNX). */
 
@@ -154,6 +157,5 @@ namespace dd
 
     nvinfer1::ICudaEngine *read_engine_from_onnx();
   };
-
 }
 #endif