From a1568d91515485d6156c94a39199babdd192634b Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Fri, 24 Jun 2016 19:16:32 -0700
Subject: [PATCH 1/5] Mark 0.15.5

---
 CMakeLists.txt | 2 +-
 Makefile       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 760138042f3..caca9029dc7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "0.15.4")
+set(CAFFE_TARGET_VERSION "0.15.5")
 set(CAFFE_TARGET_SOVERSION "0.15")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
diff --git a/Makefile b/Makefile
index 5fee716a35d..c07b5d6175c 100644
--- a/Makefile
+++ b/Makefile
@@ -38,7 +38,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 0
 DYNAMIC_VERSION_MINOR 		:= 15
-DYNAMIC_VERSION_REVISION 	:= 4
+DYNAMIC_VERSION_REVISION 	:= 5
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION)

From aa0b249a3746625ae58eb77bd2f277186dda0f5c Mon Sep 17 00:00:00 2001
From: Luke Yeager <luke.yeager@gmail.com>
Date: Mon, 27 Jun 2016 13:26:14 -0700
Subject: [PATCH 2/5] Only print learning rate once for multi-GPU solver

---
 src/caffe/solvers/sgd_solver.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 09ddaaff915..d9fef193e0f 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -101,7 +101,8 @@ void SGDSolver<Dtype>::ClipGradients() {
 template <typename Dtype>
 void SGDSolver<Dtype>::ApplyUpdate() {
   Dtype rate = GetLearningRate();
-  if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
+  if (Caffe::root_solver() && this->param_.display() &&
+      this->iter_ % this->param_.display() == 0) {
     LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
   }
   ClipGradients();

From b59099811581be356c39b270ee9c222a9c7785c4 Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Mon, 27 Jun 2016 20:43:10 -0700
Subject: [PATCH 3/5] Workspace size computation corrected, diagnostics fixed

---
 include/caffe/layers/cudnn_conv_layer.hpp |  4 ++-
 include/caffe/util/gpu_memory.hpp         |  2 ++
 src/caffe/layers/cudnn_conv_layer.cpp     | 37 +++++++++++++++--------
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/caffe/layers/cudnn_conv_layer.hpp b/include/caffe/layers/cudnn_conv_layer.hpp
index 2e7b25598cf..c89f6f5dd66 100644
--- a/include/caffe/layers/cudnn_conv_layer.hpp
+++ b/include/caffe/layers/cudnn_conv_layer.hpp
@@ -76,7 +76,7 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
   cudnnFilterDescriptor_t      filter_desc_;
   vector<cudnnConvolutionDescriptor_t> conv_descs_;
 
-  int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
+  int bottom_offset_, top_offset_, bias_offset_;
 
   size_t *workspace_fwd_sizes_;
   size_t *workspace_bwd_data_sizes_;
@@ -93,6 +93,8 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
                    const vector<Blob<Dtype>*>& top,
                    const size_t workspace_bytes);
 
+  size_t ComputeFindExWorkspaceSize();
+
   vector<cudnnTensorDescriptor_t>      cached_bottom_descs_;
   vector<cudnnConvolutionDescriptor_t> cached_conv_descs_;
   bool IsBottomDescChanged(const vector<Blob<Dtype>*>& bottom);
diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
index fb6dea1ffc6..a023d7030ad 100644
--- a/include/caffe/util/gpu_memory.hpp
+++ b/include/caffe/util/gpu_memory.hpp
@@ -21,6 +21,7 @@ struct GPUMemory {
       int device = INVALID_DEVICE,
       cudaStream_t stream = cudaStreamDefault) {
     if (!try_allocate(reinterpret_cast<void**>(ptr), size, device, stream)) {
+      CUDA_CHECK(cudaGetDevice(&device));
       LOG(FATAL) << "Out of memory: failed to allocate " << size
           << " bytes on device " << device;
     }
@@ -89,6 +90,7 @@ struct GPUMemory {
 
     void reserve(size_t size, int device = INVALID_DEVICE) {
       if (!try_reserve(size, device)) {
+        CUDA_CHECK(cudaGetDevice(&device));
         LOG(FATAL) << "Out of memory: failed to allocate " << size
             << " bytes on device " << device;
       }
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 4097648dd9d..3b46ee21fba 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -92,6 +92,26 @@ void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
   initialized_cached_descs_ = false;
 }
 
+template <typename Dtype>
+size_t CuDNNConvolutionLayer<Dtype>::ComputeFindExWorkspaceSize() {
+  size_t workspace_limit_bytes, total_memory, workspace_bytes;
+  GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
+  // Use 95% of available memory.
+  // Using all of memory may result in failure of workspace.reserve.
+  // TODO: Since 95% of memory might be too large, we can allocate
+  //       exactly how much FindEx needs by taking the maximum
+  //       workspace among all algorithms (requires an initial call
+  //       to FindEx with workspace size 0).
+  workspace_bytes = workspace_limit_bytes * MAX_WORKSPACE_RATIO;
+  const size_t weights_size = this->weight_offset_ * sizeof(Dtype);
+  if (workspace_bytes >= weights_size) {
+    workspace_bytes -= weights_size;
+  } else {
+    return 0UL;
+  }
+  return workspace_bytes;
+}
+
 template <typename Dtype>
 void CuDNNConvolutionLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -168,8 +188,6 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
 
   // Ask cuDNN to find the best algorithm
   if (use_algo_seeker_) {
-    size_t workspace_limit_bytes, total_memory;
-    GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
     // FindEx: A workspace of size workspace_bytes is allocated for FindEx.
     //         Besides, workspace, a buffer is allocated for the output of
     //         FindEx-backward-filter. The size of buffer is as big as weights.
@@ -181,19 +199,12 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
       // most of memory for allocating layer blobs.
       workspace_bytes = INITIAL_WORKSPACE_SIZE;
     } else {
-      // Use 95% of available memory.
-      // Using all of memory may result in failure of workspace.reserve.
-      // TODO: Since 95% of memory might be too large, we can allocate
-      //       exactly how much FindEx needs by taking the maximum
-      //       workspace among all algorithms (requires an initial call
-      //       to FindEx with workspace size 0).
-      workspace_bytes = workspace_limit_bytes * MAX_WORKSPACE_RATIO;
+      workspace_bytes = ComputeFindExWorkspaceSize();
       // Sometimes closer to zero we might have memory info diverged from
       // reality. If try_reserve fails, it updates the info internally and
       // we have to re-evaluate the workspace size.
       if (!WORKSPACE.try_reserve(workspace_bytes)) {
-        GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
-        workspace_bytes = workspace_limit_bytes * MAX_WORKSPACE_RATIO;
+        workspace_bytes = ComputeFindExWorkspaceSize();
       }
       // Avoid seeking for an algorithm in subsequent iterations
       use_algo_seeker_ = false;
@@ -296,7 +307,7 @@ void CuDNNConvolutionLayer<Dtype>::FindExConvAlgo(
 
   // Allocate temporary buffer for weights used for backward filter FindEx
   void *tmp_weights;
-  const int tmp_weights_size = sizeof(Dtype) * weight_offset_;
+  const int tmp_weights_size = sizeof(Dtype) * this->weight_offset_;
   GPUMemory::allocate(&tmp_weights, tmp_weights_size);
 
   for (int i = 0; i < bottom.size(); i++) {
@@ -469,7 +480,7 @@ void CuDNNConvolutionLayer<Dtype>::UpdateWorkspaceDemand(int size) {
   }
   // We might grab too much before calling Get/FindEx.
   // Reserve the only amount needed.
-  if (WORKSPACE_SIZE < WORKSPACE.size()) {
+  if (WORKSPACE_SIZE < WORKSPACE.size() && !use_modest_workspace_) {
     WORKSPACE.release();
     WORKSPACE.reserve(WORKSPACE_SIZE);
   }  // else: reserve in Fwd/Bwd calls

From 0f1194b40232ddb3518c29b1f618f4196ae4a2cb Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Tue, 28 Jun 2016 22:17:13 -0700
Subject: [PATCH 4/5] Mark 0.15.6

---
 CMakeLists.txt | 2 +-
 Makefile       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index caca9029dc7..6f1a9674e7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "0.15.5")
+set(CAFFE_TARGET_VERSION "0.15.6")
 set(CAFFE_TARGET_SOVERSION "0.15")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
diff --git a/Makefile b/Makefile
index c07b5d6175c..e18321bb6b4 100644
--- a/Makefile
+++ b/Makefile
@@ -38,7 +38,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 0
 DYNAMIC_VERSION_MINOR 		:= 15
-DYNAMIC_VERSION_REVISION 	:= 5
+DYNAMIC_VERSION_REVISION 	:= 6
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION)

From 911baa43cf4b0cab0055006a3669b6a73b82e167 Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Wed, 29 Jun 2016 20:09:15 -0700
Subject: [PATCH 5/5] Hot fixe preventing redundant Find*Ex calls during TEST
 phase

---
 src/caffe/layers/cudnn_conv_layer.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index 0348e93774c..67cdc02275b 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -58,6 +58,8 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
     // NOLINT_NEXT_LINE(whitespace/operators)
     CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy));
   }
+  // Possibly use faster algorithms by allowing larger workspace.
+  use_modest_workspace_ = false;
 }
 
 template<typename Dtype>
@@ -130,8 +132,6 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     // NOLINT_NEXT_LINE(whitespace/operators)
     CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy));
   }
-  // Possibly use faster algorithms by allowing larger workspace.
-  use_modest_workspace_ = false;
 }
 
 INSTANTIATE_LAYER_GPU_FUNCS(CuDNNConvolutionLayer);