From a1568d91515485d6156c94a39199babdd192634b Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Fri, 24 Jun 2016 19:16:32 -0700 Subject: [PATCH 1/5] Mark 0.15.5 --- CMakeLists.txt | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 760138042f3..caca9029dc7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ endif() project(Caffe C CXX) # ---[ Caffe version -set(CAFFE_TARGET_VERSION "0.15.4") +set(CAFFE_TARGET_VERSION "0.15.5") set(CAFFE_TARGET_SOVERSION "0.15") add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION}) diff --git a/Makefile b/Makefile index 5fee716a35d..c07b5d6175c 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a DYNAMIC_VERSION_MAJOR := 0 DYNAMIC_VERSION_MINOR := 15 -DYNAMIC_VERSION_REVISION := 4 +DYNAMIC_VERSION_REVISION := 5 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR) DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION) From aa0b249a3746625ae58eb77bd2f277186dda0f5c Mon Sep 17 00:00:00 2001 From: Luke Yeager Date: Mon, 27 Jun 2016 13:26:14 -0700 Subject: [PATCH 2/5] Only print learning rate once for multi-GPU solver --- src/caffe/solvers/sgd_solver.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index 09ddaaff915..d9fef193e0f 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -101,7 +101,8 @@ void SGDSolver::ClipGradients() { template void SGDSolver::ApplyUpdate() { Dtype rate = GetLearningRate(); - if (this->param_.display() && this->iter_ % this->param_.display() == 0) { + if (Caffe::root_solver() && this->param_.display() && + this->iter_ % this->param_.display() == 0) { LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; } ClipGradients(); From b59099811581be356c39b270ee9c222a9c7785c4 Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Mon, 27 Jun 2016 20:43:10 -0700 Subject: [PATCH 3/5] Workspace size computation corrected, diagnostics fixed --- include/caffe/layers/cudnn_conv_layer.hpp | 4 ++- include/caffe/util/gpu_memory.hpp | 2 ++ src/caffe/layers/cudnn_conv_layer.cpp | 37 +++++++++++++++-------- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/include/caffe/layers/cudnn_conv_layer.hpp b/include/caffe/layers/cudnn_conv_layer.hpp index 2e7b25598cf..c89f6f5dd66 100644 --- a/include/caffe/layers/cudnn_conv_layer.hpp +++ b/include/caffe/layers/cudnn_conv_layer.hpp @@ -76,7 +76,7 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { cudnnFilterDescriptor_t filter_desc_; vector conv_descs_; - int bottom_offset_, top_offset_, weight_offset_, bias_offset_; + int bottom_offset_, top_offset_, bias_offset_; size_t *workspace_fwd_sizes_; size_t *workspace_bwd_data_sizes_; @@ -93,6 +93,8 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { const vector*>& top, const size_t workspace_bytes); + size_t ComputeFindExWorkspaceSize(); + vector cached_bottom_descs_; vector cached_conv_descs_; bool IsBottomDescChanged(const vector*>& bottom); diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp index fb6dea1ffc6..a023d7030ad 100644 --- a/include/caffe/util/gpu_memory.hpp +++ b/include/caffe/util/gpu_memory.hpp @@ -21,6 +21,7 @@ struct GPUMemory { int device = INVALID_DEVICE, cudaStream_t stream = cudaStreamDefault) { if (!try_allocate(reinterpret_cast(ptr), size, device, stream)) { + CUDA_CHECK(cudaGetDevice(&device)); LOG(FATAL) << "Out of memory: failed to allocate " << size << " bytes on device " << device; } @@ -89,6 +90,7 @@ struct GPUMemory { void reserve(size_t size, int device = INVALID_DEVICE) { if (!try_reserve(size, device)) { + CUDA_CHECK(cudaGetDevice(&device)); LOG(FATAL) << "Out of memory: failed to allocate " << size << " bytes on device " << device; } diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp index 4097648dd9d..3b46ee21fba 100644 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ b/src/caffe/layers/cudnn_conv_layer.cpp @@ -92,6 +92,26 @@ void CuDNNConvolutionLayer::LayerSetUp( initialized_cached_descs_ = false; } +template +size_t CuDNNConvolutionLayer::ComputeFindExWorkspaceSize() { + size_t workspace_limit_bytes, total_memory, workspace_bytes; + GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory); + // Use 95% of available memory. + // Using all of memory may result in failure of workspace.reserve. + // TODO: Since 95% of memory might be too large, we can allocate + // exactly how much FindEx needs by taking the maximum + // workspace among all algorithms (requires an initial call + // to FindEx with workspace size 0). + workspace_bytes = workspace_limit_bytes * MAX_WORKSPACE_RATIO; + const size_t weights_size = this->weight_offset_ * sizeof(Dtype); + if (workspace_bytes >= weights_size) { + workspace_bytes -= weights_size; + } else { + return 0UL; + } + return workspace_bytes; +} + template void CuDNNConvolutionLayer::Reshape( const vector*>& bottom, const vector*>& top) { @@ -168,8 +188,6 @@ void CuDNNConvolutionLayer::Reshape( // Ask cuDNN to find the best algorithm if (use_algo_seeker_) { - size_t workspace_limit_bytes, total_memory; - GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory); // FindEx: A workspace of size workspace_bytes is allocated for FindEx. // Besides, workspace, a buffer is allocated for the output of // FindEx-backward-filter. The size of buffer is as big as weights. @@ -181,19 +199,12 @@ void CuDNNConvolutionLayer::Reshape( // most of memory for allocating layer blobs. workspace_bytes = INITIAL_WORKSPACE_SIZE; } else { - // Use 95% of available memory. - // Using all of memory may result in failure of workspace.reserve. - // TODO: Since 95% of memory might be too large, we can allocate - // exactly how much FindEx needs by taking the maximum - // workspace among all algorithms (requires an initial call - // to FindEx with workspace size 0). - workspace_bytes = workspace_limit_bytes * MAX_WORKSPACE_RATIO; + workspace_bytes = ComputeFindExWorkspaceSize(); // Sometimes closer to zero we might have memory info diverged from // reality. If try_reserve fails, it updates the info internally and // we have to re-evaluate the workspace size. if (!WORKSPACE.try_reserve(workspace_bytes)) { - GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory); - workspace_bytes = workspace_limit_bytes * MAX_WORKSPACE_RATIO; + workspace_bytes = ComputeFindExWorkspaceSize(); } // Avoid seeking for an algorithm in subsequent iterations use_algo_seeker_ = false; @@ -296,7 +307,7 @@ void CuDNNConvolutionLayer::FindExConvAlgo( // Allocate temporary buffer for weights used for backward filter FindEx void *tmp_weights; - const int tmp_weights_size = sizeof(Dtype) * weight_offset_; + const int tmp_weights_size = sizeof(Dtype) * this->weight_offset_; GPUMemory::allocate(&tmp_weights, tmp_weights_size); for (int i = 0; i < bottom.size(); i++) { @@ -469,7 +480,7 @@ void CuDNNConvolutionLayer::UpdateWorkspaceDemand(int size) { } // We might grab too much before calling Get/FindEx. // Reserve the only amount needed. - if (WORKSPACE_SIZE < WORKSPACE.size()) { + if (WORKSPACE_SIZE < WORKSPACE.size() && !use_modest_workspace_) { WORKSPACE.release(); WORKSPACE.reserve(WORKSPACE_SIZE); } // else: reserve in Fwd/Bwd calls From 0f1194b40232ddb3518c29b1f618f4196ae4a2cb Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Tue, 28 Jun 2016 22:17:13 -0700 Subject: [PATCH 4/5] Mark 0.15.6 --- CMakeLists.txt | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index caca9029dc7..6f1a9674e7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ endif() project(Caffe C CXX) # ---[ Caffe version -set(CAFFE_TARGET_VERSION "0.15.5") +set(CAFFE_TARGET_VERSION "0.15.6") set(CAFFE_TARGET_SOVERSION "0.15") add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION}) diff --git a/Makefile b/Makefile index c07b5d6175c..e18321bb6b4 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a DYNAMIC_VERSION_MAJOR := 0 DYNAMIC_VERSION_MINOR := 15 -DYNAMIC_VERSION_REVISION := 5 +DYNAMIC_VERSION_REVISION := 6 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR) DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION) From 911baa43cf4b0cab0055006a3669b6a73b82e167 Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Wed, 29 Jun 2016 20:09:15 -0700 Subject: [PATCH 5/5] Hot fixe preventing redundant Find*Ex calls during TEST phase --- src/caffe/layers/cudnn_conv_layer.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu index 0348e93774c..67cdc02275b 100644 --- a/src/caffe/layers/cudnn_conv_layer.cu +++ b/src/caffe/layers/cudnn_conv_layer.cu @@ -58,6 +58,8 @@ void CuDNNConvolutionLayer::Forward_gpu( // NOLINT_NEXT_LINE(whitespace/operators) CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy)); } + // Possibly use faster algorithms by allowing larger workspace. + use_modest_workspace_ = false; } template @@ -130,8 +132,6 @@ void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, // NOLINT_NEXT_LINE(whitespace/operators) CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy)); } - // Possibly use faster algorithms by allowing larger workspace. - use_modest_workspace_ = false; } INSTANTIATE_LAYER_GPU_FUNCS(CuDNNConvolutionLayer);