diff --git a/CMakeLists.txt b/CMakeLists.txt index 905cd7d83ef..b646e01993e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,8 @@ if(POLICY CMP0054) cmake_policy(SET CMP0054 NEW) endif() +set(CMAKE_INSTALL_PREFIX /usr/local/include/caffe) + # ---[ Caffe project project(Caffe C CXX) @@ -32,7 +34,7 @@ caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ON caffe_option(USE_NCCL "Build Caffe with NCCL library support" OFF) caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON) caffe_option(BUILD_python "Build Python wrapper" ON) -set(python_version "2" CACHE STRING "Specify which Python version to use") +set(python_version "3" CACHE STRING "Specify which Python version to use") caffe_option(BUILD_matlab "Build Matlab wrapper" OFF IF UNIX OR APPLE) caffe_option(BUILD_docs "Build documentation" ON IF UNIX OR APPLE) caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON) diff --git a/Makefile b/Makefile index 0188f3e64a4..308dc73ee52 100644 --- a/Makefile +++ b/Makefile @@ -427,7 +427,7 @@ CXXFLAGS += -MMD -MP # Complete build flags. COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS) -NVCCFLAGS += -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS) +NVCCFLAGS += -D_FORCE_INLINES -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS) # mex may invoke an older gcc that is too liberal with -Wuninitalized MATLAB_CXXFLAGS := $(CXXFLAGS) -Wno-uninitialized LINKFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS) diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake index d28cbeadaa1..046242f65a5 100644 --- a/cmake/Cuda.cmake +++ b/cmake/Cuda.cmake @@ -7,23 +7,25 @@ endif () # set(Caffe_known_gpu_archs "30 35 50 52 60 61") # set(Caffe_known_gpu_archs "20 21(20) 30 35 50 60 61") # Fermi (3.2 <= CUDA <= 8) -set(FERMI "20 21(20)") +# set(FERMI "20 21(20)") # Kepler (CUDA >= 5) -set(KEPLER "30 35 37") +set(KEPLER "35 37") # set(KEPLER "30 35 37") # This crashes with CUDA 10 # Maxwell (CUDA >= 6) set(MAXWELL "50 52 53") # Pascal (CUDA >= 8) set(PASCAL "60 61 62") # Volta (CUDA >= 9) -set(VOLTA "70") # set(VOLTA "70 71 72") # This crashes with CUDA 10 +set(VOLTA "70 72") # set(VOLTA "70 71 72") # This crashes with CUDA 10 # Turing (CUDA >= 10) set(TURING "75") +# Ampere (CUDA >= 11) +set(AMPERE "80 86") if (UNIX AND NOT APPLE) - set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING}") + set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING} ${AMPERE}") # set(Caffe_known_gpu_archs "${FERMI} ${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING}") # set(Caffe_known_gpu_archs "20 21(20) 30 35 50 52 60 61") elseif (WIN32) - set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING}") + set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING} ${AMPERE}") endif () @@ -227,7 +229,11 @@ function(detect_cuDNN) set(HAVE_CUDNN TRUE PARENT_SCOPE) set(CUDNN_FOUND TRUE PARENT_SCOPE) - file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) + if(EXISTS "${CUDNN_INCLUDE}/cudnn_version.h") + file(READ ${CUDNN_INCLUDE}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS) + else() + file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) + endif() # cuDNN v3 and beyond string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" diff --git a/examples/cpp_classification/classification.cpp b/examples/cpp_classification/classification.cpp index 6b67c537a47..478355c0dbe 100644 --- a/examples/cpp_classification/classification.cpp +++ b/examples/cpp_classification/classification.cpp @@ -3,7 +3,9 @@ #include #include #include +#include "../../include/caffe/util/opencv4.hpp" #endif // USE_OPENCV + #include #include #include diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp index 97b4ee6a8c4..dc0982923dc 100644 --- a/include/caffe/data_transformer.hpp +++ b/include/caffe/data_transformer.hpp @@ -51,6 +51,7 @@ class DataTransformer { Blob* transformed_blob); #ifdef USE_OPENCV + #include "./util/opencv4.hpp" /** * @brief Applies the transformation defined in the data layer's * transform_param block to a vector of Mat. diff --git a/include/caffe/layers/memory_data_layer.hpp b/include/caffe/layers/memory_data_layer.hpp index 8abcc8c1b68..8eb35d3817e 100644 --- a/include/caffe/layers/memory_data_layer.hpp +++ b/include/caffe/layers/memory_data_layer.hpp @@ -32,6 +32,8 @@ class MemoryDataLayer : public BaseDataLayer { #ifdef USE_OPENCV virtual void AddMatVector(const vector& mat_vector, const vector& labels); + + #include "../util/opencv4.hpp" #endif // USE_OPENCV // Reset should accept const pointers, but can't, because the memory diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp index 1a599883ca3..ba464055bbc 100644 --- a/include/caffe/util/io.hpp +++ b/include/caffe/util/io.hpp @@ -130,6 +130,7 @@ bool DecodeDatumNative(Datum* datum); bool DecodeDatum(Datum* datum, bool is_color); #ifdef USE_OPENCV + #include "./opencv4.hpp" cv::Mat ReadImageToCVMat(const string& filename, const int height, const int width, const bool is_color); diff --git a/include/caffe/util/opencv4.hpp b/include/caffe/util/opencv4.hpp new file mode 100644 index 00000000000..fe8af42b274 --- /dev/null +++ b/include/caffe/util/opencv4.hpp @@ -0,0 +1,10 @@ +#include + +#if (defined(CV_MAJOR_VERSION) && CV_MAJOR_VERSION > 3) + #define OPENCV_VERSION4 +#endif + +#ifdef OPENCV_VERSION4 + #define CV_LOAD_IMAGE_COLOR cv::IMREAD_COLOR + #define CV_LOAD_IMAGE_GRAYSCALE cv::IMREAD_GRAYSCALE +#endif \ No newline at end of file diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp index 3012251e0a5..80709ba8f28 100644 --- a/src/caffe/data_transformer.cpp +++ b/src/caffe/data_transformer.cpp @@ -1,5 +1,6 @@ #ifdef USE_OPENCV #include +#include "../../include/caffe/util/opencv4.hpp" #endif // USE_OPENCV #include diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp index efc9e04e8c0..a4a69b3ac6a 100644 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ b/src/caffe/layers/cudnn_conv_layer.cpp @@ -108,9 +108,21 @@ void CuDNNConvolutionLayer::Reshape( const int stride_h = stride_data[0]; const int stride_w = stride_data[1]; +// Note: Copied from https://github.com/Qengineering/caffe/tree/ssd/src/caffe/layers +#if CUDNN_VERSION_MIN(8, 0, 0) + int RetCnt; + bool found_conv_algorithm; + size_t free_memory, total_memory; + cudnnConvolutionFwdAlgoPerf_t fwd_algo_pref_[4]; + cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4]; + + //get memory sizes + cudaMemGetInfo(&free_memory, &total_memory); +#else // Specify workspace limit for kernels directly until we have a // planning strategy and a rewrite of Caffe's GPU memory mangagement size_t workspace_limit_bytes = 8*1024*1024; +#endif for (int i = 0; i < bottom.size(); i++) { cudnn::setTensor4dDesc(&bottom_descs_[i], @@ -127,6 +139,50 @@ void CuDNNConvolutionLayer::Reshape( filter_desc_, pad_h, pad_w, stride_h, stride_w); +// Note: Copied from https://github.com/Qengineering/caffe/tree/ssd/src/caffe/layers +#if CUDNN_VERSION_MIN(8, 0, 0) + // choose forward algorithm for filter + // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8 + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0], bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], 4, &RetCnt, fwd_algo_pref_)); + + found_conv_algorithm = false; + for(int n=0;n::Reshape( CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0], filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) ); +#endif } // reduce over all workspace sizes to get a maximum to allocate / reallocate @@ -252,7 +309,6 @@ CuDNNConvolutionLayer::~CuDNNConvolutionLayer() { } cudaFree(workspaceData); - delete [] workspace; delete [] stream_; delete [] handle_; delete [] fwd_algo_; diff --git a/src/caffe/layers/cudnn_deconv_layer.cpp b/src/caffe/layers/cudnn_deconv_layer.cpp index 260da5c1ee0..86d807fb08e 100644 --- a/src/caffe/layers/cudnn_deconv_layer.cpp +++ b/src/caffe/layers/cudnn_deconv_layer.cpp @@ -93,200 +93,200 @@ template void CuDNNDeconvolutionLayer::Reshape( const vector*>& bottom, const vector*>& top) { DeconvolutionLayer::Reshape(bottom, top); - CHECK_EQ(2, this->num_spatial_axes_) - << "CuDNNDeconvolutionLayer input must have 2 spatial axes " - << "(e.g., height and width). " - << "Use 'engine: CAFFE' for general ND convolution."; - bottom_offset_ = this->bottom_dim_ / this->group_; - top_offset_ = this->top_dim_ / this->group_; - const int height = bottom[0]->shape(this->channel_axis_ + 1); - const int width = bottom[0]->shape(this->channel_axis_ + 2); - const int height_out = top[0]->shape(this->channel_axis_ + 1); - const int width_out = top[0]->shape(this->channel_axis_ + 2); - const int* pad_data = this->pad_.cpu_data(); - const int pad_h = pad_data[0]; - const int pad_w = pad_data[1]; - const int* stride_data = this->stride_.cpu_data(); - const int stride_h = stride_data[0]; - const int stride_w = stride_data[1]; - - // Specify workspace limit for kernels directly until we have a - // planning strategy and a rewrite of Caffe's GPU memory mangagement - size_t workspace_limit_bytes = 8*1024*1024; - - for (int i = 0; i < bottom.size(); i++) { - cudnn::setTensor4dDesc(&bottom_descs_[i], - this->num_, - this->channels_ / this->group_, - height, - width, - this->channels_ * height * width, - height * width, - width, - 1); - cudnn::setTensor4dDesc(&top_descs_[i], - this->num_, - this->num_output_ / this->group_, - height_out, - width_out, - this->num_output_ * height_out * width_out, - height_out * width_out, - width_out, - 1); - cudnn::setConvolutionDesc(&conv_descs_[i], - top_descs_[i], - filter_desc_, - pad_h, - pad_w, - stride_h, - stride_w); - - // choose forward and backward algorithms + workspace(s) - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( - handle_[0], - top_descs_[i], - filter_desc_, - conv_descs_[i], - bottom_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &fwd_algo_[i])); - - // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is - // buggy. Thus, if this algo was chosen, choose winograd instead. If - // winograd is not supported or workspace is larger than threshold, choose - // implicit_gemm instead. - if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { - size_t winograd_workspace_size; - cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize( - handle_[0], - top_descs_[i], - filter_desc_, - conv_descs_[i], - bottom_descs_[i], - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, - &winograd_workspace_size); - if (status != CUDNN_STATUS_SUCCESS || - winograd_workspace_size >= workspace_limit_bytes) { - fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - } else { - fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; - } - } - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( - handle_[0], - top_descs_[i], - filter_desc_, - conv_descs_[i], - bottom_descs_[i], - fwd_algo_[i], - &(workspace_fwd_sizes_[i]))); - - // choose backward algorithm for filter - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( - handle_[0], - top_descs_[i], - bottom_descs_[i], - conv_descs_[i], - filter_desc_, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &bwd_filter_algo_[i])); - - // get workspace for backwards filter algorithm - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle_[0], - top_descs_[i], - bottom_descs_[i], - conv_descs_[i], - filter_desc_, - bwd_filter_algo_[i], - &workspace_bwd_filter_sizes_[i])); - - // choose backward algo for data - CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( - handle_[0], - filter_desc_, - bottom_descs_[i], - conv_descs_[i], - top_descs_[i], - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &bwd_data_algo_[i])); - - // get workspace size - CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( - handle_[0], - filter_desc_, - bottom_descs_[i], - conv_descs_[i], - top_descs_[i], - bwd_data_algo_[i], - &workspace_bwd_data_sizes_[i])); - } - - // reduce over all workspace sizes to get a maximum to allocate / reallocate - size_t total_workspace_fwd = 0; - size_t total_workspace_bwd_data = 0; - size_t total_workspace_bwd_filter = 0; - - for (size_t i = 0; i < bottom.size(); i++) { - total_workspace_fwd = std::max(total_workspace_fwd, - workspace_fwd_sizes_[i]); - total_workspace_bwd_data = std::max(total_workspace_bwd_data, - workspace_bwd_data_sizes_[i]); - total_workspace_bwd_filter = std::max(total_workspace_bwd_filter, - workspace_bwd_filter_sizes_[i]); - } - // get max over all operations - size_t max_workspace = std::max(total_workspace_fwd, - total_workspace_bwd_data); - max_workspace = std::max(max_workspace, total_workspace_bwd_filter); - // ensure all groups have enough workspace - size_t total_max_workspace = max_workspace * - (this->group_ * CUDNN_STREAMS_PER_GROUP); - - // this is the total amount of storage needed over all groups + streams - if (total_max_workspace > workspaceSizeInBytes) { - DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace; - workspaceSizeInBytes = total_max_workspace; - - // free the existing workspace and allocate a new (larger) one - cudaFree(this->workspaceData); - - cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes); - if (err != cudaSuccess) { - // force zero memory path - for (int i = 0; i < bottom.size(); i++) { - workspace_fwd_sizes_[i] = 0; - workspace_bwd_filter_sizes_[i] = 0; - workspace_bwd_data_sizes_[i] = 0; - fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING; - bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; - bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; - } - - // NULL out all workspace pointers - for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { - workspace[g] = NULL; - } - // NULL out underlying data - workspaceData = NULL; - workspaceSizeInBytes = 0; - } - - // if we succeed in the allocation, set pointer aliases for workspaces - for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { - workspace[g] = reinterpret_cast(workspaceData) + g*max_workspace; - } - } - - // Tensor descriptor for bias. - if (this->bias_term_) { - cudnn::setTensor4dDesc( - &bias_desc_, 1, this->num_output_ / this->group_, 1, 1); - } + // CHECK_EQ(2, this->num_spatial_axes_) + // << "CuDNNDeconvolutionLayer input must have 2 spatial axes " + // << "(e.g., height and width). " + // << "Use 'engine: CAFFE' for general ND convolution."; + // bottom_offset_ = this->bottom_dim_ / this->group_; + // top_offset_ = this->top_dim_ / this->group_; + // const int height = bottom[0]->shape(this->channel_axis_ + 1); + // const int width = bottom[0]->shape(this->channel_axis_ + 2); + // const int height_out = top[0]->shape(this->channel_axis_ + 1); + // const int width_out = top[0]->shape(this->channel_axis_ + 2); + // const int* pad_data = this->pad_.cpu_data(); + // const int pad_h = pad_data[0]; + // const int pad_w = pad_data[1]; + // const int* stride_data = this->stride_.cpu_data(); + // const int stride_h = stride_data[0]; + // const int stride_w = stride_data[1]; + + // // Specify workspace limit for kernels directly until we have a + // // planning strategy and a rewrite of Caffe's GPU memory mangagement + // size_t workspace_limit_bytes = 8*1024*1024; + + // for (int i = 0; i < bottom.size(); i++) { + // cudnn::setTensor4dDesc(&bottom_descs_[i], + // this->num_, + // this->channels_ / this->group_, + // height, + // width, + // this->channels_ * height * width, + // height * width, + // width, + // 1); + // cudnn::setTensor4dDesc(&top_descs_[i], + // this->num_, + // this->num_output_ / this->group_, + // height_out, + // width_out, + // this->num_output_ * height_out * width_out, + // height_out * width_out, + // width_out, + // 1); + // cudnn::setConvolutionDesc(&conv_descs_[i], + // top_descs_[i], + // filter_desc_, + // pad_h, + // pad_w, + // stride_h, + // stride_w); + + // // choose forward and backward algorithms + workspace(s) + // CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( + // handle_[0], + // top_descs_[i], + // filter_desc_, + // conv_descs_[i], + // bottom_descs_[i], + // CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + // workspace_limit_bytes, + // &fwd_algo_[i])); + + // // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is + // // buggy. Thus, if this algo was chosen, choose winograd instead. If + // // winograd is not supported or workspace is larger than threshold, choose + // // implicit_gemm instead. + // if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { + // size_t winograd_workspace_size; + // cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize( + // handle_[0], + // top_descs_[i], + // filter_desc_, + // conv_descs_[i], + // bottom_descs_[i], + // CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, + // &winograd_workspace_size); + // if (status != CUDNN_STATUS_SUCCESS || + // winograd_workspace_size >= workspace_limit_bytes) { + // fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + // } else { + // fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; + // } + // } + + // CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( + // handle_[0], + // top_descs_[i], + // filter_desc_, + // conv_descs_[i], + // bottom_descs_[i], + // fwd_algo_[i], + // &(workspace_fwd_sizes_[i]))); + + // // choose backward algorithm for filter + // CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( + // handle_[0], + // top_descs_[i], + // bottom_descs_[i], + // conv_descs_[i], + // filter_desc_, + // CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + // workspace_limit_bytes, + // &bwd_filter_algo_[i])); + + // // get workspace for backwards filter algorithm + // CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( + // handle_[0], + // top_descs_[i], + // bottom_descs_[i], + // conv_descs_[i], + // filter_desc_, + // bwd_filter_algo_[i], + // &workspace_bwd_filter_sizes_[i])); + + // // choose backward algo for data + // CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( + // handle_[0], + // filter_desc_, + // bottom_descs_[i], + // conv_descs_[i], + // top_descs_[i], + // CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + // workspace_limit_bytes, + // &bwd_data_algo_[i])); + + // // get workspace size + // CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( + // handle_[0], + // filter_desc_, + // bottom_descs_[i], + // conv_descs_[i], + // top_descs_[i], + // bwd_data_algo_[i], + // &workspace_bwd_data_sizes_[i])); + // } + + // // reduce over all workspace sizes to get a maximum to allocate / reallocate + // size_t total_workspace_fwd = 0; + // size_t total_workspace_bwd_data = 0; + // size_t total_workspace_bwd_filter = 0; + + // for (size_t i = 0; i < bottom.size(); i++) { + // total_workspace_fwd = std::max(total_workspace_fwd, + // workspace_fwd_sizes_[i]); + // total_workspace_bwd_data = std::max(total_workspace_bwd_data, + // workspace_bwd_data_sizes_[i]); + // total_workspace_bwd_filter = std::max(total_workspace_bwd_filter, + // workspace_bwd_filter_sizes_[i]); + // } + // // get max over all operations + // size_t max_workspace = std::max(total_workspace_fwd, + // total_workspace_bwd_data); + // max_workspace = std::max(max_workspace, total_workspace_bwd_filter); + // // ensure all groups have enough workspace + // size_t total_max_workspace = max_workspace * + // (this->group_ * CUDNN_STREAMS_PER_GROUP); + + // // this is the total amount of storage needed over all groups + streams + // if (total_max_workspace > workspaceSizeInBytes) { + // DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace; + // workspaceSizeInBytes = total_max_workspace; + + // // free the existing workspace and allocate a new (larger) one + // cudaFree(this->workspaceData); + + // cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes); + // if (err != cudaSuccess) { + // // force zero memory path + // for (int i = 0; i < bottom.size(); i++) { + // workspace_fwd_sizes_[i] = 0; + // workspace_bwd_filter_sizes_[i] = 0; + // workspace_bwd_data_sizes_[i] = 0; + // fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING; + // bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; + // bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; + // } + + // // NULL out all workspace pointers + // for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { + // workspace[g] = NULL; + // } + // // NULL out underlying data + // workspaceData = NULL; + // workspaceSizeInBytes = 0; + // } + + // // if we succeed in the allocation, set pointer aliases for workspaces + // for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { + // workspace[g] = reinterpret_cast(workspaceData) + g*max_workspace; + // } + // } + + // // Tensor descriptor for bias. + // if (this->bias_term_) { + // cudnn::setTensor4dDesc( + // &bias_desc_, 1, this->num_output_ / this->group_, 1, 1); + // } } template diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index 0f1296bbc77..b87c38449f0 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -1,6 +1,8 @@ #ifdef USE_OPENCV #include +#include "../../../include/caffe/util/opencv4.hpp" #endif // USE_OPENCV + #include #include diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp index ec0fc5b0383..e355b70f594 100644 --- a/src/caffe/layers/image_data_layer.cpp +++ b/src/caffe/layers/image_data_layer.cpp @@ -1,4 +1,5 @@ #ifdef USE_OPENCV + #include "../../../include/caffe/util/opencv4.hpp" #include #include // NOLINT(readability/streams) diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp index 975f4841723..43fd0d343b8 100644 --- a/src/caffe/layers/memory_data_layer.cpp +++ b/src/caffe/layers/memory_data_layer.cpp @@ -1,5 +1,6 @@ #ifdef USE_OPENCV #include +#include "../../../include/caffe/util/opencv4.hpp" #endif // USE_OPENCV #include diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index 1bf3760e9fd..e6f41b9028b 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -1,4 +1,6 @@ #ifdef USE_OPENCV + #include "../../../include/caffe/util/opencv4.hpp" + #include #include diff --git a/src/caffe/test/test_data_layer.cpp b/src/caffe/test/test_data_layer.cpp index 3835af1f173..72679b90a4a 100644 --- a/src/caffe/test/test_data_layer.cpp +++ b/src/caffe/test/test_data_layer.cpp @@ -1,4 +1,5 @@ #ifdef USE_OPENCV + #include "../../../include/caffe/util/opencv4.hpp" #include #include diff --git a/src/caffe/test/test_data_transformer.cpp b/src/caffe/test/test_data_transformer.cpp index 31bf1c1fb14..77d6b1fe92c 100644 --- a/src/caffe/test/test_data_transformer.cpp +++ b/src/caffe/test/test_data_transformer.cpp @@ -1,4 +1,5 @@ #ifdef USE_OPENCV + #include "../../../include/caffe/util/opencv4.hpp" #include #include diff --git a/src/caffe/test/test_image_data_layer.cpp b/src/caffe/test/test_image_data_layer.cpp index ce5e0bc62d6..06afe585913 100644 --- a/src/caffe/test/test_image_data_layer.cpp +++ b/src/caffe/test/test_image_data_layer.cpp @@ -1,4 +1,5 @@ #ifdef USE_OPENCV + #include "../../../include/caffe/util/opencv4.hpp" #include #include #include diff --git a/src/caffe/test/test_io.cpp b/src/caffe/test/test_io.cpp index c2c919e90dc..91c1a7fca22 100644 --- a/src/caffe/test/test_io.cpp +++ b/src/caffe/test/test_io.cpp @@ -1,4 +1,5 @@ #ifdef USE_OPENCV + #include "../../../include/caffe/util/opencv4.hpp" #include #include #include diff --git a/src/caffe/test/test_memory_data_layer.cpp b/src/caffe/test/test_memory_data_layer.cpp index 7998bc18262..20b46178ce0 100644 --- a/src/caffe/test/test_memory_data_layer.cpp +++ b/src/caffe/test/test_memory_data_layer.cpp @@ -1,5 +1,6 @@ #ifdef USE_OPENCV #include +#include "../../../include/caffe/util/opencv4.hpp" #endif // USE_OPENCV #include diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp index 5295d9dddb9..5c2880e5f54 100644 --- a/src/caffe/util/io.cpp +++ b/src/caffe/util/io.cpp @@ -7,7 +7,9 @@ #include #include #include + #include "../../../include/caffe/util/opencv4.hpp" #endif // USE_OPENCV + #include #include @@ -70,6 +72,7 @@ void WriteProtoToBinaryFile(const Message& proto, const char* filename) { } #ifdef USE_OPENCV + #include "../../../include/caffe/util/opencv4.hpp" cv::Mat ReadImageToCVMat(const string& filename, const int height, const int width, const bool is_color) { cv::Mat cv_img; @@ -163,6 +166,7 @@ bool ReadFileToDatum(const string& filename, const int label, } #ifdef USE_OPENCV + #include "../../../include/caffe/util/opencv4.hpp" cv::Mat DecodeDatumToCVMatNative(const Datum& datum) { cv::Mat cv_img; CHECK(datum.encoded()) << "Datum not encoded";