CMU-Perceptual-Computing-Lab · ambersun1234 · Oct 13, 2020 · Nov 21, 2020 · Nov 21, 2020 · Nov 10, 2020
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,6 +6,8 @@ if(POLICY CMP0054)
   cmake_policy(SET CMP0054 NEW)
 endif()
 
+set(CMAKE_INSTALL_PREFIX /usr/local/include/caffe)
+
 # ---[ Caffe project
 project(Caffe C CXX)
 
@@ -32,7 +34,7 @@ caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ON
 caffe_option(USE_NCCL "Build Caffe with NCCL library support" OFF)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
-set(python_version "2" CACHE STRING "Specify which Python version to use")
+set(python_version "3" CACHE STRING "Specify which Python version to use")
 caffe_option(BUILD_matlab "Build Matlab wrapper" OFF IF UNIX OR APPLE)
 caffe_option(BUILD_docs   "Build documentation" ON IF UNIX OR APPLE)
 caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON)

diff --git a/Makefile b/Makefile
@@ -427,7 +427,7 @@ CXXFLAGS += -MMD -MP
 # Complete build flags.
 COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)
-NVCCFLAGS += -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
+NVCCFLAGS += -D_FORCE_INLINES -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
 # mex may invoke an older gcc that is too liberal with -Wuninitalized
 MATLAB_CXXFLAGS := $(CXXFLAGS) -Wno-uninitialized
 LINKFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)

diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
@@ -7,23 +7,25 @@ endif ()
 # set(Caffe_known_gpu_archs "30 35 50 52 60 61")
 # set(Caffe_known_gpu_archs "20 21(20) 30 35 50 60 61")
 # Fermi (3.2 <= CUDA <= 8)
-set(FERMI "20 21(20)")
+# set(FERMI "20 21(20)")
 # Kepler (CUDA >= 5)
-set(KEPLER "30 35 37")
+set(KEPLER "35 37") # set(KEPLER "30 35 37") # This crashes with CUDA 10
 # Maxwell (CUDA >= 6)
 set(MAXWELL "50 52 53")
 # Pascal (CUDA >= 8)
 set(PASCAL "60 61 62")
 # Volta (CUDA >= 9)
-set(VOLTA "70") # set(VOLTA "70 71 72") # This crashes with CUDA 10
+set(VOLTA "70 72") # set(VOLTA "70 71 72") # This crashes with CUDA 10
 # Turing (CUDA >= 10)
 set(TURING "75")
+# Ampere (CUDA >= 11)
+set(AMPERE "80 86")
 if (UNIX AND NOT APPLE)
-  set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING}")
+  set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING} ${AMPERE}")
   # set(Caffe_known_gpu_archs "${FERMI} ${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING}")
   # set(Caffe_known_gpu_archs "20 21(20) 30 35 50 52 60 61")
 elseif (WIN32)
-  set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING}")
+  set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING} ${AMPERE}")
 endif ()
 
 
@@ -227,7 +229,11 @@ function(detect_cuDNN)
     set(HAVE_CUDNN  TRUE PARENT_SCOPE)
     set(CUDNN_FOUND TRUE PARENT_SCOPE)
 
-    file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    if(EXISTS "${CUDNN_INCLUDE}/cudnn_version.h")
+        file(READ ${CUDNN_INCLUDE}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
+    else()
+        file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    endif()
 
     # cuDNN v3 and beyond
     string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"

diff --git a/examples/cpp_classification/classification.cpp b/examples/cpp_classification/classification.cpp
@@ -3,7 +3,9 @@
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
+#include "../../include/caffe/util/opencv4.hpp"
 #endif  // USE_OPENCV
+
 #include <algorithm>
 #include <iosfwd>
 #include <memory>

diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
@@ -51,6 +51,7 @@ class DataTransformer {
                 Blob<Dtype>* transformed_blob);
 
 #ifdef USE_OPENCV
+  #include "./util/opencv4.hpp"
   /**
    * @brief Applies the transformation defined in the data layer's
    * transform_param block to a vector of Mat.

diff --git a/include/caffe/layers/memory_data_layer.hpp b/include/caffe/layers/memory_data_layer.hpp
@@ -32,6 +32,8 @@ class MemoryDataLayer : public BaseDataLayer<Dtype> {
 #ifdef USE_OPENCV
   virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
       const vector<int>& labels);
+
+  #include "../util/opencv4.hpp"
 #endif  // USE_OPENCV
 
   // Reset should accept const pointers, but can't, because the memory

diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
@@ -130,6 +130,7 @@ bool DecodeDatumNative(Datum* datum);
 bool DecodeDatum(Datum* datum, bool is_color);
 
 #ifdef USE_OPENCV
+  #include "./opencv4.hpp"
 cv::Mat ReadImageToCVMat(const string& filename,
     const int height, const int width, const bool is_color);
 

diff --git a/include/caffe/util/opencv4.hpp b/include/caffe/util/opencv4.hpp
@@ -0,0 +1,10 @@
+#include <opencv2/opencv.hpp>
+
+#if (defined(CV_MAJOR_VERSION) && CV_MAJOR_VERSION > 3)
+    #define OPENCV_VERSION4
+#endif
+
+#ifdef OPENCV_VERSION4
+    #define CV_LOAD_IMAGE_COLOR cv::IMREAD_COLOR
+    #define CV_LOAD_IMAGE_GRAYSCALE cv::IMREAD_GRAYSCALE
+#endif
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
@@ -1,5 +1,6 @@
 #ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
+#include "../../include/caffe/util/opencv4.hpp"
 #endif  // USE_OPENCV
 
 #include <string>

diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -108,9 +108,21 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
   const int stride_h = stride_data[0];
   const int stride_w = stride_data[1];
 
+// Note: Copied from https://github.com/Qengineering/caffe/tree/ssd/src/caffe/layers
+#if CUDNN_VERSION_MIN(8, 0, 0)
+  int RetCnt;
+  bool found_conv_algorithm;
+  size_t free_memory, total_memory;
+  cudnnConvolutionFwdAlgoPerf_t     fwd_algo_pref_[4];
+  cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];
+
+  //get memory sizes
+  cudaMemGetInfo(&free_memory, &total_memory);
+#else
   // Specify workspace limit for kernels directly until we have a
   // planning strategy and a rewrite of Caffe's GPU memory mangagement
   size_t workspace_limit_bytes = 8*1024*1024;
+#endif
 
   for (int i = 0; i < bottom.size(); i++) {
     cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
@@ -127,6 +139,50 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
         filter_desc_, pad_h, pad_w,
         stride_h, stride_w);
 
+// Note: Copied from https://github.com/Qengineering/caffe/tree/ssd/src/caffe/layers
+#if CUDNN_VERSION_MIN(8, 0, 0)
+    // choose forward algorithm for filter
+    // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0], bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], 4, &RetCnt, fwd_algo_pref_));
+
+    found_conv_algorithm = false;
+    for(int n=0;n<RetCnt;n++){
+      if (fwd_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
+          fwd_algo_pref_[n].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+          fwd_algo_pref_[n].memory < free_memory){
+        found_conv_algorithm = true;
+        fwd_algo_[i] = fwd_algo_pref_[n].algo;
+        workspace_fwd_sizes_[i] = fwd_algo_pref_[n].memory;
+        break;
+      }
+    }
+    if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
+    else{
+      // choose backward algorithm for filter
+        // for better or worse, just a fixed constant due to the missing
+        // cudnnGetConvolutionBackwardFilterAlgorithm in cuDNN version 8.0
+      bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+      //twice the amount of the forward search to be save
+      workspace_bwd_filter_sizes_[i] = 2*workspace_fwd_sizes_[i];
+    }
+
+    // choose backward algo for data
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0], filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], 4, &RetCnt, bwd_data_algo_pref_));
+
+    found_conv_algorithm = false;
+    for(int n=0;n<RetCnt;n++){
+      if (bwd_data_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
+          bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD &&
+          bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
+          bwd_data_algo_pref_[n].memory < free_memory){
+        found_conv_algorithm = true;
+        bwd_data_algo_[i]              = bwd_data_algo_pref_[n].algo;
+        workspace_bwd_data_sizes_[i]   = bwd_data_algo_pref_[n].memory;
+        break;
+      }
+    }
+    if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
+#else
     // choose forward and backward algorithms + workspace(s)
     CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
       bottom_descs_[i],
@@ -166,6 +222,7 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
     CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
           filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
           bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
+#endif
   }
 
   // reduce over all workspace sizes to get a maximum to allocate / reallocate
@@ -252,7 +309,6 @@ CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
   }
 
   cudaFree(workspaceData);
-  delete [] workspace;
   delete [] stream_;
   delete [] handle_;
   delete [] fwd_algo_;