diff --git a/CMakeLists.txt b/CMakeLists.txt
index 905cd7d83ef..b646e01993e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,8 @@ if(POLICY CMP0054)
   cmake_policy(SET CMP0054 NEW)
 endif()
 
+set(CMAKE_INSTALL_PREFIX /usr/local/include/caffe)
+
 # ---[ Caffe project
 project(Caffe C CXX)
 
@@ -32,7 +34,7 @@ caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ON
 caffe_option(USE_NCCL "Build Caffe with NCCL library support" OFF)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
-set(python_version "2" CACHE STRING "Specify which Python version to use")
+set(python_version "3" CACHE STRING "Specify which Python version to use")
 caffe_option(BUILD_matlab "Build Matlab wrapper" OFF IF UNIX OR APPLE)
 caffe_option(BUILD_docs   "Build documentation" ON IF UNIX OR APPLE)
 caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON)
diff --git a/Makefile b/Makefile
index 0188f3e64a4..308dc73ee52 100644
--- a/Makefile
+++ b/Makefile
@@ -427,7 +427,7 @@ CXXFLAGS += -MMD -MP
 # Complete build flags.
 COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)
-NVCCFLAGS += -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
+NVCCFLAGS += -D_FORCE_INLINES -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
 # mex may invoke an older gcc that is too liberal with -Wuninitalized
 MATLAB_CXXFLAGS := $(CXXFLAGS) -Wno-uninitialized
 LINKFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index d28cbeadaa1..046242f65a5 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -7,23 +7,25 @@ endif ()
 # set(Caffe_known_gpu_archs "30 35 50 52 60 61")
 # set(Caffe_known_gpu_archs "20 21(20) 30 35 50 60 61")
 # Fermi (3.2 <= CUDA <= 8)
-set(FERMI "20 21(20)")
+# set(FERMI "20 21(20)")
 # Kepler (CUDA >= 5)
-set(KEPLER "30 35 37")
+set(KEPLER "35 37") # set(KEPLER "30 35 37") # This crashes with CUDA 10
 # Maxwell (CUDA >= 6)
 set(MAXWELL "50 52 53")
 # Pascal (CUDA >= 8)
 set(PASCAL "60 61 62")
 # Volta (CUDA >= 9)
-set(VOLTA "70") # set(VOLTA "70 71 72") # This crashes with CUDA 10
+set(VOLTA "70 72") # set(VOLTA "70 71 72") # This crashes with CUDA 10
 # Turing (CUDA >= 10)
 set(TURING "75")
+# Ampere (CUDA >= 11)
+set(AMPERE "80 86")
 if (UNIX AND NOT APPLE)
-  set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING}")
+  set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING} ${AMPERE}")
   # set(Caffe_known_gpu_archs "${FERMI} ${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING}")
   # set(Caffe_known_gpu_archs "20 21(20) 30 35 50 52 60 61")
 elseif (WIN32)
-  set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING}")
+  set(Caffe_known_gpu_archs "${KEPLER} ${MAXWELL} ${PASCAL} ${VOLTA} ${TURING} ${AMPERE}")
 endif ()
 
 
@@ -227,7 +229,11 @@ function(detect_cuDNN)
     set(HAVE_CUDNN  TRUE PARENT_SCOPE)
     set(CUDNN_FOUND TRUE PARENT_SCOPE)
 
-    file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    if(EXISTS "${CUDNN_INCLUDE}/cudnn_version.h")
+        file(READ ${CUDNN_INCLUDE}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
+    else()
+        file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    endif()
 
     # cuDNN v3 and beyond
     string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
diff --git a/examples/cpp_classification/classification.cpp b/examples/cpp_classification/classification.cpp
index 6b67c537a47..478355c0dbe 100644
--- a/examples/cpp_classification/classification.cpp
+++ b/examples/cpp_classification/classification.cpp
@@ -3,7 +3,9 @@
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
+#include "../../include/caffe/util/opencv4.hpp"
 #endif  // USE_OPENCV
+
 #include <algorithm>
 #include <iosfwd>
 #include <memory>
diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
index 97b4ee6a8c4..dc0982923dc 100644
--- a/include/caffe/data_transformer.hpp
+++ b/include/caffe/data_transformer.hpp
@@ -51,6 +51,7 @@ class DataTransformer {
                 Blob<Dtype>* transformed_blob);
 
 #ifdef USE_OPENCV
+  #include "./util/opencv4.hpp"
   /**
    * @brief Applies the transformation defined in the data layer's
    * transform_param block to a vector of Mat.
diff --git a/include/caffe/layers/memory_data_layer.hpp b/include/caffe/layers/memory_data_layer.hpp
index 8abcc8c1b68..8eb35d3817e 100644
--- a/include/caffe/layers/memory_data_layer.hpp
+++ b/include/caffe/layers/memory_data_layer.hpp
@@ -32,6 +32,8 @@ class MemoryDataLayer : public BaseDataLayer<Dtype> {
 #ifdef USE_OPENCV
   virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
       const vector<int>& labels);
+
+  #include "../util/opencv4.hpp"
 #endif  // USE_OPENCV
 
   // Reset should accept const pointers, but can't, because the memory
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index 1a599883ca3..ba464055bbc 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -130,6 +130,7 @@ bool DecodeDatumNative(Datum* datum);
 bool DecodeDatum(Datum* datum, bool is_color);
 
 #ifdef USE_OPENCV
+  #include "./opencv4.hpp"
 cv::Mat ReadImageToCVMat(const string& filename,
     const int height, const int width, const bool is_color);
 
diff --git a/include/caffe/util/opencv4.hpp b/include/caffe/util/opencv4.hpp
new file mode 100644
index 00000000000..fe8af42b274
--- /dev/null
+++ b/include/caffe/util/opencv4.hpp
@@ -0,0 +1,10 @@
+#include <opencv2/opencv.hpp>
+
+#if (defined(CV_MAJOR_VERSION) && CV_MAJOR_VERSION > 3)
+    #define OPENCV_VERSION4
+#endif
+
+#ifdef OPENCV_VERSION4
+    #define CV_LOAD_IMAGE_COLOR cv::IMREAD_COLOR
+    #define CV_LOAD_IMAGE_GRAYSCALE cv::IMREAD_GRAYSCALE
+#endif
\ No newline at end of file
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 3012251e0a5..80709ba8f28 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -1,5 +1,6 @@
 #ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
+#include "../../include/caffe/util/opencv4.hpp"
 #endif  // USE_OPENCV
 
 #include <string>
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index efc9e04e8c0..a4a69b3ac6a 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -108,9 +108,21 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
   const int stride_h = stride_data[0];
   const int stride_w = stride_data[1];
 
+// Note: Copied from https://github.com/Qengineering/caffe/tree/ssd/src/caffe/layers
+#if CUDNN_VERSION_MIN(8, 0, 0)
+  int RetCnt;
+  bool found_conv_algorithm;
+  size_t free_memory, total_memory;
+  cudnnConvolutionFwdAlgoPerf_t     fwd_algo_pref_[4];
+  cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];
+
+  //get memory sizes
+  cudaMemGetInfo(&free_memory, &total_memory);
+#else
   // Specify workspace limit for kernels directly until we have a
   // planning strategy and a rewrite of Caffe's GPU memory mangagement
   size_t workspace_limit_bytes = 8*1024*1024;
+#endif
 
   for (int i = 0; i < bottom.size(); i++) {
     cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
@@ -127,6 +139,50 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
         filter_desc_, pad_h, pad_w,
         stride_h, stride_w);
 
+// Note: Copied from https://github.com/Qengineering/caffe/tree/ssd/src/caffe/layers
+#if CUDNN_VERSION_MIN(8, 0, 0)
+    // choose forward algorithm for filter
+    // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0], bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], 4, &RetCnt, fwd_algo_pref_));
+
+    found_conv_algorithm = false;
+    for(int n=0;n<RetCnt;n++){
+      if (fwd_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
+          fwd_algo_pref_[n].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+          fwd_algo_pref_[n].memory < free_memory){
+        found_conv_algorithm = true;
+        fwd_algo_[i] = fwd_algo_pref_[n].algo;
+        workspace_fwd_sizes_[i] = fwd_algo_pref_[n].memory;
+        break;
+      }
+    }
+    if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
+    else{
+      // choose backward algorithm for filter
+        // for better or worse, just a fixed constant due to the missing
+        // cudnnGetConvolutionBackwardFilterAlgorithm in cuDNN version 8.0
+      bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+      //twice the amount of the forward search to be save
+      workspace_bwd_filter_sizes_[i] = 2*workspace_fwd_sizes_[i];
+    }
+
+    // choose backward algo for data
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0], filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], 4, &RetCnt, bwd_data_algo_pref_));
+
+    found_conv_algorithm = false;
+    for(int n=0;n<RetCnt;n++){
+      if (bwd_data_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
+          bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD &&
+          bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
+          bwd_data_algo_pref_[n].memory < free_memory){
+        found_conv_algorithm = true;
+        bwd_data_algo_[i]              = bwd_data_algo_pref_[n].algo;
+        workspace_bwd_data_sizes_[i]   = bwd_data_algo_pref_[n].memory;
+        break;
+      }
+    }
+    if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
+#else
     // choose forward and backward algorithms + workspace(s)
     CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
       bottom_descs_[i],
@@ -166,6 +222,7 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
     CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
           filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
           bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
+#endif
   }
 
   // reduce over all workspace sizes to get a maximum to allocate / reallocate
@@ -252,7 +309,6 @@ CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
   }
 
   cudaFree(workspaceData);
-  delete [] workspace;
   delete [] stream_;
   delete [] handle_;
   delete [] fwd_algo_;
diff --git a/src/caffe/layers/cudnn_deconv_layer.cpp b/src/caffe/layers/cudnn_deconv_layer.cpp
index 260da5c1ee0..86d807fb08e 100644
--- a/src/caffe/layers/cudnn_deconv_layer.cpp
+++ b/src/caffe/layers/cudnn_deconv_layer.cpp
@@ -93,200 +93,200 @@ template <typename Dtype>
 void CuDNNDeconvolutionLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   DeconvolutionLayer<Dtype>::Reshape(bottom, top);
-  CHECK_EQ(2, this->num_spatial_axes_)
-      << "CuDNNDeconvolutionLayer input must have 2 spatial axes "
-      << "(e.g., height and width). "
-      << "Use 'engine: CAFFE' for general ND convolution.";
-  bottom_offset_ = this->bottom_dim_ / this->group_;
-  top_offset_ = this->top_dim_ / this->group_;
-  const int height = bottom[0]->shape(this->channel_axis_ + 1);
-  const int width = bottom[0]->shape(this->channel_axis_ + 2);
-  const int height_out = top[0]->shape(this->channel_axis_ + 1);
-  const int width_out = top[0]->shape(this->channel_axis_ + 2);
-  const int* pad_data = this->pad_.cpu_data();
-  const int pad_h = pad_data[0];
-  const int pad_w = pad_data[1];
-  const int* stride_data = this->stride_.cpu_data();
-  const int stride_h = stride_data[0];
-  const int stride_w = stride_data[1];
-
-  // Specify workspace limit for kernels directly until we have a
-  // planning strategy and a rewrite of Caffe's GPU memory mangagement
-  size_t workspace_limit_bytes = 8*1024*1024;
-
-  for (int i = 0; i < bottom.size(); i++) {
-    cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
-                                  this->num_,
-                                  this->channels_ / this->group_,
-                                  height,
-                                  width,
-                                  this->channels_ * height * width,
-                                  height * width,
-                                  width,
-                                  1);
-    cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
-                                  this->num_,
-                                  this->num_output_ / this->group_,
-                                  height_out,
-                                  width_out,
-                                  this->num_output_ * height_out * width_out,
-                                  height_out * width_out,
-                                  width_out,
-                                  1);
-    cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i],
-                                     top_descs_[i],
-                                     filter_desc_,
-                                     pad_h,
-                                     pad_w,
-                                     stride_h,
-                                     stride_w);
-
-    // choose forward and backward algorithms + workspace(s)
-    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-        handle_[0],
-        top_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        bottom_descs_[i],
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,
-        &fwd_algo_[i]));
-
-    // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
-    // buggy. Thus, if this algo was chosen, choose winograd instead. If
-    // winograd is not supported or workspace is larger than threshold, choose
-    // implicit_gemm instead.
-    if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
-      size_t winograd_workspace_size;
-      cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize(
-          handle_[0],
-          top_descs_[i],
-          filter_desc_,
-          conv_descs_[i],
-          bottom_descs_[i],
-          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-          &winograd_workspace_size);
-      if (status != CUDNN_STATUS_SUCCESS ||
-          winograd_workspace_size >= workspace_limit_bytes) {
-        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      } else {
-        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
-      }
-    }
-
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-        handle_[0],
-        top_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        bottom_descs_[i],
-        fwd_algo_[i],
-        &(workspace_fwd_sizes_[i])));
-
-    // choose backward algorithm for filter
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-        handle_[0],
-        top_descs_[i],
-        bottom_descs_[i],
-        conv_descs_[i],
-        filter_desc_,
-        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,
-        &bwd_filter_algo_[i]));
-
-    // get workspace for backwards filter algorithm
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        handle_[0],
-        top_descs_[i],
-        bottom_descs_[i],
-        conv_descs_[i],
-        filter_desc_,
-        bwd_filter_algo_[i],
-        &workspace_bwd_filter_sizes_[i]));
-
-    // choose backward algo for data
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-        handle_[0],
-        filter_desc_,
-        bottom_descs_[i],
-        conv_descs_[i],
-        top_descs_[i],
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,
-        &bwd_data_algo_[i]));
-
-    // get workspace size
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-        handle_[0],
-        filter_desc_,
-        bottom_descs_[i],
-        conv_descs_[i],
-        top_descs_[i],
-        bwd_data_algo_[i],
-        &workspace_bwd_data_sizes_[i]));
-  }
-
-  // reduce over all workspace sizes to get a maximum to allocate / reallocate
-  size_t total_workspace_fwd = 0;
-  size_t total_workspace_bwd_data = 0;
-  size_t total_workspace_bwd_filter = 0;
-
-  for (size_t i = 0; i < bottom.size(); i++) {
-    total_workspace_fwd        = std::max(total_workspace_fwd,
-                                     workspace_fwd_sizes_[i]);
-    total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
-                                     workspace_bwd_data_sizes_[i]);
-    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
-                                     workspace_bwd_filter_sizes_[i]);
-  }
-  // get max over all operations
-  size_t max_workspace = std::max(total_workspace_fwd,
-                             total_workspace_bwd_data);
-  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
-  // ensure all groups have enough workspace
-  size_t total_max_workspace = max_workspace *
-                               (this->group_ * CUDNN_STREAMS_PER_GROUP);
-
-  // this is the total amount of storage needed over all groups + streams
-  if (total_max_workspace > workspaceSizeInBytes) {
-    DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
-    workspaceSizeInBytes = total_max_workspace;
-
-    // free the existing workspace and allocate a new (larger) one
-    cudaFree(this->workspaceData);
-
-    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
-    if (err != cudaSuccess) {
-      // force zero memory path
-      for (int i = 0; i < bottom.size(); i++) {
-        workspace_fwd_sizes_[i] = 0;
-        workspace_bwd_filter_sizes_[i] = 0;
-        workspace_bwd_data_sizes_[i] = 0;
-        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
-        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      }
-
-      // NULL out all workspace pointers
-      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
-        workspace[g] = NULL;
-      }
-      // NULL out underlying data
-      workspaceData = NULL;
-      workspaceSizeInBytes = 0;
-    }
-
-    // if we succeed in the allocation, set pointer aliases for workspaces
-    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
-      workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
-    }
-  }
-
-  // Tensor descriptor for bias.
-  if (this->bias_term_) {
-    cudnn::setTensor4dDesc<Dtype>(
-        &bias_desc_, 1, this->num_output_ / this->group_, 1, 1);
-  }
+  // CHECK_EQ(2, this->num_spatial_axes_)
+  //     << "CuDNNDeconvolutionLayer input must have 2 spatial axes "
+  //     << "(e.g., height and width). "
+  //     << "Use 'engine: CAFFE' for general ND convolution.";
+  // bottom_offset_ = this->bottom_dim_ / this->group_;
+  // top_offset_ = this->top_dim_ / this->group_;
+  // const int height = bottom[0]->shape(this->channel_axis_ + 1);
+  // const int width = bottom[0]->shape(this->channel_axis_ + 2);
+  // const int height_out = top[0]->shape(this->channel_axis_ + 1);
+  // const int width_out = top[0]->shape(this->channel_axis_ + 2);
+  // const int* pad_data = this->pad_.cpu_data();
+  // const int pad_h = pad_data[0];
+  // const int pad_w = pad_data[1];
+  // const int* stride_data = this->stride_.cpu_data();
+  // const int stride_h = stride_data[0];
+  // const int stride_w = stride_data[1];
+
+  // // Specify workspace limit for kernels directly until we have a
+  // // planning strategy and a rewrite of Caffe's GPU memory mangagement
+  // size_t workspace_limit_bytes = 8*1024*1024;
+
+  // for (int i = 0; i < bottom.size(); i++) {
+  //   cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
+  //                                 this->num_,
+  //                                 this->channels_ / this->group_,
+  //                                 height,
+  //                                 width,
+  //                                 this->channels_ * height * width,
+  //                                 height * width,
+  //                                 width,
+  //                                 1);
+  //   cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
+  //                                 this->num_,
+  //                                 this->num_output_ / this->group_,
+  //                                 height_out,
+  //                                 width_out,
+  //                                 this->num_output_ * height_out * width_out,
+  //                                 height_out * width_out,
+  //                                 width_out,
+  //                                 1);
+  //   cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i],
+  //                                    top_descs_[i],
+  //                                    filter_desc_,
+  //                                    pad_h,
+  //                                    pad_w,
+  //                                    stride_h,
+  //                                    stride_w);
+
+  //   // choose forward and backward algorithms + workspace(s)
+  //   CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+  //       handle_[0],
+  //       top_descs_[i],
+  //       filter_desc_,
+  //       conv_descs_[i],
+  //       bottom_descs_[i],
+  //       CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+  //       workspace_limit_bytes,
+  //       &fwd_algo_[i]));
+
+  //   // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
+  //   // buggy. Thus, if this algo was chosen, choose winograd instead. If
+  //   // winograd is not supported or workspace is larger than threshold, choose
+  //   // implicit_gemm instead.
+  //   if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
+  //     size_t winograd_workspace_size;
+  //     cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize(
+  //         handle_[0],
+  //         top_descs_[i],
+  //         filter_desc_,
+  //         conv_descs_[i],
+  //         bottom_descs_[i],
+  //         CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
+  //         &winograd_workspace_size);
+  //     if (status != CUDNN_STATUS_SUCCESS ||
+  //         winograd_workspace_size >= workspace_limit_bytes) {
+  //       fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  //     } else {
+  //       fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
+  //     }
+  //   }
+
+  //   CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+  //       handle_[0],
+  //       top_descs_[i],
+  //       filter_desc_,
+  //       conv_descs_[i],
+  //       bottom_descs_[i],
+  //       fwd_algo_[i],
+  //       &(workspace_fwd_sizes_[i])));
+
+  //   // choose backward algorithm for filter
+  //   CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+  //       handle_[0],
+  //       top_descs_[i],
+  //       bottom_descs_[i],
+  //       conv_descs_[i],
+  //       filter_desc_,
+  //       CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+  //       workspace_limit_bytes,
+  //       &bwd_filter_algo_[i]));
+
+  //   // get workspace for backwards filter algorithm
+  //   CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+  //       handle_[0],
+  //       top_descs_[i],
+  //       bottom_descs_[i],
+  //       conv_descs_[i],
+  //       filter_desc_,
+  //       bwd_filter_algo_[i],
+  //       &workspace_bwd_filter_sizes_[i]));
+
+  //   // choose backward algo for data
+  //   CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+  //       handle_[0],
+  //       filter_desc_,
+  //       bottom_descs_[i],
+  //       conv_descs_[i],
+  //       top_descs_[i],
+  //       CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+  //       workspace_limit_bytes,
+  //       &bwd_data_algo_[i]));
+
+  //   // get workspace size
+  //   CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+  //       handle_[0],
+  //       filter_desc_,
+  //       bottom_descs_[i],
+  //       conv_descs_[i],
+  //       top_descs_[i],
+  //       bwd_data_algo_[i],
+  //       &workspace_bwd_data_sizes_[i]));
+  // }
+
+  // // reduce over all workspace sizes to get a maximum to allocate / reallocate
+  // size_t total_workspace_fwd = 0;
+  // size_t total_workspace_bwd_data = 0;
+  // size_t total_workspace_bwd_filter = 0;
+
+  // for (size_t i = 0; i < bottom.size(); i++) {
+  //   total_workspace_fwd        = std::max(total_workspace_fwd,
+  //                                    workspace_fwd_sizes_[i]);
+  //   total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
+  //                                    workspace_bwd_data_sizes_[i]);
+  //   total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
+  //                                    workspace_bwd_filter_sizes_[i]);
+  // }
+  // // get max over all operations
+  // size_t max_workspace = std::max(total_workspace_fwd,
+  //                            total_workspace_bwd_data);
+  // max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
+  // // ensure all groups have enough workspace
+  // size_t total_max_workspace = max_workspace *
+  //                              (this->group_ * CUDNN_STREAMS_PER_GROUP);
+
+  // // this is the total amount of storage needed over all groups + streams
+  // if (total_max_workspace > workspaceSizeInBytes) {
+  //   DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
+  //   workspaceSizeInBytes = total_max_workspace;
+
+  //   // free the existing workspace and allocate a new (larger) one
+  //   cudaFree(this->workspaceData);
+
+  //   cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
+  //   if (err != cudaSuccess) {
+  //     // force zero memory path
+  //     for (int i = 0; i < bottom.size(); i++) {
+  //       workspace_fwd_sizes_[i] = 0;
+  //       workspace_bwd_filter_sizes_[i] = 0;
+  //       workspace_bwd_data_sizes_[i] = 0;
+  //       fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
+  //       bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+  //       bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+  //     }
+
+  //     // NULL out all workspace pointers
+  //     for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
+  //       workspace[g] = NULL;
+  //     }
+  //     // NULL out underlying data
+  //     workspaceData = NULL;
+  //     workspaceSizeInBytes = 0;
+  //   }
+
+  //   // if we succeed in the allocation, set pointer aliases for workspaces
+  //   for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
+  //     workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
+  //   }
+  // }
+
+  // // Tensor descriptor for bias.
+  // if (this->bias_term_) {
+  //   cudnn::setTensor4dDesc<Dtype>(
+  //       &bias_desc_, 1, this->num_output_ / this->group_, 1, 1);
+  // }
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 0f1296bbc77..b87c38449f0 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -1,6 +1,8 @@
 #ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
+#include "../../../include/caffe/util/opencv4.hpp"
 #endif  // USE_OPENCV
+
 #include <stdint.h>
 
 #include <vector>
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index ec0fc5b0383..e355b70f594 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -1,4 +1,5 @@
 #ifdef USE_OPENCV
+  #include "../../../include/caffe/util/opencv4.hpp"
 #include <opencv2/core/core.hpp>
 
 #include <fstream>  // NOLINT(readability/streams)
diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
index 975f4841723..43fd0d343b8 100644
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@@ -1,5 +1,6 @@
 #ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
+#include "../../../include/caffe/util/opencv4.hpp"
 #endif  // USE_OPENCV
 
 #include <vector>
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index 1bf3760e9fd..e6f41b9028b 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -1,4 +1,6 @@
 #ifdef USE_OPENCV
+  #include "../../../include/caffe/util/opencv4.hpp"
+  
 #include <opencv2/highgui/highgui_c.h>
 #include <stdint.h>
 
diff --git a/src/caffe/test/test_data_layer.cpp b/src/caffe/test/test_data_layer.cpp
index 3835af1f173..72679b90a4a 100644
--- a/src/caffe/test/test_data_layer.cpp
+++ b/src/caffe/test/test_data_layer.cpp
@@ -1,4 +1,5 @@
 #ifdef USE_OPENCV
+  #include "../../../include/caffe/util/opencv4.hpp"
 #include <string>
 #include <vector>
 
diff --git a/src/caffe/test/test_data_transformer.cpp b/src/caffe/test/test_data_transformer.cpp
index 31bf1c1fb14..77d6b1fe92c 100644
--- a/src/caffe/test/test_data_transformer.cpp
+++ b/src/caffe/test/test_data_transformer.cpp
@@ -1,4 +1,5 @@
 #ifdef USE_OPENCV
+  #include "../../../include/caffe/util/opencv4.hpp"
 #include <string>
 #include <vector>
 
diff --git a/src/caffe/test/test_image_data_layer.cpp b/src/caffe/test/test_image_data_layer.cpp
index ce5e0bc62d6..06afe585913 100644
--- a/src/caffe/test/test_image_data_layer.cpp
+++ b/src/caffe/test/test_image_data_layer.cpp
@@ -1,4 +1,5 @@
 #ifdef USE_OPENCV
+  #include "../../../include/caffe/util/opencv4.hpp"
 #include <map>
 #include <string>
 #include <vector>
diff --git a/src/caffe/test/test_io.cpp b/src/caffe/test/test_io.cpp
index c2c919e90dc..91c1a7fca22 100644
--- a/src/caffe/test/test_io.cpp
+++ b/src/caffe/test/test_io.cpp
@@ -1,4 +1,5 @@
 #ifdef USE_OPENCV
+  #include "../../../include/caffe/util/opencv4.hpp"
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/highgui/highgui_c.h>
diff --git a/src/caffe/test/test_memory_data_layer.cpp b/src/caffe/test/test_memory_data_layer.cpp
index 7998bc18262..20b46178ce0 100644
--- a/src/caffe/test/test_memory_data_layer.cpp
+++ b/src/caffe/test/test_memory_data_layer.cpp
@@ -1,5 +1,6 @@
 #ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
+#include "../../../include/caffe/util/opencv4.hpp"
 #endif  // USE_OPENCV
 
 #include <string>
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 5295d9dddb9..5c2880e5f54 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -7,7 +7,9 @@
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/highgui/highgui_c.h>
 #include <opencv2/imgproc/imgproc.hpp>
+  #include "../../../include/caffe/util/opencv4.hpp"
 #endif  // USE_OPENCV
+
 #include <stdint.h>
 
 #include <algorithm>
@@ -70,6 +72,7 @@ void WriteProtoToBinaryFile(const Message& proto, const char* filename) {
 }
 
 #ifdef USE_OPENCV
+  #include "../../../include/caffe/util/opencv4.hpp"
 cv::Mat ReadImageToCVMat(const string& filename,
     const int height, const int width, const bool is_color) {
   cv::Mat cv_img;
@@ -163,6 +166,7 @@ bool ReadFileToDatum(const string& filename, const int label,
 }
 
 #ifdef USE_OPENCV
+  #include "../../../include/caffe/util/opencv4.hpp"
 cv::Mat DecodeDatumToCVMatNative(const Datum& datum) {
   cv::Mat cv_img;
   CHECK(datum.encoded()) << "Datum not encoded";