parallel roi_align on CPU

mingfeima · mingfeima · commit 1fa27d0f07d3 · 2022-09-19T10:53:41.000+08:00
diff --git a/setup.py b/setup.py
@@ -156,6 +156,8 @@ def get_extensions():
     print(f"  TORCHVISION_USE_FFMPEG: {use_ffmpeg}")
     use_video_codec = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "1") == "1"
     print(f"  TORCHVISION_USE_VIDEO_CODEC: {use_video_codec}")
+    use_openmp = os.getenv("USE_OPENMP", "0") == "1"
+    print(f"  USE_OPEN: {use_openmp}")
 
     nvcc_flags = os.getenv("NVCC_FLAGS", "")
     print(f"  NVCC_FLAGS: {nvcc_flags}")
@@ -223,6 +225,9 @@ def get_extensions():
         define_macros += [("USE_PYTHON", None)]
         extra_compile_args["cxx"].append("/MP")
 
+    if use_openmp:
+        extra_compile_args["cxx"].append("-fopenmp")
+
     if debug_mode:
         print("Compiling in debug mode")
         extra_compile_args["cxx"].append("-g")
diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -1,4 +1,5 @@
 #include <ATen/ATen.h>
+#include <ATen/Parallel.h>
 #include <torch/library.h>
 
 #include "./roi_align_common.h"
@@ -25,86 +26,88 @@ void roi_align_forward_kernel_impl(
   // (n, c, ph, pw) is an element in the pooled output
   // can be parallelized using omp
   // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-
-    // Do not using rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_rois[4] * spatial_scale - offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (!aligned) {
-      // Force malformed ROIs to be 1x1
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-        ? sampling_ratio
-        : ceil(roi_height / pooled_height); // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    // When the grid is empty, output zeros.
-    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
-
-    // we want to precalculate indices and weights shared by all chanels,
-    // this is the key point of optimization
-    std::vector<detail::PreCalc<T>> pre_calc(
-        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
-    detail::pre_calc_for_bilinear_interpolate(
-        height,
-        width,
-        pooled_height,
-        pooled_width,
-        roi_start_h,
-        roi_start_w,
-        bin_size_h,
-        bin_size_w,
-        roi_bin_grid_h,
-        roi_bin_grid_w,
-        pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_input =
-          input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              detail::PreCalc<T> pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_input[pc.pos1] +
-                  pc.w2 * offset_input[pc.pos2] +
-                  pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
-
-              pre_calc_index += 1;
+  at::parallel_for(0, n_rois, 1, [&](int begin, int end) {
+    for (int n = begin; n < end; n++) {
+      int index_n = n * channels * pooled_width * pooled_height;
+
+      const T* offset_rois = rois + n * 5;
+      int roi_batch_ind = offset_rois[0];
+
+      // Do not using rounding; this implementation detail is critical
+      T offset = aligned ? (T)0.5 : (T)0.0;
+      T roi_start_w = offset_rois[1] * spatial_scale - offset;
+      T roi_start_h = offset_rois[2] * spatial_scale - offset;
+      T roi_end_w = offset_rois[3] * spatial_scale - offset;
+      T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+      T roi_width = roi_end_w - roi_start_w;
+      T roi_height = roi_end_h - roi_start_h;
+      if (!aligned) {
+        // Force malformed ROIs to be 1x1
+        roi_width = std::max(roi_width, (T)1.);
+        roi_height = std::max(roi_height, (T)1.);
+      }
+
+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h = (sampling_ratio > 0)
+          ? sampling_ratio
+          : ceil(roi_height / pooled_height); // e.g., = 2
+      int roi_bin_grid_w =
+          (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+      // We do average (integral) pooling inside a bin
+      // When the grid is empty, output zeros.
+      const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+      // we want to precalculate indices and weights shared by all chanels,
+      // this is the key point of optimization
+      std::vector<detail::PreCalc<T>> pre_calc(
+          roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+      detail::pre_calc_for_bilinear_interpolate(
+          height,
+          width,
+          pooled_height,
+          pooled_width,
+          roi_start_h,
+          roi_start_w,
+          bin_size_h,
+          bin_size_w,
+          roi_bin_grid_h,
+          roi_bin_grid_w,
+          pre_calc);
+
+      for (int c = 0; c < channels; c++) {
+        int index_n_c = index_n + c * pooled_width * pooled_height;
+        const T* offset_input =
+            input + (roi_batch_ind * channels + c) * height * width;
+        int pre_calc_index = 0;
+
+        for (int ph = 0; ph < pooled_height; ph++) {
+          for (int pw = 0; pw < pooled_width; pw++) {
+            int index = index_n_c + ph * pooled_width + pw;
+
+            T output_val = 0.;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                detail::PreCalc<T> pc = pre_calc[pre_calc_index];
+                output_val += pc.w1 * offset_input[pc.pos1] +
+                    pc.w2 * offset_input[pc.pos2] +
+                    pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+                pre_calc_index += 1;
+              }
             }
-          }
-          output_val /= count; // Average pooling
-
-          output[index] = output_val;
-        } // for pw
-      } // for ph
-    } // for c
-  } // for n
+            output_val /= count; // Average pooling
+
+            output[index] = output_val;
+          } // for pw
+        } // for ph
+      } // for c
+    } // for n
+  });
 }
 
 template <typename T>