[moved from opencv] __shfl_up_sync with mask for CUDA >= 9

nglee · nglee · commit 18439b4150ba · 2019-01-21T15:31:15.000Z
* __shfl_up_sync with proper mask value for CUDA >= 9 * BlockScanInclusive for CUDA >= 9 * compatible_shfl_up for use in integral.hpp * Use CLAHE in cudev * Add tests for BlockScan original commit: opencv/opencv@970293a
diff --git a/modules/cudaimgproc/src/cuda/clahe.cu b/modules/cudaimgproc/src/cuda/clahe.cu
@@ -42,15 +42,9 @@
 
 #if !defined CUDA_DISABLER
 
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/scan.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/cudev.hpp"
 
-using namespace cv::cuda;
-using namespace cv::cuda::device;
+using namespace cv::cudev;
 
 namespace clahe
 {
@@ -73,7 +67,7 @@ namespace clahe
             for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
             {
                 const int data = srcPtr[j];
-                Emulation::smem::atomicAdd(&smem[data], 1);
+                ::atomicAdd(&smem[data], 1);
             }
         }
 
@@ -96,7 +90,7 @@ namespace clahe
 
             // find number of overall clipped samples
 
-            reduce<256>(smem, clipped, tid, plus<int>());
+            blockReduce<256>(smem, clipped, tid, plus<int>());
 
             // broadcast evaluated value
 
@@ -128,10 +122,10 @@ namespace clahe
 
         calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
 
-        cudaSafeCall( cudaGetLastError() );
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
 
         if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
     }
 
     __global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
@@ -173,13 +167,13 @@ namespace clahe
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 
-        cudaSafeCall( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
+        CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
 
         transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
-        cudaSafeCall( cudaGetLastError() );
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
 
         if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
     }
 }
 
diff --git a/modules/cudev/include/opencv2/cudev/block/scan.hpp b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@@ -48,12 +48,134 @@
 
 #include "../common.hpp"
 #include "../warp/scan.hpp"
+#include "../warp/warp.hpp"
 
 namespace cv { namespace cudev {
 
 //! @addtogroup cudev
 //! @{
 
+#if __CUDACC_VER_MAJOR__ >= 9
+
+// Usage Note
+// - THREADS_NUM should be equal to the number of threads in this block.
+// - smem must be able to contain at least n elements of type T, where n is equal to the number
+//   of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
+//
+// Dev Note
+// - Starting from CUDA 9.0, support for Fermi is dropped. So CV_CUDEV_ARCH >= 300 is implied.
+// - "For Pascal and earlier architectures (CV_CUDEV_ARCH < 700), all threads in mask must execute
+//    the same warp intrinsic instruction in convergence, and the union of all values in mask must
+//    be equal to the warp's active mask."
+//   (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#independent-thread-scheduling-7-x)
+// - Above restriction does not apply starting from Volta (CV_CUDEV_ARCH >= 700). We just need to
+//   take care so that "all non-exited threads named in mask must execute the same intrinsic with
+//   the same mask."
+//   (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#warp-description)
+
+template <int THREADS_NUM, typename T>
+__device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
+{
+    const int residual = THREADS_NUM & (WARP_SIZE - 1);
+
+#if CV_CUDEV_ARCH < 700
+    const uint residual_mask = (1U << residual) - 1;
+#endif
+
+    if (THREADS_NUM > WARP_SIZE)
+    {
+        // bottom-level inclusive warp scan
+    #if CV_CUDEV_ARCH >= 700
+        T warpResult = warpScanInclusive(0xFFFFFFFFU, data);
+    #else
+        T warpResult;
+
+        if (0 == residual)
+            warpResult = warpScanInclusive(0xFFFFFFFFU, data);
+        else
+        {
+            const int n_warps = divUp(THREADS_NUM, WARP_SIZE);
+            const int warp_num = Warp::warpId();
+
+            if (warp_num < n_warps - 1)
+                warpResult = warpScanInclusive(0xFFFFFFFFU, data);
+            else
+            {
+                // We are at the last threads of a block whose number of threads
+                // is not a multiple of the warp size
+                warpResult = warpScanInclusive(residual_mask, data);
+            }
+        }
+    #endif
+
+        __syncthreads();
+
+        // save top elements of each warp for exclusive warp scan
+        // sync to wait for warp scans to complete (because smem is being overwritten)
+        if ((tid & (WARP_SIZE - 1)) == (WARP_SIZE - 1))
+        {
+            smem[tid >> LOG_WARP_SIZE] = warpResult;
+        }
+
+        __syncthreads();
+
+        int quot = THREADS_NUM / WARP_SIZE;
+
+        if (tid < quot)
+        {
+            // grab top warp elements
+            T val = smem[tid];
+
+            uint mask = (1LLU << quot) - 1;
+
+            if (0 == residual)
+            {
+                // calculate exclusive scan and write back to shared memory
+                smem[tid] = warpScanExclusive(mask, val);
+            }
+            else
+            {
+                // calculate inclusive scan and write back to shared memory with offset 1
+                smem[tid + 1] = warpScanInclusive(mask, val);
+
+                if (tid == 0)
+                    smem[0] = 0;
+            }
+        }
+
+        __syncthreads();
+
+        // return updated warp scans
+        return warpResult + smem[tid >> LOG_WARP_SIZE];
+    }
+    else
+    {
+    #if CV_CUDEV_ARCH >= 700
+        return warpScanInclusive(0xFFFFFFFFU, data);
+    #else
+        if (THREADS_NUM == WARP_SIZE)
+            return warpScanInclusive(0xFFFFFFFFU, data);
+        else
+            return warpScanInclusive(residual_mask, data);
+    #endif
+    }
+}
+
+template <int THREADS_NUM, typename T>
+__device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint tid)
+{
+    return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
+}
+
+#else // __CUDACC_VER_MAJOR__ >= 9
+
+// Usage Note
+// - THREADS_NUM should be equal to the number of threads in this block.
+// - (>= Kepler) smem must be able to contain at least n elements of type T, where n is equal to the number
+//   of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
+// - (Fermi) smem must be able to contain at least n elements of type T, where n is equal to the number
+//   of threads in this block (= THREADS_NUM).
+
 template <int THREADS_NUM, typename T>
 __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
 {
@@ -73,18 +195,31 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
 
         __syncthreads();
 
-        if (tid < (THREADS_NUM / WARP_SIZE))
+        int quot = THREADS_NUM / WARP_SIZE;
+
+        if (tid < quot)
         {
             // grab top warp elements
             T val = smem[tid];
 
-            // calculate exclusive scan and write back to shared memory
-            smem[tid] = warpScanExclusive(val, smem, tid);
+            if (0 == (THREADS_NUM & (WARP_SIZE - 1)))
+            {
+                // calculate exclusive scan and write back to shared memory
+                smem[tid] = warpScanExclusive(val, smem, tid);
+            }
+            else
+            {
+                // calculate inclusive scan and write back to shared memory with offset 1
+                smem[tid + 1] = warpScanInclusive(val, smem, tid);
+
+                if (tid == 0)
+                    smem[0] = 0;
+            }
         }
 
         __syncthreads();
 
-        // return updated warp scans with exclusive scan results
+        // return updated warp scans
         return warpResult + smem[tid >> LOG_WARP_SIZE];
     }
     else
@@ -99,6 +234,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t
     return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
 }
 
+#endif // __CUDACC_VER_MAJOR__ >= 9
+
 //! @}
 
 }}
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@@ -215,7 +215,7 @@ namespace integral_detail
         #pragma unroll
         for (int i = 1; i < 32; i *= 2)
         {
-            const int n = shfl_up(sum, i, 32);
+            const int n = compatible_shfl_up(sum, i, 32);
 
             if (lane_id >= i)
             {
@@ -245,9 +245,9 @@ namespace integral_detail
             int warp_sum = sums[lane_id];
 
             #pragma unroll
-            for (int i = 1; i <= 32; i *= 2)
+            for (int i = 1; i < 32; i *= 2)
             {
-                const int n = shfl_up(warp_sum, i, 32);
+                const int n = compatible_shfl_up(warp_sum, i, 32);
 
                 if (lane_id >= i)
                     warp_sum += n;
@@ -453,7 +453,7 @@ namespace integral_detail
 
             for (int i = 1; i <= 8; i *= 2)
             {
-                T n = shfl_up(partial_sum, i, 32);
+                T n = compatible_shfl_up(partial_sum, i, 32);
 
                 if (lane_id >= i)
                     partial_sum += n;
diff --git a/modules/cudev/include/opencv2/cudev/warp/scan.hpp b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@@ -55,6 +55,36 @@ namespace cv { namespace cudev {
 //! @addtogroup cudev
 //! @{
 
+#if __CUDACC_VER_MAJOR__ >= 9
+
+// Starting from CUDA 9.0, support for Fermi is dropped.
+// So CV_CUDEV_ARCH >= 300 is implied.
+
+template <typename T>
+__device__ T warpScanInclusive(uint mask, T data)
+{
+    const uint laneId = Warp::laneId();
+
+    // scan on shufl functions
+    #pragma unroll
+    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
+    {
+        const T val = shfl_up_sync(mask, data, i);
+        if (laneId >= i)
+              data += val;
+    }
+
+    return data;
+}
+
+template <typename T>
+__device__ __forceinline__ T warpScanExclusive(uint mask, T data)
+{
+    return warpScanInclusive(mask, data) - data;
+}
+
+#else // __CUDACC_VER_MAJOR__ >= 9
+
 template <typename T>
 __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
 {
@@ -75,19 +105,16 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
 
     return data;
 #else
-    uint pos = 2 * tid - (tid & (WARP_SIZE - 1));
-    smem[pos] = 0;
+    const uint laneId = Warp::laneId();
 
-    pos += WARP_SIZE;
-    smem[pos] = data;
+    smem[tid] = data;
 
-    smem[pos] += smem[pos - 1];
-    smem[pos] += smem[pos - 2];
-    smem[pos] += smem[pos - 4];
-    smem[pos] += smem[pos - 8];
-    smem[pos] += smem[pos - 16];
+    #pragma unroll
+    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
+        if (laneId >= i)
+            smem[tid] += smem[tid - i];
 
-    return smem[pos];
+    return smem[tid];
 #endif
 }
 
@@ -97,6 +124,8 @@ __device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint ti
     return warpScanInclusive(data, smem, tid) - data;
 }
 
+#endif // __CUDACC_VER_MAJOR__ >= 9
+
 //! @}
 
 }}
diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
diff --git a/modules/cudev/test/test_scan.cu b/modules/cudev/test/test_scan.cu

Original file line number	Diff line number	Diff line change
`@@ -215,7 +215,7 @@ namespace integral_detail`
`215`	`215`	`#pragma unroll`
`216`	`216`	`for (int i = 1; i < 32; i *= 2)`
`217`	`217`	`{`
`218`		`- const int n = shfl_up(sum, i, 32);`
	`218`	`+ const int n = compatible_shfl_up(sum, i, 32);`
`219`	`219`
`220`	`220`	`if (lane_id >= i)`
`221`	`221`	`{`
`@@ -245,9 +245,9 @@ namespace integral_detail`
`245`	`245`	`int warp_sum = sums[lane_id];`
`246`	`246`
`247`	`247`	`#pragma unroll`
`248`		`- for (int i = 1; i <= 32; i *= 2)`
	`248`	`+ for (int i = 1; i < 32; i *= 2)`
`249`	`249`	`{`
`250`		`- const int n = shfl_up(warp_sum, i, 32);`
	`250`	`+ const int n = compatible_shfl_up(warp_sum, i, 32);`
`251`	`251`
`252`	`252`	`if (lane_id >= i)`
`253`	`253`	`warp_sum += n;`
`@@ -453,7 +453,7 @@ namespace integral_detail`
`453`	`453`
`454`	`454`	`for (int i = 1; i <= 8; i *= 2)`
`455`	`455`	`{`
`456`		`- T n = shfl_up(partial_sum, i, 32);`
	`456`	`+ T n = compatible_shfl_up(partial_sum, i, 32);`
`457`	`457`
`458`	`458`	`if (lane_id >= i)`
`459`	`459`	`partial_sum += n;`