48
48
49
49
#include " ../common.hpp"
50
50
#include " ../warp/scan.hpp"
51
+ #include " ../warp/warp.hpp"
51
52
52
53
namespace cv { namespace cudev {
53
54
54
55
// ! @addtogroup cudev
55
56
// ! @{
56
57
58
+ #if __CUDACC_VER_MAJOR__ >= 9
59
+
60
+ // Usage Note
61
+ // - THREADS_NUM should be equal to the number of threads in this block.
62
+ // - smem must be able to contain at least n elements of type T, where n is equal to the number
63
+ // of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
64
+ //
65
+ // Dev Note
66
+ // - Starting from CUDA 9.0, support for Fermi is dropped. So CV_CUDEV_ARCH >= 300 is implied.
67
+ // - "For Pascal and earlier architectures (CV_CUDEV_ARCH < 700), all threads in mask must execute
68
+ // the same warp intrinsic instruction in convergence, and the union of all values in mask must
69
+ // be equal to the warp's active mask."
70
+ // (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#independent-thread-scheduling-7-x)
71
+ // - Above restriction does not apply starting from Volta (CV_CUDEV_ARCH >= 700). We just need to
72
+ // take care so that "all non-exited threads named in mask must execute the same intrinsic with
73
+ // the same mask."
74
+ // (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#warp-description)
75
+
76
+ template <int THREADS_NUM, typename T>
77
+ __device__ T blockScanInclusive (T data, volatile T* smem, uint tid)
78
+ {
79
+ const int residual = THREADS_NUM & (WARP_SIZE - 1 );
80
+
81
+ #if CV_CUDEV_ARCH < 700
82
+ const uint residual_mask = (1U << residual) - 1 ;
83
+ #endif
84
+
85
+ if (THREADS_NUM > WARP_SIZE)
86
+ {
87
+ // bottom-level inclusive warp scan
88
+ #if CV_CUDEV_ARCH >= 700
89
+ T warpResult = warpScanInclusive (0xFFFFFFFFU , data);
90
+ #else
91
+ T warpResult;
92
+
93
+ if (0 == residual)
94
+ warpResult = warpScanInclusive (0xFFFFFFFFU , data);
95
+ else
96
+ {
97
+ const int n_warps = divUp (THREADS_NUM, WARP_SIZE);
98
+ const int warp_num = Warp::warpId ();
99
+
100
+ if (warp_num < n_warps - 1 )
101
+ warpResult = warpScanInclusive (0xFFFFFFFFU , data);
102
+ else
103
+ {
104
+ // We are at the last threads of a block whose number of threads
105
+ // is not a multiple of the warp size
106
+ warpResult = warpScanInclusive (residual_mask, data);
107
+ }
108
+ }
109
+ #endif
110
+
111
+ __syncthreads ();
112
+
113
+ // save top elements of each warp for exclusive warp scan
114
+ // sync to wait for warp scans to complete (because smem is being overwritten)
115
+ if ((tid & (WARP_SIZE - 1 )) == (WARP_SIZE - 1 ))
116
+ {
117
+ smem[tid >> LOG_WARP_SIZE] = warpResult;
118
+ }
119
+
120
+ __syncthreads ();
121
+
122
+ int quot = THREADS_NUM / WARP_SIZE;
123
+
124
+ if (tid < quot)
125
+ {
126
+ // grab top warp elements
127
+ T val = smem[tid];
128
+
129
+ uint mask = (1LLU << quot) - 1 ;
130
+
131
+ if (0 == residual)
132
+ {
133
+ // calculate exclusive scan and write back to shared memory
134
+ smem[tid] = warpScanExclusive (mask, val);
135
+ }
136
+ else
137
+ {
138
+ // calculate inclusive scan and write back to shared memory with offset 1
139
+ smem[tid + 1 ] = warpScanInclusive (mask, val);
140
+
141
+ if (tid == 0 )
142
+ smem[0 ] = 0 ;
143
+ }
144
+ }
145
+
146
+ __syncthreads ();
147
+
148
+ // return updated warp scans
149
+ return warpResult + smem[tid >> LOG_WARP_SIZE];
150
+ }
151
+ else
152
+ {
153
+ #if CV_CUDEV_ARCH >= 700
154
+ return warpScanInclusive (0xFFFFFFFFU , data);
155
+ #else
156
+ if (THREADS_NUM == WARP_SIZE)
157
+ return warpScanInclusive (0xFFFFFFFFU , data);
158
+ else
159
+ return warpScanInclusive (residual_mask, data);
160
+ #endif
161
+ }
162
+ }
163
+
164
+ template <int THREADS_NUM, typename T>
165
+ __device__ __forceinline__ T blockScanExclusive (T data, volatile T* smem, uint tid)
166
+ {
167
+ return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
168
+ }
169
+
170
+ #else // __CUDACC_VER_MAJOR__ >= 9
171
+
172
+ // Usage Note
173
+ // - THREADS_NUM should be equal to the number of threads in this block.
174
+ // - (>= Kepler) smem must be able to contain at least n elements of type T, where n is equal to the number
175
+ // of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
176
+ // - (Fermi) smem must be able to contain at least n elements of type T, where n is equal to the number
177
+ // of threads in this block (= THREADS_NUM).
178
+
57
179
template <int THREADS_NUM, typename T>
58
180
__device__ T blockScanInclusive (T data, volatile T* smem, uint tid)
59
181
{
@@ -73,18 +195,31 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
73
195
74
196
__syncthreads ();
75
197
76
- if (tid < (THREADS_NUM / WARP_SIZE))
198
+ int quot = THREADS_NUM / WARP_SIZE;
199
+
200
+ if (tid < quot)
77
201
{
78
202
// grab top warp elements
79
203
T val = smem[tid];
80
204
81
- // calculate exclusive scan and write back to shared memory
82
- smem[tid] = warpScanExclusive (val, smem, tid);
205
+ if (0 == (THREADS_NUM & (WARP_SIZE - 1 )))
206
+ {
207
+ // calculate exclusive scan and write back to shared memory
208
+ smem[tid] = warpScanExclusive (val, smem, tid);
209
+ }
210
+ else
211
+ {
212
+ // calculate inclusive scan and write back to shared memory with offset 1
213
+ smem[tid + 1 ] = warpScanInclusive (val, smem, tid);
214
+
215
+ if (tid == 0 )
216
+ smem[0 ] = 0 ;
217
+ }
83
218
}
84
219
85
220
__syncthreads ();
86
221
87
- // return updated warp scans with exclusive scan results
222
+ // return updated warp scans
88
223
return warpResult + smem[tid >> LOG_WARP_SIZE];
89
224
}
90
225
else
@@ -99,6 +234,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t
99
234
return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
100
235
}
101
236
237
+ #endif // __CUDACC_VER_MAJOR__ >= 9
238
+
102
239
// ! @}
103
240
104
241
}}
0 commit comments