Skip to content

Commit 1ed3dd2

Browse files
authored
Merge pull request #3744 from asmorkalov:as/variadic_tuple
Added CUDA 12.4+ support #3744 Tries to fix #3690 for CUDA 12.4+ Related patch to main repo: opencv/opencv#25658 Changes: - Added branches to support new variadic implementation of thrust::tuple - Added branch with std::array instead of std::tuple in split-merge and grid operations. The new branch got rid of namespace clash: cv::cuda in OpenCV and ::cuda in CUDA standard library (injected by Thrust). Old tuple branches presumed for compatibility with old code and CUDA versions before 12.4. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
1 parent d131e7a commit 1ed3dd2

File tree

12 files changed

+537
-49
lines changed

12 files changed

+537
-49
lines changed

modules/cudaarithm/src/cuda/polar_cart.cu

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -133,23 +133,9 @@ void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, Outpu
133133
GpuMat_<float> anglec(angle.reshape(1));
134134

135135
if (angleInDegrees)
136-
{
137-
gridTransformTuple(zipPtr(xc, yc),
138-
tie(magc, anglec),
139-
make_tuple(
140-
binaryTupleAdapter<0, 1>(magnitude_func<float>()),
141-
binaryTupleAdapter<0, 1>(direction_func<float, true>())),
142-
stream);
143-
}
136+
gridTransformBinary(xc, yc, magc, anglec, magnitude_func<float>(), direction_func<float, true>(), stream);
144137
else
145-
{
146-
gridTransformTuple(zipPtr(xc, yc),
147-
tie(magc, anglec),
148-
make_tuple(
149-
binaryTupleAdapter<0, 1>(magnitude_func<float>()),
150-
binaryTupleAdapter<0, 1>(direction_func<float, false>())),
151-
stream);
152-
}
138+
gridTransformBinary(xc, yc, magc, anglec, magnitude_func<float>(), direction_func<float, false>(), stream);
153139

154140
syncOutput(mag, _mag, stream);
155141
syncOutput(angle, _angle, stream);

modules/cudaarithm/src/cuda/split_merge.cu

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ namespace
6767
{
6868
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
6969
{
70-
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1])),
70+
const std::array<GlobPtrSz<T>, 2> d_src = {globPtr<T>(src[0]), globPtr<T>(src[1])};
71+
gridMerge(d_src,
7172
globPtr<typename MakeVec<T, 2>::type>(dst),
7273
stream);
7374
}
@@ -77,7 +78,8 @@ namespace
7778
{
7879
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
7980
{
80-
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])),
81+
const std::array<GlobPtrSz<T>, 3> d_src = {globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])};
82+
gridMerge(d_src,
8183
globPtr<typename MakeVec<T, 3>::type>(dst),
8284
stream);
8385
}
@@ -87,7 +89,8 @@ namespace
8789
{
8890
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
8991
{
90-
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])),
92+
const std::array<GlobPtrSz<T>, 4 > d_src = {globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])};
93+
gridMerge(d_src,
9194
globPtr<typename MakeVec<T, 4>::type>(dst),
9295
stream);
9396
}

modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,17 @@ namespace block_reduce_detail
154154
val = smem[tid];
155155
}
156156

157+
158+
// merge
159+
160+
template <typename T, class Op>
161+
__device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op)
162+
{
163+
T reg = smem[tid + delta];
164+
smem[tid] = val = op(val, reg);
165+
}
166+
167+
#if (CUDART_VERSION < 12040)
157168
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
158169
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
159170
__device__ __forceinline__ void loadToSmem(const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
@@ -172,15 +183,6 @@ namespace block_reduce_detail
172183
For<0, tuple_size<tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
173184
}
174185

175-
// merge
176-
177-
template <typename T, class Op>
178-
__device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op)
179-
{
180-
T reg = smem[tid + delta];
181-
smem[tid] = val = op(val, reg);
182-
}
183-
184186
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
185187
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
186188
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
@@ -214,6 +216,41 @@ namespace block_reduce_detail
214216
}
215217
#endif
216218

219+
#else
220+
template <typename... P, typename... R>
221+
__device__ __forceinline__ void loadToSmem(const tuple<P...>& smem, const tuple<R...>& val, uint tid)
222+
{
223+
For<0, tuple_size<tuple<P...> >::value>::loadToSmem(smem, val, tid);
224+
}
225+
226+
template <typename... P, typename... R>
227+
__device__ __forceinline__ void loadFromSmem(const tuple<P...>& smem, const tuple<R...>& val, uint tid)
228+
{
229+
For<0, tuple_size<tuple<P...> >::value>::loadFromSmem(smem, val, tid);
230+
}
231+
232+
template <typename... P, typename... R, class... Op>
233+
__device__ __forceinline__ void merge(const tuple<P...>& smem, const tuple<R...>& val, uint tid, uint delta, const tuple<Op...>& op)
234+
{
235+
For<0, tuple_size<tuple<P...> >::value>::merge(smem, val, tid, delta, op);
236+
}
237+
238+
// mergeShfl
239+
240+
template <typename T, class Op>
241+
__device__ __forceinline__ void mergeShfl(T& val, uint delta, uint width, const Op& op)
242+
{
243+
T reg = shfl_down(val, delta, width);
244+
val = op(val, reg);
245+
}
246+
247+
template <typename... R, class... Op>
248+
__device__ __forceinline__ void mergeShfl(const tuple<R...>& val, uint delta, uint width, const tuple<Op...>& op)
249+
{
250+
For<0, tuple_size<tuple<R...> >::value>::mergeShfl(val, delta, width, op);
251+
}
252+
#endif
253+
217254
// Generic
218255

219256
template <int N> struct Generic

modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ namespace block_reduce_key_val_detail
160160
data = smem[tid];
161161
}
162162

163+
#if (CUDART_VERSION < 12040)
163164
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
164165
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
165166
__device__ __forceinline__ void loadToSmem(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
@@ -241,6 +242,67 @@ namespace block_reduce_key_val_detail
241242
{
242243
For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
243244
}
245+
#else
246+
template <typename... VP, typename... VR>
247+
__device__ __forceinline__ void loadToSmem(const tuple<VP...>& smem, const tuple<VR...>& data, uint tid)
248+
{
249+
For<0, tuple_size<tuple<VP...> >::value>::loadToSmem(smem, data, tid);
250+
}
251+
252+
template <typename... VP, typename... VR>
253+
__device__ __forceinline__ void loadFromSmem(const tuple<VP...>& smem, const tuple<VR...>& data, uint tid)
254+
{
255+
For<0, tuple_size<tuple<VP...> >::value>::loadFromSmem(smem, data, tid);
256+
}
257+
258+
// copyVals
259+
260+
template <typename V>
261+
__device__ __forceinline__ void copyVals(volatile V* svals, V& val, uint tid, uint delta)
262+
{
263+
svals[tid] = val = svals[tid + delta];
264+
}
265+
266+
template <typename... VP, typename... VR>
267+
__device__ __forceinline__ void copyVals(const tuple<VP...>& svals, const tuple<VR...>& val, uint tid, uint delta)
268+
{
269+
For<0, tuple_size<tuple<VP...> >::value>::copy(svals, val, tid, delta);
270+
}
271+
272+
// merge
273+
274+
template <typename K, typename V, class Cmp>
275+
__device__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, uint tid, uint delta)
276+
{
277+
K reg = skeys[tid + delta];
278+
279+
if (cmp(reg, key))
280+
{
281+
skeys[tid] = key = reg;
282+
copyVals(svals, val, tid, delta);
283+
}
284+
}
285+
286+
template <typename K, typename... VP, typename... VR, class Cmp>
287+
__device__ void merge(volatile K* skeys, K& key, const tuple<VP...>& svals, const tuple<VR...>& val, const Cmp& cmp, uint tid, uint delta)
288+
{
289+
K reg = skeys[tid + delta];
290+
291+
if (cmp(reg, key))
292+
{
293+
skeys[tid] = key = reg;
294+
copyVals(svals, val, tid, delta);
295+
}
296+
}
297+
298+
template <typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
299+
__device__ __forceinline__ void merge(const tuple<KP...>& skeys, const tuple<KR...>& key,
300+
const tuple<VP...>& svals, const tuple<VR...>& val,
301+
const tuple<Cmp...>& cmp, uint tid, uint delta)
302+
{
303+
For<0, tuple_size<tuple<VP...> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
304+
}
305+
#endif
244306

245307
// Generic
246308

modules/cudev/include/opencv2/cudev/block/reduce.hpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include "../warp/reduce.hpp"
5252
#include "detail/reduce.hpp"
5353
#include "detail/reduce_key_val.hpp"
54+
#include <cuda_runtime_api.h>
5455

5556
namespace cv { namespace cudev {
5657

@@ -65,6 +66,7 @@ __device__ __forceinline__ void blockReduce(volatile T* smem, T& val, uint tid,
6566
block_reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
6667
}
6768

69+
#if (CUDART_VERSION < 12040)
6870
template <int N,
6971
typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
7072
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
@@ -126,6 +128,39 @@ __device__ __forceinline__ void blockReduceKeyVal(const tuple<KP0, KP1, KP2, KP3
126128
>(skeys, key, svals, val, tid, cmp);
127129
}
128130

131+
#else
132+
133+
template <int N, typename... P, typename... R, typename... Op>
134+
__device__ __forceinline__ void blockReduce(const tuple<P...>& smem,
135+
const tuple<R...>& val,
136+
uint tid,
137+
const tuple<Op...>& op)
138+
{
139+
block_reduce_detail::Dispatcher<N>::reductor::template reduce<const tuple<P...>&, const tuple<R...>&, const tuple<Op...>&>(smem, val, tid, op);
140+
}
141+
142+
// blockReduceKeyVal
143+
144+
template <int N, typename K, typename V, class Cmp>
145+
__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, uint tid, const Cmp& cmp)
146+
{
147+
block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
148+
}
149+
150+
template <int N, typename K, typename... VP, typename... VR, class Cmp>
151+
__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, const tuple<VP...>& svals, const tuple<VR...>& val, uint tid, const Cmp& cmp)
152+
{
153+
block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, const tuple<VP...>&, const tuple<VR...>&, const Cmp&>(skeys, key, svals, val, tid, cmp);
154+
}
155+
156+
template <int N, typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
157+
__device__ __forceinline__ void blockReduceKeyVal(const tuple<KP...>& skeys, const tuple<KR...>& key, const tuple<VP...>& svals, const tuple<VR...>& val, uint tid, const tuple<Cmp...>& cmp)
158+
{
159+
block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce< const tuple<KP...>&, const tuple<KR...>&, const tuple<VP...>&, const tuple<VR...>&, const tuple<Cmp...>&>(skeys, key, svals, val, tid, cmp);
160+
}
161+
162+
#endif
163+
129164
//! @}
130165

131166
}}

modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,28 +157,47 @@ namespace grid_split_merge_detail
157157
template <class Policy> struct MergeImpl<2, Policy>
158158
{
159159
template <class SrcPtrTuple, typename DstType, class MaskPtr>
160-
__host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
160+
__host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
161161
{
162162
mergeC2<Policy>(get<0>(src), get<1>(src), dst, mask, rows, cols, stream);
163163
}
164+
165+
template <class SrcPtrArray, typename DstType, class MaskPtr>
166+
__host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
167+
{
168+
mergeC2<Policy>(src[0], src[1], dst, mask, rows, cols, stream);
169+
}
170+
164171
};
165172

166173
template <class Policy> struct MergeImpl<3, Policy>
167174
{
168175
template <class SrcPtrTuple, typename DstType, class MaskPtr>
169-
__host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
176+
__host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
170177
{
171178
mergeC3<Policy>(get<0>(src), get<1>(src), get<2>(src), dst, mask, rows, cols, stream);
172179
}
180+
181+
template <class SrcPtrArray, typename DstType, class MaskPtr>
182+
__host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
183+
{
184+
mergeC3<Policy>(src[0], src[1], src[2], dst, mask, rows, cols, stream);
185+
}
173186
};
174187

175188
template <class Policy> struct MergeImpl<4, Policy>
176189
{
177190
template <class SrcPtrTuple, typename DstType, class MaskPtr>
178-
__host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
191+
__host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
179192
{
180193
mergeC4<Policy>(get<0>(src), get<1>(src), get<2>(src), get<3>(src), dst, mask, rows, cols, stream);
181194
}
195+
196+
template <class SrcPtrArray, typename DstType, class MaskPtr>
197+
__host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
198+
{
199+
mergeC4<Policy>(src[0], src[1], src[2], src[3], dst, mask, rows, cols, stream);
200+
}
182201
};
183202

184203
// split

0 commit comments

Comments
 (0)