diff --git a/SYCL/Reduction/reduction_big_data.cpp b/SYCL/Reduction/reduction_big_data.cpp index 7146975af5..ff07266b7e 100644 --- a/SYCL/Reduction/reduction_big_data.cpp +++ b/SYCL/Reduction/reduction_big_data.cpp @@ -49,8 +49,9 @@ int test(queue &Q, T Identity) { // Initialize. BinaryOperation BOp; - T CorrectOut; - initInputData(InBuf, CorrectOut, BOp, NWorkItems); + std::optional CorrectOutOpt; + initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems); + T CorrectOut = *CorrectOutOpt; // Compute. Q.submit([&](handler &CGH) { diff --git a/SYCL/Reduction/reduction_nd_N_vars.cpp b/SYCL/Reduction/reduction_nd_N_vars.cpp index 5b72d6ce34..0b8e04409f 100644 --- a/SYCL/Reduction/reduction_nd_N_vars.cpp +++ b/SYCL/Reduction/reduction_nd_N_vars.cpp @@ -48,7 +48,9 @@ struct Red { } void init() { - initInputData(InBuf, CorrectOut, BOp, NWorkItems); + std::optional CorrectOutOpt; + initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems); + CorrectOut = *CorrectOutOpt; if (!PropList.template has_property< property::reduction::initialize_to_identity>()) CorrectOut = BOp(CorrectOut, InitVal); diff --git a/SYCL/Reduction/reduction_nd_reducer_skip.cpp b/SYCL/Reduction/reduction_nd_reducer_skip.cpp new file mode 100644 index 0000000000..cf9a61d2ea --- /dev/null +++ b/SYCL/Reduction/reduction_nd_reducer_skip.cpp @@ -0,0 +1,66 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Group algorithms are not supported on Nvidia. +// XFAIL: hip_nvidia + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reductions initialized with a one element buffer. Additionally, some +// reducers will not be written to. + +#include "reduction_utils.hpp" + +using namespace sycl; + +int NumErrors = 0; + +template class SkipEvenName; +template class SkipOddName; +template class SkipAllName; + +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize, + size_t NWItems) { + nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); + NumErrors += test, T>(Q, Identity, Init, BOp, NDRange, + property_list{}, SkipEvenOp{}); + NumErrors += test, T>(Q, Identity, Init, BOp, NDRange, + property_list{}, SkipOddOp{}); + NumErrors += test, T>(Q, Identity, Init, BOp, NDRange, + property_list{}, SkipAllOp{}); +} + +int main() { + queue Q; + printDeviceInfo(Q); + + // Check some non power-of-two work-group sizes. + tests(Q, 0, 99, std::plus{}, 1, 7); + tests(Q, 0, 99, std::plus{}, 49, 49 * 5); + + // Try some power-of-two work-group sizes. + tests(Q, 0, 99, std::plus<>{}, 1, 32); + tests(Q, 1, 99, std::multiplies<>{}, 4, 32); + tests(Q, 0, 99, std::bit_or<>{}, 8, 128); + tests(Q, 0, 99, std::bit_xor<>{}, 16, 256); + tests(Q, ~0, 99, std::bit_and<>{}, 32, 256); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, 64, 256); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, 128, 256); + tests(Q, 0, 99, std::plus<>{}, 256, 256); + + // Check with various types. + tests(Q, 1, 99, std::multiplies<>{}, 8, 24); + tests(Q, 0x7fff, -99, ext::oneapi::minimum<>{}, 8, 256); + tests(Q, 0, 99, ext::oneapi::maximum<>{}, 8, 256); + + // Check with CUSTOM type. + using CV = CustomVec; + tests(Q, CV(0), CV(99), CustomVecPlus{}, 8, 256); + + printFinalStatus(NumErrors); + return NumErrors; +} diff --git a/SYCL/Reduction/reduction_range_1d_reducer_skip.cpp b/SYCL/Reduction/reduction_range_1d_reducer_skip.cpp new file mode 100644 index 0000000000..12258afbd4 --- /dev/null +++ b/SYCL/Reduction/reduction_range_1d_reducer_skip.cpp @@ -0,0 +1,65 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range<1>, reduction, func) +// with reductions initialized with a one element buffer. Additionally, some +// reducers will not be written to. + +#include "reduction_utils.hpp" + +using namespace sycl; + +int NumErrors = 0; + +template class SkipEvenName; +template class SkipOddName; +template class SkipAllName; + +template +void tests(ArgTys &&...Args) { + NumErrors += test, T>(std::forward(Args)..., + property_list{}, SkipEvenOp{}); + NumErrors += test, T>(std::forward(Args)..., + property_list{}, SkipOddOp{}); + NumErrors += test, T>(std::forward(Args)..., + property_list{}, SkipAllOp{}); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + constexpr access::mode RW = access::mode::read_write; + // Fast-reduce and Fast-atomics. Try various range types/sizes. + tests(Q, 0, 99, std::plus{}, range<1>(1)); + tests(Q, 0, 99, std::plus<>{}, range<1>(2)); + tests(Q, 0, 99, std::plus<>{}, range<1>(7)); + tests(Q, 0, 99, std::plus<>{}, range<1>(64)); + tests(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2)); + tests(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2 + 5)); + + // Check with CUSTOM type. + tests>(Q, 0, 99, CustomVecPlus{}, + range<1>(256)); + tests>(Q, 0, 99, CustomVecPlus{}, + range<1>(MaxWGSize * 3)); + tests>(Q, 99, CustomVecPlus{}, + range<1>(72)); + + // Check with identityless operations. + tests(Q, 99, PlusWithoutIdentity{}, range<1>(1)); + tests(Q, 99, PlusWithoutIdentity{}, range<1>(2)); + tests(Q, 99, PlusWithoutIdentity{}, range<1>(7)); + tests(Q, 99, PlusWithoutIdentity{}, range<1>(64)); + tests(Q, 99, PlusWithoutIdentity{}, + range<1>(MaxWGSize * 2)); + tests(Q, 99, PlusWithoutIdentity{}, + range<1>(MaxWGSize * 2 + 5)); + + printFinalStatus(NumErrors); + return NumErrors; +} diff --git a/SYCL/Reduction/reduction_range_2d_dw_reducer_skip.cpp b/SYCL/Reduction/reduction_range_2d_dw_reducer_skip.cpp new file mode 100644 index 0000000000..1598a90bcb --- /dev/null +++ b/SYCL/Reduction/reduction_range_2d_dw_reducer_skip.cpp @@ -0,0 +1,69 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range<2>, reduction, func) +// with reductions initialized with a one element buffer. Additionally, some +// reducers will not be written to. + +#include "reduction_utils.hpp" + +using namespace sycl; + +int NumErrors = 0; + +template class SkipEvenName; +template class SkipOddName; +template class SkipAllName; + +template +void tests(ArgTys &&...Args) { + NumErrors += test, T>(std::forward(Args)..., + property_list{}, SkipEvenOp{}); + NumErrors += test, T>(std::forward(Args)..., + property_list{}, SkipOddOp{}); + NumErrors += test, T>(std::forward(Args)..., + property_list{}, SkipAllOp{}); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + tests(Q, 0, 99, std::plus<>{}, range<2>{1, 1}); + tests(Q, 0, 99, std::plus<>{}, range<2>{2, 2}); + tests(Q, 0, 99, std::plus<>{}, range<2>{2, 3}); + tests(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1}); + tests(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize}); + tests(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2}); + tests(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7}); + tests(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3}); + + tests>(Q, 0, 99, CustomVecPlus{}, + range<2>{33, MaxWGSize}); + tests>(Q, 99, CustomVecPlus{}, + range<2>{33, MaxWGSize}); + + tests(Q, 99, PlusWithoutIdentity{}, range<2>{1, 1}); + tests(Q, 99, PlusWithoutIdentity{}, range<2>{2, 2}); + tests(Q, 99, PlusWithoutIdentity{}, range<2>{2, 3}); + tests(Q, 99, PlusWithoutIdentity{}, + range<2>{MaxWGSize, 1}); + tests(Q, 99, PlusWithoutIdentity{}, + range<2>{1, MaxWGSize}); + tests(Q, 99, PlusWithoutIdentity{}, + range<2>{2, MaxWGSize * 2}); + tests(Q, 99, PlusWithoutIdentity{}, + range<2>{MaxWGSize * 3, 7}); + tests(Q, 99, PlusWithoutIdentity{}, + range<2>{3, MaxWGSize * 3}); + + printFinalStatus(NumErrors); + return NumErrors; +} diff --git a/SYCL/Reduction/reduction_range_3d_rw_reducer_skip.cpp b/SYCL/Reduction/reduction_range_3d_rw_reducer_skip.cpp new file mode 100644 index 0000000000..089720b17e --- /dev/null +++ b/SYCL/Reduction/reduction_range_3d_rw_reducer_skip.cpp @@ -0,0 +1,82 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range<3>, reduction, func) +// with reductions initialized with a one element buffer. Additionally, some +// reducers will not be written to. + +#include "reduction_utils.hpp" + +using namespace sycl; + +int NumErrors = 0; + +template class SkipEvenName; +template class SkipOddName; +template class SkipAllName; + +template +void tests(ArgTys &&...Args) { + NumErrors += test, T>(std::forward(Args)..., + property_list{}, SkipEvenOp{}); + NumErrors += test, T>(std::forward(Args)..., + property_list{}, SkipOddOp{}); + NumErrors += test, T>(std::forward(Args)..., + property_list{}, SkipAllOp{}); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + tests(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1}); + tests(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2}); + tests(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4}); + + tests(Q, 0, 99, std::plus<>{}, + range<3>{1, 1, MaxWGSize + 1}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{1, MaxWGSize + 1, 1}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{MaxWGSize + 1, 1, 1}); + + tests(Q, 0, 99, std::plus<>{}, + range<3>{2, 5, MaxWGSize * 2}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{3, MaxWGSize * 3, 2}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{MaxWGSize * 3, 8, 4}); + + tests>(Q, 0, 99, CustomVecPlus{}, + range<3>{2, 33, MaxWGSize}); + tests>(Q, 99, CustomVecPlus{}, + range<3>{2, 33, MaxWGSize}); + + tests(Q, 99, PlusWithoutIdentity{}, range<3>{1, 1, 1}); + tests(Q, 99, PlusWithoutIdentity{}, range<3>{2, 2, 2}); + tests(Q, 99, PlusWithoutIdentity{}, range<3>{2, 3, 4}); + + tests(Q, 99, PlusWithoutIdentity{}, + range<3>{1, 1, MaxWGSize + 1}); + tests(Q, 99, PlusWithoutIdentity{}, + range<3>{1, MaxWGSize + 1, 1}); + tests(Q, 99, PlusWithoutIdentity{}, + range<3>{MaxWGSize + 1, 1, 1}); + + tests(Q, 99, PlusWithoutIdentity{}, + range<3>{2, 5, MaxWGSize * 2}); + tests(Q, 99, PlusWithoutIdentity{}, + range<3>{3, MaxWGSize * 3, 2}); + tests(Q, 99, PlusWithoutIdentity{}, + range<3>{MaxWGSize * 3, 8, 4}); + + printFinalStatus(NumErrors); + return NumErrors; +} diff --git a/SYCL/Reduction/reduction_range_N_vars.cpp b/SYCL/Reduction/reduction_range_N_vars.cpp index 2902272081..ed7e8d8b83 100644 --- a/SYCL/Reduction/reduction_range_N_vars.cpp +++ b/SYCL/Reduction/reduction_range_N_vars.cpp @@ -48,7 +48,9 @@ struct Red { } void init() { - initInputData(InBuf, CorrectOut, BOp, NWorkItems); + std::optional CorrectOutOpt; + initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems); + CorrectOut = *CorrectOutOpt; if (!PropList.template has_property< property::reduction::initialize_to_identity>()) CorrectOut = BOp(CorrectOut, InitVal); diff --git a/SYCL/Reduction/reduction_usm.cpp b/SYCL/Reduction/reduction_usm.cpp index 2a53361b13..2e6f25d273 100644 --- a/SYCL/Reduction/reduction_usm.cpp +++ b/SYCL/Reduction/reduction_usm.cpp @@ -38,12 +38,12 @@ int test(queue &Q, OptionalIdentity Identity, T Init, } // Initialize. - T CorrectOut; + std::optional CorrectOutOpt; BinaryOperation BOp; buffer InBuf(NWItems); - initInputData(InBuf, CorrectOut, BOp, NWItems); - CorrectOut = BOp(CorrectOut, Init); + initInputData(InBuf, CorrectOutOpt, BOp, NWItems); + T CorrectOut = BOp(*CorrectOutOpt, Init); // Compute. Q.submit([&](handler &CGH) { diff --git a/SYCL/Reduction/reduction_usm_dw.cpp b/SYCL/Reduction/reduction_usm_dw.cpp index 1460fac390..f94c564e23 100644 --- a/SYCL/Reduction/reduction_usm_dw.cpp +++ b/SYCL/Reduction/reduction_usm_dw.cpp @@ -39,11 +39,12 @@ int test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems, } // Initialize. - T CorrectOut; + std::optional CorrectOutOpt; BinaryOperation BOp; buffer InBuf(NWItems); - initInputData(InBuf, CorrectOut, BOp, NWItems); + initInputData(InBuf, CorrectOutOpt, BOp, NWItems); + T CorrectOut = *CorrectOutOpt; // Compute. Q.submit([&](handler &CGH) { diff --git a/SYCL/Reduction/reduction_utils.hpp b/SYCL/Reduction/reduction_utils.hpp index d5875fbbba..a326609553 100644 --- a/SYCL/Reduction/reduction_utils.hpp +++ b/SYCL/Reduction/reduction_utils.hpp @@ -4,11 +4,29 @@ using namespace sycl; +struct AllIdOp { + constexpr bool operator()(size_t Idx) const { return true; } +}; + +struct SkipAllOp { + constexpr bool operator()(size_t Idx) const { return false; } +}; + +struct SkipEvenOp { + constexpr bool operator()(size_t Idx) const { return Idx % 2; } +}; + +struct SkipOddOp { + constexpr bool operator()(size_t Idx) const { return (Idx + 1) % 2; } +}; + /// Initializes the buffer<1> \p 'InBuf' buffer with pseudo-random values, -/// computes the write the reduction value \p 'ExpectedOut'. -template -void initInputData(buffer &InBuf, T &ExpectedOut, BinaryOperation BOp, - range<1> Range) { +/// computes the write the reduction value \p 'ExpectedOut'. Linearized IDs are +/// filtered in \p 'ExpectedOut' using \p 'IdFilterFunc'. +template +void initInputData(buffer &InBuf, std::optional &ExpectedOut, + BinaryOperation BOp, range<1> Range, + IdFilterFuncT IdFilterFunc = {}) { size_t N = Range.size(); assert(N != 0); auto In = InBuf.template get_access(); @@ -27,15 +45,18 @@ void initInputData(buffer &InBuf, T &ExpectedOut, BinaryOperation BOp, In[I] = I; else In[I] = ((I + 1) % 5) + 1.1; - ExpectedOut = I == 0 ? In[I] : BOp(ExpectedOut, In[I]); + if (IdFilterFunc(I)) + ExpectedOut = ExpectedOut ? BOp(*ExpectedOut, In[I]) : In[I]; } }; /// Initializes the buffer<2> \p 'InBuf' buffer with pseudo-random values, -/// computes the write the reduction value \p 'ExpectedOut'. -template -void initInputData(buffer &InBuf, T &ExpectedOut, BinaryOperation BOp, - range<2> Range) { +/// computes the write the reduction value \p 'ExpectedOut'. Linearized IDs are +/// filtered in \p 'ExpectedOut' using \p 'IdFilterFunc'. +template +void initInputData(buffer &InBuf, std::optional &ExpectedOut, + BinaryOperation BOp, range<2> Range, + IdFilterFuncT IdFilterFunc = {}) { assert(Range.size() != 0); auto In = InBuf.template get_access(); for (int J = 0; J < Range[0]; ++J) { @@ -54,16 +75,19 @@ void initInputData(buffer &InBuf, T &ExpectedOut, BinaryOperation BOp, In[J][I] = I + J; else In[J][I] = ((I + 1 + J) % 5) + 1.1; - ExpectedOut = (I == 0 && J == 0) ? In[J][I] : BOp(ExpectedOut, In[J][I]); + if (IdFilterFunc(I + J * Range[1])) + ExpectedOut = ExpectedOut ? BOp(*ExpectedOut, In[J][I]) : In[J][I]; } } }; /// Initializes the buffer<3> \p 'InBuf' buffer with pseudo-random values, -/// computes the write the reduction value \p 'ExpectedOut'. -template -void initInputData(buffer &InBuf, T &ExpectedOut, BinaryOperation BOp, - range<3> Range) { +/// computes the write the reduction value \p 'ExpectedOut'. Linearized IDs are +/// filtered in \p 'ExpectedOut' using \p 'IdFilterFunc'. +template +void initInputData(buffer &InBuf, std::optional &ExpectedOut, + BinaryOperation BOp, range<3> Range, + IdFilterFuncT IdFilterFunc = {}) { assert(Range.size() != 0); auto In = InBuf.template get_access(); for (int K = 0; K < Range[0]; ++K) { @@ -83,9 +107,9 @@ void initInputData(buffer &InBuf, T &ExpectedOut, BinaryOperation BOp, In[K][J][I] = I + J + K; else In[K][J][I] = ((I + 1 + J + K * 3) % 5) + 1.1; - ExpectedOut = (I == 0 && J == 0 && K == 0) - ? In[K][J][I] - : BOp(ExpectedOut, In[K][J][I]); + if (IdFilterFunc(I + J * Range[2] + K * Range[1] * Range[2])) + ExpectedOut = + ExpectedOut ? BOp(*ExpectedOut, In[K][J][I]) : In[K][J][I]; } } } @@ -283,10 +307,10 @@ auto init_to_identity() { template typename RangeTy, int Dims, - typename PropListTy = property_list> + typename PropListTy = property_list, typename IdFilterFuncT = AllIdOp> int testInner(queue &Q, OptionalIdentity Identity, T Init, BinaryOperation BOp, const RangeTy &Range, - PropListTy PropList = {}) { + PropListTy PropList = {}, IdFilterFuncT IdFilterFunc = {}) { constexpr bool IsRange = std::is_same_v, RangeTy>; constexpr bool IsNDRange = std::is_same_v, RangeTy>; static_assert(IsRange || IsNDRange); @@ -326,11 +350,11 @@ int testInner(queue &Q, OptionalIdentity Identity, T Init, buffer OutBuf(1); // Initialize. - T CorrectOut; - initInputData(InBuf, CorrectOut, BOp, GlobalRange); + std::optional CorrectOut; + initInputData(InBuf, CorrectOut, BOp, GlobalRange, IdFilterFunc); if (!PropList.template has_property< property::reduction::initialize_to_identity>()) { - CorrectOut = BOp(CorrectOut, Init); + CorrectOut = CorrectOut ? BOp(*CorrectOut, Init) : Init; } // The value assigned here must be discarded (if IsReadWrite is true). @@ -352,42 +376,48 @@ int testInner(queue &Q, OptionalIdentity Identity, T Init, auto In = InBuf.template get_access(CGH); auto Redu = CreateReduction(); if constexpr (IsRange) - CGH.parallel_for( - Range, Redu, [=](id Id, auto &Sum) { Sum.combine(In[Id]); }); + CGH.parallel_for(Range, Redu, [=](item Id, auto &Sum) { + if (IdFilterFunc(Id.get_linear_id())) + Sum.combine(In[Id]); + }); else CGH.parallel_for(Range, Redu, [=](nd_item NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_id()]); + if (IdFilterFunc(NDIt.get_global_linear_id())) + Sum.combine(In[NDIt.get_global_linear_id()]); }); }); // Check correctness. auto Out = OutBuf.template get_access(); T ComputedOut = *(Out.get_pointer()); - return checkResults(Q, BOp, Range, ComputedOut, CorrectOut); + return checkResults(Q, BOp, Range, ComputedOut, *CorrectOut); } template typename RangeTy, int Dims, - typename PropListTy = property_list> + typename PropListTy = property_list, typename IdFilterFuncT = AllIdOp> int test(queue &Q, T Identity, T Init, BinaryOperation BOp, - const RangeTy &Range, PropListTy PropList = {}) { + const RangeTy &Range, PropListTy PropList = {}, + IdFilterFuncT IdFilterFunc = {}) { return testInner(Q, OptionalIdentity(Identity), Init, BOp, Range, PropList); } template typename RangeTy, int Dims, - typename PropListTy = property_list> + typename PropListTy = property_list, typename IdFilterFuncT = AllIdOp> int test(queue &Q, T Init, BinaryOperation BOp, const RangeTy &Range, - PropListTy PropList = {}) { + PropListTy PropList = {}, IdFilterFuncT IdFilterFunc = {}) { return testInner(Q, OptionalIdentity(), Init, BOp, Range, PropList); } template + int Dims, typename PropListTy = property_list, + typename IdFilterFuncT = AllIdOp> int testUSMInner(queue &Q, OptionalIdentity Identity, T Init, BinaryOperation BOp, const range &Range, - usm::alloc AllocType, PropListTy PropList = {}) { + usm::alloc AllocType, PropListTy PropList = {}, + IdFilterFuncT IdFilterFunc = {}) { printTestLabel(Range); auto Dev = Q.get_device(); @@ -434,12 +464,12 @@ int testUSMInner(queue &Q, OptionalIdentity Identity, T Init, } // Initialize. - T CorrectOut; + std::optional CorrectOut; buffer InBuf(Range); - initInputData(InBuf, CorrectOut, BOp, Range); + initInputData(InBuf, CorrectOut, BOp, Range, IdFilterFunc); if (!PropList.template has_property< property::reduction::initialize_to_identity>()) { - CorrectOut = BOp(CorrectOut, Init); + CorrectOut = CorrectOut ? BOp(*CorrectOut, Init) : Init; } // Compute. @@ -457,7 +487,10 @@ int testUSMInner(queue &Q, OptionalIdentity Identity, T Init, auto In = InBuf.template get_access(CGH); auto Redu = CreateReduction(); CGH.parallel_for>( - Range, Redu, [=](id Id, auto &Sum) { Sum.combine(In[Id]); }); + Range, Redu, [=](item Id, auto &Sum) { + if (IdFilterFunc(Id.get_linear_id())) + Sum.combine(In[Id]); + }); }).wait(); // Check correctness. @@ -476,24 +509,25 @@ int testUSMInner(queue &Q, OptionalIdentity Identity, T Init, std::string AllocStr = "AllocMode=" + std::to_string(static_cast(AllocType)); - int Error = checkResults(Q, BOp, Range, ComputedOut, CorrectOut, AllocStr); + int Error = checkResults(Q, BOp, Range, ComputedOut, *CorrectOut, AllocStr); free(ReduVarPtr, Q.get_context()); return Error; } template + typename PropListTy = property_list, typename IdFilterFuncT = AllIdOp> int testUSM(queue &Q, T Identity, T Init, BinaryOperation BOp, const range &Range, usm::alloc AllocType, - PropListTy PropList = {}) { + property_list PropList = {}, IdFilterFuncT IdFilterFunc = {}) { return testUSMInner(Q, OptionalIdentity(Identity), Init, BOp, Range, AllocType, PropList); } template + typename PropListTy = property_list, typename IdFilterFuncT = AllIdOp> int testUSM(queue &Q, T Init, BinaryOperation BOp, const range &Range, - usm::alloc AllocType, PropListTy PropList = {}) { + usm::alloc AllocType, property_list PropList = {}, + IdFilterFuncT IdFilterFunc = {}) { return testUSMInner(Q, OptionalIdentity(), Init, BOp, Range, AllocType, PropList); }