diff --git a/SYCL/Reduction/reduction_big_data.cpp b/SYCL/Reduction/reduction_big_data.cpp
index 7146975af5..ff07266b7e 100644
--- a/SYCL/Reduction/reduction_big_data.cpp
+++ b/SYCL/Reduction/reduction_big_data.cpp
@@ -49,8 +49,9 @@ int test(queue &Q, T Identity) {
 
   // Initialize.
   BinaryOperation BOp;
-  T CorrectOut;
-  initInputData(InBuf, CorrectOut, BOp, NWorkItems);
+  std::optional<T> CorrectOutOpt;
+  initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
+  T CorrectOut = *CorrectOutOpt;
 
   // Compute.
   Q.submit([&](handler &CGH) {
diff --git a/SYCL/Reduction/reduction_nd_N_vars.cpp b/SYCL/Reduction/reduction_nd_N_vars.cpp
index 5b72d6ce34..0b8e04409f 100644
--- a/SYCL/Reduction/reduction_nd_N_vars.cpp
+++ b/SYCL/Reduction/reduction_nd_N_vars.cpp
@@ -48,7 +48,9 @@ struct Red {
   }
 
   void init() {
-    initInputData(InBuf, CorrectOut, BOp, NWorkItems);
+    std::optional<T> CorrectOutOpt;
+    initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
+    CorrectOut = *CorrectOutOpt;
     if (!PropList.template has_property<
             property::reduction::initialize_to_identity>())
       CorrectOut = BOp(CorrectOut, InitVal);
diff --git a/SYCL/Reduction/reduction_nd_reducer_skip.cpp b/SYCL/Reduction/reduction_nd_reducer_skip.cpp
new file mode 100644
index 0000000000..cf9a61d2ea
--- /dev/null
+++ b/SYCL/Reduction/reduction_nd_reducer_skip.cpp
@@ -0,0 +1,66 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Group algorithms are not supported on Nvidia.
+// XFAIL: hip_nvidia
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize,
+           size_t NWItems) {
+  nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
+  NumErrors += test<SkipEvenName<Name>, T>(Q, Identity, Init, BOp, NDRange,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(Q, Identity, Init, BOp, NDRange,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(Q, Identity, Init, BOp, NDRange,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+
+  // Check some non power-of-two work-group sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1, 7);
+  tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 49, 49 * 5);
+
+  // Try some power-of-two work-group sizes.
+  tests<class B1, int>(Q, 0, 99, std::plus<>{}, 1, 32);
+  tests<class B2, int>(Q, 1, 99, std::multiplies<>{}, 4, 32);
+  tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, 8, 128);
+  tests<class B4, int>(Q, 0, 99, std::bit_xor<>{}, 16, 256);
+  tests<class B5, int>(Q, ~0, 99, std::bit_and<>{}, 32, 256);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, 64, 256);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 128, 256);
+  tests<class B8, int>(Q, 0, 99, std::plus<>{}, 256, 256);
+
+  // Check with various types.
+  tests<class C1, float>(Q, 1, 99, std::multiplies<>{}, 8, 24);
+  tests<class C2, short>(Q, 0x7fff, -99, ext::oneapi::minimum<>{}, 8, 256);
+  tests<class C3, unsigned char>(Q, 0, 99, ext::oneapi::maximum<>{}, 8, 256);
+
+  // Check with CUSTOM type.
+  using CV = CustomVec<long long>;
+  tests<class D1, CV>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 8, 256);
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
diff --git a/SYCL/Reduction/reduction_range_1d_reducer_skip.cpp b/SYCL/Reduction/reduction_range_1d_reducer_skip.cpp
new file mode 100644
index 0000000000..12258afbd4
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_1d_reducer_skip.cpp
@@ -0,0 +1,65 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, typename... ArgTys>
+void tests(ArgTys &&...Args) {
+  NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  constexpr access::mode RW = access::mode::read_write;
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<int>{}, range<1>(1));
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<1>(2));
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<1>(7));
+  tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<1>(64));
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2));
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2 + 5));
+
+  // Check with CUSTOM type.
+  tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<1>(256));
+  tests<class B2, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<1>(MaxWGSize * 3));
+  tests<class B3, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
+                                        range<1>(72));
+
+  // Check with identityless operations.
+  tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(1));
+  tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(2));
+  tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(7));
+  tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(64));
+  tests<class C5, int>(Q, 99, PlusWithoutIdentity<int>{},
+                       range<1>(MaxWGSize * 2));
+  tests<class C6, int>(Q, 99, PlusWithoutIdentity<int>{},
+                       range<1>(MaxWGSize * 2 + 5));
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
diff --git a/SYCL/Reduction/reduction_range_2d_dw_reducer_skip.cpp b/SYCL/Reduction/reduction_range_2d_dw_reducer_skip.cpp
new file mode 100644
index 0000000000..1598a90bcb
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_2d_dw_reducer_skip.cpp
@@ -0,0 +1,69 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<2>, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, typename... ArgTys>
+void tests(ArgTys &&...Args) {
+  NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
+  tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});
+
+  tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<2>{33, MaxWGSize});
+  tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
+                                        range<2>{33, MaxWGSize});
+
+  tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{1, 1});
+  tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 2});
+  tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 3});
+  tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{},
+                       range<2>{MaxWGSize, 1});
+  tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{1, MaxWGSize});
+  tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{2, MaxWGSize * 2});
+  tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{MaxWGSize * 3, 7});
+  tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{3, MaxWGSize * 3});
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
diff --git a/SYCL/Reduction/reduction_range_3d_rw_reducer_skip.cpp b/SYCL/Reduction/reduction_range_3d_rw_reducer_skip.cpp
new file mode 100644
index 0000000000..089720b17e
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_3d_rw_reducer_skip.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<3>, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, typename... ArgTys>
+void tests(ArgTys &&...Args) {
+  NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4});
+
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{1, 1, MaxWGSize + 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{1, MaxWGSize + 1, 1});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{MaxWGSize + 1, 1, 1});
+
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{2, 5, MaxWGSize * 2});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{3, MaxWGSize * 3, 2});
+  tests<class A9, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{MaxWGSize * 3, 8, 4});
+
+  tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<3>{2, 33, MaxWGSize});
+  tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
+                                        range<3>{2, 33, MaxWGSize});
+
+  tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{1, 1, 1});
+  tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
+  tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});
+
+  tests<class C4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{1, 1, MaxWGSize + 1});
+  tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{1, MaxWGSize + 1, 1});
+  tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{MaxWGSize + 1, 1, 1});
+
+  tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{2, 5, MaxWGSize * 2});
+  tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{3, MaxWGSize * 3, 2});
+  tests<class C9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{MaxWGSize * 3, 8, 4});
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
diff --git a/SYCL/Reduction/reduction_range_N_vars.cpp b/SYCL/Reduction/reduction_range_N_vars.cpp
index 2902272081..ed7e8d8b83 100644
--- a/SYCL/Reduction/reduction_range_N_vars.cpp
+++ b/SYCL/Reduction/reduction_range_N_vars.cpp
@@ -48,7 +48,9 @@ struct Red {
   }
 
   void init() {
-    initInputData(InBuf, CorrectOut, BOp, NWorkItems);
+    std::optional<T> CorrectOutOpt;
+    initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
+    CorrectOut = *CorrectOutOpt;
     if (!PropList.template has_property<
             property::reduction::initialize_to_identity>())
       CorrectOut = BOp(CorrectOut, InitVal);
diff --git a/SYCL/Reduction/reduction_usm.cpp b/SYCL/Reduction/reduction_usm.cpp
index 2a53361b13..2e6f25d273 100644
--- a/SYCL/Reduction/reduction_usm.cpp
+++ b/SYCL/Reduction/reduction_usm.cpp
@@ -38,12 +38,12 @@ int test(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
   }
 
   // Initialize.
-  T CorrectOut;
+  std::optional<T> CorrectOutOpt;
   BinaryOperation BOp;
 
   buffer<T, 1> InBuf(NWItems);
-  initInputData(InBuf, CorrectOut, BOp, NWItems);
-  CorrectOut = BOp(CorrectOut, Init);
+  initInputData(InBuf, CorrectOutOpt, BOp, NWItems);
+  T CorrectOut = BOp(*CorrectOutOpt, Init);
 
   // Compute.
   Q.submit([&](handler &CGH) {
diff --git a/SYCL/Reduction/reduction_usm_dw.cpp b/SYCL/Reduction/reduction_usm_dw.cpp
index 1460fac390..f94c564e23 100644
--- a/SYCL/Reduction/reduction_usm_dw.cpp
+++ b/SYCL/Reduction/reduction_usm_dw.cpp
@@ -39,11 +39,12 @@ int test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems,
   }
 
   // Initialize.
-  T CorrectOut;
+  std::optional<T> CorrectOutOpt;
   BinaryOperation BOp;
 
   buffer<T, 1> InBuf(NWItems);
-  initInputData(InBuf, CorrectOut, BOp, NWItems);
+  initInputData(InBuf, CorrectOutOpt, BOp, NWItems);
+  T CorrectOut = *CorrectOutOpt;
 
   // Compute.
   Q.submit([&](handler &CGH) {
diff --git a/SYCL/Reduction/reduction_utils.hpp b/SYCL/Reduction/reduction_utils.hpp
index d5875fbbba..a326609553 100644
--- a/SYCL/Reduction/reduction_utils.hpp
+++ b/SYCL/Reduction/reduction_utils.hpp
@@ -4,11 +4,29 @@
 
 using namespace sycl;
 
+struct AllIdOp {
+  constexpr bool operator()(size_t Idx) const { return true; }
+};
+
+struct SkipAllOp {
+  constexpr bool operator()(size_t Idx) const { return false; }
+};
+
+struct SkipEvenOp {
+  constexpr bool operator()(size_t Idx) const { return Idx % 2; }
+};
+
+struct SkipOddOp {
+  constexpr bool operator()(size_t Idx) const { return (Idx + 1) % 2; }
+};
+
 /// Initializes the buffer<1> \p 'InBuf' buffer with pseudo-random values,
-/// computes the write the reduction value \p 'ExpectedOut'.
-template <typename T, class BinaryOperation>
-void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, BinaryOperation BOp,
-                   range<1> Range) {
+/// computes the write the reduction value \p 'ExpectedOut'. Linearized IDs are
+/// filtered in \p 'ExpectedOut' using \p 'IdFilterFunc'.
+template <typename T, class BinaryOperation, typename IdFilterFuncT = AllIdOp>
+void initInputData(buffer<T, 1> &InBuf, std::optional<T> &ExpectedOut,
+                   BinaryOperation BOp, range<1> Range,
+                   IdFilterFuncT IdFilterFunc = {}) {
   size_t N = Range.size();
   assert(N != 0);
   auto In = InBuf.template get_access<access::mode::write>();
@@ -27,15 +45,18 @@ void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, BinaryOperation BOp,
       In[I] = I;
     else
       In[I] = ((I + 1) % 5) + 1.1;
-    ExpectedOut = I == 0 ? In[I] : BOp(ExpectedOut, In[I]);
+    if (IdFilterFunc(I))
+      ExpectedOut = ExpectedOut ? BOp(*ExpectedOut, In[I]) : In[I];
   }
 };
 
 /// Initializes the buffer<2> \p 'InBuf' buffer with pseudo-random values,
-/// computes the write the reduction value \p 'ExpectedOut'.
-template <typename T, class BinaryOperation>
-void initInputData(buffer<T, 2> &InBuf, T &ExpectedOut, BinaryOperation BOp,
-                   range<2> Range) {
+/// computes the write the reduction value \p 'ExpectedOut'. Linearized IDs are
+/// filtered in \p 'ExpectedOut' using \p 'IdFilterFunc'.
+template <typename T, class BinaryOperation, typename IdFilterFuncT = AllIdOp>
+void initInputData(buffer<T, 2> &InBuf, std::optional<T> &ExpectedOut,
+                   BinaryOperation BOp, range<2> Range,
+                   IdFilterFuncT IdFilterFunc = {}) {
   assert(Range.size() != 0);
   auto In = InBuf.template get_access<access::mode::write>();
   for (int J = 0; J < Range[0]; ++J) {
@@ -54,16 +75,19 @@ void initInputData(buffer<T, 2> &InBuf, T &ExpectedOut, BinaryOperation BOp,
         In[J][I] = I + J;
       else
         In[J][I] = ((I + 1 + J) % 5) + 1.1;
-      ExpectedOut = (I == 0 && J == 0) ? In[J][I] : BOp(ExpectedOut, In[J][I]);
+      if (IdFilterFunc(I + J * Range[1]))
+        ExpectedOut = ExpectedOut ? BOp(*ExpectedOut, In[J][I]) : In[J][I];
     }
   }
 };
 
 /// Initializes the buffer<3> \p 'InBuf' buffer with pseudo-random values,
-/// computes the write the reduction value \p 'ExpectedOut'.
-template <typename T, class BinaryOperation>
-void initInputData(buffer<T, 3> &InBuf, T &ExpectedOut, BinaryOperation BOp,
-                   range<3> Range) {
+/// computes the write the reduction value \p 'ExpectedOut'. Linearized IDs are
+/// filtered in \p 'ExpectedOut' using \p 'IdFilterFunc'.
+template <typename T, class BinaryOperation, typename IdFilterFuncT = AllIdOp>
+void initInputData(buffer<T, 3> &InBuf, std::optional<T> &ExpectedOut,
+                   BinaryOperation BOp, range<3> Range,
+                   IdFilterFuncT IdFilterFunc = {}) {
   assert(Range.size() != 0);
   auto In = InBuf.template get_access<access::mode::write>();
   for (int K = 0; K < Range[0]; ++K) {
@@ -83,9 +107,9 @@ void initInputData(buffer<T, 3> &InBuf, T &ExpectedOut, BinaryOperation BOp,
           In[K][J][I] = I + J + K;
         else
           In[K][J][I] = ((I + 1 + J + K * 3) % 5) + 1.1;
-        ExpectedOut = (I == 0 && J == 0 && K == 0)
-                          ? In[K][J][I]
-                          : BOp(ExpectedOut, In[K][J][I]);
+        if (IdFilterFunc(I + J * Range[2] + K * Range[1] * Range[2]))
+          ExpectedOut =
+              ExpectedOut ? BOp(*ExpectedOut, In[K][J][I]) : In[K][J][I];
       }
     }
   }
@@ -283,10 +307,10 @@ auto init_to_identity() {
 
 template <typename Name, typename T, bool HasIdentity, class BinaryOperation,
           template <int> typename RangeTy, int Dims,
-          typename PropListTy = property_list>
+          typename PropListTy = property_list, typename IdFilterFuncT = AllIdOp>
 int testInner(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
               BinaryOperation BOp, const RangeTy<Dims> &Range,
-              PropListTy PropList = {}) {
+              PropListTy PropList = {}, IdFilterFuncT IdFilterFunc = {}) {
   constexpr bool IsRange = std::is_same_v<range<Dims>, RangeTy<Dims>>;
   constexpr bool IsNDRange = std::is_same_v<nd_range<Dims>, RangeTy<Dims>>;
   static_assert(IsRange || IsNDRange);
@@ -326,11 +350,11 @@ int testInner(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
   buffer<T, 1> OutBuf(1);
 
   // Initialize.
-  T CorrectOut;
-  initInputData(InBuf, CorrectOut, BOp, GlobalRange);
+  std::optional<T> CorrectOut;
+  initInputData(InBuf, CorrectOut, BOp, GlobalRange, IdFilterFunc);
   if (!PropList.template has_property<
           property::reduction::initialize_to_identity>()) {
-    CorrectOut = BOp(CorrectOut, Init);
+    CorrectOut = CorrectOut ? BOp(*CorrectOut, Init) : Init;
   }
 
   // The value assigned here must be discarded (if IsReadWrite is true).
@@ -352,42 +376,48 @@ int testInner(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
     auto In = InBuf.template get_access<access::mode::read>(CGH);
     auto Redu = CreateReduction();
     if constexpr (IsRange)
-      CGH.parallel_for<Name>(
-          Range, Redu, [=](id<Dims> Id, auto &Sum) { Sum.combine(In[Id]); });
+      CGH.parallel_for<Name>(Range, Redu, [=](item<Dims> Id, auto &Sum) {
+        if (IdFilterFunc(Id.get_linear_id()))
+          Sum.combine(In[Id]);
+      });
     else
       CGH.parallel_for<Name>(Range, Redu, [=](nd_item<Dims> NDIt, auto &Sum) {
-        Sum.combine(In[NDIt.get_global_id()]);
+        if (IdFilterFunc(NDIt.get_global_linear_id()))
+          Sum.combine(In[NDIt.get_global_linear_id()]);
       });
   });
 
   // Check correctness.
   auto Out = OutBuf.template get_access<access::mode::read>();
   T ComputedOut = *(Out.get_pointer());
-  return checkResults(Q, BOp, Range, ComputedOut, CorrectOut);
+  return checkResults(Q, BOp, Range, ComputedOut, *CorrectOut);
 }
 
 template <typename Name, typename T, class BinaryOperation,
           template <int> typename RangeTy, int Dims,
-          typename PropListTy = property_list>
+          typename PropListTy = property_list, typename IdFilterFuncT = AllIdOp>
 int test(queue &Q, T Identity, T Init, BinaryOperation BOp,
-         const RangeTy<Dims> &Range, PropListTy PropList = {}) {
+         const RangeTy<Dims> &Range, PropListTy PropList = {},
+         IdFilterFuncT IdFilterFunc = {}) {
   return testInner<Name>(Q, OptionalIdentity(Identity), Init, BOp, Range,
                          PropList);
 }
 
 template <typename Name, typename T, class BinaryOperation,
           template <int> typename RangeTy, int Dims,
-          typename PropListTy = property_list>
+          typename PropListTy = property_list, typename IdFilterFuncT = AllIdOp>
 int test(queue &Q, T Init, BinaryOperation BOp, const RangeTy<Dims> &Range,
-         PropListTy PropList = {}) {
+         PropListTy PropList = {}, IdFilterFuncT IdFilterFunc = {}) {
   return testInner<Name>(Q, OptionalIdentity<T>(), Init, BOp, Range, PropList);
 }
 
 template <typename Name, typename T, bool HasIdentity, class BinaryOperation,
-          int Dims, typename PropListTy = property_list>
+          int Dims, typename PropListTy = property_list,
+          typename IdFilterFuncT = AllIdOp>
 int testUSMInner(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
                  BinaryOperation BOp, const range<Dims> &Range,
-                 usm::alloc AllocType, PropListTy PropList = {}) {
+                 usm::alloc AllocType, PropListTy PropList = {},
+                 IdFilterFuncT IdFilterFunc = {}) {
   printTestLabel<T, BinaryOperation>(Range);
 
   auto Dev = Q.get_device();
@@ -434,12 +464,12 @@ int testUSMInner(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
   }
 
   // Initialize.
-  T CorrectOut;
+  std::optional<T> CorrectOut;
   buffer<T, Dims> InBuf(Range);
-  initInputData(InBuf, CorrectOut, BOp, Range);
+  initInputData(InBuf, CorrectOut, BOp, Range, IdFilterFunc);
   if (!PropList.template has_property<
           property::reduction::initialize_to_identity>()) {
-    CorrectOut = BOp(CorrectOut, Init);
+    CorrectOut = CorrectOut ? BOp(*CorrectOut, Init) : Init;
   }
 
   // Compute.
@@ -457,7 +487,10 @@ int testUSMInner(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
      auto In = InBuf.template get_access<access::mode::read>(CGH);
      auto Redu = CreateReduction();
      CGH.parallel_for<TName<Name, class Test>>(
-         Range, Redu, [=](id<Dims> Id, auto &Sum) { Sum.combine(In[Id]); });
+         Range, Redu, [=](item<Dims> Id, auto &Sum) {
+           if (IdFilterFunc(Id.get_linear_id()))
+             Sum.combine(In[Id]);
+         });
    }).wait();
 
   // Check correctness.
@@ -476,24 +509,25 @@ int testUSMInner(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
 
   std::string AllocStr =
       "AllocMode=" + std::to_string(static_cast<int>(AllocType));
-  int Error = checkResults(Q, BOp, Range, ComputedOut, CorrectOut, AllocStr);
+  int Error = checkResults(Q, BOp, Range, ComputedOut, *CorrectOut, AllocStr);
   free(ReduVarPtr, Q.get_context());
   return Error;
 }
 
 template <typename Name, typename T, class BinaryOperation, int Dims,
-          typename PropListTy = property_list>
+          typename PropListTy = property_list, typename IdFilterFuncT = AllIdOp>
 int testUSM(queue &Q, T Identity, T Init, BinaryOperation BOp,
             const range<Dims> &Range, usm::alloc AllocType,
-            PropListTy PropList = {}) {
+            property_list PropList = {}, IdFilterFuncT IdFilterFunc = {}) {
   return testUSMInner<Name>(Q, OptionalIdentity(Identity), Init, BOp, Range,
                             AllocType, PropList);
 }
 
 template <typename Name, typename T, class BinaryOperation, int Dims,
-          typename PropListTy = property_list>
+          typename PropListTy = property_list, typename IdFilterFuncT = AllIdOp>
 int testUSM(queue &Q, T Init, BinaryOperation BOp, const range<Dims> &Range,
-            usm::alloc AllocType, PropListTy PropList = {}) {
+            usm::alloc AllocType, property_list PropList = {},
+            IdFilterFuncT IdFilterFunc = {}) {
   return testUSMInner<Name>(Q, OptionalIdentity<T>(), Init, BOp, Range,
                             AllocType, PropList);
 }