diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml
index 4cc0fe4249..a01e148dc3 100644
--- a/.github/workflows/intel_test.yml
+++ b/.github/workflows/intel_test.yml
@@ -57,7 +57,7 @@ jobs:
 
     name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }}
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 30
+    timeout-minutes: 45
 
     steps:
       - name: Checkout repository
@@ -96,7 +96,7 @@ jobs:
             -DCUTLASS_ENABLE_SYCL=ON \
             -DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
             -DCUTLASS_SYCL_RUNNING_CI=ON
-          cmake --build .
+          cmake --build . -j 6
       - name: Unit test
         shell: bash
         run: |
diff --git a/examples/common/sycl_common.hpp b/examples/common/sycl_common.hpp
index e9376c6c84..eadf5024f4 100644
--- a/examples/common/sycl_common.hpp
+++ b/examples/common/sycl_common.hpp
@@ -158,6 +158,6 @@ void random_fill(T *src, int seed, size_t N, float max, float min) {
     syclcompat::memcpy<T>(src, buff.data(), N);
     syclcompat::wait();
   } else {
-    assert(0 & "Not supported dtype");
+    assert(0 && "Not supported dtype");
   }
 }
diff --git a/include/cute/arch/copy_xe_U16.hpp b/include/cute/arch/copy_xe_U16.hpp
index 3e8a579a78..f88e7e8e46 100644
--- a/include/cute/arch/copy_xe_U16.hpp
+++ b/include/cute/arch/copy_xe_U16.hpp
@@ -99,6 +99,7 @@ struct XE_2D_U16x8x16_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -128,6 +129,7 @@ struct XE_2D_U16x16x16_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -157,6 +159,7 @@ struct XE_2D_U16x32x16_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -186,6 +189,7 @@ struct XE_2D_U16x1x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -215,6 +219,7 @@ struct XE_2D_U16x2x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -244,6 +249,7 @@ struct XE_2D_U16x4x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -273,6 +279,7 @@ struct XE_2D_U16x8x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -302,6 +309,7 @@ struct XE_2D_U16x16x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -331,6 +339,7 @@ struct XE_2D_U16x32x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -360,6 +369,7 @@ struct XE_2D_U16x16x16_LD_V {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -389,6 +399,7 @@ struct XE_2D_U16x32x16_LD_V {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -418,6 +429,7 @@ struct XE_2D_U16x16x32_LD_V {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -447,6 +459,7 @@ struct XE_2D_U16x32x32_LD_V {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
diff --git a/include/cute/arch/copy_xe_U32.hpp b/include/cute/arch/copy_xe_U32.hpp
index 8572bc4362..e4f344003d 100644
--- a/include/cute/arch/copy_xe_U32.hpp
+++ b/include/cute/arch/copy_xe_U32.hpp
@@ -307,6 +307,7 @@ struct XE_2D_TF32x16x16_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                         int height, int pitch,
                                         intel::coord_t coord) {
@@ -406,6 +407,7 @@ struct XE_2D_U32x16x8_LD_T {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
diff --git a/include/cute/arch/copy_xe_U8.hpp b/include/cute/arch/copy_xe_U8.hpp
index 35612eb467..4b0d0ffe08 100644
--- a/include/cute/arch/copy_xe_U8.hpp
+++ b/include/cute/arch/copy_xe_U8.hpp
@@ -52,6 +52,7 @@ struct XE_2D_Packed_U8x1x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -82,6 +83,7 @@ struct XE_2D_Packed_U8x2x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -127,6 +129,7 @@ struct XE_2D_Packed_U8x4x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -156,6 +159,7 @@ struct XE_2D_Packed_U8x8x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -185,6 +189,7 @@ struct XE_2D_Packed_U8x16x32_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -249,6 +254,7 @@ struct XE_2D_Packed_U8x1x64_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -278,6 +284,7 @@ struct XE_2D_Packed_U8x2x64_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -307,6 +314,7 @@ struct XE_2D_Packed_U8x4x64_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -336,6 +344,7 @@ struct XE_2D_Packed_U8x8x64_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -365,6 +374,7 @@ struct XE_2D_Packed_U8x16x64_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -394,6 +404,7 @@ struct XE_2D_Packed_U8x32x64_LD_N {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
@@ -425,6 +436,7 @@ struct XE_2D_U8x32x16_LD_V {
   }
 
   struct PREFETCH {
+    using BlockShape = BlockShape;
     CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
                                       int height, int pitch,
                                       intel::coord_t coord) {
diff --git a/include/cute/atom/copy_traits.hpp b/include/cute/atom/copy_traits.hpp
index 9117a1fb13..ea4a3913fc 100644
--- a/include/cute/atom/copy_traits.hpp
+++ b/include/cute/atom/copy_traits.hpp
@@ -153,9 +153,19 @@ namespace detail {
 template <class CopyOp, class = void>
 constexpr bool is_prefetch = false;
 
+#ifdef SYCL_INTEL_TARGET
+
+template <class CopyOp>
+constexpr bool is_prefetch<CopyOp, void_t<decltype(CopyOp{}.copy(nullptr, 0,0,0, {0,0}))>> = true;
+
+#else
+
+// TODO(Codeplay): Enable for SYCL_INTEL_TARGET.
 template <class CopyOp>
 constexpr bool is_prefetch<CopyOp, void_t<typename CopyOp::PREFETCH>> = is_same_v<CopyOp, typename CopyOp::PREFETCH>;
 
+#endif
+
 } // end namespace detail
 
 
diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp
index 8ba3344bdd..4cd09c4d99 100644
--- a/include/cute/atom/copy_traits_xe.hpp
+++ b/include/cute/atom/copy_traits_xe.hpp
@@ -214,6 +214,7 @@ template<class TileShape, int Num_SGs, int SubgroupSize = detail::subgroup_size,
 CUTE_HOST_DEVICE auto prefetch_selector(Tensor const& tensor) {
   constexpr size_t cacheline_bytes = 64;
   using dtype = typename Tensor::value_type;
+  constexpr size_t cacheline_elements = cacheline_bytes / sizeof(dtype);
   constexpr size_t dtype_size_bits = sizeof_bits_v<dtype>;
   constexpr bool is_tensor_M_major = detail::is_stride_leftmost<decltype(tensor.stride())>;
   using CopyThreadShape = std::conditional_t<is_tensor_M_major,
@@ -223,9 +224,9 @@ CUTE_HOST_DEVICE auto prefetch_selector(Tensor const& tensor) {
   constexpr int tile_contig_size = is_tensor_M_major ? size<0>(TileShape{}) : size<1>(TileShape{});
   constexpr int tile_non_contig_size = is_tensor_M_major ? size<1>(TileShape{}) : size<0>(TileShape{});
 
-  // block here is what is prefetched in one atom execution
+  // block here is what is prefetched in one atom execution - width of one cacheline
   // min(32,32)-> 32 (256, 32) -> 32
-  static constexpr auto block_contig_size = cute::min(tile_contig_size, cacheline_bytes / sizeof(dtype));
+  static constexpr auto block_contig_size = cute::min(tile_contig_size, cacheline_elements);
   // A: 1 -> trans or B 256/32 = 8
   static constexpr auto nums_blocks_contig = ceil_div(tile_contig_size, block_contig_size);
   
@@ -310,7 +311,14 @@ struct XE_2D_LD_Unpack {
   uint32_t pitch;
   uint32_t stride_l = 0;
 
-
+  // Construct prefetch from equivalent copy
+  template<class CopyOp2>
+  XE_2D_LD_Unpack(XE_2D_LD_Unpack<CopyOp2, StrideOrTensor> const& copy_op) : 
+    base_ptr(copy_op.base_ptr), width(copy_op.width), height(copy_op.height),
+    pitch(copy_op.pitch), stride_l(copy_op.stride_l) {
+      static_assert(std::is_same_v<CopyOp, typename CopyOp2::PREFETCH>, 
+        "Prefetch can only be constructed from equivalent copy");
+    }
 
   XE_2D_LD_Unpack(const void *ptr, uint32_t y,
                  uint32_t x, uint32_t p = 0) : base_ptr(ptr) {
@@ -358,53 +366,68 @@ struct XE_2D_LD_Unpack {
   CUTE_HOST_DEVICE friend constexpr void
   copy_unpack(Traits_LD_t const &traits, Tensor<TS, SLayout> const &src,
               Tensor<TD, DLayout> &dst) {
-    using dtype = typename Tensor<TD, DLayout>::value_type;
-    constexpr int dtype_bits = sizeof_bits_v<dtype>;
-
-    static_assert(is_rmem<TD>::value);
-    static_assert(size(SLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::SrcLayout{}),
-                  "Src tensor size does not match copy atom size.");
-    static_assert(size(DLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::DstLayout{}),
-                  "Dst tensor size does not match copy atom size.");
-
-    dtype *base_addr = (dtype *)traits.base_ptr;
-  
-    auto [m, n, l] = src.data().coord_;
-    int x = is_tensor_M_major ? m : n;
-    int y = is_tensor_M_major ? n : m;
+    if constexpr(detail::is_prefetch<CopyOp>){
+      prefetch_unpack(traits, src);
+    } else{
+      using dtype = typename Tensor<TD, DLayout>::value_type;
+      constexpr int dtype_bits = sizeof_bits_v<dtype>;
+
+      static_assert(is_rmem<TD>::value);
+      static_assert(size(SLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::SrcLayout{}),
+                    "Src tensor size does not match copy atom size.");
+      static_assert(size(DLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::DstLayout{}),
+                    "Dst tensor size does not match copy atom size.");
+
+      dtype *base_addr = (dtype *)traits.base_ptr;
+    
+      auto [m, n, l] = src.data().coord_;
+      int x = is_tensor_M_major ? m : n;
+      int y = is_tensor_M_major ? n : m;
 
-    constexpr auto inst_size_bits = detail::size_of_inst_bits<CopyOp, dtype>;
+      constexpr auto inst_size_bits = detail::size_of_inst_bits<CopyOp, dtype>;
 
-    CopyOp::copy(base_addr + static_cast<size_t>(l) * traits.stride_l,
-                 (traits.width * sizeof_bits_v<dtype>) / sizeof_bits_v<int8_t>, traits.height,
-                 (traits.pitch * sizeof_bits_v<dtype>) / sizeof_bits_v<int8_t>,
-                 intel::coord_t{(int)(x * sizeof_bits_v<dtype> / inst_size_bits), y},
-                 raw_pointer_cast(&((&*dst.data())[0])));
+      CopyOp::copy(base_addr + static_cast<size_t>(l) * traits.stride_l,
+                  (traits.width * sizeof_bits_v<dtype>) / sizeof_bits_v<int8_t>, traits.height,
+                  (traits.pitch * sizeof_bits_v<dtype>) / sizeof_bits_v<int8_t>,
+                  intel::coord_t{(int)(x * sizeof_bits_v<dtype> / inst_size_bits), y},
+                  raw_pointer_cast(&((&*dst.data())[0])));
+    }
   }
 
   template <class... CA_Args, class TS, class SLayout>
   CUTE_HOST_DEVICE friend constexpr void
-  prefetch(Copy_Atom<Traits_LD_t, CA_Args...> const &atom,
+  prefetch_unpack(Traits_LD_t const &traits, 
            Tensor<TS, SLayout> const &src) {
-    using dtype = typename Copy_Atom<Traits_LD_t, CA_Args...>::ValType;
-
-    static_assert(detail::has_prefetch<CopyOp>);
-    static_assert(size(SLayout{}) * sizeof_bits_v<dtype> == size<1>(typename Traits_LD_t::SrcLayout{}),
+    // We do not have exact dtype available here, only size of the prefetch, assume that is the size of the element as well
+    constexpr int dtype_size_bits = size<1,0>(typename Traits_LD_t::SrcLayout{});
+    constexpr int dtype_size = dtype_size_bits / 8;
+    using dtype_proxy = sycl::vec<char, dtype_size>;
+
+    // This asserts checks for the most common usecase - using prefetch atom with the same type as the data
+    // However prefetch atoms can also be used for other datatypes, such as U8 prefetch for U4 data, which need a special case in the assert below
+    // If we have more such cases we could remove the assert altogether
+    static_assert(size(SLayout{}) * dtype_size_bits == size<1>(typename Traits_LD_t::SrcLayout{}) ||
+                  // Prefetching U4 using U8 prefetch, we can not distinguish this case without actual type
+                  dtype_size_bits == 8 && size(SLayout{}) * 4 == size<1>(typename Traits_LD_t::SrcLayout{}), 
                   "Src tensor size does not match copy atom for prefetch size");
 
-    dtype *base_addr = (dtype *)atom.base_ptr;
+    char *base_addr = (char *)traits.base_ptr;
 
     auto [m, n, l] = src.data().coord_;
 
     int x = is_tensor_M_major ? m : n;
     int y = is_tensor_M_major ? n : m;
 
-    constexpr auto inst_size_bits = detail::size_of_inst_bits<CopyOp, dtype>;
+    constexpr auto inst_size_bits = detail::size_of_inst_bits<CopyOp, dtype_proxy>;
+    // The assert checks that assert is implemented using prefetch instruction with matching datatype size
+    // Currently this is the case for all prefetch atoms, but we might need to remove the assert to add
+    // any prefetch atom with size of datatype different from the underlying instruction.
+    static_assert(inst_size_bits == dtype_size_bits, "Instruction size does not match prefetch size.");
 
-    CopyOp::PREFETCH::copy(base_addr + l * atom.stride_l,
-                           (atom.width * sizeof_bits_v<dtype>) / sizeof_bits_v<int8_t>, atom.height,
-                           (atom.pitch * sizeof_bits_v<dtype>) / sizeof_bits_v<int8_t>,
-                           intel::coord_t{(int)(x * sizeof_bits_v<dtype> / inst_size_bits), y});
+    CopyOp::PREFETCH::copy(base_addr + l * traits.stride_l * dtype_size,
+                           (traits.width * dtype_size_bits) / sizeof_bits_v<int8_t>, traits.height,
+                           (traits.pitch * dtype_size_bits) / sizeof_bits_v<int8_t>,
+                           intel::coord_t{(int)(x * dtype_size_bits / inst_size_bits), y});
   }
 
   template <class... TensorArgs>
@@ -559,6 +582,21 @@ struct Copy_Traits_<XE_2D_Packed_U8x1x32_LD_N, args_t...>
       : XE_2D_LD_Unpack<XE_2D_Packed_U8x1x32_LD_N, args_t...>(args...) {}
 };
 
+template <class... args_t>
+struct Copy_Traits_<XE_2D_Packed_U8x1x32_LD_N::PREFETCH, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_Packed_U8x1x32_LD_N::PREFETCH, args_t...> {
+  using ThrID = Layout<_16>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_16,_8>,
+                           Stride< _0,_1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_16,_16>,
+                           Stride<_16, _1>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x1x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
+};
+
 template <class... args_t>
 struct Copy_Traits_<XE_2D_Packed_U8x2x32_LD_N, args_t...>
     : XE_2D_LD_Unpack<XE_2D_Packed_U8x2x32_LD_N, args_t...> {
@@ -577,6 +615,21 @@ struct Copy_Traits_<XE_2D_Packed_U8x2x32_LD_N, args_t...>
       : XE_2D_LD_Unpack<XE_2D_Packed_U8x2x32_LD_N, args_t...>(args...) {}
 };
 
+template <class... args_t>
+struct Copy_Traits_<XE_2D_Packed_U8x2x32_LD_N::PREFETCH, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_Packed_U8x2x32_LD_N::PREFETCH, args_t...> {
+  using ThrID = Layout<_16>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_16,Shape <_8,  _2, _2>>,
+                           Stride< _0,Stride< _1,_128,_256>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_16,Shape <_8,  _2, _2>>,
+                           Stride<_16,Stride< _1,_128,_256>>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x2x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
+};
+
 template <class... args_t>
 struct Copy_Traits_<XE_2D_Packed_U8x4x32_LD_N, args_t...>
     : XE_2D_LD_Unpack<XE_2D_Packed_U8x4x32_LD_N, args_t...> {
@@ -596,23 +649,53 @@ struct Copy_Traits_<XE_2D_Packed_U8x4x32_LD_N, args_t...>
 };
 
 template <class... args_t>
-struct Copy_Traits_<XE_2D_Packed_U8x8x32_LD_N, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_Packed_U8x8x32_LD_N, args_t...> {
+struct Copy_Traits_<XE_2D_Packed_U8x4x32_LD_N::PREFETCH, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_Packed_U8x4x32_LD_N::PREFETCH, args_t...> {
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <_16,Shape <_8,  _2, _8>>,
+  using SrcLayout = Layout<Shape <_16,Shape <_8,  _2, _4>>,
                            Stride< _0,Stride< _1,_128,_256>>>;
   // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_16,Shape <_8,  _2, _8>>,
+  using DstLayout = Layout<Shape <_16,Shape <_8,  _2, _4>>,
                            Stride<_16,Stride< _1,_128,_256>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x4x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
+};
+
+template <class... args_t>
+struct Copy_Traits_<XE_2D_Packed_U8x8x32_LD_N, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_Packed_U8x8x32_LD_N, args_t...> {
+  using ThrID = Layout<_16>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_16,Shape <_8, _2, _8>>,
+                           Stride< _0,Stride<_1, _8,_256>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_16,Shape <_8, _2, _8>>,
+                           Stride<_16,Stride<_1, _8,_256>>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
 
   template <class... ArgT>
   Copy_Traits_(ArgT... args)
       : XE_2D_LD_Unpack<XE_2D_Packed_U8x8x32_LD_N, args_t...>(args...) {}
 };
 
+template <class... args_t>
+struct Copy_Traits_<XE_2D_Packed_U8x8x32_LD_N::PREFETCH, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_Packed_U8x8x32_LD_N::PREFETCH, args_t...> {
+  using ThrID = Layout<_16>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_16,Shape <_8,  _2, _8>>,
+                           Stride< _0,Stride< _1,_128,_256>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_16,Shape <_8,  _2, _8>>,
+                           Stride<_16,Stride< _1,_128,_256>>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x8x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
+};
+
 template <class... args_t>
 struct Copy_Traits_<XE_2D_Packed_U8x16x32_LD_N, args_t...>
     : XE_2D_LD_Unpack<XE_2D_Packed_U8x16x32_LD_N, args_t...> {
@@ -649,6 +732,21 @@ struct Copy_Traits_<XE_2D_Packed_U8x32x32_LD_N, args_t...>
       : XE_2D_LD_Unpack<XE_2D_Packed_U8x32x32_LD_N, args_t...>(args...) {}
 };
 
+template <class... args_t>
+struct Copy_Traits_<XE_2D_Packed_U8x16x32_LD_N::PREFETCH, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_Packed_U8x16x32_LD_N::PREFETCH, args_t...> {
+  using ThrID = Layout<_16>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_16,Shape <_8,  _2, _16>>,
+                           Stride< _0,Stride< _1,_128,_256>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_16,Shape <_8,  _2, _16>>,
+                           Stride<_16,Stride< _1,_128,_256>>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x16x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
+};
+
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U8x32x32_LD_N, args_t...>
     : XE_2D_LD_Unpack<XE_2D_U8x32x32_LD_N, args_t...> {
@@ -743,11 +841,11 @@ struct Copy_Traits_<XE_2D_Packed_U8x1x64_LD_N, args_t...>
     : XE_2D_LD_Unpack<XE_2D_Packed_U8x1x64_LD_N, args_t...> {
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <_16,_8>,
-                           Stride< _0,_1>>;
+  using SrcLayout = Layout<Shape <_16,Shape <_8,_2,  _2>>,
+                           Stride<_0,Stride<_1,_8,_256>>>;
   // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_16,Shape <_16,  _2>>,
-                           Stride<_16,Stride< _1,_256>>>;
+  using DstLayout = Layout<Shape <_16,Shape <_8,_2,  _2>>,
+                           Stride<_16,Stride<_1,_8,_256>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
 
@@ -763,13 +861,14 @@ struct Copy_Traits_<XE_2D_Packed_U8x1x64_LD_N::PREFETCH, args_t...>
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_16,Shape <_8,_2,  _2>>,
-                           Stride<_16,Stride<_1,_8,_256>>>;
+                           Stride<_0,Stride<_1,_8,_256>>>;
   // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_16,Shape <_8,_2,  _2>>,
                            Stride<_16,Stride<_1,_8,_256>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
   using CopyInternalType = cute::intel::ushort;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x1x64_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -798,13 +897,14 @@ struct Copy_Traits_<XE_2D_Packed_U8x2x64_LD_N::PREFETCH, args_t...>
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _2>>,
-                           Stride<_16,Stride<_1,_8,_256,_512>>>;
+                           Stride<_0,Stride<_1,_8,_256,_512>>>;
   // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _2>>,
                            Stride<_16,Stride<_1,_8,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
   using CopyInternalType = cute::intel::ushort;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x2x64_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -832,13 +932,14 @@ struct Copy_Traits_<XE_2D_Packed_U8x4x64_LD_N::PREFETCH, args_t...>
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _4>>,
-                           Stride<_16,Stride<_1,_8,_256,_512>>>;
+                           Stride<_0,Stride<_1,_8,_256,_512>>>;
   // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _4>>,
                            Stride<_16,Stride<_1,_8,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
   using CopyInternalType = cute::intel::ushort;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x4x64_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -849,8 +950,8 @@ struct Copy_Traits_<XE_2D_Packed_U8x8x64_LD_N, args_t...>
   using SrcLayout = Layout<Shape <_16,Shape <_16,  _8,  _2>>,
                            Stride< _0,Stride< _1,_512,_256>>>;
   // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_16,Shape <_16,  _2,  _8>>,
-                           Stride<_16,Stride< _1,_256,_512>>>;
+  using DstLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _8>>,
+                           Stride<_16,Stride<_1,_8,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
 
@@ -866,13 +967,14 @@ struct Copy_Traits_<XE_2D_Packed_U8x8x64_LD_N::PREFETCH, args_t...>
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _8>>,
-                           Stride<_16,Stride<_1,_8,_256,_512>>>;
+                           Stride<_0,Stride<_1,_8,_256,_512>>>;
   // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _8>>,
                            Stride<_16,Stride<_1,_8,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
   using CopyInternalType = cute::intel::ushort;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x8x64_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 
@@ -884,8 +986,8 @@ struct Copy_Traits_<XE_2D_Packed_U8x16x64_LD_N, args_t...>
   using SrcLayout = Layout<Shape <_16,Shape <_16, _16,  _2>>,
                            Stride< _0,Stride< _1,_512,_256>>>;
   // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_16,Shape <_16,  _2, _16>>,
-                           Stride<_16,Stride< _1,_256,_512>>>;
+  using DstLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _16>>,
+                           Stride<_16,Stride<_1,_8,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
   template <class... ArgT>
@@ -898,13 +1000,14 @@ struct Copy_Traits_<XE_2D_Packed_U8x16x64_LD_N::PREFETCH, args_t...>
     : XE_2D_LD_Unpack<XE_2D_Packed_U8x16x64_LD_N::PREFETCH, args_t...> {
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <_16,Shape <_16,  _2, _16>>,
-                           Stride<_16,Stride< _1,_256,_512>>>;
+  using SrcLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _16>>,
+                           Stride<_0,Stride<_1,_8,_256,_512>>>;
   // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_16,Shape <_16,  _2, _16>>,
-                           Stride<_16,Stride< _1,_256,_512>>>;
+  using DstLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _16>>,
+                           Stride<_16,Stride<_1,_8,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x16x64_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -929,13 +1032,14 @@ struct Copy_Traits_<XE_2D_Packed_U8x32x64_LD_N::PREFETCH, args_t...>
     : XE_2D_LD_Unpack<XE_2D_Packed_U8x32x64_LD_N::PREFETCH, args_t...> {
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <_16,Shape <_16,  _2, _32>>,
-                           Stride<_16,Stride< _1,_256,_512>>>;
+  using SrcLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _32>>,
+                           Stride<_0,Stride<_1,_8,_256,_512>>>;
   // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_16,Shape <_16,  _2, _32>>,
-                           Stride<_16,Stride< _1,_256,_512>>>;
+  using DstLayout = Layout<Shape <_16,Shape <_8,_2,  _2,  _32>>,
+                           Stride<_16,Stride<_1,_8,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_Packed_U8x32x64_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1050,17 +1154,18 @@ struct Copy_Traits_<XE_2D_U16x8x16_LD_N, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x8x16_LD_N::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U16x8x16_LD_N, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U16x8x16_LD_N::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_16,Shape <_16,  _8>>,
-                           Stride<_16,Stride< _1,_256>>>;
+                           Stride<_0,Stride< _1,_256>>>;
   // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_16,Shape <_16,  _8>>,
                            Stride<_16,Stride< _1,_256>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U16x8x16_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1083,17 +1188,18 @@ struct Copy_Traits_<XE_2D_U16x16x16_LD_N, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x16x16_LD_N::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U16x16x16_LD_N, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U16x16x16_LD_N::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_16,Shape <_16, _16>>,
-                           Stride<_16,Stride< _1,_256>>>;
+                           Stride<_0,Stride< _1,_256>>>;
   // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_16,Shape <_16, _16>>,
                            Stride<_16,Stride< _1,_256>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U16x16x16_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1116,17 +1222,18 @@ struct Copy_Traits_<XE_2D_U16x32x16_LD_N, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x32x16_LD_N::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U16x32x16_LD_N, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U16x32x16_LD_N::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_16,Shape <_16, _32>>,
-                           Stride<_16,Stride< _1,_256>>>;
+                           Stride<_0,Stride< _1,_256>>>;
   // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_16,Shape <_16, _32>>,
                            Stride<_16,Stride< _1,_256>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U16x32x16_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1149,17 +1256,18 @@ struct Copy_Traits_<XE_2D_U16x1x32_LD_N, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x1x32_LD_N::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U16x1x32_LD_N, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U16x1x32_LD_N::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_16,Shape <_16,  _2>>,
-                           Stride<_16,Stride< _1,_256>>>;
+                           Stride<_0,Stride< _1,_256>>>;
   // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_16,Shape <_16,  _2>>,
                            Stride<_16,Stride< _1,_256>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U16x1x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1182,17 +1290,18 @@ struct Copy_Traits_<XE_2D_U16x2x32_LD_N, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x2x32_LD_N::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U16x2x32_LD_N, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U16x2x32_LD_N::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_16,Shape <_16,  _2,  _2>>,
-                           Stride<_16,Stride< _1,_256,_512>>>;
+                           Stride<_0,Stride< _1,_256,_512>>>;
   // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_16,Shape <_16,  _2,  _2>>,
                            Stride<_16,Stride< _1,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U16x2x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1215,17 +1324,18 @@ struct Copy_Traits_<XE_2D_U16x4x32_LD_N, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x4x32_LD_N::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U16x4x32_LD_N, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U16x4x32_LD_N::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
   using SrcLayout = Layout<Shape <_16,Shape <_16,  _2,  _4>>,
-                           Stride<_16,Stride< _1,_256,_512>>>;
+                           Stride<_0,Stride< _1,_256,_512>>>;
   // Map from (dst-thr,dst-val) to bit
   using DstLayout = Layout<Shape <_16,Shape <_16,  _2,  _4>>,
                            Stride<_16,Stride< _1,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U16x4x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1249,18 +1359,19 @@ struct Copy_Traits_<XE_2D_U16x8x32_LD_N, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x8x32_LD_N::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U16x8x32_LD_N, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U16x8x32_LD_N::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <_16,Shape <_32,  _8>>,
-                           Stride<_32,Stride< _1,_512>>>;
+  using SrcLayout = Layout<Shape <_16,Shape <_16,  _2,  _8>>,
+                           Stride< _0,Stride< _1,_256,_512>>>;
   // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_16,Shape <_32,  _8>>,
-                           Stride<_32,Stride< _1,_512>>>;
+  using DstLayout = Layout<Shape <_16,Shape <_16,  _2,  _8>>,
+                           Stride<_16,Stride< _1,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
   using CopyInternalType = cute::intel::ushort;
+  using XE_2D_LD_Unpack<XE_2D_U16x8x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1286,18 +1397,19 @@ struct Copy_Traits_<XE_2D_U16x16x32_LD_N, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x16x32_LD_N::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U16x16x32_LD_N, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U16x16x32_LD_N::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <_16,Shape <_32, _16>>,
-                           Stride<_32,Stride< _1,_512>>>;
+  using SrcLayout = Layout<Shape <_16,Shape <_16,  _2, _16>>,
+                           Stride< _0,Stride< _1,_256,_512>>>;
   // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_16,Shape <_32, _16>>,
-                           Stride<_32,Stride< _1,_512>>>;
+  using DstLayout = Layout<Shape <_16,Shape <_16,  _2, _16>>,
+                           Stride<_16,Stride< _1,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
   using CopyInternalType = cute::intel::ushort;
+  using XE_2D_LD_Unpack<XE_2D_U16x16x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1320,18 +1432,19 @@ struct Copy_Traits_<XE_2D_U16x32x32_LD_N, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x32x32_LD_N::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U16x32x32_LD_N, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U16x32x32_LD_N::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <_16,Shape <_32, _32>>,
-                           Stride<_32,Stride< _1,_512>>>;
+  using SrcLayout = Layout<Shape <_16, Shape<_16,  _2, _32>>,
+                           Stride<_0, Stride< _1,_256,_512>>>;
   // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_16,Shape <_32, _32>>,
-                           Stride<_32,Stride< _1,_512>>>;
+  using DstLayout = Layout<Shape <_16, Shape <_16,   _2, _32>>,
+                           Stride<_16, Stride< _1,_256,_512>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
   using CopyInternalType = cute::intel::ushort;
+  using XE_2D_LD_Unpack<XE_2D_U16x32x32_LD_N::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1696,7 +1809,7 @@ struct Copy_Traits_<XE_2D_U8x32x16_LD_V, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U8x32x16_LD_V::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U8x32x16_LD_V, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U8x32x16_LD_V::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
@@ -1707,6 +1820,7 @@ struct Copy_Traits_<XE_2D_U8x32x16_LD_V::PREFETCH, args_t...>
                            Stride< _8,Stride<_1,_128>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U8x32x16_LD_V::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -1824,6 +1938,22 @@ struct Copy_Traits_<XE_2D_U16x16x16_LD_V, args_t...>
       : XE_2D_LD_Unpack<XE_2D_U16x16x16_LD_V, args_t...>(args...) {}
 };
 
+template <class... args_t>
+struct Copy_Traits_<XE_2D_U16x16x16_LD_V::PREFETCH, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_U16x16x16_LD_V::PREFETCH, args_t...> {
+  // Logical thread id to thread idx
+  using ThrID = Layout<_16>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_16,Shape <_16,  _2,  _8>>,
+                           Stride< _0,Stride< _1,_256,_512>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_16,Shape <_16,  _2,  _8>>,
+                           Stride<_16,Stride< _1,_256,_512>>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U16x16x16_LD_V::PREFETCH, args_t...>::XE_2D_LD_Unpack;
+};
+
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x32x16_LD_V, args_t...>
     : XE_2D_LD_Unpack<XE_2D_U16x32x16_LD_V, args_t...> {
@@ -1842,6 +1972,21 @@ struct Copy_Traits_<XE_2D_U16x32x16_LD_V, args_t...>
       : XE_2D_LD_Unpack<XE_2D_U16x32x16_LD_V, args_t...>(args...) {}
 };
 
+template <class... args_t>
+struct Copy_Traits_<XE_2D_U16x32x16_LD_V::PREFETCH, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_U16x32x16_LD_V::PREFETCH, args_t...> {
+  using ThrID = Layout<_16>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_16,Shape <_16,  _2, _16>>,
+                           Stride< _0,Stride< _1,_256,_512>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_16,Shape <_16,  _2, _16>>,
+                           Stride<_16,Stride< _1,_256,_512>>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U16x32x16_LD_V::PREFETCH, args_t...>::XE_2D_LD_Unpack;
+};
+
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x32x32_LD_V, args_t...>
     : XE_2D_LD_Unpack<XE_2D_U16x32x32_LD_V, args_t...> {
@@ -1861,6 +2006,22 @@ struct Copy_Traits_<XE_2D_U16x32x32_LD_V, args_t...>
       : XE_2D_LD_Unpack<XE_2D_U16x32x32_LD_V, args_t...>(args...) {}
 };
 
+template <class... args_t>
+struct Copy_Traits_<XE_2D_U16x32x32_LD_V::PREFETCH, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_U16x32x32_LD_V::PREFETCH, args_t...> {
+  // Logical thread id to thread idx
+  using ThrID = Layout<_16>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_16,Shape <_16,  _2,  _2,  _16>>,
+                           Stride<_0,Stride< _1,_512,_256,_1024>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_16,Shape <_16,  _2,  _2,  _16>>,
+                           Stride<_16,Stride< _1,_512,_256,_1024>>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U16x32x32_LD_V::PREFETCH, args_t...>::XE_2D_LD_Unpack;
+};
+
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x16x32_LD_V, args_t...>
     : XE_2D_LD_Unpack<XE_2D_U16x16x32_LD_V, args_t...> {
@@ -1880,6 +2041,22 @@ struct Copy_Traits_<XE_2D_U16x16x32_LD_V, args_t...>
       : XE_2D_LD_Unpack<XE_2D_U16x16x32_LD_V, args_t...>(args...) {}
 };
 
+template <class... args_t>
+struct Copy_Traits_<XE_2D_U16x16x32_LD_V::PREFETCH, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_U16x16x32_LD_V::PREFETCH, args_t...> {
+  // Logical thread id to thread idx
+  using ThrID = Layout<_16>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_16,Shape <_16,  _2,  _2,   _8>>,
+                           Stride<_0,Stride< _1,_512,_256,_1024>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_16,Shape <_16,  _2,  _2,   _8>>,
+                           Stride<_16,Stride< _1,_512,_256,_1024>>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U16x16x32_LD_V::PREFETCH, args_t...>::XE_2D_LD_Unpack;
+};
+
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U16x16x8_LD_T, args_t...>
     : XE_2D_LD_Unpack<XE_2D_U16x16x8_LD_T, args_t...> {
@@ -1992,17 +2169,18 @@ struct Copy_Traits_<XE_2D_U32x16x8_LD_T, args_t...>
 
 template <class... args_t>
 struct Copy_Traits_<XE_2D_U32x16x8_LD_T::PREFETCH, args_t...>
-    : XE_2D_LD_Unpack<XE_2D_U32x16x8_LD_T, args_t...> {
+    : XE_2D_LD_Unpack<XE_2D_U32x16x8_LD_T::PREFETCH, args_t...> {
   // Logical thread id to thread idx
   using ThrID = Layout<_16>;
   // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <Shape < _8,_2>,Shape <_32, _16>>,
-                           Stride<Stride<_32,_0>,Stride< _1,_256>>>;
+  using SrcLayout = Layout<Shape <_16,Shape <_32, _8>>,
+                           Stride< _0,Stride< _1,_32>>>;
   // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <Shape < _8,_2>,Shape <_32, _16>>,
-                           Stride<Stride<_32,_0>,Stride< _1,_256>>>;
+  using DstLayout = Layout<Shape < _16,Shape <_32, _8>>,
+                           Stride<_128,Stride< _1,_32>>>;
   // Reference map from (thr,val) to bit
   using RefLayout = DstLayout;
+  using XE_2D_LD_Unpack<XE_2D_U32x16x8_LD_T::PREFETCH, args_t...>::XE_2D_LD_Unpack;
 };
 
 template <class... args_t>
@@ -2498,6 +2676,11 @@ COPY_TRAIT_LD_DEF(XE_2D_U4x32x64_LD_N)
 COPY_TRAIT_LD_DEF(XE_2D_U4x16x64_LD_N)
 COPY_TRAIT_LD_DEF(XE_2D_U4x32x16_LD_T)
 COPY_TRAIT_LD_DEF(XE_2D_U4x16x16_LD_T)
+COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x1x32_LD_N::PREFETCH)
+COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x2x32_LD_N::PREFETCH)
+COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x4x32_LD_N::PREFETCH)
+COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x8x32_LD_N::PREFETCH)
+COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x16x32_LD_N::PREFETCH)
 COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x1x64_LD_N::PREFETCH)
 COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x2x64_LD_N::PREFETCH)
 COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x4x64_LD_N::PREFETCH)
@@ -2510,6 +2693,10 @@ COPY_TRAIT_LD_DEF(XE_2D_U16x2x32_LD_N::PREFETCH)
 COPY_TRAIT_LD_DEF(XE_2D_U16x4x32_LD_N::PREFETCH)
 COPY_TRAIT_LD_DEF(XE_2D_U16x8x32_LD_N::PREFETCH)
 COPY_TRAIT_LD_DEF(XE_2D_U8x32x16_LD_V::PREFETCH)
+COPY_TRAIT_LD_DEF(XE_2D_U16x16x16_LD_V::PREFETCH)
+COPY_TRAIT_LD_DEF(XE_2D_U16x32x16_LD_V::PREFETCH)
+COPY_TRAIT_LD_DEF(XE_2D_U16x16x32_LD_V::PREFETCH)
+COPY_TRAIT_LD_DEF(XE_2D_U16x32x32_LD_V::PREFETCH)
 COPY_TRAIT_LD_DEF(XE_2D_U32x16x8_LD_T::PREFETCH)
 COPY_TRAIT_LD_DEF(XE_2D_U16x16x16_LD_N::PREFETCH)
 COPY_TRAIT_LD_DEF(XE_2D_U16x32x16_LD_N::PREFETCH)
diff --git a/tools/copy_debug/copy_debug.cpp b/tools/copy_debug/copy_debug.cpp
index 9a170bc4d0..69f83acdd6 100644
--- a/tools/copy_debug/copy_debug.cpp
+++ b/tools/copy_debug/copy_debug.cpp
@@ -55,12 +55,20 @@ void copy_kernel(TensorS S) {
   }
   syncthreads();
 
-  using CopyThreadShape = Shape<_1, Int<SUBGROUP_SIZE>>;
   using traits_load = Copy_Traits<CopyInstruction, decltype(S)>;
   using Atom_load = Copy_Atom<traits_load, Element>;
+  constexpr int vector_size = size<1,0>(typename traits_load::SrcLayout{}) / sizeof_bits_v<Element>;
+  using VectorShape = std::conditional_t<traits_load::is_tensor_M_major,
+                                         Shape<Int<vector_size>, _1>,
+                                         Shape<_1, Int<vector_size>>>;
+  using CopyThreadShape = std::conditional_t<traits_load::is_tensor_M_major,
+                                             Shape<Int<SUBGROUP_SIZE>, _1>,
+                                             Shape<_1, Int<SUBGROUP_SIZE>>>;
+  using ScalarBlockShape = Shape<Int<get<0>(typename traits_load::BlockShape{}) * get<0>(VectorShape{})>, 
+                                 Int<get<1>(typename traits_load::BlockShape{}) * get<1>(VectorShape{})>>;
   auto tiled_copy_load = make_tiled_copy(Atom_load{}.with(S),
     Layout<CopyThreadShape>{},
-    make_layout(shape_div(typename traits_load::BlockShape{}, CopyThreadShape{})));
+    make_layout(shape_div(ScalarBlockShape{}, CopyThreadShape{})));
     
   auto thr_copy_load = tiled_copy_load.get_slice(ThreadIdxX());
 
@@ -89,15 +97,16 @@ void copy_kernel(TensorS S) {
     }
   }
   for(int i = 0;i < size(fragment); i++){
+    int val = static_cast<int>(static_cast<Element>(fragment(i)));
     if(thread(0)){
       print("\n    ");
     }
     for(int j=0;j<SUBGROUP_SIZE;j++){
       if(thread(j)){
-        if(fragment(i)<10) print(" ");
-        if(fragment(i)<100) print(" ");
-        if(fragment(i)<1000) print(" ");
-        print(static_cast<int>(fragment(i))); print("      ");
+        if(val<10) print(" ");
+        if(val<100) print(" ");
+        if(val<1000) print(" ");
+        print(val); print("      ");
       }
     }
   }
@@ -115,7 +124,7 @@ void copy(int global_M, int global_N) {
   int tensor_size = size(tensor_shape);
   cutlass::DeviceAllocation<dtype> src(tensor_size);
 
-  Tensor tensor_S = make_tensor(make_gmem_ptr(src.get()), make_layout(tensor_shape, LayoutLeft{}));
+  Tensor tensor_S = make_tensor(make_gmem_ptr(src.get()), make_layout(tensor_shape, LayoutRight{}));
 
   auto gridDim = syclcompat::dim3(1);
   auto blockDim = syclcompat::dim3(SUBGROUP_SIZE);
@@ -130,7 +139,8 @@ void copy(int global_M, int global_N) {
 int main(){
   // for 16b copies use integers as floating point types could lose precision for bigger indices
   // for 8b copies you have to work with overflow
-  //copy<XE_2D_U8x16x32_LD_T, uint8_t>(32, 32);
-  copy<XE_2D_U8x32x4_LD_T, uint8_t>(256, 256);
+  // TODO(Codeplay): for 4b types the initialization does not correctly access subbyte types and only initializes every other elemeent
+  copy<XE_2D_Packed_U8x8x32_LD_N, uint8_t>(64, 64);
+  copy<XE_2D_Packed_U8x8x32_LD_N, uint4_t>(64, 64);
   return 0;
 }