diff --git a/include/cute/arch/xe_copy_1B.hpp b/include/cute/arch/xe_copy_1B.hpp index acb9bc52e8..9242798d1a 100644 --- a/include/cute/arch/xe_copy_1B.hpp +++ b/include/cute/arch/xe_copy_1B.hpp @@ -276,6 +276,7 @@ struct XE_2D_U8x1x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -309,6 +310,7 @@ struct XE_2D_U8x2x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -343,6 +345,7 @@ struct XE_2D_U8x4x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -377,6 +380,7 @@ struct XE_2D_U8x8x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -410,6 +414,7 @@ struct XE_2D_U8x16x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -602,6 +607,7 @@ struct XE_2D_U8x1x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -634,6 +640,7 @@ struct XE_2D_U8x2x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -666,6 +673,7 @@ struct XE_2D_U8x4x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -698,6 +706,7 @@ struct XE_2D_U8x8x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -730,6 +739,7 @@ struct XE_2D_U8x16x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -763,6 +773,7 @@ struct XE_2D_U8x32x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -798,6 +809,7 @@ struct XE_2D_U8x32x16_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { diff --git a/include/cute/arch/xe_copy_2B.hpp b/include/cute/arch/xe_copy_2B.hpp index 36468fd75e..fc87f38a14 100644 --- a/include/cute/arch/xe_copy_2B.hpp +++ b/include/cute/arch/xe_copy_2B.hpp @@ -327,6 +327,7 @@ struct XE_2D_U16x8x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -360,6 +361,7 @@ struct XE_2D_U16x16x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -393,6 +395,7 @@ struct XE_2D_U16x32x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -426,6 +429,7 @@ struct XE_2D_U16x1x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -458,6 +462,7 @@ struct XE_2D_U16x2x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -490,6 +495,7 @@ struct XE_2D_U16x4x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -522,6 +528,7 @@ struct XE_2D_U16x8x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -555,6 +562,7 @@ struct XE_2D_U16x16x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -588,6 +596,7 @@ struct XE_2D_U16x32x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -620,21 +629,6 @@ struct XE_2D_U16x16x16_LD_V { CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; }; struct XE_2D_U16x32x16_LD_V { @@ -653,21 +647,6 @@ struct XE_2D_U16x32x16_LD_V { CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; }; struct XE_2D_U16x16x32_LD_V { @@ -686,21 +665,6 @@ struct XE_2D_U16x16x32_LD_V { CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; }; struct XE_2D_U16x32x32_LD_V { @@ -719,21 +683,6 @@ struct XE_2D_U16x32x32_LD_V { CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; }; struct XE_2D_U16x16x8_LD_T { diff --git a/include/cute/arch/xe_copy_4B.hpp b/include/cute/arch/xe_copy_4B.hpp index 0c4b1b53c8..8aa57c3034 100644 --- a/include/cute/arch/xe_copy_4B.hpp +++ b/include/cute/arch/xe_copy_4B.hpp @@ -584,6 +584,7 @@ struct XE_2D_TF32x16x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -696,6 +697,7 @@ struct XE_2D_U32x16x8_LD_T { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { diff --git a/include/cute/atom/copy_traits.hpp b/include/cute/atom/copy_traits.hpp index 9117a1fb13..32e972622c 100644 --- a/include/cute/atom/copy_traits.hpp +++ b/include/cute/atom/copy_traits.hpp @@ -153,8 +153,11 @@ namespace detail { template constexpr bool is_prefetch = false; +//template +//constexpr bool is_prefetch> = is_same_v; + template -constexpr bool is_prefetch> = is_same_v; +constexpr bool is_prefetch> = true; } // end namespace detail diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index 75f1c13ba7..e2f59633fe 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -90,6 +90,7 @@ template; constexpr bool is_need_reversed = detail::is_stride_leftmost; using CopyThreadShape = std::conditional_t(TileShape{}) : size<1>(TileShape{}); constexpr int tile_non_contig_size = is_need_reversed ? size<1>(TileShape{}) : size<0>(TileShape{}); - // block here is what is prefetched in one atom execution + // block here is what is prefetched in one atom execution - width of one cacheline // min(32,32)-> 32 (256, 32) -> 32 static constexpr auto block_contig_size = cute::min(tile_contig_size, cacheline_bytes / sizeof(dtype)); // A: 1 -> trans or B 256/32 = 8 @@ -109,8 +110,12 @@ CUTE_HOST_DEVICE auto prefetch_selector(Tensor const& tensor) { // A shape<32,1> / trans or B shape<4,8> constexpr int sgs_contig = cute::gcd(Num_SGs, nums_blocks_contig); constexpr int sgs_non_contig = Num_SGs / sgs_contig; + + constexpr int iters_contig = nums_blocks_contig / sgs_contig; constexpr auto block_non_contig_size = tile_non_contig_size / sgs_non_contig; + constexpr int nums_blocks_non_contig = ceil_div(tile_non_contig_size, block_non_contig_size); + constexpr int iters_non_contig = nums_blocks_non_contig / sgs_non_contig; using PrefetchTilingLayout = std::conditional_t, Int>, Int>, @@ -217,7 +222,14 @@ struct XE_2D_LD_Unpack { uint32_t pitch; uint32_t stride_l = 0; - + // Construct prefetch from equivalent copy + template + XE_2D_LD_Unpack(XE_2D_LD_Unpack const& copy_op) : + base_ptr(copy_op.base_ptr), width(copy_op.width), height(copy_op.height), + pitch(copy_op.pitch), stride_l(copy_op.stride_l) { + static_assert(std::is_same_v, + "Prefetch can only be constructed from equivalent copy"); + } XE_2D_LD_Unpack(const void *ptr, uint32_t y, uint32_t x, uint32_t p = 0) : base_ptr(ptr) { @@ -265,53 +277,59 @@ struct XE_2D_LD_Unpack { CUTE_HOST_DEVICE friend constexpr void copy_unpack(Traits_LD_t const &traits, Tensor const &src, Tensor &dst) { - using dtype = typename Tensor::value_type; - constexpr int dtype_bits = sizeof_bits_v; - - static_assert(is_rmem::value); - static_assert(size(SLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::SrcLayout{}), - "Src tensor size does not match copy atom size."); - static_assert(size(DLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::DstLayout{}), - "Dst tensor size does not match copy atom size."); - - dtype *base_addr = (dtype *)traits.base_ptr; - - auto [m, n, l] = src.data().coord_; - int x = is_need_reversed ? m : n; - int y = is_need_reversed ? n : m; + if constexpr(detail::is_prefetch){ + prefetch_unpack(traits, src); + } else{ + using dtype = typename Tensor::value_type; + constexpr int dtype_bits = sizeof_bits_v; + + static_assert(is_rmem::value); + static_assert(size(SLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::SrcLayout{}), + "Src tensor size does not match copy atom size."); + static_assert(size(DLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::DstLayout{}), + "Dst tensor size does not match copy atom size."); + + dtype *base_addr = (dtype *)traits.base_ptr; + + auto [m, n, l] = src.data().coord_; + int x = is_need_reversed ? m : n; + int y = is_need_reversed ? n : m; - constexpr auto inst_size_bits = detail::size_of_inst_bits; + constexpr auto inst_size_bits = detail::size_of_inst_bits; - CopyOp::copy(base_addr + static_cast(l) * traits.stride_l, - (traits.width * sizeof_bits_v) / sizeof_bits_v, traits.height, - (traits.pitch * sizeof_bits_v) / sizeof_bits_v, - intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}, - raw_pointer_cast(&((&*dst.data())[0]))); + CopyOp::copy(base_addr + static_cast(l) * traits.stride_l, + (traits.width * sizeof_bits_v) / sizeof_bits_v, traits.height, + (traits.pitch * sizeof_bits_v) / sizeof_bits_v, + intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}, + raw_pointer_cast(&((&*dst.data())[0]))); + } } template CUTE_HOST_DEVICE friend constexpr void - prefetch(Copy_Atom const &atom, + prefetch_unpack(Traits_LD_t const &traits, Tensor const &src) { - using dtype = typename Copy_Atom::ValType; + // we do not have exact dtype available here, only size + constexpr int dtype_size_bits = size<1,0>(typename Traits_LD_t::SrcLayout{}); + constexpr int dtype_size = dtype_size_bits / 8; + using dtype_proxy = sycl::vec; - static_assert(detail::has_prefetch); - static_assert(size(SLayout{}) * sizeof_bits_v == size<1>(typename Traits_LD_t::SrcLayout{}), + static_assert(size(SLayout{}) * dtype_size_bits == size<1>(typename Traits_LD_t::SrcLayout{}), "Src tensor size does not match copy atom for prefetch size"); - dtype *base_addr = (dtype *)atom.base_ptr; + char *base_addr = (char *)traits.base_ptr; auto [m, n, l] = src.data().coord_; int x = is_need_reversed ? m : n; int y = is_need_reversed ? n : m; - constexpr auto inst_size_bits = detail::size_of_inst_bits; + constexpr auto inst_size_bits = detail::size_of_inst_bits; - CopyOp::PREFETCH::copy(base_addr + l * atom.stride_l, - (atom.width * sizeof_bits_v) / sizeof_bits_v, atom.height, - (atom.pitch * sizeof_bits_v) / sizeof_bits_v, - intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}); + CopyOp::PREFETCH::copy(base_addr + l * traits.stride_l * dtype_size, + (traits.width * dtype_size_bits) / sizeof_bits_v, traits.height, + (traits.pitch * dtype_size_bits) / sizeof_bits_v, + intel::coord_t{(int)(x * dtype_size_bits / inst_size_bits), y}); } template @@ -467,6 +485,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout, + Stride< _0,_1>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout, + Stride<_16, _1>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -485,6 +518,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -503,6 +551,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -521,6 +584,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -539,6 +617,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -633,11 +726,11 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout, - Stride< _0,_1>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; @@ -653,13 +746,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256>>>; + Stride<_0,Stride<_1,_8,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -667,11 +761,11 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout, - Stride< _0,_1>>; + using SrcLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; @@ -688,13 +782,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256,_512>>>; + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -702,11 +797,11 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (dst-thr,dst-val) to bit - using SrcLayout = Layout>, - Stride<_0,Stride< _1,_256,_512>>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; @@ -722,13 +817,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256,_512>>>; + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -736,11 +832,11 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout, - Stride< _0,_1>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; @@ -756,13 +852,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256,_512>>>; + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; @@ -771,11 +868,11 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout, - Stride< _0,_1>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; template @@ -788,13 +885,14 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -802,11 +900,11 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout, - Stride< _0,_1>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; template @@ -819,13 +917,14 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -903,17 +1002,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -936,17 +1036,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -969,17 +1070,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1002,17 +1104,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1035,17 +1138,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + Stride<_0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1068,17 +1172,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + Stride<_0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1102,18 +1207,19 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1139,18 +1245,19 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1173,18 +1280,19 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using SrcLayout = Layout>, + Stride<_0, Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using DstLayout = Layout>, + Stride<_16, Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1531,7 +1639,7 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit @@ -1542,6 +1650,7 @@ struct Copy_Traits_ Stride< _8,Stride<_1,_128>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1767,17 +1876,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout,Shape <_32, _16>>, - Stride,Stride< _1,_256>>>; + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_32>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout,Shape <_32, _16>>, - Stride,Stride< _1,_256>>>; + using DstLayout = Layout>, + Stride<_128,Stride< _1,_32>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -2283,6 +2393,11 @@ COPY_TRAIT_LD_DEF(XE_2D_U4x32x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U4x16x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U4x32x16_LD_T) COPY_TRAIT_LD_DEF(XE_2D_U4x16x16_LD_T) +COPY_TRAIT_LD_DEF(XE_2D_U8x1x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U8x2x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U8x4x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U8x8x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U8x16x32_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U8x1x64_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U8x2x64_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U8x4x64_LD_N::PREFETCH)