diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml index 926ab19360..0240befab8 100644 --- a/.github/workflows/intel_test.yml +++ b/.github/workflows/intel_test.yml @@ -57,7 +57,7 @@ jobs: name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }} runs-on: ${{ matrix.runner }} - timeout-minutes: 120 + timeout-minutes: 45 steps: - name: Checkout repository @@ -104,12 +104,7 @@ jobs: -DCUTLASS_ENABLE_SYCL=ON \ -DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \ -DCUTLASS_SYCL_RUNNING_CI=ON - # Use reduced parallelism for BMG runners, full for PVC - if [[ "${{ matrix.gpu }}" == "BMG" ]]; then - cmake --build . -j 2 - else - cmake --build . - fi + cmake --build . -j 6 - name: Unit test shell: bash run: | diff --git a/examples/common/sycl_common.hpp b/examples/common/sycl_common.hpp index 9593e37d27..bd9931fa4c 100644 --- a/examples/common/sycl_common.hpp +++ b/examples/common/sycl_common.hpp @@ -58,7 +58,7 @@ void random_fill(T *src, int seed, size_t N, float max, float min) { syclcompat::memcpy(src, buff.data(), N); syclcompat::wait(); } else { - assert(0 & "Not supported dtype"); + assert(0 && "Not supported dtype"); } } diff --git a/include/cute/arch/copy_xe_U16.hpp b/include/cute/arch/copy_xe_U16.hpp index 4bb3b404bf..e4c8e72e8c 100644 --- a/include/cute/arch/copy_xe_U16.hpp +++ b/include/cute/arch/copy_xe_U16.hpp @@ -100,6 +100,7 @@ struct XE_2D_U16x8x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -129,6 +130,7 @@ struct XE_2D_U16x16x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -158,6 +160,7 @@ struct XE_2D_U16x32x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -188,6 +191,7 @@ struct XE_2D_U16x1x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -218,6 +222,7 @@ struct XE_2D_U16x2x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -278,6 +283,7 @@ struct XE_2D_U16x4x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -308,6 +314,7 @@ struct XE_2D_U16x8x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -338,6 +345,7 @@ struct XE_2D_U16x16x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -368,6 +376,7 @@ struct XE_2D_U16x32x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -426,6 +435,7 @@ struct XE_2D_U16x16x16_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -455,6 +465,7 @@ struct XE_2D_U16x32x16_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -484,6 +495,7 @@ struct XE_2D_U16x16x32_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -513,6 +525,7 @@ struct XE_2D_U16x32x32_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { diff --git a/include/cute/arch/copy_xe_U32.hpp b/include/cute/arch/copy_xe_U32.hpp index c78f9d0588..32f655eefb 100644 --- a/include/cute/arch/copy_xe_U32.hpp +++ b/include/cute/arch/copy_xe_U32.hpp @@ -308,6 +308,7 @@ struct XE_2D_TF32x16x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -407,6 +408,7 @@ struct XE_2D_U32x16x8_LD_T { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { diff --git a/include/cute/arch/copy_xe_U8.hpp b/include/cute/arch/copy_xe_U8.hpp index 411e6cb5d8..7b1af33809 100644 --- a/include/cute/arch/copy_xe_U8.hpp +++ b/include/cute/arch/copy_xe_U8.hpp @@ -98,6 +98,7 @@ struct XE_2D_Packed_U8x1x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -128,6 +129,7 @@ struct XE_2D_Packed_U8x2x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -173,6 +175,7 @@ struct XE_2D_Packed_U8x4x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -202,6 +205,7 @@ struct XE_2D_Packed_U8x8x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -231,6 +235,7 @@ struct XE_2D_Packed_U8x16x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -325,6 +330,7 @@ struct XE_2D_Packed_U8x1x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -354,6 +360,7 @@ struct XE_2D_Packed_U8x2x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -383,6 +390,7 @@ struct XE_2D_Packed_U8x4x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -412,6 +420,7 @@ struct XE_2D_Packed_U8x8x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -441,6 +450,7 @@ struct XE_2D_Packed_U8x16x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -470,6 +480,7 @@ struct XE_2D_Packed_U8x32x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -501,6 +512,7 @@ struct XE_2D_U8x32x16_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { diff --git a/include/cute/atom/copy_traits.hpp b/include/cute/atom/copy_traits.hpp index 9117a1fb13..ea4a3913fc 100644 --- a/include/cute/atom/copy_traits.hpp +++ b/include/cute/atom/copy_traits.hpp @@ -153,9 +153,19 @@ namespace detail { template constexpr bool is_prefetch = false; +#ifdef SYCL_INTEL_TARGET + +template +constexpr bool is_prefetch> = true; + +#else + +// TODO(Codeplay): Enable for SYCL_INTEL_TARGET. template constexpr bool is_prefetch> = is_same_v; +#endif + } // end namespace detail diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index 5697d3e9a1..fb7721d60b 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -228,7 +228,7 @@ CUTE_HOST_DEVICE auto prefetch_selector(Tensor const& tensor) { constexpr int tile_contig_size = is_tensor_M_major ? size<0>(TileShape{}) : size<1>(TileShape{}); constexpr int tile_non_contig_size = is_tensor_M_major ? size<1>(TileShape{}) : size<0>(TileShape{}); - // block here is what is prefetched in one atom execution + // block here is what is prefetched in one atom execution - width of one cacheline // min(32,32)-> 32 (256, 32) -> 32 static constexpr auto block_contig_size = cute::min(tile_contig_size, cacheline_bytes * sizeof_bits_v / sizeof_bits_v); // A: 1 -> trans or B 256/32 = 8 @@ -321,6 +321,14 @@ struct XE_2D_LD_Unpack { uint32_t pitch; uint32_t stride_l = 0; + // Construct prefetch from equivalent copy + template + XE_2D_LD_Unpack(XE_2D_LD_Unpack const& copy_op) : + base_ptr(copy_op.base_ptr), width(copy_op.width), height(copy_op.height), + pitch(copy_op.pitch), stride_l(copy_op.stride_l) { + static_assert(std::is_same_v, + "Prefetch can only be constructed from equivalent copy"); + } XE_2D_LD_Unpack(const void *ptr, uint32_t y, uint32_t x, uint32_t p = 0) : base_ptr(ptr) { @@ -368,48 +376,63 @@ struct XE_2D_LD_Unpack { CUTE_HOST_DEVICE friend constexpr void copy_unpack(Traits_LD_t const &traits, Tensor const &src, Tensor &dst) { - using dtype = typename Tensor::value_type; - constexpr int dtype_bits = sizeof_bits_v; - - static_assert(is_rmem::value); - static_assert(size(SLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::SrcLayout{}), - "Src tensor size does not match copy atom size."); - static_assert(size(DLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::DstLayout{}), - "Dst tensor size does not match copy atom size."); - - dtype *base_addr = (dtype *)traits.base_ptr; - - auto [m, n, l] = src.data().coord_; - int x = is_tensor_M_major ? m : n; - int y = is_tensor_M_major ? n : m; + if constexpr(detail::is_prefetch){ + prefetch_unpack(traits, src); + } else{ + using dtype = typename Tensor::value_type; + constexpr int dtype_bits = sizeof_bits_v; + + static_assert(is_rmem::value); + static_assert(size(SLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::SrcLayout{}), + "Src tensor size does not match copy atom size."); + static_assert(size(DLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::DstLayout{}), + "Dst tensor size does not match copy atom size."); + + dtype *base_addr = (dtype *)traits.base_ptr; + + auto [m, n, l] = src.data().coord_; + int x = is_tensor_M_major ? m : n; + int y = is_tensor_M_major ? n : m; - constexpr auto inst_size_bits = detail::size_of_inst_bits; + constexpr auto inst_size_bits = detail::size_of_inst_bits; - CopyOp::copy(((uint8_t*)base_addr) + static_cast(l) * traits.stride_l * sizeof_bits_v / 8, - (traits.width * sizeof_bits_v) / sizeof_bits_v, traits.height, - (traits.pitch * sizeof_bits_v) / sizeof_bits_v, - intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}, - raw_pointer_cast(&((&*dst.data())[0]))); + CopyOp::copy(((uint8_t*)base_addr) + static_cast(l) * traits.stride_l * sizeof_bits_v / 8, + (traits.width * sizeof_bits_v) / sizeof_bits_v, traits.height, + (traits.pitch * sizeof_bits_v) / sizeof_bits_v, + intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}, + raw_pointer_cast(&((&*dst.data())[0]))); + } } template CUTE_HOST_DEVICE friend constexpr void - prefetch(Copy_Atom const &atom, + prefetch_unpack(Traits_LD_t const &traits, Tensor const &src) { - using dtype = typename Copy_Atom::ValType; - - static_assert(detail::has_prefetch); - static_assert(size(SLayout{}) * sizeof_bits_v == size<1>(typename Traits_LD_t::SrcLayout{}), + // We do not have exact dtype available here, only size of the prefetch, assume that is the size of the element as well + constexpr int dtype_size_bits = size<1,0>(typename Traits_LD_t::SrcLayout{}); + constexpr int dtype_size = dtype_size_bits / 8; + using dtype_proxy = sycl::vec; + + // This asserts checks for the most common usecase - using prefetch atom with the same type as the data + // However prefetch atoms can also be used for other datatypes, such as U8 prefetch for U4 data, which need a special case in the assert below + // If we have more such cases we could remove the assert altogether + static_assert(size(SLayout{}) * dtype_size_bits == size<1>(typename Traits_LD_t::SrcLayout{}) || + // Prefetching U4 using U8 prefetch, we can not distinguish this case without actual type + dtype_size_bits == 8 && size(SLayout{}) * 4 == size<1>(typename Traits_LD_t::SrcLayout{}), "Src tensor size does not match copy atom for prefetch size"); - dtype *base_addr = (dtype *)atom.base_ptr; + char *base_addr = (char *)traits.base_ptr; auto [m, n, l] = src.data().coord_; int x = is_tensor_M_major ? m : n; int y = is_tensor_M_major ? n : m; - constexpr auto inst_size_bits = detail::size_of_inst_bits; + constexpr auto inst_size_bits = detail::size_of_inst_bits; + // The assert checks that assert is implemented using prefetch instruction with matching datatype size + // Currently this is the case for all prefetch atoms, but we might need to remove the assert to add + // any prefetch atom with size of datatype different from the underlying instruction. + static_assert(inst_size_bits == dtype_size_bits, "Instruction size does not match prefetch size."); CopyOp::PREFETCH::copy(((uint8_t*)(base_addr)) + static_cast(l) * atom.stride_l * sizeof_bits_v / 8, (atom.width * sizeof_bits_v) / sizeof_bits_v, atom.height, @@ -611,6 +634,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout, + Stride< _0,_1>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout, + Stride<_16, _1>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -629,6 +667,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -648,17 +701,32 @@ struct Copy_Traits_ }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, + using SrcLayout = Layout>, Stride< _0,Stride< _1,_128,_256>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, + using DstLayout = Layout>, Stride<_16,Stride< _1,_128,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride<_1, _8,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride<_1, _8,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; template Copy_Traits_(ArgT... args) @@ -685,6 +753,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -722,8 +805,23 @@ struct Copy_Traits_ }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, @@ -851,11 +949,11 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout, - Stride< _0,_1>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; @@ -871,13 +969,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256>>>; + Stride<_0,Stride<_1,_8,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -906,13 +1005,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256,_512>>>; + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -940,13 +1040,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256,_512>>>; + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -957,8 +1058,8 @@ struct Copy_Traits_ using SrcLayout = Layout>, Stride< _0,Stride< _1,_512,_256>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; @@ -974,13 +1075,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256,_512>>>; + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; @@ -992,8 +1094,8 @@ struct Copy_Traits_ using SrcLayout = Layout>, Stride< _0,Stride< _1,_512,_256>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; template @@ -1006,13 +1108,14 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1037,13 +1140,14 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1158,17 +1262,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1209,17 +1314,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1242,17 +1348,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1275,17 +1382,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1308,17 +1416,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + Stride<_0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1341,17 +1450,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + Stride<_0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1375,18 +1485,19 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1412,18 +1523,19 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1464,18 +1576,19 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using SrcLayout = Layout>, + Stride<_0, Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using DstLayout = Layout>, + Stride<_16, Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1840,7 +1953,7 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit @@ -1851,6 +1964,7 @@ struct Copy_Traits_ Stride< _8,Stride<_1,_128>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1968,6 +2082,22 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + // Logical thread id to thread idx + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_256,_512>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_256,_512>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -1986,6 +2116,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_256,_512>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_256,_512>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -2005,6 +2150,22 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + // Logical thread id to thread idx + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride<_0,Stride< _1,_512,_256,_1024>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_512,_256,_1024>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -2024,6 +2185,22 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + // Logical thread id to thread idx + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride<_0,Stride< _1,_512,_256,_1024>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_512,_256,_1024>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -2136,17 +2313,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout,Shape <_32, _16>>, - Stride,Stride< _1,_256>>>; + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_32>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout,Shape <_32, _16>>, - Stride,Stride< _1,_256>>>; + using DstLayout = Layout>, + Stride<_128,Stride< _1,_32>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -2649,6 +2827,11 @@ COPY_TRAIT_LD_DEF(XE_2D_U4x16x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U4x32x16_LD_T) COPY_TRAIT_LD_DEF(XE_2D_U4x16x8_LD_T) COPY_TRAIT_LD_DEF(XE_2D_U4x16x16_LD_T) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x1x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x2x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x4x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x8x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x16x32_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x1x64_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x2x64_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x4x64_LD_N::PREFETCH) @@ -2661,6 +2844,10 @@ COPY_TRAIT_LD_DEF(XE_2D_U16x2x32_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x4x32_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x8x32_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U8x32x16_LD_V::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U16x16x16_LD_V::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U16x32x16_LD_V::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U16x16x32_LD_V::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U16x32x32_LD_V::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U32x16x8_LD_T::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x16x16_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x32x16_LD_N::PREFETCH) diff --git a/tools/copy_debug/copy_debug.cpp b/tools/copy_debug/copy_debug.cpp index e8be50b08e..db8a90dd27 100644 --- a/tools/copy_debug/copy_debug.cpp +++ b/tools/copy_debug/copy_debug.cpp @@ -55,8 +55,20 @@ void copy_kernel(TensorS S) { } syncthreads(); - using Copy = typename Copy_Traits::template DefaultTiledCopy; - Copy tiled_copy_load{Copy{}.with(S)}; + using traits_load = Copy_Traits; + using Atom_load = Copy_Atom; + constexpr int vector_size = size<1,0>(typename traits_load::SrcLayout{}) / sizeof_bits_v; + using VectorShape = std::conditional_t, _1>, + Shape<_1, Int>>; + using CopyThreadShape = std::conditional_t, _1>, + Shape<_1, Int>>; + using ScalarBlockShape = Shape(typename traits_load::BlockShape{}) * get<0>(VectorShape{})>, + Int(typename traits_load::BlockShape{}) * get<1>(VectorShape{})>>; + auto tiled_copy_load = make_tiled_copy(Atom_load{}.with(S), + Layout{}, + make_layout(shape_div(ScalarBlockShape{}, CopyThreadShape{}))); auto thr_copy_load = tiled_copy_load.get_slice(ThreadIdxX()); @@ -85,15 +97,16 @@ void copy_kernel(TensorS S) { } } for(int i = 0;i < size(fragment); i++){ + int val = static_cast(static_cast(fragment(i))); if(thread(0)){ print("\n "); } for(int j=0;j(fragment(i))); print(" "); + if(val<10) print(" "); + if(val<100) print(" "); + if(val<1000) print(" "); + print(val); print(" "); } } } @@ -111,7 +124,7 @@ void copy(int global_M, int global_N) { int tensor_size = size(tensor_shape); cutlass::DeviceAllocation src(tensor_size); - Tensor tensor_S = make_tensor(make_gmem_ptr(src.get()), make_layout(tensor_shape, LayoutLeft{})); + Tensor tensor_S = make_tensor(make_gmem_ptr(src.get()), make_layout(tensor_shape, LayoutRight{})); auto gridDim = syclcompat::dim3(1); auto blockDim = syclcompat::dim3(SUBGROUP_SIZE); @@ -126,7 +139,8 @@ void copy(int global_M, int global_N) { int main(){ // for 16b copies use integers as floating point types could lose precision for bigger indices // for 8b copies you have to work with overflow - //copy(32, 32); - copy(256, 256); + // TODO(Codeplay): for 4b types the initialization does not correctly access subbyte types and only initializes every other elemeent + copy(64, 64); + copy(64, 64); return 0; }