diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml index 4cc0fe4249..a01e148dc3 100644 --- a/.github/workflows/intel_test.yml +++ b/.github/workflows/intel_test.yml @@ -57,7 +57,7 @@ jobs: name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }} runs-on: ${{ matrix.runner }} - timeout-minutes: 30 + timeout-minutes: 45 steps: - name: Checkout repository @@ -96,7 +96,7 @@ jobs: -DCUTLASS_ENABLE_SYCL=ON \ -DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \ -DCUTLASS_SYCL_RUNNING_CI=ON - cmake --build . + cmake --build . -j 6 - name: Unit test shell: bash run: | diff --git a/examples/common/sycl_common.hpp b/examples/common/sycl_common.hpp index e9376c6c84..eadf5024f4 100644 --- a/examples/common/sycl_common.hpp +++ b/examples/common/sycl_common.hpp @@ -158,6 +158,6 @@ void random_fill(T *src, int seed, size_t N, float max, float min) { syclcompat::memcpy(src, buff.data(), N); syclcompat::wait(); } else { - assert(0 & "Not supported dtype"); + assert(0 && "Not supported dtype"); } } diff --git a/include/cute/arch/copy_xe_U16.hpp b/include/cute/arch/copy_xe_U16.hpp index 3e8a579a78..f88e7e8e46 100644 --- a/include/cute/arch/copy_xe_U16.hpp +++ b/include/cute/arch/copy_xe_U16.hpp @@ -99,6 +99,7 @@ struct XE_2D_U16x8x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -128,6 +129,7 @@ struct XE_2D_U16x16x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -157,6 +159,7 @@ struct XE_2D_U16x32x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -186,6 +189,7 @@ struct XE_2D_U16x1x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -215,6 +219,7 @@ struct XE_2D_U16x2x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -244,6 +249,7 @@ struct XE_2D_U16x4x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -273,6 +279,7 @@ struct XE_2D_U16x8x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -302,6 +309,7 @@ struct XE_2D_U16x16x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -331,6 +339,7 @@ struct XE_2D_U16x32x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -360,6 +369,7 @@ struct XE_2D_U16x16x16_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -389,6 +399,7 @@ struct XE_2D_U16x32x16_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -418,6 +429,7 @@ struct XE_2D_U16x16x32_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -447,6 +459,7 @@ struct XE_2D_U16x32x32_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { diff --git a/include/cute/arch/copy_xe_U32.hpp b/include/cute/arch/copy_xe_U32.hpp index 8572bc4362..e4f344003d 100644 --- a/include/cute/arch/copy_xe_U32.hpp +++ b/include/cute/arch/copy_xe_U32.hpp @@ -307,6 +307,7 @@ struct XE_2D_TF32x16x16_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -406,6 +407,7 @@ struct XE_2D_U32x16x8_LD_T { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { diff --git a/include/cute/arch/copy_xe_U8.hpp b/include/cute/arch/copy_xe_U8.hpp index 35612eb467..4b0d0ffe08 100644 --- a/include/cute/arch/copy_xe_U8.hpp +++ b/include/cute/arch/copy_xe_U8.hpp @@ -52,6 +52,7 @@ struct XE_2D_Packed_U8x1x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -82,6 +83,7 @@ struct XE_2D_Packed_U8x2x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -127,6 +129,7 @@ struct XE_2D_Packed_U8x4x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -156,6 +159,7 @@ struct XE_2D_Packed_U8x8x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -185,6 +189,7 @@ struct XE_2D_Packed_U8x16x32_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -249,6 +254,7 @@ struct XE_2D_Packed_U8x1x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -278,6 +284,7 @@ struct XE_2D_Packed_U8x2x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -307,6 +314,7 @@ struct XE_2D_Packed_U8x4x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -336,6 +344,7 @@ struct XE_2D_Packed_U8x8x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -365,6 +374,7 @@ struct XE_2D_Packed_U8x16x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -394,6 +404,7 @@ struct XE_2D_Packed_U8x32x64_LD_N { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { @@ -425,6 +436,7 @@ struct XE_2D_U8x32x16_LD_V { } struct PREFETCH { + using BlockShape = BlockShape; CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { diff --git a/include/cute/atom/copy_traits.hpp b/include/cute/atom/copy_traits.hpp index 9117a1fb13..ea4a3913fc 100644 --- a/include/cute/atom/copy_traits.hpp +++ b/include/cute/atom/copy_traits.hpp @@ -153,9 +153,19 @@ namespace detail { template constexpr bool is_prefetch = false; +#ifdef SYCL_INTEL_TARGET + +template +constexpr bool is_prefetch> = true; + +#else + +// TODO(Codeplay): Enable for SYCL_INTEL_TARGET. template constexpr bool is_prefetch> = is_same_v; +#endif + } // end namespace detail diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index 8ba3344bdd..4cd09c4d99 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -214,6 +214,7 @@ template; constexpr bool is_tensor_M_major = detail::is_stride_leftmost; using CopyThreadShape = std::conditional_t(TileShape{}) : size<1>(TileShape{}); constexpr int tile_non_contig_size = is_tensor_M_major ? size<1>(TileShape{}) : size<0>(TileShape{}); - // block here is what is prefetched in one atom execution + // block here is what is prefetched in one atom execution - width of one cacheline // min(32,32)-> 32 (256, 32) -> 32 - static constexpr auto block_contig_size = cute::min(tile_contig_size, cacheline_bytes / sizeof(dtype)); + static constexpr auto block_contig_size = cute::min(tile_contig_size, cacheline_elements); // A: 1 -> trans or B 256/32 = 8 static constexpr auto nums_blocks_contig = ceil_div(tile_contig_size, block_contig_size); @@ -310,7 +311,14 @@ struct XE_2D_LD_Unpack { uint32_t pitch; uint32_t stride_l = 0; - + // Construct prefetch from equivalent copy + template + XE_2D_LD_Unpack(XE_2D_LD_Unpack const& copy_op) : + base_ptr(copy_op.base_ptr), width(copy_op.width), height(copy_op.height), + pitch(copy_op.pitch), stride_l(copy_op.stride_l) { + static_assert(std::is_same_v, + "Prefetch can only be constructed from equivalent copy"); + } XE_2D_LD_Unpack(const void *ptr, uint32_t y, uint32_t x, uint32_t p = 0) : base_ptr(ptr) { @@ -358,53 +366,68 @@ struct XE_2D_LD_Unpack { CUTE_HOST_DEVICE friend constexpr void copy_unpack(Traits_LD_t const &traits, Tensor const &src, Tensor &dst) { - using dtype = typename Tensor::value_type; - constexpr int dtype_bits = sizeof_bits_v; - - static_assert(is_rmem::value); - static_assert(size(SLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::SrcLayout{}), - "Src tensor size does not match copy atom size."); - static_assert(size(DLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::DstLayout{}), - "Dst tensor size does not match copy atom size."); - - dtype *base_addr = (dtype *)traits.base_ptr; - - auto [m, n, l] = src.data().coord_; - int x = is_tensor_M_major ? m : n; - int y = is_tensor_M_major ? n : m; + if constexpr(detail::is_prefetch){ + prefetch_unpack(traits, src); + } else{ + using dtype = typename Tensor::value_type; + constexpr int dtype_bits = sizeof_bits_v; + + static_assert(is_rmem::value); + static_assert(size(SLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::SrcLayout{}), + "Src tensor size does not match copy atom size."); + static_assert(size(DLayout{}) * dtype_bits == size<1>(typename Traits_LD_t::DstLayout{}), + "Dst tensor size does not match copy atom size."); + + dtype *base_addr = (dtype *)traits.base_ptr; + + auto [m, n, l] = src.data().coord_; + int x = is_tensor_M_major ? m : n; + int y = is_tensor_M_major ? n : m; - constexpr auto inst_size_bits = detail::size_of_inst_bits; + constexpr auto inst_size_bits = detail::size_of_inst_bits; - CopyOp::copy(base_addr + static_cast(l) * traits.stride_l, - (traits.width * sizeof_bits_v) / sizeof_bits_v, traits.height, - (traits.pitch * sizeof_bits_v) / sizeof_bits_v, - intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}, - raw_pointer_cast(&((&*dst.data())[0]))); + CopyOp::copy(base_addr + static_cast(l) * traits.stride_l, + (traits.width * sizeof_bits_v) / sizeof_bits_v, traits.height, + (traits.pitch * sizeof_bits_v) / sizeof_bits_v, + intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}, + raw_pointer_cast(&((&*dst.data())[0]))); + } } template CUTE_HOST_DEVICE friend constexpr void - prefetch(Copy_Atom const &atom, + prefetch_unpack(Traits_LD_t const &traits, Tensor const &src) { - using dtype = typename Copy_Atom::ValType; - - static_assert(detail::has_prefetch); - static_assert(size(SLayout{}) * sizeof_bits_v == size<1>(typename Traits_LD_t::SrcLayout{}), + // We do not have exact dtype available here, only size of the prefetch, assume that is the size of the element as well + constexpr int dtype_size_bits = size<1,0>(typename Traits_LD_t::SrcLayout{}); + constexpr int dtype_size = dtype_size_bits / 8; + using dtype_proxy = sycl::vec; + + // This asserts checks for the most common usecase - using prefetch atom with the same type as the data + // However prefetch atoms can also be used for other datatypes, such as U8 prefetch for U4 data, which need a special case in the assert below + // If we have more such cases we could remove the assert altogether + static_assert(size(SLayout{}) * dtype_size_bits == size<1>(typename Traits_LD_t::SrcLayout{}) || + // Prefetching U4 using U8 prefetch, we can not distinguish this case without actual type + dtype_size_bits == 8 && size(SLayout{}) * 4 == size<1>(typename Traits_LD_t::SrcLayout{}), "Src tensor size does not match copy atom for prefetch size"); - dtype *base_addr = (dtype *)atom.base_ptr; + char *base_addr = (char *)traits.base_ptr; auto [m, n, l] = src.data().coord_; int x = is_tensor_M_major ? m : n; int y = is_tensor_M_major ? n : m; - constexpr auto inst_size_bits = detail::size_of_inst_bits; + constexpr auto inst_size_bits = detail::size_of_inst_bits; + // The assert checks that assert is implemented using prefetch instruction with matching datatype size + // Currently this is the case for all prefetch atoms, but we might need to remove the assert to add + // any prefetch atom with size of datatype different from the underlying instruction. + static_assert(inst_size_bits == dtype_size_bits, "Instruction size does not match prefetch size."); - CopyOp::PREFETCH::copy(base_addr + l * atom.stride_l, - (atom.width * sizeof_bits_v) / sizeof_bits_v, atom.height, - (atom.pitch * sizeof_bits_v) / sizeof_bits_v, - intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}); + CopyOp::PREFETCH::copy(base_addr + l * traits.stride_l * dtype_size, + (traits.width * dtype_size_bits) / sizeof_bits_v, traits.height, + (traits.pitch * dtype_size_bits) / sizeof_bits_v, + intel::coord_t{(int)(x * dtype_size_bits / inst_size_bits), y}); } template @@ -559,6 +582,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout, + Stride< _0,_1>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout, + Stride<_16, _1>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -577,6 +615,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -596,23 +649,53 @@ struct Copy_Traits_ }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, + using SrcLayout = Layout>, Stride< _0,Stride< _1,_128,_256>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, + using DstLayout = Layout>, Stride<_16,Stride< _1,_128,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride<_1, _8,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride<_1, _8,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; template Copy_Traits_(ArgT... args) : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -649,6 +732,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -743,11 +841,11 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout, - Stride< _0,_1>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; @@ -763,13 +861,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256>>>; + Stride<_0,Stride<_1,_8,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -798,13 +897,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256,_512>>>; + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -832,13 +932,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256,_512>>>; + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -849,8 +950,8 @@ struct Copy_Traits_ using SrcLayout = Layout>, Stride< _0,Stride< _1,_512,_256>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; @@ -866,13 +967,14 @@ struct Copy_Traits_ using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride<_1,_8,_256,_512>>>; + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; @@ -884,8 +986,8 @@ struct Copy_Traits_ using SrcLayout = Layout>, Stride< _0,Stride< _1,_512,_256>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; template @@ -898,13 +1000,14 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -929,13 +1032,14 @@ struct Copy_Traits_ : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using SrcLayout = Layout>, + Stride<_0,Stride<_1,_8,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride<_1,_8,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1050,17 +1154,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1083,17 +1188,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1116,17 +1222,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1149,17 +1256,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256>>>; + Stride<_0,Stride< _1,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1182,17 +1290,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + Stride<_0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1215,17 +1324,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, - Stride<_16,Stride< _1,_256,_512>>>; + Stride<_0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1249,18 +1359,19 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1286,18 +1397,19 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using DstLayout = Layout>, + Stride<_16,Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1320,18 +1432,19 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using SrcLayout = Layout>, + Stride<_0, Stride< _1,_256,_512>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout>, - Stride<_32,Stride< _1,_512>>>; + using DstLayout = Layout>, + Stride<_16, Stride< _1,_256,_512>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; using CopyInternalType = cute::intel::ushort; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1696,7 +1809,7 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit @@ -1707,6 +1820,7 @@ struct Copy_Traits_ Stride< _8,Stride<_1,_128>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -1824,6 +1938,22 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + // Logical thread id to thread idx + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_256,_512>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_256,_512>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -1842,6 +1972,21 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_256,_512>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_256,_512>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -1861,6 +2006,22 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + // Logical thread id to thread idx + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride<_0,Stride< _1,_512,_256,_1024>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_512,_256,_1024>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -1880,6 +2041,22 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + // Logical thread id to thread idx + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride<_0,Stride< _1,_512,_256,_1024>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_512,_256,_1024>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -1992,17 +2169,18 @@ struct Copy_Traits_ template struct Copy_Traits_ - : XE_2D_LD_Unpack { + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit - using SrcLayout = Layout,Shape <_32, _16>>, - Stride,Stride< _1,_256>>>; + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_32>>>; // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout,Shape <_32, _16>>, - Stride,Stride< _1,_256>>>; + using DstLayout = Layout>, + Stride<_128,Stride< _1,_32>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout; + using XE_2D_LD_Unpack::XE_2D_LD_Unpack; }; template @@ -2498,6 +2676,11 @@ COPY_TRAIT_LD_DEF(XE_2D_U4x32x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U4x16x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U4x32x16_LD_T) COPY_TRAIT_LD_DEF(XE_2D_U4x16x16_LD_T) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x1x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x2x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x4x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x8x32_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x16x32_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x1x64_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x2x64_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x4x64_LD_N::PREFETCH) @@ -2510,6 +2693,10 @@ COPY_TRAIT_LD_DEF(XE_2D_U16x2x32_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x4x32_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x8x32_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U8x32x16_LD_V::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U16x16x16_LD_V::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U16x32x16_LD_V::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U16x16x32_LD_V::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_U16x32x32_LD_V::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U32x16x8_LD_T::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x16x16_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x32x16_LD_N::PREFETCH) diff --git a/tools/copy_debug/copy_debug.cpp b/tools/copy_debug/copy_debug.cpp index 9a170bc4d0..69f83acdd6 100644 --- a/tools/copy_debug/copy_debug.cpp +++ b/tools/copy_debug/copy_debug.cpp @@ -55,12 +55,20 @@ void copy_kernel(TensorS S) { } syncthreads(); - using CopyThreadShape = Shape<_1, Int>; using traits_load = Copy_Traits; using Atom_load = Copy_Atom; + constexpr int vector_size = size<1,0>(typename traits_load::SrcLayout{}) / sizeof_bits_v; + using VectorShape = std::conditional_t, _1>, + Shape<_1, Int>>; + using CopyThreadShape = std::conditional_t, _1>, + Shape<_1, Int>>; + using ScalarBlockShape = Shape(typename traits_load::BlockShape{}) * get<0>(VectorShape{})>, + Int(typename traits_load::BlockShape{}) * get<1>(VectorShape{})>>; auto tiled_copy_load = make_tiled_copy(Atom_load{}.with(S), Layout{}, - make_layout(shape_div(typename traits_load::BlockShape{}, CopyThreadShape{}))); + make_layout(shape_div(ScalarBlockShape{}, CopyThreadShape{}))); auto thr_copy_load = tiled_copy_load.get_slice(ThreadIdxX()); @@ -89,15 +97,16 @@ void copy_kernel(TensorS S) { } } for(int i = 0;i < size(fragment); i++){ + int val = static_cast(static_cast(fragment(i))); if(thread(0)){ print("\n "); } for(int j=0;j(fragment(i))); print(" "); + if(val<10) print(" "); + if(val<100) print(" "); + if(val<1000) print(" "); + print(val); print(" "); } } } @@ -115,7 +124,7 @@ void copy(int global_M, int global_N) { int tensor_size = size(tensor_shape); cutlass::DeviceAllocation src(tensor_size); - Tensor tensor_S = make_tensor(make_gmem_ptr(src.get()), make_layout(tensor_shape, LayoutLeft{})); + Tensor tensor_S = make_tensor(make_gmem_ptr(src.get()), make_layout(tensor_shape, LayoutRight{})); auto gridDim = syclcompat::dim3(1); auto blockDim = syclcompat::dim3(SUBGROUP_SIZE); @@ -130,7 +139,8 @@ void copy(int global_M, int global_N) { int main(){ // for 16b copies use integers as floating point types could lose precision for bigger indices // for 8b copies you have to work with overflow - //copy(32, 32); - copy(256, 256); + // TODO(Codeplay): for 4b types the initialization does not correctly access subbyte types and only initializes every other elemeent + copy(64, 64); + copy(64, 64); return 0; }