diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d913fed5e..cbe7e09923 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,6 +108,8 @@ option(CUTLASS_SYCL_PROFILING_ENABLED "Use SYCL events to calculate device execu option(CUTLASS_SYCL_RUNNING_CI "Enable this option when building in a CI environment. It activates CI specific configurations, such as additional checks or selectively disabling tests that cannot run in CI." OFF) +option(CUTLASS_SYCL_BUILTIN_ENABLE "Enable this option to use builtin functions instead of SPIR-V" OFF) + if (CUTLASS_ENABLE_SYCL) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) @@ -130,6 +132,10 @@ if (CUTLASS_ENABLE_SYCL) add_compile_definitions(SYCLCOMPAT_PROFILING_ENABLED) endif() + if (CUTLASS_SYCL_BUILTIN_ENABLE) + add_compile_definitions(CUTLASS_SYCL_BUILTIN_ENABLE) + endif() + include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/onemkl.cmake) endif() find_package(Doxygen QUIET) diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake index f14a62b267..9f45285cdc 100644 --- a/cmake/FindDPCPP.cmake +++ b/cmake/FindDPCPP.cmake @@ -62,8 +62,11 @@ endif() if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc" OR "${DPCPP_SYCL_TARGET}" STREQUAL "spir64" OR "${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_bmg_g21") - list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier") - + if ((CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2025.2) OR CUTLASS_SYCL_BUILTIN_ENABLE) + list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier") + else() + list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate") + endif() if(DPCPP_DISABLE_ITT_FOR_CUTLASS) list(APPEND DPCPP_FLAGS "-fno-sycl-instrument-device-code") endif() diff --git a/include/cute/arch/copy_xe.hpp b/include/cute/arch/copy_xe.hpp index fa1fb4ad51..c36befa7c8 100644 --- a/include/cute/arch/copy_xe.hpp +++ b/include/cute/arch/copy_xe.hpp @@ -29,48 +29,22 @@ * **************************************************************************************************/ #pragma once -#include -#include -#include -#include -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) inline x { assert(false); } + +#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_INTEL_TARGET) +#define CUTE_ARCH_COPY_XE_ENABLED +#endif + +#if defined(CUTE_ARCH_COPY_XE_ENABLED) && ((defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200)) || defined(CUTLASS_SYCL_BUILTIN_ENABLE)) +#include +#elif defined(CUTE_ARCH_COPY_XE_ENABLED) +#include #endif -// prefetch -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uchar( - const __attribute__((opencl_global)) uint8_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ushort( - const __attribute__((opencl_global)) uint16_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uint( - const __attribute__((opencl_global)) uint32_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uint2( - const __attribute__((opencl_global)) uint32_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uint4( - const __attribute__((opencl_global)) uint32_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uint8( - const __attribute__((opencl_global)) uint32_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ulong( - const __attribute__((opencl_global)) uint64_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ulong2( - const __attribute__((opencl_global)) uint64_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ulong4( - const __attribute__((opencl_global)) uint64_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ulong8( - const __attribute__((opencl_global)) uint64_t *base, int immElemOff, - enum CacheControl cacheOpt)); -#undef SYCL_DEVICE_BUILTIN +#include +#include +#include +#include +#include #ifdef __SYCL_DEVICE_ONLY__ SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int execution_scope, int memory_scope, int memory_semantics); @@ -142,49 +116,6 @@ struct XE_1D_LDSM { } }; -template -struct PREFETCH { - using SRegisters = S[1]; - using DRegisters = D[1]; - - template - CUTE_HOST_DEVICE static void copy(const S_ &src, D_ &dst) { -#if defined(SYCL_INTEL_TARGET) - if constexpr(sizeof(D) == 1) { - __builtin_IB_lsc_prefetch_global_uchar( - (const __attribute__((opencl_global)) uint8_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 2) { - __builtin_IB_lsc_prefetch_global_ushort( - (const __attribute__((opencl_global)) uint16_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 4) { - __builtin_IB_lsc_prefetch_global_uint( - (const __attribute__((opencl_global)) uint32_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 8) { - __builtin_IB_lsc_prefetch_global_uint2( - (const __attribute__((opencl_global)) uint32_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 16) { - __builtin_IB_lsc_prefetch_global_uint4( - (const __attribute__((opencl_global)) uint32_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 32) { - __builtin_IB_lsc_prefetch_global_uint8( - (const __attribute__((opencl_global)) uint32_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 64) { - __builtin_IB_lsc_prefetch_global_ulong8( - (const __attribute__((opencl_global)) uint64_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } -}; - template struct XE_1D_LOAD_GLOBAL { using SRegisters = S[1]; @@ -212,9 +143,6 @@ struct XE_1D_LOAD_GLOBAL { CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif } - - using PREFETCH = PREFETCH; - }; template diff --git a/include/cute/arch/copy_xe_U16.hpp b/include/cute/arch/copy_xe_U16.hpp new file mode 100644 index 0000000000..1a14351fbb --- /dev/null +++ b/include/cute/arch/copy_xe_U16.hpp @@ -0,0 +1,564 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" + +namespace cute +{ +struct XE_2D_U16x1x16_LD_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x2x16_LD_N { + using BlockShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x4x16_LD_N { + using BlockShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x8x16_LD_N { + using BlockShape = Shape<_8, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x16x16_LD_N { + using BlockShape = Shape<_16, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x32x16_LD_N { + using BlockShape = Shape<_32, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x1x32_LD_N { + using BlockShape = Shape<_1, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 1, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x2x32_LD_N { + using BlockShape = Shape<_2, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 2, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x4x32_LD_N { + using BlockShape = Shape<_4, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 4, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x8x32_LD_N { + using BlockShape = Shape<_8, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 8, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x16x32_LD_N { + using BlockShape = Shape<_16, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x32x32_LD_N { + using BlockShape = Shape<_32, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x16x16_LD_V { + using BlockShape = Shape<_16, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockTransform<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x32x16_LD_V { + using BlockShape = Shape<_32, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockTransform<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x16x32_LD_V { + using BlockShape = Shape<_16, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockTransform<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x32x32_LD_V { + using BlockShape = Shape<_32, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockTransform<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x16x8_LD_T { + using BlockShape = Shape<_8, _16>; + using inst_dtype = uint32_t; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 4"); + detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x16x16_LD_T { + using BlockShape = Shape<_16, _16>; + using inst_dtype = uint32_t; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockTranspose<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x1x16_ST_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockStore<2, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x2x16_ST_N { + using BlockShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x4x16_ST_N { + using BlockShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockStore<2, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x8x16_ST_N { + using BlockShape = Shape<_8, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockStore<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; +} // end namespace cute diff --git a/include/cute/arch/copy_xe_U32.hpp b/include/cute/arch/copy_xe_U32.hpp new file mode 100644 index 0000000000..8802b0868e --- /dev/null +++ b/include/cute/arch/copy_xe_U32.hpp @@ -0,0 +1,496 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" + +namespace cute +{ +struct XE_2D_U32x1x16_LD_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x2x16_LD_N { + using BlockShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x4x16_LD_N { + using BlockShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x8x16_LD_N { + using BlockShape = Shape<_8, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x16x16_LD_N { + using BlockShape = Shape<_16, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x32x16_LD_N { + using BlockShape = Shape<_32, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x1x8_LD_N { + using BlockShape = Shape<_32, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x2x8_LD_N { + using BlockShape = Shape<_2, _8>; + using ValueShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x4x8_LD_N { + using BlockShape = Shape<_4, _8>; + using ValueShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x8x8_LD_N { + using BlockShape = Shape<_8, _8>; + using ValueShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x16x8_LD_N { + using BlockShape = Shape<_16, _8>; + using ValueShape = Shape<_8, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x32x8_LD_N { + using BlockShape = Shape<_32, _8>; + using ValueShape = Shape<_16, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x1x16_LD_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x2x16_LD_N { + using BlockShape = Shape<_2, _16>; + using ValueShape = Shape<_1, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x4x16_LD_N { + using BlockShape = Shape<_4, _16>; + using ValueShape = Shape<_2, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x8x16_LD_N { + using BlockShape = Shape<_8, _16>; + using ValueShape = Shape<_4, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x16x16_LD_N { + using BlockShape = Shape<_16, _16>; + using ValueShape = Shape<_8, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_TF32x32x16_LD_N { + using BlockShape = Shape<_32, _16>; + using ValueShape = Shape<_16, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + + +struct XE_2D_U32x16x1_LD_T { + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockTranspose<4, 1, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x16x2_LD_T { + using BlockShape = Shape<_2, _16>; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockTranspose<4, 2, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x16x4_LD_T { + using BlockShape = Shape<_4, _16>; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x16x8_LD_T { + using BlockShape = Shape<_8, _16>; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockTranspose<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U32x1x16_ST_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockStore<4, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x2x16_ST_N { + using BlockShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockStore<4, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x4x16_ST_N { + using BlockShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockStore<4, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x8x16_ST_N { + using BlockShape = Shape<_8, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockStore<4, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +} // end namespace cute diff --git a/include/cute/arch/copy_xe_U4.hpp b/include/cute/arch/copy_xe_U4.hpp new file mode 100644 index 0000000000..f253dd4249 --- /dev/null +++ b/include/cute/arch/copy_xe_U4.hpp @@ -0,0 +1,173 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" +#include "cute/pointer.hpp" + +namespace cute +{ + +struct XE_2D_U4x16x16_LD_T { + using BlockShape = Shape<_16, _16>; + using inst_dtype = uint32_t; + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockTranspose<4, 2, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U4x32x16_LD_T { + using BlockShape = Shape<_32, _16>; + using inst_dtype = uint32_t; + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U4x32x64_LD_N { + using BlockShape = Shape<_32, _64>; + using inst_dtype = int8_t; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); + + // ================= shuffle begin ================= + // FIXME: the performance of shuffle algorithm here is too bad, we are working with + // compiler/IGC team to optimize it. + + static constexpr auto subgroup_size = 16; + static constexpr auto copy_W = decltype(size<1>(BlockShape{}))::value / subgroup_size; + static constexpr auto copy_H = decltype(size<0>(BlockShape{}))::value; + + auto sg = syclcompat::get_nd_item<1>().get_sub_group(); + auto id = int(ThreadIdxX()) % subgroup_size; + + cute::subbyte_iterator dst_iter(dst); + cute::array_subbyte dst_tmp{}; + + #pragma unroll + for (int cw = 0; cw < copy_W; cw++) { + auto remote_id = (id + cw * subgroup_size) / copy_W; + + // TODO: select 'ushort32' will cause compiling error, use 'ushort16' instead, why? + intel::ushort16 remote_dst[2]; + remote_dst[0] = sycl::select_from_group(sg, *(reinterpret_cast(dst)), remote_id); + remote_dst[1] = sycl::select_from_group(sg, *((reinterpret_cast(dst)) + 1), remote_id); + + cute::subbyte_iterator remote_dst_iter(remote_dst); + + #pragma unroll + for (int row = 0; row < copy_H; row++) { + dst_tmp[row + cw * copy_H] = remote_dst_iter[row * copy_W + id % copy_W].get(); + } + } + + *reinterpret_cast(cute::raw_pointer_cast(dst_iter)) = *reinterpret_cast(cute::raw_pointer_cast(dst_tmp.begin())); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U4x16x64_LD_N { + using BlockShape = Shape<_16, _64>; + using inst_dtype = int8_t; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); + + // ================= shuffle begin ================= + // FIXME: the performance of shuffle algorithm here is too bad, we are working with + // compiler/IGC team to optimize it. + + static constexpr auto subgroup_size = 16; + static constexpr auto copy_W = decltype(size<1>(BlockShape{}))::value / subgroup_size; + static constexpr auto copy_H = decltype(size<0>(BlockShape{}))::value; + + auto sg = syclcompat::get_nd_item<1>().get_sub_group(); + auto id = int(ThreadIdxX()) % subgroup_size; + + cute::subbyte_iterator dst_iter(dst); + cute::array_subbyte dst_tmp{}; + + #pragma unroll + for (int cw = 0; cw < copy_W; cw++) { + auto remote_id = (id + cw * subgroup_size) / copy_W; + + intel::ushort16 remote_dst; + remote_dst = sycl::select_from_group(sg, *(reinterpret_cast(dst)), remote_id); + + cute::subbyte_iterator remote_dst_iter(&remote_dst); + + + #pragma unroll + for (int row = 0; row < copy_H; row++) { + dst_tmp[row + cw * copy_H] = remote_dst_iter[row * copy_W + id % copy_W].get(); + } + } + + *reinterpret_cast(cute::raw_pointer_cast(dst_iter)) = *reinterpret_cast(cute::raw_pointer_cast(dst_tmp.begin())); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +} // end namespace cute diff --git a/include/cute/arch/xe_copy_8B.hpp b/include/cute/arch/copy_xe_U64.hpp similarity index 51% rename from include/cute/arch/xe_copy_8B.hpp rename to include/cute/arch/copy_xe_U64.hpp index 733fefdb5d..49a984a789 100644 --- a/include/cute/arch/xe_copy_8B.hpp +++ b/include/cute/arch/copy_xe_U64.hpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without @@ -30,59 +30,8 @@ **************************************************************************************************/ #pragma once -#include -#include #include - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -// 64bits No transform Transpose -SYCL_DEVICE_BUILTIN( - cute::intel::ulong __builtin_IB_subgroup_block_read_flat_transpose_u64_k1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ulong2 __builtin_IB_subgroup_block_read_flat_transpose_u64_k2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ulong4 __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -#undef SYCL_DEVICE_BUILTIN - -#undef __global -#define __global __attribute__((opencl_global)) - -// 64bits No transform Transpose -SYCL_DEVICE_OCL(cute::intel::ulong intel_sub_group_block_read_transpose_64b_8r1c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ulong2 intel_sub_group_block_read_transpose_64b_8r2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ulong4 intel_sub_group_block_read_transpose_64b_8r4c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -#undef SYCL_DEVICE_OCL +#include "cute/config.hpp" namespace cute { @@ -93,11 +42,9 @@ struct XE_2D_U64x8x1_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u64_k1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<8, 1, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif @@ -111,11 +58,9 @@ struct XE_2D_U64x8x2_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u64_k2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<8, 2, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif @@ -129,11 +74,9 @@ struct XE_2D_U64x8x4_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<8, 4, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif diff --git a/include/cute/arch/copy_xe_U8.hpp b/include/cute/arch/copy_xe_U8.hpp new file mode 100644 index 0000000000..f3a2d574ab --- /dev/null +++ b/include/cute/arch/copy_xe_U8.hpp @@ -0,0 +1,529 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" + +namespace cute +{ +struct XE_2D_U8x1x32_LD_N { + using BlockShape = Shape<_1, _32>; + using inst_dtype = int8_t; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 1, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x2x32_LD_N { + using BlockShape = Shape<_2, _32>; + using inst_dtype = int8_t; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 2, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x2x32_ST_N { + using BlockShape = Shape<_2, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); +#endif + } +}; + +struct XE_2D_U8x4x32_LD_N { + using BlockShape = Shape<_4, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 4, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x8x32_LD_N { + using BlockShape = Shape<_8, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 8, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x16x32_LD_N { + using BlockShape = Shape<_16, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x32x32_LD_N { + using BlockShape = Shape<_32, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x1x64_LD_N { + using BlockShape = Shape<_1, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 1, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x2x64_LD_N { + using BlockShape = Shape<_2, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 2, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x4x64_LD_N { + using BlockShape = Shape<_4, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 4, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x8x64_LD_N { + using BlockShape = Shape<_8, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 8, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x16x64_LD_N { + using BlockShape = Shape<_16, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x32x64_LD_N { + using BlockShape = Shape<_32, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + + + +struct XE_2D_U8x32x16_LD_V { + using BlockShape = Shape<_32, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockTransform<1, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x32x32_LD_V { + using BlockShape = Shape<_32, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockTransform<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x32x64_LD_V { + using BlockShape = Shape<_32, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockTransform<1, 16, 32, 4>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x1x16_ST_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<1, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x2x16_ST_N { + using BlockShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<1, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x4x16_ST_N { + using BlockShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<1, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x8x16_ST_N { + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<1, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x8x32_ST_N { + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<1, 16, 8, 2>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; +} // end namespace cute diff --git a/include/cute/arch/copy_xe_builtin.hpp b/include/cute/arch/copy_xe_builtin.hpp new file mode 100644 index 0000000000..a6404475eb --- /dev/null +++ b/include/cute/arch/copy_xe_builtin.hpp @@ -0,0 +1,1409 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" + +#ifdef __SYCL_DEVICE_ONLY__ +#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x +#else +#define SYCL_DEVICE_BUILTIN(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } +#endif + +#ifdef __SYCL_DEVICE_ONLY__ +#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x +#else +#define SYCL_DEVICE_OCL(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } +#endif + + +#undef __global +#define __global __attribute__((opencl_global)) + + +#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_INTEL_TARGET) +#define CUTE_ARCH_XE_COPY_ENABLED +#endif + +namespace cute::detail +{ +template +struct XeSubgroup2DBlockPrefetch { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockLoad { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockTransform { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockTranspose { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockStore { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +} + +enum class CacheControl { + kDefault = 0, + kL1UC_L3UC = 1, // Override to L1 uncached and L3 uncached + kL1UC_L3C = 2, // Override to L1 uncached and L3 cached + kL1C_L3UC = 3, // Override to L1 cached and L3 uncached + kL1C_L3C = 4, // Override to L1 cached and L3 cached + kL1S_L3UC = 5, // Override to L1 streaming load and L3 uncached + kL1S_L3C = 6, // Override to L1 streaming load and L3 cached + kL1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached +}; + +// 8bits No transform No transpose +SYCL_DEVICE_BUILTIN(cute::intel::ushort __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u8_m2k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u8_m4k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u8_m8k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +SYCL_DEVICE_BUILTIN( + cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u8_m1k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u8_m2k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u8_m4k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u8_m8k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u8_m16k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort64 __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + + +// 8bits VNNI transform No transpose +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transform_u8_k32( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_transform_u8_k32v4( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 8bits No transform No transpose +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m2k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar2 data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m4k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar4)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar8)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar8)); + +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +// // 2D prefetch +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_1r32x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_2r32x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_4r32x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_8r32x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_32r16x1c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); + +SYCL_DEVICE_BUILTIN(cute::intel::ushort16 intel_subgroup_block_read_u16_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +SYCL_DEVICE_BUILTIN(cute::intel::int8 intel_subgroup_block_read_transform_u16_k16( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// U16 prefetch +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); + +// 16 bits No transform No transpose +SYCL_DEVICE_BUILTIN(cute::intel::ushort __builtin_IB_subgroup_block_read_flat_u16_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u16_m2k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u16_m4k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u16_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u16_m16k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u16_m32k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +SYCL_DEVICE_BUILTIN( + cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u16_m1k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u16_m2k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u16_m4k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u16_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u16_m16k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort64 __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 16bits VNNI transform No transpose +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transform_u16_k16( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k32( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 16bits +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort2 data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort4 data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort8 data)); + +// 2D prefetch +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_1r16x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_2r16x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_4r16x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); + + +// 32bits specific for tf32 No transform No transpose +SYCL_DEVICE_BUILTIN( + cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m1k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m2k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m4k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m8k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m16k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m32k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +SYCL_DEVICE_BUILTIN( + cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m1k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_u32_m32k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 32bits No transform No transpose +SYCL_DEVICE_BUILTIN(cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_u32_m32k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 32bits No transform Transpose +SYCL_DEVICE_BUILTIN(cute::intel::uint __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 32bits +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint2 data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint4 data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint8 data)); + +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_32b_16r8x1c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); + +// 64bits No transform Transpose +SYCL_DEVICE_BUILTIN( + cute::intel::ulong __builtin_IB_subgroup_block_read_flat_transpose_u64_k1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ulong2 __builtin_IB_subgroup_block_read_flat_transpose_u64_k2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ulong4 __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +namespace cute::detail +{ +template<> +struct XeSubgroup2DBlockLoad<1, 32, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m2k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m4k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m8k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 1, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m1k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 2, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m2k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 4, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m4k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 8, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m8k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 16, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m16k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<1, 16, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<1, 16, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<1, 16, 32, 4> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32v4( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m1k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m2k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar2 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m4k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar4 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m8k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar8 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 8, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar8 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 1, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( + (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 2, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( + (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 4, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( + (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 8, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( + (intptr_t)srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 1, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_1r32x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 2, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_2r32x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 4, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_4r32x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 8, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_8r32x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 16, 32, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_32r16x1c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m1k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m2k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m4k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m8k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m16k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m32k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 1, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m1k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 2, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m2k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 4, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m4k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 8, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m8k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 16, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m16k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k16( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k32( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 16, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m1k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(ushort *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort2 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort4 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort8 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 8, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 16, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 32, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 1, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_1r16x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 2, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_2r16x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 4, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_4r16x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 8, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 16, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 32, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 1, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 2, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 4, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 8, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 16, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 1, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 2, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 4, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 8, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(uint *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m2k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint2 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m4k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint4 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint8 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<4, 8, 16, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_32b_16r8x1c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<8, 1, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<8, 2, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<8, 4, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +} // namespace cute::detail diff --git a/include/cute/arch/copy_xe_spirv.hpp b/include/cute/arch/copy_xe_spirv.hpp new file mode 100644 index 0000000000..d37db7cbe0 --- /dev/null +++ b/include/cute/arch/copy_xe_spirv.hpp @@ -0,0 +1,504 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" + +// TODO(Codeplay): These builtins are not available on SPIRV +SYCL_EXTERNAL extern "C" +cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord); + +SYCL_EXTERNAL extern "C" +cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord); + +enum class CacheControl { + kDefault = 0, + kL1UC_L3UC = 1, // Override to L1 uncached and L3 uncached + kL1UC_L3C = 2, // Override to L1 uncached and L3 cached + kL1C_L3UC = 3, // Override to L1 cached and L3 uncached + kL1C_L3C = 4, // Override to L1 cached and L3 cached + kL1S_L3UC = 5, // Override to L1 streaming load and L3 uncached + kL1S_L3C = 6, // Override to L1 streaming load and L3 cached + kL1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached +}; + +// U16 prefetch +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_8b_1r32x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_8b_2r32x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_8b_4r32x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_8b_8r32x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_8b_32r16x1c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL +void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_16b_1r16x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_16b_2r16x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_16b_4r16x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_32b_16r8x1c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +// SPIRV copy definitions +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void *src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void *dst_pointer); + +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransformINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void *src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void *dst_pointer); + +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransposeINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void *src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void *dst_pointer); + +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockStoreINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + void *src_pointer, const void *dst_base_pointer, int memory_width, + int memory_height, int memory_pitch, cute::intel::coord_t coordinate); + +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockPrefetchINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void *src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate); + +namespace cute::detail { +template +struct XeSubgroup2DBlockLoad { + template + CUTE_HOST_DEVICE + void operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *dstPointer) { + __spirv_Subgroup2DBlockLoadINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, + memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); + } +}; + +template +struct XeSubgroup2DBlockTransform { + template + CUTE_HOST_DEVICE + void operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *dstPointer) { + __spirv_Subgroup2DBlockLoadTransformINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, + memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); + } +}; + +template +struct XeSubgroup2DBlockTranspose { + template + CUTE_HOST_DEVICE + void operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *dstPointer) { + __spirv_Subgroup2DBlockLoadTransposeINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, + memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); + } +}; + +template +struct XeSubgroup2DBlockPrefetch { + CUTE_HOST_DEVICE + void operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __spirv_Subgroup2DBlockPrefetchINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, + memoryHeight, memoryPitch, coordinate); + } +}; + +template +struct XeSubgroup2DBlockStore { + template + CUTE_HOST_DEVICE + void operator()(const void *dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *srcPointer) { + __spirv_Subgroup2DBlockStoreINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + (void *)(srcPointer), dstBasePointer, + memoryWidth, memoryHeight, + memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 2, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 4, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 1, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 2, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 4, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 8, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( + reinterpret_cast(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 1, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_1r32x2c( + (__attribute__((opencl_global)) void*)(srcBasePointer), memoryWidth, memoryHeight, + memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 2, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_2r32x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, + memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 4, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_4r32x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 8, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_8r32x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 16, 32, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_32r16x1c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 8, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 16, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 32, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 1, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_1r16x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 2, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_2r16x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 4, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_4r16x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 8, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 16, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 32, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<4, 8, 16, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_32b_16r8x1c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; +} // namespace cute::detail end diff --git a/include/cute/arch/mma_xe.hpp b/include/cute/arch/mma_xe.hpp index ccd03eedc7..763da5020f 100644 --- a/include/cute/arch/mma_xe.hpp +++ b/include/cute/arch/mma_xe.hpp @@ -30,54 +30,107 @@ **************************************************************************************************/ #pragma once -#include -#include -#include +#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_INTEL_TARGET) +#define CUTE_ARCH_MMA_XE_ENABLED +#endif -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) inline x { CUTE_INVALID_CONTROL_PATH("Trying to use XE built-in on non-XE hardware"); } +#if defined(CUTE_ARCH_MMA_XE_ENABLED) && ((defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200)) || defined(CUTLASS_SYCL_BUILTIN_ENABLE)) +#include +#elif defined(CUTE_ARCH_MMA_XE_ENABLED) +#include #endif -// mma_bf16 with float acc -SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc)); -SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc)); -SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc)); -SYCL_DEVICE_OCL(float intel_sub_group_bf16_bf16_matrix_mad_k16(short a, cute::intel::int8 b, float acc)); -// mma bf16 with bf16 acc -SYCL_DEVICE_OCL(cute::intel::short8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::short8 acc)); -SYCL_DEVICE_OCL(cute::intel::short4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::short4 acc)); -SYCL_DEVICE_OCL(cute::intel::short2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::short2 acc)); -SYCL_DEVICE_OCL(short intel_sub_group_bf16_bf16_matrix_mad_k16(short a, cute::intel::int8 b, short acc)); -// mma_half -SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc)); -SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc)); -SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc)); -SYCL_DEVICE_OCL(float intel_sub_group_f16_f16_matrix_mad_k16(short a, cute::intel::int8 b, float acc)); -// mma_s8 -SYCL_DEVICE_OCL(cute::intel::int8 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short8 a, cute::intel::int8 b, cute::intel::int8 acc)); -SYCL_DEVICE_OCL(cute::intel::int4 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short4 a, cute::intel::int8 b, cute::intel::int4 acc)); -SYCL_DEVICE_OCL(cute::intel::int2 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short2 a, cute::intel::int8 b, cute::intel::int2 acc)); -SYCL_DEVICE_OCL(int intel_sub_group_i8_i8_matrix_mad_k32(short a, cute::intel::int8 b, int acc)); -// mma_u8 -SYCL_DEVICE_OCL(cute::intel::int8 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort8 a, cute::intel::uint8 b, cute::intel::int8 acc)); -SYCL_DEVICE_OCL(cute::intel::int4 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort4 a, cute::intel::uint8 b, cute::intel::int4 acc)); -SYCL_DEVICE_OCL(cute::intel::int2 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort2 a, cute::intel::uint8 b, cute::intel::int2 acc)); -SYCL_DEVICE_OCL(int intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort a, cute::intel::uint8 b, int acc)); -// mma_tf32 -SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float4 a, cute::intel::float8 b, cute::intel::float8 acc)); -SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float2 a, cute::intel::float8 b, cute::intel::float4 acc)); -SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_tf32_tf32_matrix_mad_k8(float a, cute::intel::float8 b, cute::intel::float2 acc)); -SYCL_DEVICE_OCL(float intel_sub_group_tf32_tf32_matrix_mad_k8(float a, cute::intel::float8 b, float acc)); - -#undef SYCL_DEVICE_OCL +#include +#include +#include namespace cute { //MxNxK_D,A,B,C //# of vector component of a x subgroup-size x function name //float8 intel_sub_group_bf16_bf16_matrix_mad_k16(short8 a, int8 b, float8 acc); //TODO: Is A really not transposed? Maybe better a macro than separate define for 1,2,4,8 +struct XE_8x16x16_F32BF16BF16F32_TT +{ + using DRegisters = intel::float8[1]; + using ARegisters = intel::short8[1]; + using BRegisters = intel::int8[1]; + using CRegisters = intel::float8[1]; + + CUTE_HOST_DEVICE static void + fma(intel::float8 & d, + intel::short8 const& a, + intel::int8 const& b, + intel::float8 const& c) + { +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-Xe hardware"); +#endif + } +}; +struct XE_4x16x16_F32BF16BF16F32_TT +{ + using DRegisters = intel::float4[1]; + using ARegisters = intel::short4[1]; + using BRegisters = intel::int8[1]; + using CRegisters = intel::float4[1]; + + CUTE_HOST_DEVICE static void + fma(intel::float4 & d, + intel::short4 const& a, + intel::int8 const& b, + intel::float4 const& c) + { +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-Xe hardware"); +#endif + } +}; +struct XE_2x16x16_F32BF16BF16F32_TT +{ + using DRegisters = intel::float2[1]; + using ARegisters = intel::short2[1]; + using BRegisters = intel::int8[1]; + using CRegisters = intel::float2[1]; + + CUTE_HOST_DEVICE static void + fma(intel::float2 & d, + intel::short2 const& a, + intel::int8 const& b, + intel::float2 const& c) + { +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-Xe hardware"); +#endif + } +}; +//float intel_sub_group_bf16_bf16_matrix_mad_k16(short a, int8 b, float acc) +struct XE_1x16x16_F32BF16BF16F32_TT +{ + using DRegisters = float[1]; + using ARegisters = short[1]; + using BRegisters = intel::int8[1]; + using CRegisters = float[1]; + + CUTE_HOST_DEVICE static void + fma(float & d, + short const& a, + intel::int8 const& b, + float const& c) + { +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F32BF16BF16F32_TT on non-Xe hardware"); +#endif + } +}; + struct XE_8x16x16_BF16BF16BF16BF16_TT { using DRegisters = intel::short8[1]; @@ -91,10 +144,10 @@ struct XE_8x16x16_BF16BF16BF16BF16_TT intel::int8 const& b, intel::short8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_BF16BF16BF16BF16_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); #endif } }; @@ -111,10 +164,10 @@ struct XE_4x16x16_BF16BF16BF16BF16_TT intel::int8 const& b, intel::short4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_BF16BF16BF16BF16_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); #endif } }; @@ -131,10 +184,10 @@ struct XE_2x16x16_BF16BF16BF16BF16_TT intel::int8 const& b, intel::short2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_BF16BF16BF16BF16_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); #endif } }; @@ -146,20 +199,23 @@ struct XE_1x16x16_BF16BF16BF16BF16_TT using CRegisters = short[1]; CUTE_HOST_DEVICE static void - fma(short & d, - short const & a, + fma(short & d, + short const& a, intel::int8 const& b, - short const & c) + short const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_BF16BF16BF16BF16_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); #endif } }; - -struct XE_8x16x16_F32BF16BF16F32_TT +//MxNxK_A,B,C,D +//# of vector component of a x subgroup-size x function name +//float8 intel_sub_group_f16_f16_matrix_mad_k16(short8 a, int8 b, int8 acc); +//TODO: Is A really not transposed? Maybe better a macro than separate define for 1,2,4,8 +struct XE_8x16x16_F32F16F16F32_TT { using DRegisters = intel::float8[1]; using ARegisters = intel::short8[1]; @@ -172,14 +228,15 @@ struct XE_8x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32F16F16F32_TT on non-Xe hardware"); #endif } }; -struct XE_4x16x16_F32BF16BF16F32_TT + +struct XE_4x16x16_F32F16F16F32_TT { using DRegisters = intel::float4[1]; using ARegisters = intel::short4[1]; @@ -192,14 +249,15 @@ struct XE_4x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_F32F16F16F32_TT on non-Xe hardware"); #endif } }; -struct XE_2x16x16_F32BF16BF16F32_TT + +struct XE_2x16x16_F32F16F16F32_TT { using DRegisters = intel::float2[1]; using ARegisters = intel::short2[1]; @@ -212,15 +270,15 @@ struct XE_2x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_F32F16F16F32_TT on non-Xe hardware"); #endif } }; -//float intel_sub_group_bf16_bf16_matrix_mad_k16(short a, int8 b, float acc) -struct XE_1x16x16_F32BF16BF16F32_TT + +struct XE_1x16x16_F32F16F16F32_TT { using DRegisters = float[1]; using ARegisters = short[1]; @@ -228,107 +286,102 @@ struct XE_1x16x16_F32BF16BF16F32_TT using CRegisters = float[1]; CUTE_HOST_DEVICE static void - fma(float & d, - short const& a, - intel::int8 const& b, - float const& c) + fma(float & d, + short const& a, + intel::int8 const& b, + float const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F32BF16BF16F32_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F32F16F16F32_TT on non-Xe hardware"); #endif } }; -//MxNxK_D,A,B,C -//# of vector component of a x subgroup-size x function name -//float8 intel_sub_group_f16_f16_matrix_mad_k16(short8 a, int8 b, int8 acc); -//TODO: Is A really not transposed? Maybe better a macro than separate define for 1,2,4,8 -struct XE_8x16x16_F32F16F16F32_TT +struct XE_8x16x16_F16F16F16F16_TT { - using DRegisters = intel::float8[1]; + using DRegisters = intel::half8[1]; using ARegisters = intel::short8[1]; using BRegisters = intel::int8[1]; - using CRegisters = intel::float8[1]; + using CRegisters = intel::half8[1]; CUTE_HOST_DEVICE static void - fma(intel::float8 & d, + fma(intel::half8 & d, intel::short8 const& a, intel::int8 const& b, - intel::float8 const& c) + intel::half8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32F16F16F32_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F16F16F16F16_TT on non-PVC hardware"); #endif } }; -struct XE_4x16x16_F32F16F16F32_TT +struct XE_4x16x16_F16F16F16F16_TT { - using DRegisters = intel::float4[1]; + using DRegisters = intel::half4[1]; using ARegisters = intel::short4[1]; using BRegisters = intel::int8[1]; - using CRegisters = intel::float4[1]; + using CRegisters = intel::half4[1]; CUTE_HOST_DEVICE static void - fma(intel::float4 & d, + fma(intel::half4 & d, intel::short4 const& a, intel::int8 const& b, - intel::float4 const& c) + intel::half4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_F32F16F16F32_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_F16F16F16F16_TT on non-PVC hardware"); #endif } }; -struct XE_2x16x16_F32F16F16F32_TT +struct XE_2x16x16_F16F16F16F16_TT { - using DRegisters = intel::float2[1]; + using DRegisters = intel::half2[1]; using ARegisters = intel::short2[1]; using BRegisters = intel::int8[1]; - using CRegisters = intel::float2[1]; + using CRegisters = intel::half2[1]; CUTE_HOST_DEVICE static void - fma(intel::float2 & d, + fma(intel::half2 & d, intel::short2 const& a, intel::int8 const& b, - intel::float2 const& c) + intel::half2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_F32F16F16F32_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_F16F16F16F16_TT on non-PVC hardware"); #endif } }; -struct XE_1x16x16_F32F16F16F32_TT +struct XE_1x16x16_F16F16F16F16_TT { - using DRegisters = float[1]; + using DRegisters = intel::half[1]; using ARegisters = short[1]; using BRegisters = intel::int8[1]; - using CRegisters = float[1]; + using CRegisters = intel::half[1]; CUTE_HOST_DEVICE static void - fma(float & d, - short const& a, - intel::int8 const& b, - float const& c) + fma(intel::half & d, + short const& a, + intel::int8 const& b, + intel::half const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F32F16F16F32_TT on non-Xe hardware"); + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F16F16F16F16_TT on non-PVC hardware"); #endif } }; - //MxNxK_A,B,C,D //# of vector component of a x subgroup-size x function name //float8 intel_sub_group_i8_i8_matrix_mad_k16(short8 a, int8 b, float8 acc); @@ -346,8 +399,8 @@ struct XE_8x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x32_S32S8S8S32_TT on non-Xe hardware"); #endif @@ -367,8 +420,8 @@ struct XE_4x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x32_S32S8S8S32_TT on non-Xe hardware"); #endif @@ -388,8 +441,8 @@ struct XE_2x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x32_S32S8S8S32_TT on non-Xe hardware"); #endif @@ -409,8 +462,8 @@ struct XE_1x16x32_S32S8S8S32_TT intel::int8 const& b, int const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x32_S32S8S8S32_TT on non-Xe hardware"); #endif @@ -430,8 +483,8 @@ struct XE_8x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x32_S32U8U8S32_TT on non-Xe hardware"); #endif @@ -451,8 +504,8 @@ struct XE_4x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x32_S32U8U8S32_TT on non-Xe hardware"); #endif @@ -472,8 +525,8 @@ struct XE_2x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x32_S32U8U8S32_TT on non-Xe hardware"); #endif @@ -493,8 +546,8 @@ struct XE_1x16x32_S32U8U8S32_TT intel::uint8 const& b, int const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x32_S32U8U8S32_TT on non-Xe hardware"); #endif @@ -514,8 +567,8 @@ struct XE_8x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x8_F32TF32TF32F32_TT on non-Xe hardware"); #endif @@ -535,8 +588,8 @@ struct XE_4x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x8_F32TF32TF32F32_TT on non-Xe hardware"); #endif @@ -556,8 +609,8 @@ struct XE_2x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x8_F32TF32TF32F32_TT on non-Xe hardware"); #endif @@ -577,8 +630,8 @@ struct XE_1x16x8_F32TF32TF32F32_TT intel::float8 const& b, float const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x8_F32TF32TF32F32_TT on non-Xe hardware"); #endif diff --git a/include/cute/arch/mma_xe_builtin.hpp b/include/cute/arch/mma_xe_builtin.hpp new file mode 100644 index 0000000000..504872f656 --- /dev/null +++ b/include/cute/arch/mma_xe_builtin.hpp @@ -0,0 +1,140 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once +#include + +// mma_bf16 +SYCL_EXTERNAL cute::intel::float8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc); +SYCL_EXTERNAL cute::intel::float4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc); +SYCL_EXTERNAL cute::intel::float2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc); +SYCL_EXTERNAL float intel_sub_group_bf16_bf16_matrix_mad_k16( short a, cute::intel::int8 b, float acc); +// mma_half +SYCL_EXTERNAL cute::intel::float8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc); +SYCL_EXTERNAL cute::intel::float4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc); +SYCL_EXTERNAL cute::intel::float2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc); +SYCL_EXTERNAL float intel_sub_group_f16_f16_matrix_mad_k16( short a, cute::intel::int8 b, float acc); +// mma_s8 +SYCL_EXTERNAL cute::intel::int8 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short8 a, cute::intel::int8 b, cute::intel::int8 acc); +SYCL_EXTERNAL cute::intel::int4 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short4 a, cute::intel::int8 b, cute::intel::int4 acc); +SYCL_EXTERNAL cute::intel::int2 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short2 a, cute::intel::int8 b, cute::intel::int2 acc); +SYCL_EXTERNAL int intel_sub_group_i8_i8_matrix_mad_k32( short a, cute::intel::int8 b, int acc); +// mma_u8 +SYCL_EXTERNAL cute::intel::int8 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort8 a, cute::intel::uint8 b, cute::intel::int8 acc); +SYCL_EXTERNAL cute::intel::int4 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort4 a, cute::intel::uint8 b, cute::intel::int4 acc); +SYCL_EXTERNAL cute::intel::int2 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort2 a, cute::intel::uint8 b, cute::intel::int2 acc); +SYCL_EXTERNAL int intel_sub_group_u8_u8_matrix_mad_k32( ushort a, cute::intel::uint8 b, int acc); +// mma_tf32 +SYCL_EXTERNAL cute::intel::float8 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float4 a, cute::intel::float8 b, cute::intel::float8 acc); +SYCL_EXTERNAL cute::intel::float4 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float2 a, cute::intel::float8 b, cute::intel::float4 acc); +SYCL_EXTERNAL cute::intel::float2 intel_sub_group_tf32_tf32_matrix_mad_k8( float a, cute::intel::float8 b, cute::intel::float2 acc); +SYCL_EXTERNAL float intel_sub_group_tf32_tf32_matrix_mad_k8( float a, cute::intel::float8 b, float acc); +// mma_bfloat16 with bfloat16 accumulator: +SYCL_EXTERNAL cute::intel::short8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::short8 acc); +SYCL_EXTERNAL cute::intel::short4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::short4 acc); +SYCL_EXTERNAL cute::intel::short2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::short2 acc); +SYCL_EXTERNAL short intel_sub_group_bf16_bf16_matrix_mad_k16( short a, cute::intel::int8 b, short acc); +// mma_half with half accumulator: +SYCL_EXTERNAL cute::intel::half8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::half8 acc); +SYCL_EXTERNAL cute::intel::half4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::half4 acc); +SYCL_EXTERNAL cute::intel::half2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::half2 acc); +SYCL_EXTERNAL sycl::half intel_sub_group_f16_f16_matrix_mad_k16( short a, cute::intel::int8 b, sycl::half acc); + +namespace cute::detail +{ + +template +struct XeSubgroupMatrixMultiplyAccumulate { + static_assert(dependent_false<>, "Unsupported MMA Configuration."); +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); + } +}; +} // namespace cute::detail end diff --git a/include/cute/arch/mma_xe_spirv.hpp b/include/cute/arch/mma_xe_spirv.hpp new file mode 100644 index 0000000000..2d512bb685 --- /dev/null +++ b/include/cute/arch/mma_xe_spirv.hpp @@ -0,0 +1,157 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once +#include + +SYCL_EXTERNAL cute::intel::float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::float8, int32_t); +SYCL_EXTERNAL cute::intel::float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::float4, int32_t); +SYCL_EXTERNAL cute::intel::float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::float2, int32_t); +SYCL_EXTERNAL float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, float, int32_t); + +SYCL_EXTERNAL cute::intel::int8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::int8, int32_t); +SYCL_EXTERNAL cute::intel::int4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::int4, int32_t); +SYCL_EXTERNAL cute::intel::int2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::int2, int32_t); +SYCL_EXTERNAL int __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, int, int32_t); + +SYCL_EXTERNAL cute::intel::int8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort8, cute::intel::uint8, cute::intel::int8, int32_t); +SYCL_EXTERNAL cute::intel::int4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort4, cute::intel::uint8, cute::intel::int4, int32_t); +SYCL_EXTERNAL cute::intel::int2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort2, cute::intel::uint8, cute::intel::int2, int32_t); +SYCL_EXTERNAL int __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, ushort, cute::intel::uint8, int, int32_t); + +SYCL_EXTERNAL cute::intel::float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::float4, cute::intel::float8, cute::intel::float8, int32_t); +SYCL_EXTERNAL cute::intel::float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::float2, cute::intel::float8, cute::intel::float4, int32_t); +SYCL_EXTERNAL cute::intel::float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, float, cute::intel::float8, cute::intel::float2, int32_t); +SYCL_EXTERNAL float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, float, cute::intel::float8, float, int32_t); + +SYCL_EXTERNAL cute::intel::short8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::short8, int32_t); +SYCL_EXTERNAL cute::intel::short4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::short4, int32_t); +SYCL_EXTERNAL cute::intel::short2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::short2, int32_t); +SYCL_EXTERNAL short __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, short, int32_t); + +SYCL_EXTERNAL cute::intel::half8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::half8, int32_t); +SYCL_EXTERNAL cute::intel::half4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::half4, int32_t); +SYCL_EXTERNAL cute::intel::half2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::half2, int32_t); +SYCL_EXTERNAL cute::intel::half __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, cute::intel::half, int32_t); + +struct SPIRV_MMAOperands { + static constexpr int SPIRV_MatrixASigned = 0x1; + static constexpr int SPIRV_MatrixBSigned = 0x2; + static constexpr int SPIRV_MatrixAInt8 = 0x10; + static constexpr int SPIRV_MatrixBInt8 = 0x20; + static constexpr int SPIRV_MatrixAFp16 = 0x400; + static constexpr int SPIRV_MatrixBFp16 = 0x800; + static constexpr int SPIRV_MatrixABf16 = 0x1000; + static constexpr int SPIRV_MatrixBBf16 = 0x2000; + static constexpr int SPIRV_MatrixCBf16 = 0xC; + static constexpr int SPIRV_MatrixATf32 = 0x100; + static constexpr int SPIRV_MatrixBTf32 = 0x200; +}; + +namespace cute::detail +{ + +template +struct XeSubgroupMatrixMultiplyAccumulate { + static_assert(dependent_false<>, "Unsupported MMA Configuration."); +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixABf16 | SPIRV_MMAOperands::SPIRV_MatrixBBf16 ); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixABf16 | SPIRV_MMAOperands::SPIRV_MatrixBBf16 | + SPIRV_MMAOperands::SPIRV_MatrixCBf16); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixASigned | SPIRV_MMAOperands::SPIRV_MatrixBSigned | + SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(8, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixATf32 | SPIRV_MMAOperands::SPIRV_MatrixBTf32); + } +}; +} // namespace cute::detail end diff --git a/include/cute/arch/xe_copy_1B.hpp b/include/cute/arch/xe_copy_1B.hpp deleted file mode 100644 index acb9bc52e8..0000000000 --- a/include/cute/arch/xe_copy_1B.hpp +++ /dev/null @@ -1,936 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -#pragma once - -#include -#include -#include -#include -#include "cute/pointer.hpp" - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -// 8bits No transform No transpose -SYCL_DEVICE_BUILTIN(cute::intel::ushort __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u8_m2k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u8_m4k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u8_m8k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -SYCL_DEVICE_BUILTIN( - cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u8_m1k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u8_m2k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u8_m4k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u8_m8k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u8_m16k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort64 __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - - -// 8bits VNNI transform No transpose -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transform_u8_k32( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_transform_u8_k32v4( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 8bits No transform No transpose -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m2k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar2 data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m4k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar4)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar8)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar8)); - -// U8 prefetch -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); - -#undef SYCL_DEVICE_BUILTIN - -#undef __global -#define __global __attribute__((opencl_global)) -// 8 bits No transform No transpose -SYCL_DEVICE_OCL(cute::intel::ushort intel_sub_group_block_read_8b_1r32c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort2 intel_sub_group_block_read_8b_2r32c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort4 intel_sub_group_block_read_8b_4r32c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort8 intel_sub_group_block_read_8b_8r32c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort16 intel_sub_group_block_read_8b_16r32c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -SYCL_DEVICE_OCL(cute::intel::ushort2 intel_sub_group_block_read_8b_1r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort4 intel_sub_group_block_read_8b_2r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort8 intel_sub_group_block_read_8b_4r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort16 intel_sub_group_block_read_8b_8r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort32 intel_sub_group_block_read_8b_16r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort64 intel_sub_group_block_read_8b_32r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -// 8bits VNNI transform No transpose -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_transform_8b_32r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_transform_8b_32r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint32 intel_sub_group_block_read_transform_8b_32r16x4c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -// 8bits store -SYCL_DEVICE_OCL(void intel_sub_group_block_write_8b_1r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uchar data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_8b_2r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uchar2 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_8b_4r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uchar4 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_8b_8r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uchar8 data)); - - -// 2D prefetch -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_1r32x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_2r32x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_4r32x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_8r32x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_32r16x1c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -#undef SYCL_DEVICE_OCL - -namespace cute -{ -struct XE_2D_U8x2x32_ST_N { - using BlockShape = Shape<_2, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *src) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::ushort2 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x1x32_LD_N { - using BlockShape = Shape<_1, _32>; - using inst_dtype = int8_t; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x2x32_LD_N { - using BlockShape = Shape<_2, _32>; - using inst_dtype = int8_t; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m2k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; - -}; - -struct XE_2D_U8x4x32_LD_N { - using BlockShape = Shape<_4, _32>; - using inst_dtype = int8_t; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m4k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x8x32_LD_N { - using BlockShape = Shape<_8, _32>; - using inst_dtype = int8_t; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m8k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x16x32_LD_N { - using BlockShape = Shape<_16, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x32x32_LD_N { - using BlockShape = Shape<_32, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U4x16x16_LD_T { - using BlockShape = Shape<_16, _16>; - using inst_dtype = uint32_t; - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U4x32x16_LD_T { - using BlockShape = Shape<_32, _16>; - using inst_dtype = uint32_t; - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U4x32x64_LD_N { - using BlockShape = Shape<_32, _64>; - using inst_dtype = int8_t; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); - - // ================= shuffle begin ================= - // FIXME: the performance of shuffle algorithm here is too bad, we are working with - // compiler/IGC team to optimize it. - - static constexpr auto subgroup_size = 16; - static constexpr auto copy_W = decltype(size<1>(BlockShape{}))::value / subgroup_size; - static constexpr auto copy_H = decltype(size<0>(BlockShape{}))::value; - - auto sg = syclcompat::get_nd_item<1>().get_sub_group(); - auto id = int(ThreadIdxX()) % subgroup_size; - - cute::subbyte_iterator dst_iter(dst); - cute::array_subbyte dst_tmp{}; - - #pragma unroll - for (int cw = 0; cw < copy_W; cw++) { - auto remote_id = (id + cw * subgroup_size) / copy_W; - - // TODO: select 'ushort32' will cause compiling error, use 'ushort16' instead, why? - intel::ushort16 remote_dst[2]; - remote_dst[0] = sycl::select_from_group(sg, *(reinterpret_cast(dst)), remote_id); - remote_dst[1] = sycl::select_from_group(sg, *((reinterpret_cast(dst)) + 1), remote_id); - - cute::subbyte_iterator remote_dst_iter(remote_dst); - - #pragma unroll - for (int row = 0; row < copy_H; row++) { - dst_tmp[row + cw * copy_H] = remote_dst_iter[row * copy_W + id % copy_W].get(); - } - } - - *reinterpret_cast(cute::raw_pointer_cast(dst_iter)) = *reinterpret_cast(cute::raw_pointer_cast(dst_tmp.begin())); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U4x16x64_LD_N { - using BlockShape = Shape<_16, _64>; - using inst_dtype = int8_t; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); - - // ================= shuffle begin ================= - // FIXME: the performance of shuffle algorithm here is too bad, we are working with - // compiler/IGC team to optimize it. - - static constexpr auto subgroup_size = 16; - static constexpr auto copy_W = decltype(size<1>(BlockShape{}))::value / subgroup_size; - static constexpr auto copy_H = decltype(size<0>(BlockShape{}))::value; - - auto sg = syclcompat::get_nd_item<1>().get_sub_group(); - auto id = int(ThreadIdxX()) % subgroup_size; - - cute::subbyte_iterator dst_iter(dst); - cute::array_subbyte dst_tmp{}; - - #pragma unroll - for (int cw = 0; cw < copy_W; cw++) { - auto remote_id = (id + cw * subgroup_size) / copy_W; - - intel::ushort16 remote_dst; - remote_dst = sycl::select_from_group(sg, *(reinterpret_cast(dst)), remote_id); - - cute::subbyte_iterator remote_dst_iter(&remote_dst); - - - #pragma unroll - for (int row = 0; row < copy_H; row++) { - dst_tmp[row + cw * copy_H] = remote_dst_iter[row * copy_W + id % copy_W].get(); - } - } - - *reinterpret_cast(cute::raw_pointer_cast(dst_iter)) = *reinterpret_cast(cute::raw_pointer_cast(dst_tmp.begin())); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x1x64_LD_N { - using BlockShape = Shape<_1, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m1k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_8b_1r32x2c( - (__global void*)baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x2x64_LD_N { - using BlockShape = Shape<_2, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m2k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_8b_2r32x2c( - (__global void*)baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x4x64_LD_N { - using BlockShape = Shape<_4, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m4k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_8b_4r32x2c( - (__global void*)baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x8x64_LD_N { - using BlockShape = Shape<_8, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m8k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_8b_8r32x2c( - (__global void*)baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x16x64_LD_N { - using BlockShape = Shape<_16, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m16k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x32x64_LD_N { - using BlockShape = Shape<_32, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - - - -struct XE_2D_U8x32x16_LD_V { - using BlockShape = Shape<_32, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u8_k32( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_8b_32r16x1c( - (__global void*)baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x32x32_LD_V { - using BlockShape = Shape<_32, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x32x64_LD_V { - using BlockShape = Shape<_32, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u8_k32v4( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x1x16_ST_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u8_m1k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uchar *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x2x16_ST_N { - using BlockShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u8_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uchar2 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x4x16_ST_N { - using BlockShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u8_m4k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uchar4 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x8x16_ST_N { - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u8_m8k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uchar8 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x8x32_ST_N { - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uchar8 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; -} // end namespace cute diff --git a/include/cute/arch/xe_copy_2B.hpp b/include/cute/arch/xe_copy_2B.hpp deleted file mode 100644 index 36468fd75e..0000000000 --- a/include/cute/arch/xe_copy_2B.hpp +++ /dev/null @@ -1,852 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -#pragma once - -#include -#include -#include -#include - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -SYCL_DEVICE_BUILTIN(cute::intel::ushort16 intel_subgroup_block_read_u16_m8k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -SYCL_DEVICE_BUILTIN(cute::intel::int8 intel_subgroup_block_read_transform_u16_k16( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// U16 prefetch -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); - -// 16 bits No transform No transpose -SYCL_DEVICE_BUILTIN(cute::intel::ushort __builtin_IB_subgroup_block_read_flat_u16_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u16_m2k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u16_m4k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u16_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u16_m16k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u16_m32k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -SYCL_DEVICE_BUILTIN( - cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u16_m1k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u16_m2k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u16_m4k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u16_m8k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u16_m16k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort64 __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 16bits VNNI transform No transpose -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transform_u16_k16( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k32( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 16bits -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort2 data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort4 data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort8 data)); -#undef SYCL_DEVICE_BUILTIN - -#undef __global__ -#define __global __attribute__((opencl_global)) -// 16bits No transform No transpose -SYCL_DEVICE_OCL(cute::intel::ushort intel_sub_group_block_read_16b_1r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort2 intel_sub_group_block_read_16b_2r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort4 intel_sub_group_block_read_16b_4r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort8 intel_sub_group_block_read_16b_8r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort16 intel_sub_group_block_read_16b_16r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort32 intel_sub_group_block_read_16b_32r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -SYCL_DEVICE_OCL(cute::intel::ushort2 intel_sub_group_block_read_16b_1r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort4 intel_sub_group_block_read_16b_2r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort8 intel_sub_group_block_read_16b_4r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort16 intel_sub_group_block_read_16b_8r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort32 intel_sub_group_block_read_16b_16r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort64 intel_sub_group_block_read_16b_32r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -// 16bits VNNI transform No transpose -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_transform_16b_16r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_transform_16b_32r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_transform_16b_16r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint32 intel_sub_group_block_read_transform_16b_32r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -// 16bits store -SYCL_DEVICE_OCL(void intel_sub_group_block_write_16b_1r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::ushort data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_16b_2r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::ushort2 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_16b_4r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::ushort4 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_16b_8r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::ushort8 data)); - -// 2D prefetch -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_1r16x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_2r16x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_4r16x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_8r16x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_16r16x1c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -#undef SYCL_DEVICE_OCL - -namespace cute -{ -struct XE_2D_U16x1x16_LD_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m1k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x2x16_LD_N { - using BlockShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x4x16_LD_N { - using BlockShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m4k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x8x16_LD_N { - using BlockShape = Shape<_8, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m8k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x16x16_LD_N { - using BlockShape = Shape<_16, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m16k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x32x16_LD_N { - using BlockShape = Shape<_32, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m32k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x1x32_LD_N { - using BlockShape = Shape<_1, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m1k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_16b_1r16x2c( - (__global void*)baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x2x32_LD_N { - using BlockShape = Shape<_2, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m2k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_16b_2r16x2c( - (__global void*)baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x4x32_LD_N { - using BlockShape = Shape<_4, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m4k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_16b_4r16x2c( - (__global void*)baseoffset, width, height, pitch , coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x8x32_LD_N { - using BlockShape = Shape<_8, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m8k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x16x32_LD_N { - using BlockShape = Shape<_16, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m16k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x32x32_LD_N { - using BlockShape = Shape<_32, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - // __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( - __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x16x16_LD_V { - using BlockShape = Shape<_16, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u16_k16( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x32x16_LD_V { - using BlockShape = Shape<_32, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u16_k32( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x16x32_LD_V { - using BlockShape = Shape<_16, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x32x32_LD_V { - using BlockShape = Shape<_32, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x16x8_LD_T { - using BlockShape = Shape<_8, _16>; - using inst_dtype = uint32_t; - - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x16x16_LD_T { - using BlockShape = Shape<_16, _16>; - using inst_dtype = uint32_t; - - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x1x16_ST_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - __builtin_IB_subgroup_block_write_flat_u16_m1k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(cute::intel::ushort *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x2x16_ST_N { - using BlockShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::ushort2 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x4x16_ST_N { - using BlockShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::ushort4 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x8x16_ST_N { - using BlockShape = Shape<_8, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::ushort8 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; -} // end namespace cute diff --git a/include/cute/arch/xe_copy_4B.hpp b/include/cute/arch/xe_copy_4B.hpp deleted file mode 100644 index 0c4b1b53c8..0000000000 --- a/include/cute/arch/xe_copy_4B.hpp +++ /dev/null @@ -1,785 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -#pragma once - -#include -#include -#include - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -enum class CacheControl { - kDefault = 0, - kL1UC_L3UC = 1, // Override to L1 uncached and L3 uncached - kL1UC_L3C = 2, // Override to L1 uncached and L3 cached - kL1C_L3UC = 3, // Override to L1 cached and L3 uncached - kL1C_L3C = 4, // Override to L1 cached and L3 cached - kL1S_L3UC = 5, // Override to L1 streaming load and L3 uncached - kL1S_L3C = 6, // Override to L1 streaming load and L3 cached - kL1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached -}; - -// 32bits specific for tf32 No transform No transpose -SYCL_DEVICE_BUILTIN( - cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m1k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m2k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m4k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m8k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m16k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m32k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -SYCL_DEVICE_BUILTIN( - cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m1k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_u32_m32k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 32bits No transform No transpose -SYCL_DEVICE_BUILTIN(cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_u32_m32k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 32bits No transform Transpose -SYCL_DEVICE_BUILTIN(cute::intel::uint __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 32bits -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint2 data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint4 data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint8 data)); - -#undef SYCL_DEVICE_BUILTIN - -#undef __global -#define __global __attribute__((opencl_global)) -// 32bits specific for tf32 No transform No transpose -SYCL_DEVICE_OCL(cute::intel::uint intel_sub_group_block_read_32b_1r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint intel_sub_group_block_read_32b_2r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint2 intel_sub_group_block_read_32b_4r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint4 intel_sub_group_block_read_32b_8r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_32b_16r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_32b_32r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -SYCL_DEVICE_OCL(cute::intel::uint2 intel_sub_group_block_read_32b_1r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint2 intel_sub_group_block_read_32b_2r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint4 intel_sub_group_block_read_32b_4r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_32b_8r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_32b_16r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint32 intel_sub_group_block_read_32b_32r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -// 32bits No transform No transpose -SYCL_DEVICE_OCL(cute::intel::uint intel_sub_group_block_read_32b_1r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint2 intel_sub_group_block_read_32b_2r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint4 intel_sub_group_block_read_32b_4r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_32b_8r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_32b_16r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint32 intel_sub_group_block_read_32b_32r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -// 32bits No transform Transpose -SYCL_DEVICE_OCL(cute::intel::uint intel_sub_group_block_read_transpose_32b_16r1c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint2 intel_sub_group_block_read_transpose_32b_16r2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint4 intel_sub_group_block_read_transpose_32b_16r4c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_transpose_32b_16r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -// 32bits store -SYCL_DEVICE_OCL(void intel_sub_group_block_write_32b_1r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uint data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_32b_2r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uint2 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_32b_4r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uint4 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_32b_8r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uint8 data)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_32b_16r8x1c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -#undef SYCL_DEVICE_OCL - -namespace cute -{ -struct XE_2D_U32x1x16_LD_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x2x16_LD_N { - using BlockShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x4x16_LD_N { - using BlockShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m4k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x8x16_LD_N { - using BlockShape = Shape<_8, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m8k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x16x16_LD_N { - using BlockShape = Shape<_16, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x32x16_LD_N { - using BlockShape = Shape<_32, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m32k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x1x8_LD_N { - using BlockShape = Shape<_32, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m1k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x2x8_LD_N { - using BlockShape = Shape<_2, _8>; - using ValueShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m2k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x4x8_LD_N { - using BlockShape = Shape<_4, _8>; - using ValueShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m4k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x8x8_LD_N { - using BlockShape = Shape<_8, _8>; - using ValueShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m8k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x16x8_LD_N { - using BlockShape = Shape<_16, _8>; - using ValueShape = Shape<_8, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m16k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x32x8_LD_N { - using BlockShape = Shape<_32, _8>; - using ValueShape = Shape<_16, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m32k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x1x16_LD_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m1k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x2x16_LD_N { - using BlockShape = Shape<_2, _16>; - using ValueShape = Shape<_1, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m2k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x4x16_LD_N { - using BlockShape = Shape<_4, _16>; - using ValueShape = Shape<_2, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m4k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x8x16_LD_N { - using BlockShape = Shape<_8, _16>; - using ValueShape = Shape<_4, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m8k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x16x16_LD_N { - using BlockShape = Shape<_16, _16>; - using ValueShape = Shape<_8, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m16k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_32b_16r8x1c( - (__global void*)baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_TF32x32x16_LD_N { - using BlockShape = Shape<_32, _16>; - using ValueShape = Shape<_16, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m32k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - - -struct XE_2D_U32x16x1_LD_T { - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x16x2_LD_T { - using BlockShape = Shape<_2, _16>; - - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x16x4_LD_T { - using BlockShape = Shape<_4, _16>; - - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x16x8_LD_T { - using BlockShape = Shape<_8, _16>; - - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_32b_16r8x1c( - (__global void*)baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U32x1x16_ST_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - // static_assert(sizeof(T) == 4, "Expected T to have size 4"); - __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(cute::intel::uint *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x2x16_ST_N { - using BlockShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - __builtin_IB_subgroup_block_write_flat_u32_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uint2 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x4x16_ST_N { - using BlockShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - __builtin_IB_subgroup_block_write_flat_u32_m4k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uint4 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x8x16_ST_N { - using BlockShape = Shape<_8, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(SYCL_INTEL_TARGET) - // static_assert(sizeof(T) == 4, "Expected T to have size 4"); - __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uint8 *)(src)); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -} // end namespace cute diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index 75f1c13ba7..83cc6f63ae 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -2140,23 +2140,6 @@ struct Copy_Traits> { using RefLayout = DstLayout; }; -template -struct Copy_Traits> { - // Logical thread id to thread idx - using ThrID = Layout<_16>; - // Map from (src-thr,src-val) to bit - using SrcLayout = Layout::value>>, Stride<_0, _1>>; - // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout::value>>, - Stride::value>, _1>>; - // Reference map from (thr,val) to bit - using RefLayout = DstLayout; - - template - CUTE_HOST_DEVICE - Copy_Traits(Copy_Traits const& traits) {} -}; - template struct Copy_Traits> { // Logical thread id to thread idx diff --git a/include/cute/atom/mma_traits_xe.hpp b/include/cute/atom/mma_traits_xe.hpp index 661302049c..f99e171954 100644 --- a/include/cute/atom/mma_traits_xe.hpp +++ b/include/cute/atom/mma_traits_xe.hpp @@ -165,7 +165,6 @@ struct MMA_Traits using CLayout = Layout, Stride<_1, _1>>; }; - template <> struct MMA_Traits { @@ -226,6 +225,66 @@ struct MMA_Traits using CLayout = Layout, Stride<_1, _1>>; }; +template <> +struct MMA_Traits +{ + using ValTypeD = half_t; + using ValTypeA = half_t; + using ValTypeB = half_t; + using ValTypeC = half_t; + + using Shape_MNK = Shape<_8,_16,_16>; + using ThrID = Layout<_16>; + using ALayout = Layout, Stride<_8, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_8, _1>>; +}; + +template <> +struct MMA_Traits +{ + using ValTypeD = half_t; + using ValTypeA = half_t; + using ValTypeB = half_t; + using ValTypeC = half_t; + + using Shape_MNK = Shape<_4,_16,_16>; + using ThrID = Layout<_16>; + using ALayout = Layout, Stride<_4, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_4, _1>>; +}; + +template <> +struct MMA_Traits +{ + using ValTypeD = half_t; + using ValTypeA = half_t; + using ValTypeB = half_t; + using ValTypeC = half_t; + + using Shape_MNK = Shape<_2,_16,_16>; + using ThrID = Layout<_16>; + using ALayout = Layout, Stride<_2, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_2, _1>>; +}; + +template <> +struct MMA_Traits +{ + using ValTypeD = half_t; + using ValTypeA = half_t; + using ValTypeB = half_t; + using ValTypeC = half_t; + + using Shape_MNK = Shape<_1,_16,_16>; + using ThrID = Layout<_16>; + using ALayout = Layout, Stride<_1, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_1, _1>>; +}; + template <> struct MMA_Traits { diff --git a/include/cute/util/sycl_vec.hpp b/include/cute/util/sycl_vec.hpp index fdaba345a0..ba82d7a37a 100644 --- a/include/cute/util/sycl_vec.hpp +++ b/include/cute/util/sycl_vec.hpp @@ -57,6 +57,11 @@ using float2 = vector_t; using float4 = vector_t; using float8 = vector_t; +using half = _Float16; +using half2 = vector_t<_Float16, 2>; +using half4 = vector_t<_Float16, 4>; +using half8 = vector_t<_Float16, 8>; + using short2 = vector_t; using short4 = vector_t; using short8 = vector_t; diff --git a/include/cutlass/epilogue/collective/builders/xe_builder.inl b/include/cutlass/epilogue/collective/builders/xe_builder.inl index 08683b0aec..a50b41bcb4 100644 --- a/include/cutlass/epilogue/collective/builders/xe_builder.inl +++ b/include/cutlass/epilogue/collective/builders/xe_builder.inl @@ -165,7 +165,8 @@ template < "Trying to use Intel pipeline on Non Intel hardware"); #endif static_assert(is_static::value); - static_assert(cute::is_any_of_v, "ElementC needs to be float or bfloat for the Intel pipeline"); + static_assert(cute::is_any_of_v, + "ElementC needs to be float or bfloat for the Intel pipeline"); using EpilogueSchedule = std::conditional_t, IntelXeXMX16, diff --git a/include/cutlass/epilogue/collective/xe_epilogue.hpp b/include/cutlass/epilogue/collective/xe_epilogue.hpp index 0d9352a72c..9949482d78 100644 --- a/include/cutlass/epilogue/collective/xe_epilogue.hpp +++ b/include/cutlass/epilogue/collective/xe_epilogue.hpp @@ -295,8 +295,6 @@ class CollectiveEpilogue< auto sg_m_coord = m_coord * ATOM_M + sg_local_m_coord; auto sg_n_coord = n_coord * ATOM_N + sg_local_n_coord; auto sg_coord = make_coord(sg_m_coord, sg_n_coord, k_coord, l_coord); - - bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed(); // Represent the full output tensor Tensor mD_mnl = cute::get_xe_tensor(make_shape(M,N,L)); @@ -364,12 +362,12 @@ class CollectiveEpilogue< for (int epi_m = 0; epi_m < FragsM; epi_m++) { cst_callbacks.begin_loop(epi_m, epi_n); - if (is_C_load_needed) { + if (is_source_supported && fusion_callbacks.is_C_load_needed()) { //cordinates for C and D are the same copy(params.xe_load_c, tCgD(_, epi_m, epi_n), trC); } - cst_callbacks.previsit(epi_m, epi_n, 0, is_C_load_needed); + cst_callbacks.previsit(epi_m, epi_n, 0, is_source_supported && fusion_callbacks.is_C_load_needed()); auto acc_frag_mn = acc_frag(_, epi_m, epi_n); diff --git a/include/cutlass/gemm/collective/builders/xe_mma_builder.inl b/include/cutlass/gemm/collective/builders/xe_mma_builder.inl index 27cddc4ad7..7c6364e028 100644 --- a/include/cutlass/gemm/collective/builders/xe_mma_builder.inl +++ b/include/cutlass/gemm/collective/builders/xe_mma_builder.inl @@ -129,6 +129,7 @@ template <> struct pick_mma_atom { \ PICK_MMA(bfloat16_t, float, XE_8x16x16_F32BF16BF16F32_TT); PICK_MMA(bfloat16_t, bfloat16_t, XE_8x16x16_BF16BF16BF16BF16_TT); PICK_MMA(half_t, float, XE_8x16x16_F32F16F16F32_TT); +PICK_MMA(half_t, half_t, XE_8x16x16_F16F16F16F16_TT); #undef PICK_MMA } @@ -171,7 +172,8 @@ struct CollectiveBuilder< "Trying to use Intel pipeline on Non Intel hardware"); #endif static_assert(is_static::value); - static_assert(cute::is_any_of_v, "Intel multi-stage pipeline requires ElementC to be of type float or bfloat"); + static_assert(cute::is_any_of_v, + "Intel multi-stage pipeline requires ElementC to be of type float, bfloat or half"); using MMAAtom = typename pick_mma_atom::atom; diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py index 62585afc78..c559273942 100644 --- a/python/cutlass/backend/compiler.py +++ b/python/cutlass/backend/compiler.py @@ -159,10 +159,12 @@ def __init__(self) -> None: "--expt-relaxed-constexpr", "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored", ] + # TODO(Codeplay): remove CUTLASS_SYCL_BUILTIN_ENABLE when the spirv functions are available for PVC self._dpcpp_compile_options = ["-fsycl", "-std=c++17", "-DCUTLASS_ENABLE_SYCL", "-fsycl-rtc-mode", "-DSYCL_INTEL_TARGET", + "-DCUTLASS_SYCL_BUILTIN_ENABLE", "-shared", "-fPIC", "-fno-sycl-dead-args-optimization", "-Xspirv-translator -spirv-ext=+SPV_INTEL_split_barrier", diff --git a/test/unit/cute/intel_xe/mma.cpp b/test/unit/cute/intel_xe/mma.cpp index 1c0e3d8a61..5589310f61 100755 --- a/test/unit/cute/intel_xe/mma.cpp +++ b/test/unit/cute/intel_xe/mma.cpp @@ -263,6 +263,26 @@ TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_F32BF16BF16F32_TT) { bfloat16_t, float>(512, 512, 256); } +TEST(PVC_CuTe_Xe, MMA_XE_8x16x16_BF16BF16BF16BF16_TT) { + MMA_Test(512, 512, 256); +} + +TEST(PVC_CuTe_Xe, MMA_XE_4x16x16_BF16BF16BF16BF16_TT) { + MMA_Test(512, 512, 256); +} + +TEST(PVC_CuTe_Xe, MMA_XE_2x16x16_BF16BF16BF16BF16_TT) { + MMA_Test(512, 512, 256); +} + +TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_BF16BF16BF16BF16_TT) { + MMA_Test(512, 512, 256); +} + TEST(PVC_CuTe_Xe, MMA_XE_8x16x16_F32F16F16F32_TT) { MMA_Test(512, 512, 256); @@ -279,8 +299,28 @@ TEST(PVC_CuTe_Xe, MMA_XE_2x16x16_F32F16F16F32_TT) { } TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_F32F16F16F32_TT) { - MMA_Test( - 512, 512, 256); + MMA_Test + (512, 512, 256); +} +#if defined(CUTE_ARCH_MMA_XE_SPIRV_ENABLED) +TEST(PVC_CuTe_Xe, MMA_XE_8x16x16_F16F16F16F16_TT) { + MMA_Test + (512, 512, 256); +} + +TEST(PVC_CuTe_Xe, MMA_XE_4x16x16_F16F16F16F16_TT) { + MMA_Test + (512, 512, 256); +} + +TEST(PVC_CuTe_Xe, MMA_XE_2x16x16_F16F16F16F16_TT) { + MMA_Test + (512, 512, 256); +} +#endif +TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_F16F16F16F16_TT) { + MMA_Test + (512, 512, 256); } TEST(PVC_CuTe_Xe, FMA_XE_UniversalFMA_F32F32F32F32) { diff --git a/test/unit/cute/intel_xe/utils.hpp b/test/unit/cute/intel_xe/utils.hpp index e109d9fe27..48973a0de9 100755 --- a/test/unit/cute/intel_xe/utils.hpp +++ b/test/unit/cute/intel_xe/utils.hpp @@ -59,10 +59,10 @@ void verify(uint32_t m, uint32_t n, uint32_t k, atype *A, btype *B, ctype *C, bool row_a = true, bool row_b = true) { int cnt = 0; bool is_normal = true; - + using accum_type = conditional_t == 32, ctype, float>; for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { - ctype expect = ctype(0); + accum_type expect = accum_type(0); for (int z = 0; z < k; z++) { auto a = row_a ? A[i * k + z] : A[i + z * m]; auto b = row_b ? B[z * n + j] : B[z + j * k]; @@ -71,15 +71,10 @@ void verify(uint32_t m, uint32_t n, uint32_t k, atype *A, btype *B, ctype *C, ctype val = C[i * n + j]; - if constexpr(std::is_floating_point_v) { - if (isnormal(val) && isnormal(expect)) { - auto error = std::abs((expect - val) / val); - if (error > 0.01f) { - cnt++; - } - } else { - // TODO(codeplay): Assert that at least some values are non-zero. - if(!(expect == 0 && val == 0)) is_normal = false; + if (isnormal(val) && isnormal(expect)) { + auto error = std::abs((expect - val) / val); + if (error > 0.02f) { + cnt++; } } else { if (val != expect) { diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt index 787416c9e1..52922b2126 100644 --- a/test/unit/gemm/device/CMakeLists.txt +++ b/test/unit/gemm/device/CMakeLists.txt @@ -31,6 +31,7 @@ if(CUTLASS_ENABLE_SYCL) cutlass_test_unit_add_executable( cutlass_test_unit_gemm_device_tensorop_xe xe_gemm_bf16_bf16_bf16_tensor_op_bf16.cpp + xe_gemm_fp16_fp16_fp16_tensor_op_fp16.cpp xe_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp xe_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp xe_gemm_s8_s8_s32_tensor_op_s32.cpp diff --git a/test/unit/gemm/device/default_gemm_configuration.hpp b/test/unit/gemm/device/default_gemm_configuration.hpp index f48f291fb9..06e235cf38 100644 --- a/test/unit/gemm/device/default_gemm_configuration.hpp +++ b/test/unit/gemm/device/default_gemm_configuration.hpp @@ -1706,6 +1706,68 @@ struct DefaultGemmConfigurationToCutlass3Types< /////////////////////////////////////////////////////////////////////////////// +// Intel XE MMA F32F16 +template +struct DefaultGemmConfigurationToCutlass3Types< + arch::OpClassTensorOp, arch::IntelXe, + half_t, LayoutA, + half_t, LayoutB, + half_t, LayoutC, + half_t> +{ + using TileShape = Shape<_256, _256, _32>; + + using TiledMma = + typename TiledMMAHelper, + Layout, + Layout, Stride<_4, _1, _0>>>::TiledMMA; + + // A + static constexpr int kAlignmentA = 32; + using DefaultOperandA = detail::DefaultGemm_TensorOpXe_OperandA< + half_t, LayoutA, kAlignmentA, 32>; + using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy; + + // B + static constexpr int kAlignmentB = 32; + using DefaultOperandB = detail::DefaultGemm_TensorOpXe_OperandB< + half_t, LayoutB, kAlignmentB, 32>; + using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy; + + // Mainloop + using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::IntelXe, cutlass::arch::OpClassTensorOp, + half_t, LayoutA, 1, + half_t, LayoutB, 1, + half_t, + TileShape, Shape<_1, _1, _1>, + cutlass::gemm::collective::StageCountAuto, + cutlass::gemm::collective::KernelScheduleAuto + >::CollectiveOp; + + using EpilogueOp = epilogue::fusion::LinearCombination; + + using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks< + epilogue::IntelXeXMX16, + EpilogueOp, + TileShape, + decltype(tile_shape(TiledMma())) + >; + + using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::IntelXe, cutlass::arch::OpClassTensorOp, + TileShape, Shape<_1, _1, _1>, + cutlass::epilogue::collective::EpilogueTileAuto, + half_t, half_t, + half_t, LayoutC, 1, + half_t, LayoutC, 1, + cutlass::epilogue::collective::EpilogueScheduleAuto, + EpilogueOp + >::CollectiveOp; +}; + +/////////////////////////////////////////////////////////////////////////////// + // Intel XE MMA S32S8 template struct DefaultGemmConfigurationToCutlass3Types< diff --git a/test/unit/gemm/device/gemm_testbed_3x.hpp b/test/unit/gemm/device/gemm_testbed_3x.hpp index 015846bd07..2b3b465c95 100644 --- a/test/unit/gemm/device/gemm_testbed_3x.hpp +++ b/test/unit/gemm/device/gemm_testbed_3x.hpp @@ -307,7 +307,7 @@ bool initialize_tensor( } } - else if (std::is_same_v) { + else if (cute::is_any_of_v) { scope_max = 1; scope_min = -1; } diff --git a/test/unit/gemm/device/xe_gemm_fp16_fp16_fp16_tensor_op_fp16.cpp b/test/unit/gemm/device/xe_gemm_fp16_fp16_fp16_tensor_op_fp16.cpp new file mode 100644 index 0000000000..38bd731980 --- /dev/null +++ b/test/unit/gemm/device/xe_gemm_fp16_fp16_fp16_tensor_op_fp16.cpp @@ -0,0 +1,92 @@ +/*************************************************************************************************** + * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Tests for Xe fp16_fp16_fp16 +*/ + + +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "default_gemm_configuration.hpp" + +#include "gemm_testbed_3x.hpp" + +namespace cutlass { +namespace { +template +struct XE_Device_Gemm_fp16_fp16_fp16_tensor_op_fp16 { + using Config = + gemm::device::DefaultGemmConfigurationToCutlass3Types< + arch::OpClassTensorOp, arch::IntelXe, + cute::half_t, LayoutA, + cute::half_t, LayoutB, + cute::half_t, layout::RowMajor, + cute::half_t>; + + using Gemm = gemm::device::GemmUniversalAdapter< + gemm::kernel::GemmUniversal< + cute::Shape, + typename Config::CollectiveMainloop, + typename Config::CollectiveEpilogue>>; +}; + +TEST(XE_Device_Gemm_fp16t_fp16t_fp16t_tensor_op_fp16, 256x256x32) { + using LayoutA = layout::RowMajor; + using LayoutB = layout::RowMajor; + using Gemm = XE_Device_Gemm_fp16_fp16_fp16_tensor_op_fp16::Gemm; + EXPECT_TRUE(test::gemm::device::TestXe()); +} + +TEST(XE_Device_Gemm_fp16n_fp16t_fp16t_tensor_op_fp16, 256x256x32) { + using LayoutA = layout::ColumnMajor; + using LayoutB = layout::RowMajor; + using Gemm = XE_Device_Gemm_fp16_fp16_fp16_tensor_op_fp16::Gemm; + EXPECT_TRUE(test::gemm::device::TestXe()); +} + +TEST(XE_Device_Gemm_fp16t_fp16n_fp16t_tensor_op_fp16, 256x256x32) { + using LayoutA = layout::RowMajor; + using LayoutB = layout::ColumnMajor; + using Gemm = XE_Device_Gemm_fp16_fp16_fp16_tensor_op_fp16::Gemm; + EXPECT_TRUE(test::gemm::device::TestXe()); +} + +TEST(XE_Device_Gemm_fp16n_fp16n_fp16t_tensor_op_fp16, 256x256x32) { + using LayoutA = layout::ColumnMajor; + using LayoutB = layout::ColumnMajor; + using Gemm = XE_Device_Gemm_fp16_fp16_fp16_tensor_op_fp16::Gemm; + EXPECT_TRUE(test::gemm::device::TestXe()); +} +} +} // namespace cutlass