Skip to content

Commit fb7df75

Browse files
committed
enable more pvc test cases
1 parent 724a206 commit fb7df75

File tree

8 files changed

+386
-160
lines changed

8 files changed

+386
-160
lines changed

include/cute/arch/xe_copy_1B.hpp

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,9 +363,49 @@ struct XE_2D_U8x32x32_LD_N {
363363
}
364364
};
365365

366+
struct XE_2D_U8x32x16_LD_T {
367+
using BlockShape = Shape<_32, _16>;
368+
using inst_dtype = uint32_t;
369+
static constexpr bool is_transpose = true;
370+
371+
template <class T>
372+
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
373+
int height, int pitch, intel::coord_t coord,
374+
T *dst) {
375+
#if defined(SYCL_INTEL_TARGET)
376+
static_assert(sizeof(T) == 1, "Expected T to have size 2");
377+
*reinterpret_cast<intel::uint8 *>(dst) =
378+
__builtin_IB_subgroup_block_read_flat_transpose_u32_k8(
379+
(long)(baseoffset), width - 1, height - 1, pitch - 1, coord);
380+
#else
381+
CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware");
382+
#endif
383+
}
384+
};
385+
386+
struct XE_2D_U8x32x32_LD_T {
387+
using BlockShape = Shape<_32, _32>;
388+
using inst_dtype = uint32_t;
389+
static constexpr bool is_transpose = true;
390+
391+
template <class T>
392+
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
393+
int height, int pitch, intel::coord_t coord,
394+
T *dst) {
395+
#if defined(SYCL_INTEL_TARGET)
396+
static_assert(sizeof(T) == 1, "Expected T to have size 2");
397+
*reinterpret_cast<intel::uint16 *>(dst) =
398+
__builtin_IB_subgroup_block_read_cacheopts_transpose_u32_m32k8(
399+
(long)(baseoffset), width - 1, height - 1, pitch - 1, coord, 0);
400+
#else
401+
CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware");
402+
#endif
403+
}
404+
};
405+
366406
struct XE_2D_U8x1x64_LD_N {
367407
using BlockShape = Shape<_1, _64>;
368-
408+
369409
template <class T>
370410
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
371411
int height, int pitch, intel::coord_t coord,

include/cute/arch/xe_copy_4B.hpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,11 @@ SYCL_DEVICE_BUILTIN(
160160
long baseoffset, int width_minus_one, int height_minus_one,
161161
int pitch_minus_one, intel::coord_t coord));
162162

163+
SYCL_DEVICE_BUILTIN(
164+
intel::uint16 __builtin_IB_subgroup_block_read_cacheopts_transpose_u32_m32k8(
165+
long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one,
166+
intel::coord_t coord, int cache));
167+
163168
// 32bits
164169
SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1(
165170
long baseoffset, int width_minus_one, int height_minus_one,
@@ -174,6 +179,11 @@ SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1(
174179
long baseoffset, int width_minus_one, int height_minus_one,
175180
int pitch_minus_one, intel::coord_t coord, intel::uint8 data));
176181

182+
// 32 bits prefetch
183+
SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u32_m16k16v1(
184+
long baseoffset, int width_minus_one, int height_minus_one,
185+
int pitch_minus_one, intel::coord_t coord, enum CacheControl cache_control));
186+
177187
#undef SYCL_DEVICE_BUILTIN
178188

179189
#undef __global
@@ -264,6 +274,8 @@ SYCL_DEVICE_OCL(void intel_sub_group_block_write_32b_4r16c(
264274
SYCL_DEVICE_OCL(void intel_sub_group_block_write_32b_8r16c(
265275
const __global void *base_address, int width, int height, int pitch,
266276
intel::coord_t coord, intel::uint8 data));
277+
278+
267279
SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_32b_16r8x1c(
268280
__global void* base_address, int width, int height, int pitch,
269281
intel::coord_t coord));
@@ -359,6 +371,21 @@ struct XE_2D_U32x16x16_LD_N {
359371
CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware");
360372
#endif
361373
}
374+
375+
struct PREFETCH {
376+
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
377+
int height, int pitch,
378+
intel::coord_t coord) {
379+
#if defined(SYCL_INTEL_TARGET)
380+
__builtin_IB_subgroup_block_read_prefetch_u32_m16k16v1(
381+
(long)baseoffset, width - 1, height - 1, pitch - 1, coord,
382+
CacheControl::kL1C_L3C);
383+
#else
384+
CUTE_INVALID_CONTROL_PATH(
385+
"Trying to use block prefetch on non-PVC hardware");
386+
#endif
387+
}
388+
};
362389
};
363390

364391
struct XE_2D_U32x32x16_LD_N {
@@ -698,6 +725,41 @@ struct XE_2D_U32x16x8_LD_T {
698725
};
699726
};
700727

728+
struct XE_2D_TF32x16x8_LD_T {
729+
using BlockShape = Shape<_8, _16>;
730+
using ValueShape = Shape<_4, _32>;
731+
732+
static constexpr bool is_transpose = true;
733+
734+
template <class T>
735+
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
736+
int height, int pitch, intel::coord_t coord,
737+
T *dst) {
738+
#if defined(SYCL_INTEL_TARGET)
739+
static_assert(sizeof(T) == 4, "Expected T to have size 4");
740+
*reinterpret_cast<intel::uint8 *>(dst) =
741+
__builtin_IB_subgroup_block_read_flat_transpose_u32_k8(
742+
(long)(baseoffset), width - 1, height - 1, pitch - 1, coord);
743+
#else
744+
CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware");
745+
#endif
746+
}
747+
748+
struct PREFETCH {
749+
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
750+
int height, int pitch,
751+
intel::coord_t coord) {
752+
#if defined(SYCL_INTEL_TARGET)
753+
intel_sub_group_2d_block_prefetch_32b_16r8x1c(
754+
(__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord);
755+
#else
756+
CUTE_INVALID_CONTROL_PATH(
757+
"Trying to use block prefetch on non-PVC hardware");
758+
#endif
759+
}
760+
};
761+
};
762+
701763
struct XE_2D_U32x1x16_ST_N {
702764
using BlockShape = Shape<_1, _16>;
703765

include/cute/atom/copy_traits_xe.hpp

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -660,8 +660,8 @@ struct Copy_Traits<XE_2D_U8x16x64_LD_N, args_t...>
660660
using SrcLayout = Layout<Shape <_16,_8>,
661661
Stride< _0,_1>>;
662662
// Map from (dst-thr,dst-val) to bit
663-
using DstLayout = Layout<Shape <_16,Shape <_16, _2, _16>>,
664-
Stride<_16,Stride< _1,_256,_512>>>;
663+
using DstLayout = Layout<Shape <_16,Shape <_16, _2, _2, _8>>,
664+
Stride<_0 ,Stride<_16, _8, _256, _1>>>;
665665
// Reference map from (thr,val) to bit
666666
using RefLayout = DstLayout;
667667
template <class... ArgT>
@@ -1303,6 +1303,60 @@ struct Copy_Traits<XE_2D_U32x1x16_LD_N, args_t...>
13031303
: XE_2D_LD_Unpack<XE_2D_U32x1x16_LD_N, args_t...>(args...) {}
13041304
};
13051305

1306+
template <class... args_t>
1307+
struct Copy_Traits<XE_2D_TF32x16x8_LD_T, args_t...>
1308+
: XE_2D_LD_Unpack<XE_2D_TF32x16x8_LD_T, args_t...> {
1309+
using ThrID = Layout<_16>;
1310+
// Map from (src-thr,src-val) to bit
1311+
using SrcLayout = Layout<Shape <_16,_32>,
1312+
Stride< _0, _1>>;
1313+
// Map from (dst-thr,dst-val) to bit
1314+
using DstLayout = Layout<Shape <_16, Shape <_4, _2, _32>>,
1315+
Stride< _0, Stride<_512, Int<512 *4>, _1>>>;
1316+
// Reference map from (thr,val) to bit
1317+
using RefLayout = DstLayout;
1318+
1319+
template <class... ArgTs>
1320+
Copy_Traits(ArgTs... args)
1321+
: XE_2D_LD_Unpack<XE_2D_TF32x16x8_LD_T, args_t...>(args...) {}
1322+
};
1323+
1324+
template <class... args_t>
1325+
struct Copy_Traits<XE_2D_U8x32x16_LD_T, args_t...>
1326+
: XE_2D_LD_Unpack<XE_2D_U8x32x16_LD_T, args_t...> {
1327+
using ThrID = Layout<_16>;
1328+
// Map from (src-thr,src-val) to bit
1329+
using SrcLayout = Layout<Shape <_16,_16>,
1330+
Stride< _0, _1>>;
1331+
// Map from (dst-thr,dst-val) to bit
1332+
using DstLayout = Layout<Shape < _16,Shape <_16,_16>>,
1333+
Stride<_256,Stride< _1,_16>>>;
1334+
// Reference map from (thr,val) to bit
1335+
using RefLayout = DstLayout;
1336+
1337+
template <class... ArgT>
1338+
Copy_Traits(ArgT... args)
1339+
: XE_2D_LD_Unpack<XE_2D_U8x32x16_LD_T, args_t...>(args...) {}
1340+
};
1341+
1342+
template <class... args_t>
1343+
struct Copy_Traits<XE_2D_U8x32x32_LD_T, args_t...>
1344+
: XE_2D_LD_Unpack<XE_2D_U8x32x32_LD_T, args_t...> {
1345+
using ThrID = Layout<_16>;
1346+
// Map from (src-thr,src-val) to bit
1347+
using SrcLayout = Layout<Shape <_16,_16>,
1348+
Stride< _0, _1>>;
1349+
// Map from (dst-thr,dst-val) to bit
1350+
using DstLayout = Layout<Shape < _16,Shape <_32,_16>>,
1351+
Stride<_0, Stride< _256,_1>>>;
1352+
// Reference map from (thr,val) to bit
1353+
using RefLayout = DstLayout;
1354+
1355+
template <class... ArgT>
1356+
Copy_Traits(ArgT... args)
1357+
: XE_2D_LD_Unpack<XE_2D_U8x32x32_LD_T, args_t...>(args...) {}
1358+
};
1359+
13061360
template <class... args_t>
13071361
struct Copy_Traits<XE_2D_U32x2x16_LD_N, args_t...>
13081362
: XE_2D_LD_Unpack<XE_2D_U32x2x16_LD_N, args_t...> {
@@ -2160,6 +2214,14 @@ template <>\
21602214
struct XePrefetchConstructor<int8_t, row> {\
21612215
using type_t = TYPE_BITS_int8_t(row);\
21622216
};\
2217+
template <>\
2218+
struct XePrefetchConstructor<uint8_t, row> {\
2219+
using type_t = TYPE_BITS_int8_t(row);\
2220+
};\
2221+
template <>\
2222+
struct XePrefetchConstructor<tfloat32_t, row> {\
2223+
using type_t = TYPE_BITS_float(row);\
2224+
};\
21632225

21642226
BUILD_XE_NAME(1)
21652227
BUILD_XE_NAME(2)

test/unit/cute/intel_xe/CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,7 @@ cutlass_test_unit_add_executable(
3737
gemm_partition_src_dst.cpp
3838
gemm_partition_fragment_abc.cpp
3939
gemm_tiled_copy_abc.cpp
40-
gemm_layout.cpp
41-
gemm_data_type.cpp
40+
gemm_layout_data_type.cpp
4241
)
4342
else()
4443
cutlass_test_unit_add_executable(

test/unit/cute/intel_xe/gemm_data_type.cpp

Lines changed: 0 additions & 85 deletions
This file was deleted.

0 commit comments

Comments
 (0)