Skip to content

Enable prefetch iteration #382

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 34 commits into
base: sycl-develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
9dea3d5
working, slower npot prefetch
t4c1 May 19, 2025
496ed2b
cleanup
t4c1 May 19, 2025
0de9565
fix u8 prefetch layouts
t4c1 May 19, 2025
af92057
Merge branch 'sycl-develop' into npot_prefetch
t4c1 May 19, 2025
8f59745
implement missing copy traits
t4c1 May 19, 2025
eb905d2
more fixes to layouts including missing copy_traits and duplicated pr…
t4c1 May 20, 2025
ace6a3e
fix prefetch for int4 and some other small fixes
t4c1 May 21, 2025
6578bdc
revert some changes
t4c1 May 21, 2025
f285a07
Merge branch 'sycl-develop' into npot_prefetch
t4c1 May 21, 2025
4d50128
fix erronous revert
t4c1 May 21, 2025
4e992c8
applyed review suggestion for is_prefetch
t4c1 May 22, 2025
80f0851
address review comment about unused variable
t4c1 May 22, 2025
b6dd21a
Merge branch 'sycl-develop' into npot_prefetch
t4c1 May 28, 2025
b095c26
add missing prefetch copy traits
t4c1 May 28, 2025
696b7dc
more missing prefetch copy traits
t4c1 May 28, 2025
ad84fd0
fix copy-paste errors
t4c1 May 30, 2025
4576dad
Merge branch 'sycl-develop' into npot_prefetch
t4c1 May 30, 2025
18ea014
another copy-paste error
t4c1 May 30, 2025
5f3198b
removed unused variables
t4c1 Jun 2, 2025
0832a7b
Merge branch 'sycl-develop' into npot_prefetch
t4c1 Jun 2, 2025
3733ee0
Merge remote-tracking branch 'origin/sycl-develop' into npot_prefetch
t4c1 Jun 2, 2025
3d0b699
fix merge
t4c1 Jun 2, 2025
9a449e4
extend CI timeout
t4c1 Jun 3, 2025
b3bb1c4
remove unused variable
t4c1 Jun 3, 2025
8e275c1
Merge branch 'sycl-develop' into npot_prefetch
t4c1 Jun 11, 2025
6eea5b1
Merge branch 'sycl-develop' into npot_prefetch
t4c1 Jun 11, 2025
0a2986a
added assert and some comments
t4c1 Jun 13, 2025
386d4d2
Merge branch 'sycl-develop' into npot_prefetch
t4c1 Jun 13, 2025
b813f47
Merge remote-tracking branch 'origin/sycl-develop' into npot_prefetch
t4c1 Jun 16, 2025
4681d8c
Revert "extend CI timeout"
t4c1 Jun 16, 2025
ddd294f
reduce the number of build threads
t4c1 Jun 18, 2025
9c42dae
increase timeout
t4c1 Jun 18, 2025
e557c12
further reduce the number of parallel jobs
t4c1 Jun 19, 2025
978ee33
Merge branch 'sycl-develop' into npot_prefetch
t4c1 Jun 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/intel_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:

name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }}
runs-on: ${{ matrix.runner }}
timeout-minutes: 30
timeout-minutes: 45

steps:
- name: Checkout repository
Expand Down Expand Up @@ -96,7 +96,7 @@ jobs:
-DCUTLASS_ENABLE_SYCL=ON \
-DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
-DCUTLASS_SYCL_RUNNING_CI=ON
cmake --build .
cmake --build . -j 6
- name: Unit test
shell: bash
run: |
Expand Down
2 changes: 1 addition & 1 deletion examples/common/sycl_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,6 @@ void random_fill(T *src, int seed, size_t N, float max, float min) {
syclcompat::memcpy<T>(src, buff.data(), N);
syclcompat::wait();
} else {
assert(0 & "Not supported dtype");
assert(0 && "Not supported dtype");
}
}
13 changes: 13 additions & 0 deletions include/cute/arch/copy_xe_U16.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ struct XE_2D_U16x8x16_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -128,6 +129,7 @@ struct XE_2D_U16x16x16_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -157,6 +159,7 @@ struct XE_2D_U16x32x16_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -186,6 +189,7 @@ struct XE_2D_U16x1x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -215,6 +219,7 @@ struct XE_2D_U16x2x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -244,6 +249,7 @@ struct XE_2D_U16x4x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -273,6 +279,7 @@ struct XE_2D_U16x8x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -302,6 +309,7 @@ struct XE_2D_U16x16x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -331,6 +339,7 @@ struct XE_2D_U16x32x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -360,6 +369,7 @@ struct XE_2D_U16x16x16_LD_V {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -389,6 +399,7 @@ struct XE_2D_U16x32x16_LD_V {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -418,6 +429,7 @@ struct XE_2D_U16x16x32_LD_V {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -447,6 +459,7 @@ struct XE_2D_U16x32x32_LD_V {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down
2 changes: 2 additions & 0 deletions include/cute/arch/copy_xe_U32.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ struct XE_2D_TF32x16x16_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -406,6 +407,7 @@ struct XE_2D_U32x16x8_LD_T {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down
12 changes: 12 additions & 0 deletions include/cute/arch/copy_xe_U8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ struct XE_2D_Packed_U8x1x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -82,6 +83,7 @@ struct XE_2D_Packed_U8x2x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -127,6 +129,7 @@ struct XE_2D_Packed_U8x4x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -156,6 +159,7 @@ struct XE_2D_Packed_U8x8x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -185,6 +189,7 @@ struct XE_2D_Packed_U8x16x32_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -249,6 +254,7 @@ struct XE_2D_Packed_U8x1x64_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -278,6 +284,7 @@ struct XE_2D_Packed_U8x2x64_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -307,6 +314,7 @@ struct XE_2D_Packed_U8x4x64_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -336,6 +344,7 @@ struct XE_2D_Packed_U8x8x64_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -365,6 +374,7 @@ struct XE_2D_Packed_U8x16x64_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -394,6 +404,7 @@ struct XE_2D_Packed_U8x32x64_LD_N {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down Expand Up @@ -425,6 +436,7 @@ struct XE_2D_U8x32x16_LD_V {
}

struct PREFETCH {
using BlockShape = BlockShape;
CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
int height, int pitch,
intel::coord_t coord) {
Expand Down
10 changes: 10 additions & 0 deletions include/cute/atom/copy_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,19 @@ namespace detail {
template <class CopyOp, class = void>
constexpr bool is_prefetch = false;

#ifdef SYCL_INTEL_TARGET

template <class CopyOp>
constexpr bool is_prefetch<CopyOp, void_t<decltype(CopyOp{}.copy(nullptr, 0,0,0, {0,0}))>> = true;

#else

// TODO(Codeplay): Enable for SYCL_INTEL_TARGET.
template <class CopyOp>
constexpr bool is_prefetch<CopyOp, void_t<typename CopyOp::PREFETCH>> = is_same_v<CopyOp, typename CopyOp::PREFETCH>;

#endif

} // end namespace detail


Expand Down
Loading
Loading