From c2ce5e863435633a0a741febfdaa0f208b5c8a61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 6 May 2025 12:58:13 +0100 Subject: [PATCH 1/9] reenable tests --- .github/workflows/intel_test.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml index cbd8a8311c..fb6a5aaf54 100644 --- a/.github/workflows/intel_test.yml +++ b/.github/workflows/intel_test.yml @@ -97,7 +97,4 @@ jobs: - name: Examples shell: bash run: | - # Disable test examples on BMG due to flakiness - if [[ "${{ matrix.GPU }}" != "BMG" ]]; then - cmake --build . --target test_examples -j $(nproc) - fi + cmake --build . --target test_examples -j $(nproc) From 364083c90590213873d13e6eed86643dc88185cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 6 May 2025 13:13:16 +0100 Subject: [PATCH 2/9] keep the runner running --- .github/workflows/intel_test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml index fb6a5aaf54..1b5feceb7d 100644 --- a/.github/workflows/intel_test.yml +++ b/.github/workflows/intel_test.yml @@ -97,4 +97,5 @@ jobs: - name: Examples shell: bash run: | - cmake --build . --target test_examples -j $(nproc) + cmake --build . --target test_examples -j $(nproc) || true + sleep 3600 From f02d113779d180bcc7572a591ba75fbff51d9257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 6 May 2025 15:15:07 +0100 Subject: [PATCH 3/9] rerun --- .github/workflows/intel_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml index 1b5feceb7d..e6a7fbb5b7 100644 --- a/.github/workflows/intel_test.yml +++ b/.github/workflows/intel_test.yml @@ -99,3 +99,4 @@ jobs: run: | cmake --build . --target test_examples -j $(nproc) || true sleep 3600 + From 2261c61d31b05a9ee74aec531dfe3f11258e7b0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Wed, 7 May 2025 08:54:40 +0100 Subject: [PATCH 4/9] rerun --- .github/workflows/intel_test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml index e6a7fbb5b7..1b5feceb7d 100644 --- a/.github/workflows/intel_test.yml +++ b/.github/workflows/intel_test.yml @@ -99,4 +99,3 @@ jobs: run: | cmake --build . --target test_examples -j $(nproc) || true sleep 3600 - From b3357f82a0a4c11d7beb1e3fa9044429ff2db951 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 8 May 2025 09:02:53 +0100 Subject: [PATCH 5/9] run examples sequentially --- .github/workflows/intel_test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml index 1b5feceb7d..0232258b05 100644 --- a/.github/workflows/intel_test.yml +++ b/.github/workflows/intel_test.yml @@ -97,5 +97,4 @@ jobs: - name: Examples shell: bash run: | - cmake --build . --target test_examples -j $(nproc) || true - sleep 3600 + cmake --build . --target test_examples -j 1 From ca79d8acfbe4f3ff7b13c41f11e2d8144e71d36b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 8 May 2025 09:04:53 +0100 Subject: [PATCH 6/9] detect and print about oom in examples --- examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp | 15 ++++--- .../01_pvc_gemm_with_collective_builder.cpp | 15 ++++--- .../02_pvc_gemm_mixed_dtype.cpp | 23 ++++++----- .../03_pvc_gemm_streamk.cpp | 11 +++-- .../04_pvc_grouped_gemm.cpp | 19 +++++---- ...05_pvc_gemm_single_b_with_per_col_bias.cpp | 17 +++++--- .../05_pvc_gemm_with_epilogue_gelu.cpp | 15 ++++--- ...pvc_gemm_with_epilogue_lincombdeeltact.cpp | 17 +++++--- .../05_pvc_gemm_with_epilogue_relu.cpp | 15 ++++--- .../05_pvc_gemm_with_epilogue_silu.cpp | 15 ++++--- .../05_pvc_gemm_with_epilogue_softmax.cpp | 15 ++++--- .../05_pvc_gemm_with_per_row_bias.cpp | 17 +++++--- .../05_pvc_gemm_with_topk_and_softmax.cpp | 13 ++++-- .../pvc_flash_attn_runner.hpp | 29 ++++++++++---- .../pvc_flash_decode_runner.hpp | 40 ++++++++++++++----- .../07_pvc_dual_gemm/07_pvc_dual_gemm.cpp | 27 ++++++++----- .../sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp | 15 ++++--- .../09_pvc_gemm_with_sycl_queue.cpp | 15 ++++--- .../pvc_flash_attn_cachedKV_runner.hpp | 33 ++++++++++----- 19 files changed, 248 insertions(+), 118 deletions(-) diff --git a/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp b/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp index 617dbd395c..d0beff4765 100644 --- a/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp +++ b/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp @@ -231,11 +231,16 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); + try{ + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp b/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp index 259f913b90..160d6b97e2 100644 --- a/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp +++ b/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp @@ -228,11 +228,16 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); + try{ + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + } catch(...){ + std::cerr << "Failed to allocate memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp b/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp index 617f1eca64..3ee9c17064 100644 --- a/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp +++ b/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp @@ -391,15 +391,20 @@ struct ExampleRunner { stride_D = cutlass::make_cute_packed_stride(StrideD{}, shape_CD); stride_S = cutlass::make_cute_packed_stride(StrideScale{}, shape_scale_zero); - block_A.reset(static_cast(M) * K * L); - block_A_dq.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_B_dq.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - block_scale.reset(static_cast(scale_k) * L * dq_mn_size); - block_zero.reset(static_cast(scale_k) * L * dq_mn_size); + try{ + block_A.reset(static_cast(M) * K * L); + block_A_dq.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_B_dq.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + block_scale.reset(static_cast(scale_k) * L * dq_mn_size); + block_zero.reset(static_cast(scale_k) * L * dq_mn_size); + } catch(...){ + std::cerr << "Failed to allocate memory. Aborting." << std::endl; + std::exit(1); + } initialize_mixed_dtype_block(block_A, block_A_dq, seed + 2023); initialize_mixed_dtype_block(block_B, block_B_dq, seed + 2022); diff --git a/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp b/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp index b87cf1ce30..48dee9a268 100644 --- a/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp +++ b/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp @@ -252,9 +252,14 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); + try{ + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); initialize_block(block_C, seed + 2021); diff --git a/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp b/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp index 4b3f4bc8ca..41ee7197a6 100644 --- a/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp +++ b/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp @@ -337,13 +337,18 @@ void allocate(const Options &options) { } - block_A.reset(total_elements_A); - block_B.reset(total_elements_B); - block_C.reset(total_elements_C); - block_D.reset(total_elements_D); - block_ref_D.reset(total_elements_D); - block_alpha.reset(options.groups); - block_beta.reset(options.groups); + try{ + block_A.reset(total_elements_A); + block_B.reset(total_elements_B); + block_C.reset(total_elements_C); + block_D.reset(total_elements_D); + block_ref_D.reset(total_elements_D); + block_alpha.reset(options.groups); + block_beta.reset(options.groups); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } } /// Initialize operands to be used in the GEMM and reference GEMM diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp index 97b9885afe..cc92a9a498 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp @@ -246,12 +246,17 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(cute::cosize(make_layout(cute::make_shape(M, K, L), stride_A))); - block_B.reset(cute::cosize(make_layout(cute::make_shape(N, K, L), stride_B))); - block_C.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_C))); - block_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D))); - block_ref_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D))); - block_bias.reset(N * L); + try{ + block_A.reset(cute::cosize(make_layout(cute::make_shape(M, K, L), stride_A))); + block_B.reset(cute::cosize(make_layout(cute::make_shape(N, K, L), stride_B))); + block_C.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_C))); + block_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D))); + block_ref_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D))); + block_bias.reset(N * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp index ae8b3dfecc..c5b38b1fa6 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp @@ -232,11 +232,16 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); + try{ + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp index 802d02d29b..7ffabe30ca 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp @@ -268,12 +268,17 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - block_Aux.reset(static_cast(M) * N * L); + try{ + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + block_Aux.reset(static_cast(M) * N * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp index 3bf1180067..fb3a87bba8 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp @@ -232,11 +232,16 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); + try{ + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp index 93a294659f..746c44bfdd 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp @@ -231,11 +231,16 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(M * K * L); - block_B.reset(K * N * L); - block_C.reset(M * N * L); - block_D.reset(M * N * L); - block_ref_D.reset(M * N * L); + try{ + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp index 5ff68c93d1..0af52e6c56 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp @@ -281,11 +281,16 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); + try{ + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } cutlass::reference::device::BlockFillRandomUniform( block_A.get(), block_A.size(), seed + 2023, (ElementA)1, (ElementA)0, 0); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp index 63ad03f705..824fab209e 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp @@ -240,12 +240,17 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - block_bias.reset(static_cast(M) * L); + try{ + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + block_bias.reset(static_cast(M) * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp index b4d39816cd..87f3dbb560 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp @@ -303,10 +303,15 @@ struct Result { auto c_coord = cutlass::make_Coord(options.m * options.l, options.n); auto b_coord = cutlass::make_Coord(options.k, options.n * options.l); - tensor_A.resize(a_coord); - tensor_B.resize(b_coord); - tensor_D.resize(c_coord); - tensor_ref_D.resize(c_coord); + try{ + tensor_A.resize(a_coord); + tensor_B.resize(b_coord); + tensor_D.resize(c_coord); + tensor_ref_D.resize(c_coord); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_tensor(tensor_A.host_view(), seed + 2022); initialize_tensor(tensor_B.host_view(), seed + 2023); diff --git a/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp b/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp index b79948960b..abb0bcb333 100644 --- a/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp +++ b/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp @@ -417,23 +417,38 @@ template struct ExampleRunner { stride_V = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv, batch * num_heads_kv)); stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q)); - block_Q.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_qk); - block_K.reset(static_cast(batch) * num_heads_kv * seq_len_kv * head_size_qk); - block_V.reset(static_cast(batch) * num_heads_kv * seq_len_kv * head_size_vo); - block_O.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_vo); - block_ref_O.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_vo); + try{ + block_Q.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_qk); + block_K.reset(static_cast(batch) * num_heads_kv * seq_len_kv * head_size_qk); + block_V.reset(static_cast(batch) * num_heads_kv * seq_len_kv * head_size_vo); + block_O.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_vo); + block_ref_O.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_vo); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_Q, seed + 2023); initialize_block(block_K, seed + 2022); initialize_block(block_V, seed + 2021); if (!cumulative_seqlen_q.empty()) { - device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); + try{ + device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } device_cumulative_seqlen_q.copy_from_host( cumulative_seqlen_q.data(), cumulative_seqlen_q.size()); } if (!cumulative_seqlen_kv.empty()) { - device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); + try{ + device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } device_cumulative_seqlen_kv.copy_from_host( cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size()); } diff --git a/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp b/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp index 81f54b13b2..3dc509b2f0 100644 --- a/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp +++ b/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp @@ -483,13 +483,18 @@ template struct ExampleRunner { stride_V_cache = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv_cache, batch * num_heads_kv)); stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q)); - block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk); - block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk); - block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo); - block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk); - block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo); - block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); - block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); + try{ + block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk); + block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk); + block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo); + block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk); + block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo); + block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); + block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_Q, seed + 2023); initialize_block(block_K, seed + 2022); @@ -498,18 +503,33 @@ template struct ExampleRunner { initialize_block(block_V_cache, seed + 2025); if (!cumulative_seqlen_q.empty()) { - device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); + try{ + device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } device_cumulative_seqlen_q.copy_from_host( cumulative_seqlen_q.data(), cumulative_seqlen_q.size()); } if (!cumulative_seqlen_kv.empty()) { - device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); + try{ + device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } device_cumulative_seqlen_kv.copy_from_host( cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size()); } if (!cumulative_seqlen_kv_cache.empty()) { - device_cumulative_seqlen_kv_cache.reset(cumulative_seqlen_kv_cache.size()); + try{ + device_cumulative_seqlen_kv_cache.reset(cumulative_seqlen_kv_cache.size()); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } device_cumulative_seqlen_kv_cache.copy_from_host( cumulative_seqlen_kv_cache.data(), cumulative_seqlen_kv_cache.size()); } diff --git a/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp b/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp index a3a4cd5316..ea523f2a2b 100644 --- a/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp +++ b/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp @@ -286,17 +286,22 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(static_cast(M) * K * L); - block_B0.reset(static_cast(K) * N * L); - block_B1.reset(static_cast(K) * N * L); - block_C0.reset(static_cast(M) * N * L); - block_C1.reset(static_cast(M) * N * L); - block_D0.reset(static_cast(M) * N * L); - block_D1.reset(static_cast(M) * N * L); - block_D2.reset(static_cast(M) * N * L); - block_ref_D0.reset(static_cast(M) * N * L); - block_ref_D1.reset(static_cast(M) * N * L); - block_ref_D2.reset(static_cast(M) * N * L); + try{ + block_A.reset(static_cast(M) * K * L); + block_B0.reset(static_cast(K) * N * L); + block_B1.reset(static_cast(K) * N * L); + block_C0.reset(static_cast(M) * N * L); + block_C1.reset(static_cast(M) * N * L); + block_D0.reset(static_cast(M) * N * L); + block_D1.reset(static_cast(M) * N * L); + block_D2.reset(static_cast(M) * N * L); + block_ref_D0.reset(static_cast(M) * N * L); + block_ref_D1.reset(static_cast(M) * N * L); + block_ref_D2.reset(static_cast(M) * N * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B0, seed + 2022); diff --git a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp index e2c1e64e9e..1525eb0a5e 100644 --- a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp +++ b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp @@ -244,11 +244,16 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); + try{ + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp b/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp index 177b738cea..06e63930ea 100644 --- a/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp +++ b/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp @@ -166,11 +166,16 @@ struct ExampleRunner { Memory(sycl::queue q, ProblemShapeType problem_shape_MNKL) : q(q) { auto [M, N, K, L] = problem_shape_MNKL; - block_A = sycl::malloc_device(static_cast(M) * K * L, q); - block_B = sycl::malloc_device(static_cast(N) * K * L, q); - block_C = sycl::malloc_device(static_cast(M) * N * L, q); - block_D = sycl::malloc_device(static_cast(M) * N * L, q); - block_ref_D = sycl::malloc_device(static_cast(M) * N * L, q); + try{ + block_A = sycl::malloc_device(static_cast(M) * K * L, q); + block_B = sycl::malloc_device(static_cast(N) * K * L, q); + block_C = sycl::malloc_device(static_cast(M) * N * L, q); + block_D = sycl::malloc_device(static_cast(M) * N * L, q); + block_ref_D = sycl::malloc_device(static_cast(M) * N * L, q); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } } ~Memory() { diff --git a/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp b/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp index a11fedbbb4..924aa9c429 100644 --- a/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp +++ b/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp @@ -478,13 +478,18 @@ template struct ExampleRunner { stride_V_cache = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv_cache, batch * num_heads_kv)); stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q)); - block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk); - block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk); - block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo); - block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk); - block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo); - block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); - block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); + try{ + block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk); + block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk); + block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo); + block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk); + block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo); + block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); + block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } initialize_block(block_Q, seed + 2023); initialize_block(block_K, seed + 2022); @@ -493,12 +498,22 @@ template struct ExampleRunner { initialize_block(block_V_cache, seed + 2025); if (!cumulative_seqlen_q.empty()) { - device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); + try{ + device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } device_cumulative_seqlen_q.copy_from_host( cumulative_seqlen_q.data(), cumulative_seqlen_q.size()); } if (!cumulative_seqlen_kv.empty()) { - device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); + try{ + device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); + } catch(...){ + std::cerr << "Failed to allocate device memory. Aborting." << std::endl; + std::exit(1); + } device_cumulative_seqlen_kv.copy_from_host( cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size()); } From 3b5009ada877e57b27de33dbdbad193993eb86eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 8 May 2025 09:40:44 +0100 Subject: [PATCH 7/9] rerun --- .github/workflows/intel_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml index 0232258b05..00fd91a77a 100644 --- a/.github/workflows/intel_test.yml +++ b/.github/workflows/intel_test.yml @@ -98,3 +98,4 @@ jobs: shell: bash run: | cmake --build . --target test_examples -j 1 + From 4a7c6c86e7a11ef77f84138686fc555c790d0fd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 8 May 2025 10:17:44 +0100 Subject: [PATCH 8/9] rerun --- .github/workflows/intel_test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml index 00fd91a77a..0232258b05 100644 --- a/.github/workflows/intel_test.yml +++ b/.github/workflows/intel_test.yml @@ -98,4 +98,3 @@ jobs: shell: bash run: | cmake --build . --target test_examples -j 1 - From 81be124022449fcf18311b51f6365990fe4c1109 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 20 May 2025 09:53:37 +0100 Subject: [PATCH 9/9] Revert "detect and print about oom in examples" This reverts commit ca79d8acfbe4f3ff7b13c41f11e2d8144e71d36b. --- examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp | 15 +++---- .../01_pvc_gemm_with_collective_builder.cpp | 15 +++---- .../02_pvc_gemm_mixed_dtype.cpp | 23 +++++------ .../03_pvc_gemm_streamk.cpp | 11 ++--- .../04_pvc_grouped_gemm.cpp | 19 ++++----- ...05_pvc_gemm_single_b_with_per_col_bias.cpp | 17 +++----- .../05_pvc_gemm_with_epilogue_gelu.cpp | 15 +++---- ...pvc_gemm_with_epilogue_lincombdeeltact.cpp | 17 +++----- .../05_pvc_gemm_with_epilogue_relu.cpp | 15 +++---- .../05_pvc_gemm_with_epilogue_silu.cpp | 15 +++---- .../05_pvc_gemm_with_epilogue_softmax.cpp | 15 +++---- .../05_pvc_gemm_with_per_row_bias.cpp | 17 +++----- .../05_pvc_gemm_with_topk_and_softmax.cpp | 13 ++---- .../pvc_flash_attn_runner.hpp | 29 ++++---------- .../pvc_flash_decode_runner.hpp | 40 +++++-------------- .../07_pvc_dual_gemm/07_pvc_dual_gemm.cpp | 27 +++++-------- .../sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp | 15 +++---- .../09_pvc_gemm_with_sycl_queue.cpp | 15 +++---- .../pvc_flash_attn_cachedKV_runner.hpp | 33 +++++---------- 19 files changed, 118 insertions(+), 248 deletions(-) diff --git a/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp b/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp index d0beff4765..617dbd395c 100644 --- a/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp +++ b/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp @@ -231,16 +231,11 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp b/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp index 160d6b97e2..259f913b90 100644 --- a/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp +++ b/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp @@ -228,16 +228,11 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - } catch(...){ - std::cerr << "Failed to allocate memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp b/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp index 3ee9c17064..617f1eca64 100644 --- a/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp +++ b/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp @@ -391,20 +391,15 @@ struct ExampleRunner { stride_D = cutlass::make_cute_packed_stride(StrideD{}, shape_CD); stride_S = cutlass::make_cute_packed_stride(StrideScale{}, shape_scale_zero); - try{ - block_A.reset(static_cast(M) * K * L); - block_A_dq.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_B_dq.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - block_scale.reset(static_cast(scale_k) * L * dq_mn_size); - block_zero.reset(static_cast(scale_k) * L * dq_mn_size); - } catch(...){ - std::cerr << "Failed to allocate memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_A_dq.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_B_dq.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + block_scale.reset(static_cast(scale_k) * L * dq_mn_size); + block_zero.reset(static_cast(scale_k) * L * dq_mn_size); initialize_mixed_dtype_block(block_A, block_A_dq, seed + 2023); initialize_mixed_dtype_block(block_B, block_B_dq, seed + 2022); diff --git a/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp b/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp index 48dee9a268..b87cf1ce30 100644 --- a/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp +++ b/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp @@ -252,14 +252,9 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); initialize_block(block_C, seed + 2021); diff --git a/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp b/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp index 41ee7197a6..4b3f4bc8ca 100644 --- a/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp +++ b/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp @@ -337,18 +337,13 @@ void allocate(const Options &options) { } - try{ - block_A.reset(total_elements_A); - block_B.reset(total_elements_B); - block_C.reset(total_elements_C); - block_D.reset(total_elements_D); - block_ref_D.reset(total_elements_D); - block_alpha.reset(options.groups); - block_beta.reset(options.groups); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(total_elements_A); + block_B.reset(total_elements_B); + block_C.reset(total_elements_C); + block_D.reset(total_elements_D); + block_ref_D.reset(total_elements_D); + block_alpha.reset(options.groups); + block_beta.reset(options.groups); } /// Initialize operands to be used in the GEMM and reference GEMM diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp index cc92a9a498..97b9885afe 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp @@ -246,17 +246,12 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(cute::cosize(make_layout(cute::make_shape(M, K, L), stride_A))); - block_B.reset(cute::cosize(make_layout(cute::make_shape(N, K, L), stride_B))); - block_C.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_C))); - block_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D))); - block_ref_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D))); - block_bias.reset(N * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(cute::cosize(make_layout(cute::make_shape(M, K, L), stride_A))); + block_B.reset(cute::cosize(make_layout(cute::make_shape(N, K, L), stride_B))); + block_C.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_C))); + block_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D))); + block_ref_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D))); + block_bias.reset(N * L); initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp index c5b38b1fa6..ae8b3dfecc 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp @@ -232,16 +232,11 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp index 7ffabe30ca..802d02d29b 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp @@ -268,17 +268,12 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - block_Aux.reset(static_cast(M) * N * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + block_Aux.reset(static_cast(M) * N * L); initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp index fb3a87bba8..3bf1180067 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp @@ -232,16 +232,11 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp index 746c44bfdd..93a294659f 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp @@ -231,16 +231,11 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(M * K * L); - block_B.reset(K * N * L); - block_C.reset(M * N * L); - block_D.reset(M * N * L); - block_ref_D.reset(M * N * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp index 0af52e6c56..5ff68c93d1 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp @@ -281,16 +281,11 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); cutlass::reference::device::BlockFillRandomUniform( block_A.get(), block_A.size(), seed + 2023, (ElementA)1, (ElementA)0, 0); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp index 824fab209e..63ad03f705 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp @@ -240,17 +240,12 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - block_bias.reset(static_cast(M) * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); + block_bias.reset(static_cast(M) * L); initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp index 87f3dbb560..b4d39816cd 100644 --- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp +++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp @@ -303,15 +303,10 @@ struct Result { auto c_coord = cutlass::make_Coord(options.m * options.l, options.n); auto b_coord = cutlass::make_Coord(options.k, options.n * options.l); - try{ - tensor_A.resize(a_coord); - tensor_B.resize(b_coord); - tensor_D.resize(c_coord); - tensor_ref_D.resize(c_coord); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + tensor_A.resize(a_coord); + tensor_B.resize(b_coord); + tensor_D.resize(c_coord); + tensor_ref_D.resize(c_coord); initialize_tensor(tensor_A.host_view(), seed + 2022); initialize_tensor(tensor_B.host_view(), seed + 2023); diff --git a/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp b/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp index abb0bcb333..b79948960b 100644 --- a/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp +++ b/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp @@ -417,38 +417,23 @@ template struct ExampleRunner { stride_V = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv, batch * num_heads_kv)); stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q)); - try{ - block_Q.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_qk); - block_K.reset(static_cast(batch) * num_heads_kv * seq_len_kv * head_size_qk); - block_V.reset(static_cast(batch) * num_heads_kv * seq_len_kv * head_size_vo); - block_O.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_vo); - block_ref_O.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_vo); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_Q.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_qk); + block_K.reset(static_cast(batch) * num_heads_kv * seq_len_kv * head_size_qk); + block_V.reset(static_cast(batch) * num_heads_kv * seq_len_kv * head_size_vo); + block_O.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_vo); + block_ref_O.reset(static_cast(batch) * num_heads_q * seq_len_qo * head_size_vo); initialize_block(block_Q, seed + 2023); initialize_block(block_K, seed + 2022); initialize_block(block_V, seed + 2021); if (!cumulative_seqlen_q.empty()) { - try{ - device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); device_cumulative_seqlen_q.copy_from_host( cumulative_seqlen_q.data(), cumulative_seqlen_q.size()); } if (!cumulative_seqlen_kv.empty()) { - try{ - device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); device_cumulative_seqlen_kv.copy_from_host( cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size()); } diff --git a/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp b/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp index 3dc509b2f0..81f54b13b2 100644 --- a/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp +++ b/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp @@ -483,18 +483,13 @@ template struct ExampleRunner { stride_V_cache = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv_cache, batch * num_heads_kv)); stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q)); - try{ - block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk); - block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk); - block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo); - block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk); - block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo); - block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); - block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk); + block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk); + block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo); + block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk); + block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo); + block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); + block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); initialize_block(block_Q, seed + 2023); initialize_block(block_K, seed + 2022); @@ -503,33 +498,18 @@ template struct ExampleRunner { initialize_block(block_V_cache, seed + 2025); if (!cumulative_seqlen_q.empty()) { - try{ - device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); device_cumulative_seqlen_q.copy_from_host( cumulative_seqlen_q.data(), cumulative_seqlen_q.size()); } if (!cumulative_seqlen_kv.empty()) { - try{ - device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); device_cumulative_seqlen_kv.copy_from_host( cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size()); } if (!cumulative_seqlen_kv_cache.empty()) { - try{ - device_cumulative_seqlen_kv_cache.reset(cumulative_seqlen_kv_cache.size()); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + device_cumulative_seqlen_kv_cache.reset(cumulative_seqlen_kv_cache.size()); device_cumulative_seqlen_kv_cache.copy_from_host( cumulative_seqlen_kv_cache.data(), cumulative_seqlen_kv_cache.size()); } diff --git a/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp b/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp index ea523f2a2b..a3a4cd5316 100644 --- a/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp +++ b/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp @@ -286,22 +286,17 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(static_cast(M) * K * L); - block_B0.reset(static_cast(K) * N * L); - block_B1.reset(static_cast(K) * N * L); - block_C0.reset(static_cast(M) * N * L); - block_C1.reset(static_cast(M) * N * L); - block_D0.reset(static_cast(M) * N * L); - block_D1.reset(static_cast(M) * N * L); - block_D2.reset(static_cast(M) * N * L); - block_ref_D0.reset(static_cast(M) * N * L); - block_ref_D1.reset(static_cast(M) * N * L); - block_ref_D2.reset(static_cast(M) * N * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_B0.reset(static_cast(K) * N * L); + block_B1.reset(static_cast(K) * N * L); + block_C0.reset(static_cast(M) * N * L); + block_C1.reset(static_cast(M) * N * L); + block_D0.reset(static_cast(M) * N * L); + block_D1.reset(static_cast(M) * N * L); + block_D2.reset(static_cast(M) * N * L); + block_ref_D0.reset(static_cast(M) * N * L); + block_ref_D1.reset(static_cast(M) * N * L); + block_ref_D2.reset(static_cast(M) * N * L); initialize_block(block_A, seed + 2023); initialize_block(block_B0, seed + 2022); diff --git a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp index 1525eb0a5e..e2c1e64e9e 100644 --- a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp +++ b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp @@ -244,16 +244,11 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - try{ - block_A.reset(static_cast(M) * K * L); - block_B.reset(static_cast(K) * N * L); - block_C.reset(static_cast(M) * N * L); - block_D.reset(static_cast(M) * N * L); - block_ref_D.reset(static_cast(M) * N * L); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A.reset(static_cast(M) * K * L); + block_B.reset(static_cast(K) * N * L); + block_C.reset(static_cast(M) * N * L); + block_D.reset(static_cast(M) * N * L); + block_ref_D.reset(static_cast(M) * N * L); initialize_block(block_A, seed + 2023); initialize_block(block_B, seed + 2022); diff --git a/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp b/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp index 06e63930ea..177b738cea 100644 --- a/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp +++ b/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp @@ -166,16 +166,11 @@ struct ExampleRunner { Memory(sycl::queue q, ProblemShapeType problem_shape_MNKL) : q(q) { auto [M, N, K, L] = problem_shape_MNKL; - try{ - block_A = sycl::malloc_device(static_cast(M) * K * L, q); - block_B = sycl::malloc_device(static_cast(N) * K * L, q); - block_C = sycl::malloc_device(static_cast(M) * N * L, q); - block_D = sycl::malloc_device(static_cast(M) * N * L, q); - block_ref_D = sycl::malloc_device(static_cast(M) * N * L, q); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_A = sycl::malloc_device(static_cast(M) * K * L, q); + block_B = sycl::malloc_device(static_cast(N) * K * L, q); + block_C = sycl::malloc_device(static_cast(M) * N * L, q); + block_D = sycl::malloc_device(static_cast(M) * N * L, q); + block_ref_D = sycl::malloc_device(static_cast(M) * N * L, q); } ~Memory() { diff --git a/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp b/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp index 924aa9c429..a11fedbbb4 100644 --- a/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp +++ b/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp @@ -478,18 +478,13 @@ template struct ExampleRunner { stride_V_cache = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv_cache, batch * num_heads_kv)); stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q)); - try{ - block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk); - block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk); - block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo); - block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk); - block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo); - block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); - block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk); + block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk); + block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo); + block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk); + block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo); + block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); + block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo); initialize_block(block_Q, seed + 2023); initialize_block(block_K, seed + 2022); @@ -498,22 +493,12 @@ template struct ExampleRunner { initialize_block(block_V_cache, seed + 2025); if (!cumulative_seqlen_q.empty()) { - try{ - device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size()); device_cumulative_seqlen_q.copy_from_host( cumulative_seqlen_q.data(), cumulative_seqlen_q.size()); } if (!cumulative_seqlen_kv.empty()) { - try{ - device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); - } catch(...){ - std::cerr << "Failed to allocate device memory. Aborting." << std::endl; - std::exit(1); - } + device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size()); device_cumulative_seqlen_kv.copy_from_host( cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size()); }