From c2ce5e863435633a0a741febfdaa0f208b5c8a61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Tue, 6 May 2025 12:58:13 +0100
Subject: [PATCH 1/9] reenable tests

---
 .github/workflows/intel_test.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml
index cbd8a8311c..fb6a5aaf54 100644
--- a/.github/workflows/intel_test.yml
+++ b/.github/workflows/intel_test.yml
@@ -97,7 +97,4 @@ jobs:
       - name: Examples
         shell: bash
         run: |
-          # Disable test examples on BMG due to flakiness 
-          if [[ "${{ matrix.GPU }}" != "BMG" ]]; then
-            cmake --build . --target test_examples -j $(nproc)
-          fi
+          cmake --build . --target test_examples -j $(nproc)

From 364083c90590213873d13e6eed86643dc88185cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Tue, 6 May 2025 13:13:16 +0100
Subject: [PATCH 2/9] keep the runner running

---
 .github/workflows/intel_test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml
index fb6a5aaf54..1b5feceb7d 100644
--- a/.github/workflows/intel_test.yml
+++ b/.github/workflows/intel_test.yml
@@ -97,4 +97,5 @@ jobs:
       - name: Examples
         shell: bash
         run: |
-          cmake --build . --target test_examples -j $(nproc)
+          cmake --build . --target test_examples -j $(nproc) || true
+          sleep 3600

From f02d113779d180bcc7572a591ba75fbff51d9257 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Tue, 6 May 2025 15:15:07 +0100
Subject: [PATCH 3/9] rerun

---
 .github/workflows/intel_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml
index 1b5feceb7d..e6a7fbb5b7 100644
--- a/.github/workflows/intel_test.yml
+++ b/.github/workflows/intel_test.yml
@@ -99,3 +99,4 @@ jobs:
         run: |
           cmake --build . --target test_examples -j $(nproc) || true
           sleep 3600
+

From 2261c61d31b05a9ee74aec531dfe3f11258e7b0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Wed, 7 May 2025 08:54:40 +0100
Subject: [PATCH 4/9] rerun

---
 .github/workflows/intel_test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml
index e6a7fbb5b7..1b5feceb7d 100644
--- a/.github/workflows/intel_test.yml
+++ b/.github/workflows/intel_test.yml
@@ -99,4 +99,3 @@ jobs:
         run: |
           cmake --build . --target test_examples -j $(nproc) || true
           sleep 3600
-

From b3357f82a0a4c11d7beb1e3fa9044429ff2db951 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 8 May 2025 09:02:53 +0100
Subject: [PATCH 5/9] run examples sequentially

---
 .github/workflows/intel_test.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml
index 1b5feceb7d..0232258b05 100644
--- a/.github/workflows/intel_test.yml
+++ b/.github/workflows/intel_test.yml
@@ -97,5 +97,4 @@ jobs:
       - name: Examples
         shell: bash
         run: |
-          cmake --build . --target test_examples -j $(nproc) || true
-          sleep 3600
+          cmake --build . --target test_examples -j 1

From ca79d8acfbe4f3ff7b13c41f11e2d8144e71d36b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 8 May 2025 09:04:53 +0100
Subject: [PATCH 6/9] detect and print about oom in examples

---
 examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp     | 15 ++++---
 .../01_pvc_gemm_with_collective_builder.cpp   | 15 ++++---
 .../02_pvc_gemm_mixed_dtype.cpp               | 23 ++++++-----
 .../03_pvc_gemm_streamk.cpp                   | 11 +++--
 .../04_pvc_grouped_gemm.cpp                   | 19 +++++----
 ...05_pvc_gemm_single_b_with_per_col_bias.cpp | 17 +++++---
 .../05_pvc_gemm_with_epilogue_gelu.cpp        | 15 ++++---
 ...pvc_gemm_with_epilogue_lincombdeeltact.cpp | 17 +++++---
 .../05_pvc_gemm_with_epilogue_relu.cpp        | 15 ++++---
 .../05_pvc_gemm_with_epilogue_silu.cpp        | 15 ++++---
 .../05_pvc_gemm_with_epilogue_softmax.cpp     | 15 ++++---
 .../05_pvc_gemm_with_per_row_bias.cpp         | 17 +++++---
 .../05_pvc_gemm_with_topk_and_softmax.cpp     | 13 ++++--
 .../pvc_flash_attn_runner.hpp                 | 29 ++++++++++----
 .../pvc_flash_decode_runner.hpp               | 40 ++++++++++++++-----
 .../07_pvc_dual_gemm/07_pvc_dual_gemm.cpp     | 27 ++++++++-----
 .../sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp    | 15 ++++---
 .../09_pvc_gemm_with_sycl_queue.cpp           | 15 ++++---
 .../pvc_flash_attn_cachedKV_runner.hpp        | 33 ++++++++++-----
 19 files changed, 248 insertions(+), 118 deletions(-)

diff --git a/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp b/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp
index 617dbd395c..d0beff4765 100644
--- a/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp
+++ b/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp
@@ -231,11 +231,16 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
-    block_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_B.reset(static_cast<std::size_t>(K) * N * L);
+      block_C.reset(static_cast<std::size_t>(M) * N * L);
+      block_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp b/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp
index 259f913b90..160d6b97e2 100644
--- a/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp
+++ b/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp
@@ -228,11 +228,16 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
-    block_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_B.reset(static_cast<std::size_t>(K) * N * L);
+      block_C.reset(static_cast<std::size_t>(M) * N * L);
+      block_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp b/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp
index 617f1eca64..3ee9c17064 100644
--- a/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp
+++ b/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp
@@ -391,15 +391,20 @@ struct ExampleRunner {
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, shape_CD);
     stride_S = cutlass::make_cute_packed_stride(StrideScale{}, shape_scale_zero);
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_A_dq.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_B_dq.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
-    block_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_scale.reset(static_cast<std::size_t>(scale_k) * L * dq_mn_size);
-    block_zero.reset(static_cast<std::size_t>(scale_k) * L * dq_mn_size);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_A_dq.reset(static_cast<std::size_t>(M) * K * L);
+      block_B.reset(static_cast<std::size_t>(K) * N * L);
+      block_B_dq.reset(static_cast<std::size_t>(K) * N * L);
+      block_C.reset(static_cast<std::size_t>(M) * N * L);
+      block_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_scale.reset(static_cast<std::size_t>(scale_k) * L * dq_mn_size);
+      block_zero.reset(static_cast<std::size_t>(scale_k) * L * dq_mn_size);
+    } catch(...){
+      std::cerr << "Failed to allocate memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_mixed_dtype_block(block_A, block_A_dq, seed + 2023);
     initialize_mixed_dtype_block(block_B, block_B_dq, seed + 2022);
diff --git a/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp b/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp
index b87cf1ce30..48dee9a268 100644
--- a/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp
+++ b/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp
@@ -252,9 +252,14 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_B.reset(static_cast<std::size_t>(K) * N * L);
+      block_C.reset(static_cast<std::size_t>(M) * N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
     initialize_block(block_C, seed + 2021);
diff --git a/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp b/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp
index 4b3f4bc8ca..41ee7197a6 100644
--- a/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp
+++ b/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp
@@ -337,13 +337,18 @@ void allocate(const Options &options) {
 
   }
 
-  block_A.reset(total_elements_A);
-  block_B.reset(total_elements_B);
-  block_C.reset(total_elements_C);
-  block_D.reset(total_elements_D);
-  block_ref_D.reset(total_elements_D);
-  block_alpha.reset(options.groups);
-  block_beta.reset(options.groups);
+  try{
+    block_A.reset(total_elements_A);
+    block_B.reset(total_elements_B);
+    block_C.reset(total_elements_C);
+    block_D.reset(total_elements_D);
+    block_ref_D.reset(total_elements_D);
+    block_alpha.reset(options.groups);
+    block_beta.reset(options.groups);
+  } catch(...){
+    std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+    std::exit(1);
+  }
 }
 
 /// Initialize operands to be used in the GEMM and reference GEMM
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp
index 97b9885afe..cc92a9a498 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp
@@ -246,12 +246,17 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(cute::cosize(make_layout(cute::make_shape(M, K, L), stride_A)));
-    block_B.reset(cute::cosize(make_layout(cute::make_shape(N, K, L), stride_B)));
-    block_C.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_C)));
-    block_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D)));
-    block_ref_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D)));
-    block_bias.reset(N * L);
+    try{
+      block_A.reset(cute::cosize(make_layout(cute::make_shape(M, K, L), stride_A)));
+      block_B.reset(cute::cosize(make_layout(cute::make_shape(N, K, L), stride_B)));
+      block_C.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_C)));
+      block_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D)));
+      block_ref_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D)));
+      block_bias.reset(N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp
index ae8b3dfecc..c5b38b1fa6 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp
@@ -232,11 +232,16 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
-    block_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_B.reset(static_cast<std::size_t>(K) * N * L);
+      block_C.reset(static_cast<std::size_t>(M) * N * L);
+      block_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp
index 802d02d29b..7ffabe30ca 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp
@@ -268,12 +268,17 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
-    block_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_Aux.reset(static_cast<std::size_t>(M) * N * L);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_B.reset(static_cast<std::size_t>(K) * N * L);
+      block_C.reset(static_cast<std::size_t>(M) * N * L);
+      block_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_Aux.reset(static_cast<std::size_t>(M) * N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp
index 3bf1180067..fb3a87bba8 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp
@@ -232,11 +232,16 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
-    block_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_B.reset(static_cast<std::size_t>(K) * N * L);
+      block_C.reset(static_cast<std::size_t>(M) * N * L);
+      block_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp
index 93a294659f..746c44bfdd 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp
@@ -231,11 +231,16 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(M * K * L);
-    block_B.reset(K * N * L);
-    block_C.reset(M * N * L);
-    block_D.reset(M * N * L);
-    block_ref_D.reset(M * N * L);
+    try{
+      block_A.reset(M * K * L);
+      block_B.reset(K * N * L);
+      block_C.reset(M * N * L);
+      block_D.reset(M * N * L);
+      block_ref_D.reset(M * N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp
index 5ff68c93d1..0af52e6c56 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp
@@ -281,11 +281,16 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
-    block_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_B.reset(static_cast<std::size_t>(K) * N * L);
+      block_C.reset(static_cast<std::size_t>(M) * N * L);
+      block_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     cutlass::reference::device::BlockFillRandomUniform(
        block_A.get(), block_A.size(), seed + 2023, (ElementA)1, (ElementA)0, 0);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp
index 63ad03f705..824fab209e 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp
@@ -240,12 +240,17 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
-    block_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_bias.reset(static_cast<std::size_t>(M) * L);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_B.reset(static_cast<std::size_t>(K) * N * L);
+      block_C.reset(static_cast<std::size_t>(M) * N * L);
+      block_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_bias.reset(static_cast<std::size_t>(M) * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp
index b4d39816cd..87f3dbb560 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp
@@ -303,10 +303,15 @@ struct Result {
     auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
     auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
 
-    tensor_A.resize(a_coord);
-    tensor_B.resize(b_coord);
-    tensor_D.resize(c_coord);
-    tensor_ref_D.resize(c_coord);
+    try{
+      tensor_A.resize(a_coord);
+      tensor_B.resize(b_coord);
+      tensor_D.resize(c_coord);
+      tensor_ref_D.resize(c_coord);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_tensor(tensor_A.host_view(), seed + 2022);
     initialize_tensor(tensor_B.host_view(), seed + 2023);
diff --git a/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp b/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp
index b79948960b..abb0bcb333 100644
--- a/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp
+++ b/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp
@@ -417,23 +417,38 @@ template <class GemmKernel, bool isVarLen> struct ExampleRunner {
     stride_V = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv, batch * num_heads_kv));
     stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q));
 
-    block_Q.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_qk);
-    block_K.reset(static_cast<std::size_t>(batch) * num_heads_kv * seq_len_kv * head_size_qk);
-    block_V.reset(static_cast<std::size_t>(batch) * num_heads_kv * seq_len_kv * head_size_vo);
-    block_O.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_vo);
-    block_ref_O.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_vo);
+    try{
+      block_Q.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_qk);
+      block_K.reset(static_cast<std::size_t>(batch) * num_heads_kv * seq_len_kv * head_size_qk);
+      block_V.reset(static_cast<std::size_t>(batch) * num_heads_kv * seq_len_kv * head_size_vo);
+      block_O.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_vo);
+      block_ref_O.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_vo);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_Q, seed + 2023);
     initialize_block(block_K, seed + 2022);
     initialize_block(block_V, seed + 2021);
 
     if (!cumulative_seqlen_q.empty()) {
-      device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
+      try{
+        device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
+      } catch(...){
+        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+        std::exit(1);
+      }
       device_cumulative_seqlen_q.copy_from_host(
         cumulative_seqlen_q.data(), cumulative_seqlen_q.size());
     }
     if (!cumulative_seqlen_kv.empty()) {
-      device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
+      try{
+        device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
+      } catch(...){
+        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+        std::exit(1);
+      }
       device_cumulative_seqlen_kv.copy_from_host(
         cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size());
     }
diff --git a/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp b/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp
index 81f54b13b2..3dc509b2f0 100644
--- a/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp
+++ b/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp
@@ -483,13 +483,18 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
     stride_V_cache = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv_cache, batch * num_heads_kv));
     stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q));
 
-    block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk);
-    block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk);
-    block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo);
-    block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk);
-    block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo);
-    block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
-    block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
+    try{
+      block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk);
+      block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk);
+      block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo);
+      block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk);
+      block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo);
+      block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
+      block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_Q, seed + 2023);
     initialize_block(block_K, seed + 2022);
@@ -498,18 +503,33 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
     initialize_block(block_V_cache, seed + 2025);
 
     if (!cumulative_seqlen_q.empty()) {
-      device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
+      try{
+        device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
+      } catch(...){
+        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+        std::exit(1);
+      }
       device_cumulative_seqlen_q.copy_from_host(
         cumulative_seqlen_q.data(), cumulative_seqlen_q.size());
     }
     if (!cumulative_seqlen_kv.empty()) {
-      device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
+      try{
+        device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
+      } catch(...){
+        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+        std::exit(1);
+      }
       device_cumulative_seqlen_kv.copy_from_host(
         cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size());
     }
 
     if (!cumulative_seqlen_kv_cache.empty()) {
-      device_cumulative_seqlen_kv_cache.reset(cumulative_seqlen_kv_cache.size());
+      try{
+        device_cumulative_seqlen_kv_cache.reset(cumulative_seqlen_kv_cache.size());
+      } catch(...){
+        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+        std::exit(1);
+      }
       device_cumulative_seqlen_kv_cache.copy_from_host(
         cumulative_seqlen_kv_cache.data(), cumulative_seqlen_kv_cache.size());
     }
diff --git a/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp b/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp
index a3a4cd5316..ea523f2a2b 100644
--- a/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp
+++ b/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp
@@ -286,17 +286,22 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B0.reset(static_cast<std::size_t>(K) * N * L);
-    block_B1.reset(static_cast<std::size_t>(K) * N * L);
-    block_C0.reset(static_cast<std::size_t>(M) * N * L);
-    block_C1.reset(static_cast<std::size_t>(M) * N * L);
-    block_D0.reset(static_cast<std::size_t>(M) * N * L);
-    block_D1.reset(static_cast<std::size_t>(M) * N * L);
-    block_D2.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D0.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D1.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D2.reset(static_cast<std::size_t>(M) * N * L);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_B0.reset(static_cast<std::size_t>(K) * N * L);
+      block_B1.reset(static_cast<std::size_t>(K) * N * L);
+      block_C0.reset(static_cast<std::size_t>(M) * N * L);
+      block_C1.reset(static_cast<std::size_t>(M) * N * L);
+      block_D0.reset(static_cast<std::size_t>(M) * N * L);
+      block_D1.reset(static_cast<std::size_t>(M) * N * L);
+      block_D2.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D0.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D1.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D2.reset(static_cast<std::size_t>(M) * N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B0, seed + 2022);
diff --git a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp
index e2c1e64e9e..1525eb0a5e 100644
--- a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp
+++ b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp
@@ -244,11 +244,16 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(static_cast<std::size_t>(M) * K * L);
-    block_B.reset(static_cast<std::size_t>(K) * N * L);
-    block_C.reset(static_cast<std::size_t>(M) * N * L);
-    block_D.reset(static_cast<std::size_t>(M) * N * L);
-    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    try{
+      block_A.reset(static_cast<std::size_t>(M) * K * L);
+      block_B.reset(static_cast<std::size_t>(K) * N * L);
+      block_C.reset(static_cast<std::size_t>(M) * N * L);
+      block_D.reset(static_cast<std::size_t>(M) * N * L);
+      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp b/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp
index 177b738cea..06e63930ea 100644
--- a/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp
+++ b/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp
@@ -166,11 +166,16 @@ struct ExampleRunner {
 
     Memory(sycl::queue q, ProblemShapeType problem_shape_MNKL) : q(q) {
       auto [M, N, K, L] = problem_shape_MNKL;
-      block_A = sycl::malloc_device<ElementA>(static_cast<std::size_t>(M) * K * L, q);
-      block_B = sycl::malloc_device<ElementB>(static_cast<std::size_t>(N) * K * L, q);
-      block_C = sycl::malloc_device<ElementC>(static_cast<std::size_t>(M) * N * L, q);
-      block_D = sycl::malloc_device<ElementOutput>(static_cast<std::size_t>(M) * N * L, q);
-      block_ref_D = sycl::malloc_device<ElementOutput>(static_cast<std::size_t>(M) * N * L, q);
+      try{
+        block_A = sycl::malloc_device<ElementA>(static_cast<std::size_t>(M) * K * L, q);
+        block_B = sycl::malloc_device<ElementB>(static_cast<std::size_t>(N) * K * L, q);
+        block_C = sycl::malloc_device<ElementC>(static_cast<std::size_t>(M) * N * L, q);
+        block_D = sycl::malloc_device<ElementOutput>(static_cast<std::size_t>(M) * N * L, q);
+        block_ref_D = sycl::malloc_device<ElementOutput>(static_cast<std::size_t>(M) * N * L, q);
+      } catch(...){
+        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+        std::exit(1);
+      }
     }
 
     ~Memory() {
diff --git a/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp b/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp
index a11fedbbb4..924aa9c429 100644
--- a/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp
+++ b/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp
@@ -478,13 +478,18 @@ template <class GemmKernel, bool isVarLen> struct ExampleRunner {
     stride_V_cache = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv_cache, batch * num_heads_kv));
     stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q));
 
-    block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk);
-    block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk);
-    block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo);
-    block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk);
-    block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo);
-    block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
-    block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
+    try{
+      block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk);
+      block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk);
+      block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo);
+      block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk);
+      block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo);
+      block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
+      block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
+    } catch(...){
+      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+      std::exit(1);
+    }
 
     initialize_block(block_Q, seed + 2023);
     initialize_block(block_K, seed + 2022);
@@ -493,12 +498,22 @@ template <class GemmKernel, bool isVarLen> struct ExampleRunner {
     initialize_block(block_V_cache, seed + 2025);
 
     if (!cumulative_seqlen_q.empty()) {
-      device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
+      try{
+        device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
+      } catch(...){
+        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+        std::exit(1);
+      }
       device_cumulative_seqlen_q.copy_from_host(
         cumulative_seqlen_q.data(), cumulative_seqlen_q.size());
     }
     if (!cumulative_seqlen_kv.empty()) {
-      device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
+      try{
+        device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
+      } catch(...){
+        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
+        std::exit(1);
+      }
       device_cumulative_seqlen_kv.copy_from_host(
         cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size());
     }

From 3b5009ada877e57b27de33dbdbad193993eb86eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 8 May 2025 09:40:44 +0100
Subject: [PATCH 7/9] rerun

---
 .github/workflows/intel_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml
index 0232258b05..00fd91a77a 100644
--- a/.github/workflows/intel_test.yml
+++ b/.github/workflows/intel_test.yml
@@ -98,3 +98,4 @@ jobs:
         shell: bash
         run: |
           cmake --build . --target test_examples -j 1
+

From 4a7c6c86e7a11ef77f84138686fc555c790d0fd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 8 May 2025 10:17:44 +0100
Subject: [PATCH 8/9] rerun

---
 .github/workflows/intel_test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/intel_test.yml b/.github/workflows/intel_test.yml
index 00fd91a77a..0232258b05 100644
--- a/.github/workflows/intel_test.yml
+++ b/.github/workflows/intel_test.yml
@@ -98,4 +98,3 @@ jobs:
         shell: bash
         run: |
           cmake --build . --target test_examples -j 1
-

From 81be124022449fcf18311b51f6365990fe4c1109 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Tue, 20 May 2025 09:53:37 +0100
Subject: [PATCH 9/9] Revert "detect and print about oom in examples"

This reverts commit ca79d8acfbe4f3ff7b13c41f11e2d8144e71d36b.
---
 examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp     | 15 +++----
 .../01_pvc_gemm_with_collective_builder.cpp   | 15 +++----
 .../02_pvc_gemm_mixed_dtype.cpp               | 23 +++++------
 .../03_pvc_gemm_streamk.cpp                   | 11 ++---
 .../04_pvc_grouped_gemm.cpp                   | 19 ++++-----
 ...05_pvc_gemm_single_b_with_per_col_bias.cpp | 17 +++-----
 .../05_pvc_gemm_with_epilogue_gelu.cpp        | 15 +++----
 ...pvc_gemm_with_epilogue_lincombdeeltact.cpp | 17 +++-----
 .../05_pvc_gemm_with_epilogue_relu.cpp        | 15 +++----
 .../05_pvc_gemm_with_epilogue_silu.cpp        | 15 +++----
 .../05_pvc_gemm_with_epilogue_softmax.cpp     | 15 +++----
 .../05_pvc_gemm_with_per_row_bias.cpp         | 17 +++-----
 .../05_pvc_gemm_with_topk_and_softmax.cpp     | 13 ++----
 .../pvc_flash_attn_runner.hpp                 | 29 ++++----------
 .../pvc_flash_decode_runner.hpp               | 40 +++++--------------
 .../07_pvc_dual_gemm/07_pvc_dual_gemm.cpp     | 27 +++++--------
 .../sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp    | 15 +++----
 .../09_pvc_gemm_with_sycl_queue.cpp           | 15 +++----
 .../pvc_flash_attn_cachedKV_runner.hpp        | 33 +++++----------
 19 files changed, 118 insertions(+), 248 deletions(-)

diff --git a/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp b/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp
index d0beff4765..617dbd395c 100644
--- a/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp
+++ b/examples/sycl/00_pvc_gemm/00_pvc_gemm.cpp
@@ -231,16 +231,11 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_B.reset(static_cast<std::size_t>(K) * N * L);
-      block_C.reset(static_cast<std::size_t>(M) * N * L);
-      block_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp b/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp
index 160d6b97e2..259f913b90 100644
--- a/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp
+++ b/examples/sycl/01_pvc_gemm_with_collective_builder/01_pvc_gemm_with_collective_builder.cpp
@@ -228,16 +228,11 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_B.reset(static_cast<std::size_t>(K) * N * L);
-      block_C.reset(static_cast<std::size_t>(M) * N * L);
-      block_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp b/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp
index 3ee9c17064..617f1eca64 100644
--- a/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp
+++ b/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp
@@ -391,20 +391,15 @@ struct ExampleRunner {
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, shape_CD);
     stride_S = cutlass::make_cute_packed_stride(StrideScale{}, shape_scale_zero);
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_A_dq.reset(static_cast<std::size_t>(M) * K * L);
-      block_B.reset(static_cast<std::size_t>(K) * N * L);
-      block_B_dq.reset(static_cast<std::size_t>(K) * N * L);
-      block_C.reset(static_cast<std::size_t>(M) * N * L);
-      block_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_scale.reset(static_cast<std::size_t>(scale_k) * L * dq_mn_size);
-      block_zero.reset(static_cast<std::size_t>(scale_k) * L * dq_mn_size);
-    } catch(...){
-      std::cerr << "Failed to allocate memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_A_dq.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_B_dq.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_scale.reset(static_cast<std::size_t>(scale_k) * L * dq_mn_size);
+    block_zero.reset(static_cast<std::size_t>(scale_k) * L * dq_mn_size);
 
     initialize_mixed_dtype_block(block_A, block_A_dq, seed + 2023);
     initialize_mixed_dtype_block(block_B, block_B_dq, seed + 2022);
diff --git a/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp b/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp
index 48dee9a268..b87cf1ce30 100644
--- a/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp
+++ b/examples/sycl/03_pvc_gemm_streamk/03_pvc_gemm_streamk.cpp
@@ -252,14 +252,9 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_B.reset(static_cast<std::size_t>(K) * N * L);
-      block_C.reset(static_cast<std::size_t>(M) * N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
     initialize_block(block_C, seed + 2021);
diff --git a/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp b/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp
index 41ee7197a6..4b3f4bc8ca 100644
--- a/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp
+++ b/examples/sycl/04_pvc_grouped_gemm/04_pvc_grouped_gemm.cpp
@@ -337,18 +337,13 @@ void allocate(const Options &options) {
 
   }
 
-  try{
-    block_A.reset(total_elements_A);
-    block_B.reset(total_elements_B);
-    block_C.reset(total_elements_C);
-    block_D.reset(total_elements_D);
-    block_ref_D.reset(total_elements_D);
-    block_alpha.reset(options.groups);
-    block_beta.reset(options.groups);
-  } catch(...){
-    std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-    std::exit(1);
-  }
+  block_A.reset(total_elements_A);
+  block_B.reset(total_elements_B);
+  block_C.reset(total_elements_C);
+  block_D.reset(total_elements_D);
+  block_ref_D.reset(total_elements_D);
+  block_alpha.reset(options.groups);
+  block_beta.reset(options.groups);
 }
 
 /// Initialize operands to be used in the GEMM and reference GEMM
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp
index cc92a9a498..97b9885afe 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_single_b_with_per_col_bias.cpp
@@ -246,17 +246,12 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(cute::cosize(make_layout(cute::make_shape(M, K, L), stride_A)));
-      block_B.reset(cute::cosize(make_layout(cute::make_shape(N, K, L), stride_B)));
-      block_C.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_C)));
-      block_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D)));
-      block_ref_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D)));
-      block_bias.reset(N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(cute::cosize(make_layout(cute::make_shape(M, K, L), stride_A)));
+    block_B.reset(cute::cosize(make_layout(cute::make_shape(N, K, L), stride_B)));
+    block_C.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_C)));
+    block_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D)));
+    block_ref_D.reset(cute::cosize(make_layout(cute::make_shape(M, N, L), stride_D)));
+    block_bias.reset(N * L);
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp
index c5b38b1fa6..ae8b3dfecc 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_gelu.cpp
@@ -232,16 +232,11 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_B.reset(static_cast<std::size_t>(K) * N * L);
-      block_C.reset(static_cast<std::size_t>(M) * N * L);
-      block_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp
index 7ffabe30ca..802d02d29b 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_lincombdeeltact.cpp
@@ -268,17 +268,12 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_B.reset(static_cast<std::size_t>(K) * N * L);
-      block_C.reset(static_cast<std::size_t>(M) * N * L);
-      block_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_Aux.reset(static_cast<std::size_t>(M) * N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_Aux.reset(static_cast<std::size_t>(M) * N * L);
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp
index fb3a87bba8..3bf1180067 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_relu.cpp
@@ -232,16 +232,11 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_B.reset(static_cast<std::size_t>(K) * N * L);
-      block_C.reset(static_cast<std::size_t>(M) * N * L);
-      block_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp
index 746c44bfdd..93a294659f 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_silu.cpp
@@ -231,16 +231,11 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(M * K * L);
-      block_B.reset(K * N * L);
-      block_C.reset(M * N * L);
-      block_D.reset(M * N * L);
-      block_ref_D.reset(M * N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(M * K * L);
+    block_B.reset(K * N * L);
+    block_C.reset(M * N * L);
+    block_D.reset(M * N * L);
+    block_ref_D.reset(M * N * L);
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp
index 0af52e6c56..5ff68c93d1 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_epilogue_softmax.cpp
@@ -281,16 +281,11 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_B.reset(static_cast<std::size_t>(K) * N * L);
-      block_C.reset(static_cast<std::size_t>(M) * N * L);
-      block_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
 
     cutlass::reference::device::BlockFillRandomUniform(
        block_A.get(), block_A.size(), seed + 2023, (ElementA)1, (ElementA)0, 0);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp
index 824fab209e..63ad03f705 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_per_row_bias.cpp
@@ -240,17 +240,12 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_B.reset(static_cast<std::size_t>(K) * N * L);
-      block_C.reset(static_cast<std::size_t>(M) * N * L);
-      block_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_bias.reset(static_cast<std::size_t>(M) * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_bias.reset(static_cast<std::size_t>(M) * L);
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp
index 87f3dbb560..b4d39816cd 100644
--- a/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp
+++ b/examples/sycl/05_pvc_gemm_with_epilogues/05_pvc_gemm_with_topk_and_softmax.cpp
@@ -303,15 +303,10 @@ struct Result {
     auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
     auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
 
-    try{
-      tensor_A.resize(a_coord);
-      tensor_B.resize(b_coord);
-      tensor_D.resize(c_coord);
-      tensor_ref_D.resize(c_coord);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    tensor_A.resize(a_coord);
+    tensor_B.resize(b_coord);
+    tensor_D.resize(c_coord);
+    tensor_ref_D.resize(c_coord);
 
     initialize_tensor(tensor_A.host_view(), seed + 2022);
     initialize_tensor(tensor_B.host_view(), seed + 2023);
diff --git a/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp b/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp
index abb0bcb333..b79948960b 100644
--- a/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp
+++ b/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp
@@ -417,38 +417,23 @@ template <class GemmKernel, bool isVarLen> struct ExampleRunner {
     stride_V = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv, batch * num_heads_kv));
     stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q));
 
-    try{
-      block_Q.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_qk);
-      block_K.reset(static_cast<std::size_t>(batch) * num_heads_kv * seq_len_kv * head_size_qk);
-      block_V.reset(static_cast<std::size_t>(batch) * num_heads_kv * seq_len_kv * head_size_vo);
-      block_O.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_vo);
-      block_ref_O.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_vo);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_Q.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_qk);
+    block_K.reset(static_cast<std::size_t>(batch) * num_heads_kv * seq_len_kv * head_size_qk);
+    block_V.reset(static_cast<std::size_t>(batch) * num_heads_kv * seq_len_kv * head_size_vo);
+    block_O.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_vo);
+    block_ref_O.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo * head_size_vo);
 
     initialize_block(block_Q, seed + 2023);
     initialize_block(block_K, seed + 2022);
     initialize_block(block_V, seed + 2021);
 
     if (!cumulative_seqlen_q.empty()) {
-      try{
-        device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
-      } catch(...){
-        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-        std::exit(1);
-      }
+      device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
       device_cumulative_seqlen_q.copy_from_host(
         cumulative_seqlen_q.data(), cumulative_seqlen_q.size());
     }
     if (!cumulative_seqlen_kv.empty()) {
-      try{
-        device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
-      } catch(...){
-        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-        std::exit(1);
-      }
+      device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
       device_cumulative_seqlen_kv.copy_from_host(
         cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size());
     }
diff --git a/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp b/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp
index 3dc509b2f0..81f54b13b2 100644
--- a/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp
+++ b/examples/sycl/06_pvc_flash_attention/pvc_flash_decode_runner.hpp
@@ -483,18 +483,13 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
     stride_V_cache = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv_cache, batch * num_heads_kv));
     stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q));
 
-    try{
-      block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk);
-      block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk);
-      block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo);
-      block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk);
-      block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo);
-      block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
-      block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk);
+    block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk);
+    block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo);
+    block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk);
+    block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo);
+    block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
+    block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
 
     initialize_block(block_Q, seed + 2023);
     initialize_block(block_K, seed + 2022);
@@ -503,33 +498,18 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
     initialize_block(block_V_cache, seed + 2025);
 
     if (!cumulative_seqlen_q.empty()) {
-      try{
-        device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
-      } catch(...){
-        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-        std::exit(1);
-      }
+      device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
       device_cumulative_seqlen_q.copy_from_host(
         cumulative_seqlen_q.data(), cumulative_seqlen_q.size());
     }
     if (!cumulative_seqlen_kv.empty()) {
-      try{
-        device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
-      } catch(...){
-        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-        std::exit(1);
-      }
+      device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
       device_cumulative_seqlen_kv.copy_from_host(
         cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size());
     }
 
     if (!cumulative_seqlen_kv_cache.empty()) {
-      try{
-        device_cumulative_seqlen_kv_cache.reset(cumulative_seqlen_kv_cache.size());
-      } catch(...){
-        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-        std::exit(1);
-      }
+      device_cumulative_seqlen_kv_cache.reset(cumulative_seqlen_kv_cache.size());
       device_cumulative_seqlen_kv_cache.copy_from_host(
         cumulative_seqlen_kv_cache.data(), cumulative_seqlen_kv_cache.size());
     }
diff --git a/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp b/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp
index ea523f2a2b..a3a4cd5316 100644
--- a/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp
+++ b/examples/sycl/07_pvc_dual_gemm/07_pvc_dual_gemm.cpp
@@ -286,22 +286,17 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_B0.reset(static_cast<std::size_t>(K) * N * L);
-      block_B1.reset(static_cast<std::size_t>(K) * N * L);
-      block_C0.reset(static_cast<std::size_t>(M) * N * L);
-      block_C1.reset(static_cast<std::size_t>(M) * N * L);
-      block_D0.reset(static_cast<std::size_t>(M) * N * L);
-      block_D1.reset(static_cast<std::size_t>(M) * N * L);
-      block_D2.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D0.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D1.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D2.reset(static_cast<std::size_t>(M) * N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B0.reset(static_cast<std::size_t>(K) * N * L);
+    block_B1.reset(static_cast<std::size_t>(K) * N * L);
+    block_C0.reset(static_cast<std::size_t>(M) * N * L);
+    block_C1.reset(static_cast<std::size_t>(M) * N * L);
+    block_D0.reset(static_cast<std::size_t>(M) * N * L);
+    block_D1.reset(static_cast<std::size_t>(M) * N * L);
+    block_D2.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D0.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D1.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D2.reset(static_cast<std::size_t>(M) * N * L);
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B0, seed + 2022);
diff --git a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp
index 1525eb0a5e..e2c1e64e9e 100644
--- a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp
+++ b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp
@@ -244,16 +244,11 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    try{
-      block_A.reset(static_cast<std::size_t>(M) * K * L);
-      block_B.reset(static_cast<std::size_t>(K) * N * L);
-      block_C.reset(static_cast<std::size_t>(M) * N * L);
-      block_D.reset(static_cast<std::size_t>(M) * N * L);
-      block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_A.reset(static_cast<std::size_t>(M) * K * L);
+    block_B.reset(static_cast<std::size_t>(K) * N * L);
+    block_C.reset(static_cast<std::size_t>(M) * N * L);
+    block_D.reset(static_cast<std::size_t>(M) * N * L);
+    block_ref_D.reset(static_cast<std::size_t>(M) * N * L);
 
     initialize_block(block_A, seed + 2023);
     initialize_block(block_B, seed + 2022);
diff --git a/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp b/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp
index 06e63930ea..177b738cea 100644
--- a/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp
+++ b/examples/sycl/09_pvc_gemm_with_sycl_queue/09_pvc_gemm_with_sycl_queue.cpp
@@ -166,16 +166,11 @@ struct ExampleRunner {
 
     Memory(sycl::queue q, ProblemShapeType problem_shape_MNKL) : q(q) {
       auto [M, N, K, L] = problem_shape_MNKL;
-      try{
-        block_A = sycl::malloc_device<ElementA>(static_cast<std::size_t>(M) * K * L, q);
-        block_B = sycl::malloc_device<ElementB>(static_cast<std::size_t>(N) * K * L, q);
-        block_C = sycl::malloc_device<ElementC>(static_cast<std::size_t>(M) * N * L, q);
-        block_D = sycl::malloc_device<ElementOutput>(static_cast<std::size_t>(M) * N * L, q);
-        block_ref_D = sycl::malloc_device<ElementOutput>(static_cast<std::size_t>(M) * N * L, q);
-      } catch(...){
-        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-        std::exit(1);
-      }
+      block_A = sycl::malloc_device<ElementA>(static_cast<std::size_t>(M) * K * L, q);
+      block_B = sycl::malloc_device<ElementB>(static_cast<std::size_t>(N) * K * L, q);
+      block_C = sycl::malloc_device<ElementC>(static_cast<std::size_t>(M) * N * L, q);
+      block_D = sycl::malloc_device<ElementOutput>(static_cast<std::size_t>(M) * N * L, q);
+      block_ref_D = sycl::malloc_device<ElementOutput>(static_cast<std::size_t>(M) * N * L, q);
     }
 
     ~Memory() {
diff --git a/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp b/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp
index 924aa9c429..a11fedbbb4 100644
--- a/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp
+++ b/examples/sycl/10_pvc_flash_attention_cachedKV/pvc_flash_attn_cachedKV_runner.hpp
@@ -478,18 +478,13 @@ template <class GemmKernel, bool isVarLen> struct ExampleRunner {
     stride_V_cache = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo, seq_len_kv_cache, batch * num_heads_kv));
     stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, head_size_vo, batch * num_heads_q));
 
-    try{
-      block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk);
-      block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk);
-      block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo);
-      block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk);
-      block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo);
-      block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
-      block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
-    } catch(...){
-      std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-      std::exit(1);
-    }
+    block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk);
+    block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk);
+    block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo);
+    block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk);
+    block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo);
+    block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
+    block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
 
     initialize_block(block_Q, seed + 2023);
     initialize_block(block_K, seed + 2022);
@@ -498,22 +493,12 @@ template <class GemmKernel, bool isVarLen> struct ExampleRunner {
     initialize_block(block_V_cache, seed + 2025);
 
     if (!cumulative_seqlen_q.empty()) {
-      try{
-        device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
-      } catch(...){
-        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-        std::exit(1);
-      }
+      device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
       device_cumulative_seqlen_q.copy_from_host(
         cumulative_seqlen_q.data(), cumulative_seqlen_q.size());
     }
     if (!cumulative_seqlen_kv.empty()) {
-      try{
-        device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
-      } catch(...){
-        std::cerr << "Failed to allocate device memory. Aborting." << std::endl;
-        std::exit(1);
-      }
+      device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
       device_cumulative_seqlen_kv.copy_from_host(
         cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size());
     }