From 8782dfadb0f1a04f135ca21cab9ae3602ebd59bb Mon Sep 17 00:00:00 2001
From: Daniel Ching <dching@nvidia.com>
Date: Mon, 14 Apr 2025 18:21:28 -0500
Subject: [PATCH 01/10] NEW: Make event timing error messages more specific and
 actionable

The CUDA driver provides different error messages for various
errors when trying to compute elapsed time, and the documentation
explains each of these scenarious. Surface each of these to Python
uses with actionable error messages.
---
 cuda_core/cuda/core/experimental/_event.py | 24 +++++++--
 cuda_core/tests/test_event.py              | 60 ++++++++++++++++++++++
 2 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index 382384a65..e8d1699e6 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -120,9 +120,27 @@ def __sub__(self, other):
         try:
             timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle))
         except CUDAError as e:
-            raise RuntimeError(
-                "Timing capability must be enabled in order to subtract two Events; timing is disabled by default."
-            ) from e
+            error_message = str(e)
+            if "CUDA_ERROR_INVALID_HANDLE" in error_message:
+                if self.is_timing_disabled or other.is_timing_disabled:
+                    explanation = (
+                        "Both Events must be created with timing enabled in order to subtract them; "
+                        "use EventOptions(enable_timing=True) when creating both events."
+                    )
+                else:
+                    explanation = (
+                        "Both Events must be recorded before they can be subtracted; "
+                        "use Stream.record() to record both events to a stream."
+                    )
+            elif "CUDA_ERROR_NOT_READY" in error_message:
+                explanation = (
+                    "One or both events have not completed; "
+                    "use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
+                    "before subtracting them."
+                )
+            else:
+                raise e
+            raise RuntimeError(explanation) from e
         return timing
 
     @property
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 108b8b140..614451bad 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -80,3 +80,63 @@ def test_is_done(init_cuda):
     # Without a sync, the captured work might not have yet completed
     # Therefore this check should never raise an exception
     assert event.is_done in (True, False)
+
+
+def test_error_timing_disabled():
+    device = Device()
+    device.set_current()
+    enabled = EventOptions(enable_timing=True)
+    disabled = EventOptions(enable_timing=False)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    event2 = stream.record(options=disabled)
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
+        event2 - event1
+
+    event1 = stream.record(options=disabled)
+    event2 = stream.record(options=disabled)
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
+        event2 - event1
+
+    event1 = stream.record(options=enabled)
+    event2 = stream.record(options=enabled)
+    stream.sync()
+    event2 - event1
+
+
+def test_error_timing_recorded():
+    device = Device()
+    device.set_current()
+    enabled = EventOptions(enable_timing=True)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    event2 = device.create_event(options=enabled)
+    event3 = device.create_event(options=enabled)
+
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event2 - event1
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event1 - event2
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event3 - event2
+
+
+def test_error_timing_incomplete():
+    device = Device()
+    device.set_current()
+    enabled = EventOptions(enable_timing=True)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    event2 = device.create_event(options=enabled)
+    stream.wait(event2)
+    event3 = stream.record(options=enabled)
+
+    # event3 will never complete because the stream is waiting on event2 which is never recorded
+    with pytest.raises(RuntimeError, match="^One or both events have not completed."):
+        event3 - event1

From e7f0f85a41f3f9b1528606c42c037d569ef338f2 Mon Sep 17 00:00:00 2001
From: Daniel Ching <dching@nvidia.com>
Date: Tue, 15 Apr 2025 16:04:55 -0500
Subject: [PATCH 02/10] REF: Use error types instead of string comparison

---
 cuda_core/cuda/core/experimental/_event.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index e8d1699e6..4b43226b5 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -8,7 +8,13 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
-from cuda.core.experimental._utils.cuda_utils import CUDAError, check_or_create_options, driver, handle_return
+from cuda.core.experimental._utils.cuda_utils import (
+    CUDAError,
+    check_or_create_options,
+    driver,
+    handle_return,
+    _check_driver_error as raise_if_driver_error,
+)
 
 if TYPE_CHECKING:
     import cuda.bindings
@@ -117,11 +123,12 @@ def __rsub__(self, other):
 
     def __sub__(self, other):
         # return self - other (in milliseconds)
+        err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
         try:
-            timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle))
+            raise_if_driver_error(err)
+            return timing
         except CUDAError as e:
-            error_message = str(e)
-            if "CUDA_ERROR_INVALID_HANDLE" in error_message:
+            if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE:
                 if self.is_timing_disabled or other.is_timing_disabled:
                     explanation = (
                         "Both Events must be created with timing enabled in order to subtract them; "
@@ -132,7 +139,7 @@ def __sub__(self, other):
                         "Both Events must be recorded before they can be subtracted; "
                         "use Stream.record() to record both events to a stream."
                     )
-            elif "CUDA_ERROR_NOT_READY" in error_message:
+            elif err == driver.CUresult.CUDA_ERROR_NOT_READY:
                 explanation = (
                     "One or both events have not completed; "
                     "use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
@@ -141,7 +148,6 @@ def __sub__(self, other):
             else:
                 raise e
             raise RuntimeError(explanation) from e
-        return timing
 
     @property
     def is_timing_disabled(self) -> bool:

From d66580e64ad13c92b9deff4c2eff1a453ca8f8ab Mon Sep 17 00:00:00 2001
From: Daniel Ching <dching@nvidia.com>
Date: Tue, 15 Apr 2025 16:07:35 -0500
Subject: [PATCH 03/10] TST: Remove redundant test

---
 cuda_core/tests/test_event.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 614451bad..28f565396 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -101,11 +101,6 @@ def test_error_timing_disabled():
     with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
         event2 - event1
 
-    event1 = stream.record(options=enabled)
-    event2 = stream.record(options=enabled)
-    stream.sync()
-    event2 - event1
-
 
 def test_error_timing_recorded():
     device = Device()

From 2cf2dc892d519c863a95c6dd0ffe4352317c813a Mon Sep 17 00:00:00 2001
From: Daniel Ching <dching@nvidia.com>
Date: Tue, 15 Apr 2025 16:14:54 -0500
Subject: [PATCH 04/10] STY: Separate regular from renaming imports

---
 cuda_core/cuda/core/experimental/_event.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index 4b43226b5..ff014a668 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -13,6 +13,8 @@
     check_or_create_options,
     driver,
     handle_return,
+)
+from cuda.core.experimental._utils.cuda_utils import (
     _check_driver_error as raise_if_driver_error,
 )
 

From 99218bb9a7d53bbe3ff2d0f0d1cc97e0547bcfd8 Mon Sep 17 00:00:00 2001
From: Daniel Ching <dching@nvidia.com>
Date: Tue, 15 Apr 2025 16:16:36 -0500
Subject: [PATCH 05/10] TST: Refactor event class tests

---
 cuda_core/tests/test_event.py | 40 ++++++++++++++---------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 28f565396..d19e3136d 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -20,37 +20,27 @@ def test_event_init_disabled():
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
-@pytest.mark.parametrize("enable_timing", [True, False, None])
-def test_timing(init_cuda, enable_timing):
-    options = EventOptions(enable_timing=enable_timing)
+def test_timing_success(init_cuda):
+    options = EventOptions(enable_timing=True)
     stream = Device().create_stream()
     delay_seconds = 0.5
     e1 = stream.record(options=options)
     time.sleep(delay_seconds)
     e2 = stream.record(options=options)
     e2.sync()
-    for e in (e1, e2):
-        assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing)
-    if enable_timing:
-        elapsed_time_ms = e2 - e1
-        assert isinstance(elapsed_time_ms, float)
-        # Using a generous tolerance, to avoid flaky tests:
-        # We only want to exercise the __sub__ method, this test is not meant
-        # to stress-test the CUDA driver or time.sleep().
-        delay_ms = delay_seconds * 1000
-        if os.name == "nt":  # noqa: SIM108
-            # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
-            generous_tolerance = 100
-        else:
-            # Most modern Linux kernels have a default timer resolution of 1 ms.
-            generous_tolerance = 20
-        assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
+    elapsed_time_ms = e2 - e1
+    assert isinstance(elapsed_time_ms, float)
+    # Using a generous tolerance, to avoid flaky tests:
+    # We only want to exercise the __sub__ method, this test is not meant
+    # to stress-test the CUDA driver or time.sleep().
+    delay_ms = delay_seconds * 1000
+    if os.name == "nt":  # noqa: SIM108
+        # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
+        generous_tolerance = 100
     else:
-        with pytest.raises(RuntimeError) as e:
-            elapsed_time_ms = e2 - e1
-            msg = str(e)
-            assert "disabled by default" in msg
-            assert "CUDA_ERROR_INVALID_HANDLE" in msg
+        # Most modern Linux kernels have a default timer resolution of 1 ms.
+        generous_tolerance = 20
+    assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
 
 
 def test_is_sync_busy_waited(init_cuda):
@@ -91,6 +81,8 @@ def test_error_timing_disabled():
 
     event1 = stream.record(options=enabled)
     event2 = stream.record(options=disabled)
+    assert not event1.is_timing_disabled
+    assert event2.is_timing_disabled
     stream.sync()
     with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
         event2 - event1

From 4314886115551cbf7245cc8157298845cb464b1e Mon Sep 17 00:00:00 2001
From: Daniel Ching <dching@nvidia.com>
Date: Mon, 21 Apr 2025 16:51:31 -0500
Subject: [PATCH 06/10] TST: Use infinite kernel instead of unrecorded event
 for testing event error

---
 cuda_core/tests/test_event.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index d19e3136d..16f4eafb1 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -12,7 +12,7 @@
 import pytest
 
 import cuda.core.experimental
-from cuda.core.experimental import Device, EventOptions
+from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch
 
 
 def test_event_init_disabled():
@@ -116,14 +116,32 @@ def test_error_timing_recorded():
 def test_error_timing_incomplete():
     device = Device()
     device.set_current()
+
+    # This kernel is designed to not complete
+    code = """
+extern "C"
+__global__ void wait() {
+    while (1 > 0) {
+    }
+}
+"""
+
+    arch = "".join(f"{i}" for i in device.compute_capability)
+    program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile(target_type="cubin")
+    ker = mod.get_kernel("wait")
+
+    config = LaunchConfig(grid=1, block=1)
+    ker_args = ()
+
     enabled = EventOptions(enable_timing=True)
     stream = device.create_stream()
 
     event1 = stream.record(options=enabled)
-    event2 = device.create_event(options=enabled)
-    stream.wait(event2)
+    launch(stream, config, ker, *ker_args)
     event3 = stream.record(options=enabled)
 
-    # event3 will never complete because the stream is waiting on event2 which is never recorded
+    # event3 will never complete because the stream is waiting on wait() to complete
     with pytest.raises(RuntimeError, match="^One or both events have not completed."):
         event3 - event1

From add0ba179a8cc9241cfb74aaca8a824072ca6ae3 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 27 Apr 2025 02:33:52 +0000
Subject: [PATCH 07/10] ensure the busy kernel can be shut down

---
 cuda_core/tests/test_event.py | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 16f4eafb1..63bd9202d 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -7,12 +7,15 @@
 # is strictly prohibited.
 
 import os
+import pathlib
 import time
 
+import numpy as np
 import pytest
 
 import cuda.core.experimental
 from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core.experimental._memory import _DefaultPinnedMemorySource
 
 
 def test_event_init_disabled():
@@ -113,27 +116,44 @@ def test_error_timing_recorded():
         event3 - event2
 
 
+# TODO: improve this once path finder can find headers
+@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header")
 def test_error_timing_incomplete():
     device = Device()
     device.set_current()
 
-    # This kernel is designed to not complete
+    # This kernel is designed to busy loop until a signal is received
     code = """
+#include <cuda/atomic>
+
 extern "C"
-__global__ void wait() {
-    while (1 > 0) {
+__global__ void wait(int* val) {
+    cuda::atomic_ref<int, cuda::thread_scope_system> signal{*val};
+    while (true) {
+        if (signal.load(cuda::memory_order_relaxed)) {
+            break;
+        }
     }
 }
 """
 
     arch = "".join(f"{i}" for i in device.compute_capability)
-    program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}")
+    program_options = ProgramOptions(
+        std="c++17",
+        arch=f"sm_{arch}",
+        include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")),
+    )
     prog = Program(code, code_type="c++", options=program_options)
     mod = prog.compile(target_type="cubin")
     ker = mod.get_kernel("wait")
 
+    mr = _DefaultPinnedMemorySource()
+    b = mr.allocate(4)
+    arr = np.from_dlpack(b).view(np.int32)
+    arr[0] = 0
+
     config = LaunchConfig(grid=1, block=1)
-    ker_args = ()
+    ker_args = (arr.ctypes.data,)
 
     enabled = EventOptions(enable_timing=True)
     stream = device.create_stream()
@@ -145,3 +165,7 @@ def test_error_timing_incomplete():
     # event3 will never complete because the stream is waiting on wait() to complete
     with pytest.raises(RuntimeError, match="^One or both events have not completed."):
         event3 - event1
+
+    arr[0] = 1
+    event3.sync()
+    event3 - event1  # this should work

From 9f530b6e96dd77e18855dc660d58ab658c4cd7d2 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 27 Apr 2025 03:01:24 +0000
Subject: [PATCH 08/10] from_dlpack needs a recent NumPy fix

---
 cuda_core/tests/test_event.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 63bd9202d..4895c0a67 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -118,6 +118,7 @@ def test_error_timing_recorded():
 
 # TODO: improve this once path finder can find headers
 @pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header")
+@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
 def test_error_timing_incomplete():
     device = Device()
     device.set_current()

From 638990d94532d8d44f63db7aab6468530b0f5552 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 27 Apr 2025 03:10:19 +0000
Subject: [PATCH 09/10] ensure cccl headers are available (with local ctk) at
 test time

---
 .github/workflows/test-wheel-windows.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 4e48590a3..36e0b0535 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -55,9 +55,9 @@ jobs:
 
           if ('${{ inputs.local-ctk }}' -eq '1') {
             if ($TEST_CUDA_MAJOR -eq '12') {
-              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink"]'
+              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink", "cccl"]'
             } else {
-              $MINI_CTK_DEPS = '["nvcc", "nvrtc"]'
+              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "cccl"]'
             }
           }
 

From 9904f3d0183f022d1a69201b08fbd53920e8110a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 27 Apr 2025 03:47:04 +0000
Subject: [PATCH 10/10] there is no cccl component on Windows...

---
 .github/workflows/test-wheel-windows.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 36e0b0535..9cb5f41eb 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -55,9 +55,9 @@ jobs:
 
           if ('${{ inputs.local-ctk }}' -eq '1') {
             if ($TEST_CUDA_MAJOR -eq '12') {
-              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink", "cccl"]'
+              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink", "thrust"]'
             } else {
-              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "cccl"]'
+              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "thrust"]'
             }
           }