NVIDIA · leofang · Apr 27, 2025 · Apr 14, 2025 · Apr 15, 2025 · Apr 15, 2025
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
@@ -55,9 +55,9 @@ jobs:
 
           if ('${{ inputs.local-ctk }}' -eq '1') {
             if ($TEST_CUDA_MAJOR -eq '12') {
-              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink"]'
+              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink", "thrust"]'
             } else {
-              $MINI_CTK_DEPS = '["nvcc", "nvrtc"]'
+              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "thrust"]'
             }
           }
 

diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
@@ -8,7 +8,15 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
-from cuda.core.experimental._utils.cuda_utils import CUDAError, check_or_create_options, driver, handle_return
+from cuda.core.experimental._utils.cuda_utils import (
+    CUDAError,
+    check_or_create_options,
+    driver,
+    handle_return,
+)
+from cuda.core.experimental._utils.cuda_utils import (
+    _check_driver_error as raise_if_driver_error,
+)
 
 if TYPE_CHECKING:
     import cuda.bindings
@@ -117,13 +125,31 @@ def __rsub__(self, other):
 
     def __sub__(self, other):
         # return self - other (in milliseconds)
+        err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
         try:
-            timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle))
+            raise_if_driver_error(err)
+            return timing
         except CUDAError as e:
-            raise RuntimeError(
-                "Timing capability must be enabled in order to subtract two Events; timing is disabled by default."
-            ) from e
-        return timing
+            if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE:
+                if self.is_timing_disabled or other.is_timing_disabled:
+                    explanation = (
+                        "Both Events must be created with timing enabled in order to subtract them; "
+                        "use EventOptions(enable_timing=True) when creating both events."
+                    )
+                else:
+                    explanation = (
+                        "Both Events must be recorded before they can be subtracted; "
+                        "use Stream.record() to record both events to a stream."
+                    )
+            elif err == driver.CUresult.CUDA_ERROR_NOT_READY:
+                explanation = (
+                    "One or both events have not completed; "
+                    "use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
+                    "before subtracting them."
+                )
+            else:
+                raise e
+            raise RuntimeError(explanation) from e
 
     @property
     def is_timing_disabled(self) -> bool:

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
@@ -7,50 +7,43 @@
 # is strictly prohibited.
 
 import os
+import pathlib
 import time
 
+import numpy as np
 import pytest
 
 import cuda.core.experimental
-from cuda.core.experimental import Device, EventOptions
+from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core.experimental._memory import _DefaultPinnedMemorySource
 
 
 def test_event_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
-@pytest.mark.parametrize("enable_timing", [True, False, None])
-def test_timing(init_cuda, enable_timing):
-    options = EventOptions(enable_timing=enable_timing)
+def test_timing_success(init_cuda):
+    options = EventOptions(enable_timing=True)
     stream = Device().create_stream()
     delay_seconds = 0.5
     e1 = stream.record(options=options)
     time.sleep(delay_seconds)
     e2 = stream.record(options=options)
     e2.sync()
-    for e in (e1, e2):
-        assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing)
-    if enable_timing:
-        elapsed_time_ms = e2 - e1
-        assert isinstance(elapsed_time_ms, float)
-        # Using a generous tolerance, to avoid flaky tests:
-        # We only want to exercise the __sub__ method, this test is not meant
-        # to stress-test the CUDA driver or time.sleep().
-        delay_ms = delay_seconds * 1000
-        if os.name == "nt":  # noqa: SIM108
-            # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
-            generous_tolerance = 100
-        else:
-            # Most modern Linux kernels have a default timer resolution of 1 ms.
-            generous_tolerance = 20
-        assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
+    elapsed_time_ms = e2 - e1
+    assert isinstance(elapsed_time_ms, float)
+    # Using a generous tolerance, to avoid flaky tests:
+    # We only want to exercise the __sub__ method, this test is not meant
+    # to stress-test the CUDA driver or time.sleep().
+    delay_ms = delay_seconds * 1000
+    if os.name == "nt":  # noqa: SIM108
+        # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
+        generous_tolerance = 100
     else:
-        with pytest.raises(RuntimeError) as e:
-            elapsed_time_ms = e2 - e1
-            msg = str(e)
-            assert "disabled by default" in msg
-            assert "CUDA_ERROR_INVALID_HANDLE" in msg
+        # Most modern Linux kernels have a default timer resolution of 1 ms.
+        generous_tolerance = 20
+    assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
 
 
 def test_is_sync_busy_waited(init_cuda):
@@ -80,3 +73,100 @@ def test_is_done(init_cuda):
     # Without a sync, the captured work might not have yet completed
     # Therefore this check should never raise an exception
     assert event.is_done in (True, False)
+
+
+def test_error_timing_disabled():
+    device = Device()
+    device.set_current()
+    enabled = EventOptions(enable_timing=True)
+    disabled = EventOptions(enable_timing=False)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    event2 = stream.record(options=disabled)
+    assert not event1.is_timing_disabled
+    assert event2.is_timing_disabled
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
+        event2 - event1
+
+    event1 = stream.record(options=disabled)
+    event2 = stream.record(options=disabled)
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
+        event2 - event1
+
+
+def test_error_timing_recorded():
+    device = Device()
+    device.set_current()
+    enabled = EventOptions(enable_timing=True)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    event2 = device.create_event(options=enabled)
+    event3 = device.create_event(options=enabled)
+
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event2 - event1
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event1 - event2
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event3 - event2
+
+
+# TODO: improve this once path finder can find headers
+@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header")
+@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
+def test_error_timing_incomplete():
+    device = Device()
+    device.set_current()
+
+    # This kernel is designed to busy loop until a signal is received
+    code = """
+#include <cuda/atomic>
+
+extern "C"
+__global__ void wait(int* val) {
+    cuda::atomic_ref<int, cuda::thread_scope_system> signal{*val};
+    while (true) {
+        if (signal.load(cuda::memory_order_relaxed)) {
+            break;
+        }
+    }
+}
+"""
+
+    arch = "".join(f"{i}" for i in device.compute_capability)
+    program_options = ProgramOptions(
+        std="c++17",
+        arch=f"sm_{arch}",
+        include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")),
+    )
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile(target_type="cubin")
+    ker = mod.get_kernel("wait")
+
+    mr = _DefaultPinnedMemorySource()
+    b = mr.allocate(4)
+    arr = np.from_dlpack(b).view(np.int32)
+    arr[0] = 0
+
+    config = LaunchConfig(grid=1, block=1)
+    ker_args = (arr.ctypes.data,)
+
+    enabled = EventOptions(enable_timing=True)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    launch(stream, config, ker, *ker_args)
+    event3 = stream.record(options=enabled)
+
+    # event3 will never complete because the stream is waiting on wait() to complete
+    with pytest.raises(RuntimeError, match="^One or both events have not completed."):
+        event3 - event1
+
+    arr[0] = 1
+    event3.sync()
+    event3 - event1  # this should work