Skip to content

NEW: Make event timing error messages more specific and actionable #559

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Apr 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/test-wheel-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ jobs:

if ('${{ inputs.local-ctk }}' -eq '1') {
if ($TEST_CUDA_MAJOR -eq '12') {
$MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink"]'
$MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink", "thrust"]'
} else {
$MINI_CTK_DEPS = '["nvcc", "nvrtc"]'
$MINI_CTK_DEPS = '["nvcc", "nvrtc", "thrust"]'
}
}

Expand Down
38 changes: 32 additions & 6 deletions cuda_core/cuda/core/experimental/_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,15 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional

from cuda.core.experimental._utils.cuda_utils import CUDAError, check_or_create_options, driver, handle_return
from cuda.core.experimental._utils.cuda_utils import (
CUDAError,
check_or_create_options,
driver,
handle_return,
)
from cuda.core.experimental._utils.cuda_utils import (
_check_driver_error as raise_if_driver_error,
)

if TYPE_CHECKING:
import cuda.bindings
Expand Down Expand Up @@ -117,13 +125,31 @@ def __rsub__(self, other):

def __sub__(self, other):
# return self - other (in milliseconds)
err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
try:
timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle))
raise_if_driver_error(err)
return timing
except CUDAError as e:
raise RuntimeError(
"Timing capability must be enabled in order to subtract two Events; timing is disabled by default."
) from e
return timing
if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE:
if self.is_timing_disabled or other.is_timing_disabled:
explanation = (
"Both Events must be created with timing enabled in order to subtract them; "
"use EventOptions(enable_timing=True) when creating both events."
)
else:
explanation = (
"Both Events must be recorded before they can be subtracted; "
"use Stream.record() to record both events to a stream."
)
elif err == driver.CUresult.CUDA_ERROR_NOT_READY:
explanation = (
"One or both events have not completed; "
"use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
"before subtracting them."
)
else:
raise e
raise RuntimeError(explanation) from e

@property
def is_timing_disabled(self) -> bool:
Expand Down
140 changes: 115 additions & 25 deletions cuda_core/tests/test_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,50 +7,43 @@
# is strictly prohibited.

import os
import pathlib
import time

import numpy as np
import pytest

import cuda.core.experimental
from cuda.core.experimental import Device, EventOptions
from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch
from cuda.core.experimental._memory import _DefaultPinnedMemorySource


def test_event_init_disabled():
with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
cuda.core.experimental._event.Event() # Ensure back door is locked.


@pytest.mark.parametrize("enable_timing", [True, False, None])
def test_timing(init_cuda, enable_timing):
options = EventOptions(enable_timing=enable_timing)
def test_timing_success(init_cuda):
options = EventOptions(enable_timing=True)
stream = Device().create_stream()
delay_seconds = 0.5
e1 = stream.record(options=options)
time.sleep(delay_seconds)
e2 = stream.record(options=options)
e2.sync()
for e in (e1, e2):
assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing)
if enable_timing:
elapsed_time_ms = e2 - e1
assert isinstance(elapsed_time_ms, float)
# Using a generous tolerance, to avoid flaky tests:
# We only want to exercise the __sub__ method, this test is not meant
# to stress-test the CUDA driver or time.sleep().
delay_ms = delay_seconds * 1000
if os.name == "nt": # noqa: SIM108
# For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
generous_tolerance = 100
else:
# Most modern Linux kernels have a default timer resolution of 1 ms.
generous_tolerance = 20
assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
elapsed_time_ms = e2 - e1
assert isinstance(elapsed_time_ms, float)
# Using a generous tolerance, to avoid flaky tests:
# We only want to exercise the __sub__ method, this test is not meant
# to stress-test the CUDA driver or time.sleep().
delay_ms = delay_seconds * 1000
if os.name == "nt": # noqa: SIM108
# For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
generous_tolerance = 100
else:
with pytest.raises(RuntimeError) as e:
elapsed_time_ms = e2 - e1
msg = str(e)
assert "disabled by default" in msg
assert "CUDA_ERROR_INVALID_HANDLE" in msg
# Most modern Linux kernels have a default timer resolution of 1 ms.
generous_tolerance = 20
assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance


def test_is_sync_busy_waited(init_cuda):
Expand Down Expand Up @@ -80,3 +73,100 @@ def test_is_done(init_cuda):
# Without a sync, the captured work might not have yet completed
# Therefore this check should never raise an exception
assert event.is_done in (True, False)


def test_error_timing_disabled():
device = Device()
device.set_current()
enabled = EventOptions(enable_timing=True)
disabled = EventOptions(enable_timing=False)
stream = device.create_stream()

event1 = stream.record(options=enabled)
event2 = stream.record(options=disabled)
assert not event1.is_timing_disabled
assert event2.is_timing_disabled
stream.sync()
with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
event2 - event1

event1 = stream.record(options=disabled)
event2 = stream.record(options=disabled)
stream.sync()
with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
event2 - event1


def test_error_timing_recorded():
device = Device()
device.set_current()
enabled = EventOptions(enable_timing=True)
stream = device.create_stream()

event1 = stream.record(options=enabled)
event2 = device.create_event(options=enabled)
event3 = device.create_event(options=enabled)

stream.sync()
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
event2 - event1
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
event1 - event2
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
event3 - event2


# TODO: improve this once path finder can find headers
@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header")
@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
def test_error_timing_incomplete():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, this is an amazingly sophisticated unit test setup.

device = Device()
device.set_current()

# This kernel is designed to busy loop until a signal is received
code = """
#include <cuda/atomic>

extern "C"
__global__ void wait(int* val) {
cuda::atomic_ref<int, cuda::thread_scope_system> signal{*val};
while (true) {
if (signal.load(cuda::memory_order_relaxed)) {
break;
}
}
}
"""

arch = "".join(f"{i}" for i in device.compute_capability)
program_options = ProgramOptions(
std="c++17",
arch=f"sm_{arch}",
include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")),
)
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile(target_type="cubin")
ker = mod.get_kernel("wait")

mr = _DefaultPinnedMemorySource()
b = mr.allocate(4)
arr = np.from_dlpack(b).view(np.int32)
arr[0] = 0

config = LaunchConfig(grid=1, block=1)
ker_args = (arr.ctypes.data,)

enabled = EventOptions(enable_timing=True)
stream = device.create_stream()

event1 = stream.record(options=enabled)
launch(stream, config, ker, *ker_args)
event3 = stream.record(options=enabled)

# event3 will never complete because the stream is waiting on wait() to complete
with pytest.raises(RuntimeError, match="^One or both events have not completed."):
event3 - event1

arr[0] = 1
event3.sync()
event3 - event1 # this should work
Loading