From 9ca8bd626d3765a237d798510b2fa1a36e857d40 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 3 Mar 2025 00:55:04 +0000 Subject: [PATCH 1/4] add event timing --- cuda_core/cuda/core/experimental/_event.py | 32 +++++++++++++++++-- cuda_core/docs/source/release/0.2.0-notes.rst | 1 + cuda_core/tests/test_event.py | 15 +++++++-- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 3b269179b..fca0a3ef4 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -47,8 +47,25 @@ class Event: the last recorded stream. Events can be used to monitor device's progress, query completion - of work up to event's record, and help establish dependencies - between GPU work submissions. + of work up to event's record, help establish dependencies + between GPU work submissions, and record the elapsed time on GPU: + + .. code-block:: python + + # To create events and record the timing: + s = Device(0).create_stream() + e1 = s.record(options={"enable_timing": True}) + # ... run some GPU works ... + e2 = s.record(options={"enable_timing": True}) + e2.sync() + print(f"time = {e2 - e1}") + + # Or, if events are already created: + s.record(e1) + # ... run some more GPU works ... + s.record(e2) + e2.sync() + print(f"time = {e2 - e1}") Directly creating an :obj:`~_event.Event` is not supported due to ambiguity, and they should instead be created through a :obj:`~_stream.Stream` object. @@ -96,6 +113,17 @@ def close(self): """Destroy the event.""" self._mnff.close() + def __isub__(self, other): + return NotImplemented + + def __rsub__(self, other): + return NotImplemented + + def __sub__(self, other): + # return self - other + timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle)) + return timing + @property def is_timing_disabled(self) -> bool: """Return True if the event does not record timing data, otherwise False.""" diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst index 39cb586c1..7d872c8fd 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.rst +++ b/cuda_core/docs/source/release/0.2.0-notes.rst @@ -25,6 +25,7 @@ New features - Expose :class:`ObjectCode` as a public API, which allows loading cubins from memory or disk. For loading other kinds of code types, please continue using :class:`Program`. - A C++ helper function ``get_cuda_native_handle()`` is provided in the new ``include/utility.cuh`` header to retrive the underlying CUDA C objects (ex: ``CUstream``) from a Python object returned by the ``.handle`` attribute (ex: :attr:`Stream.handle`). - For objects such as :class:`Program` and :class:`Linker` that could dispatch to different backends, a new ``.backend`` attribute is provided to query this information. +- Support CUDA event timing. Limitations ----------- diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index df1a7fc47..73d2d7ae6 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -6,6 +6,8 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. +import time + import pytest from cuda.core.experimental import Device, EventOptions @@ -15,8 +17,17 @@ def test_timing(init_cuda, enable_timing): options = EventOptions(enable_timing=enable_timing) stream = Device().create_stream() - event = stream.record(options=options) - assert event.is_timing_disabled == (not enable_timing if enable_timing is not None else True) + n_seconds = 0.5 + e1 = stream.record(options=options) + time.sleep(n_seconds) + e2 = stream.record(options=options) + for e in (e1, e2): + assert e.is_timing_disabled == (not enable_timing if enable_timing is not None else True) + if enable_timing: + e2.sync() + elapsed_time = e2 - e1 + assert isinstance(elapsed_time, float) + assert n_seconds * 1000 <= elapsed_time < n_seconds * 1000 + 2 # tolerance 2 ms def test_is_sync_busy_waited(init_cuda): From 8aca468526183cae811e51307ca4a009169daa07 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 4 Mar 2025 14:25:00 +0000 Subject: [PATCH 2/4] address review comments --- cuda_core/cuda/core/experimental/_event.py | 9 +++++---- cuda_core/tests/test_event.py | 19 ++++++++++++------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index fca0a3ef4..55b5f8beb 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -48,7 +48,8 @@ class Event: Events can be used to monitor device's progress, query completion of work up to event's record, help establish dependencies - between GPU work submissions, and record the elapsed time on GPU: + between GPU work submissions, and record the elapsed time (in milliseconds) + on GPU: .. code-block:: python @@ -58,14 +59,14 @@ class Event: # ... run some GPU works ... e2 = s.record(options={"enable_timing": True}) e2.sync() - print(f"time = {e2 - e1}") + print(f"time = {e2 - e1} milliseconds") # Or, if events are already created: s.record(e1) # ... run some more GPU works ... s.record(e2) e2.sync() - print(f"time = {e2 - e1}") + print(f"time = {e2 - e1} milliseconds") Directly creating an :obj:`~_event.Event` is not supported due to ambiguity, and they should instead be created through a :obj:`~_stream.Stream` object. @@ -120,7 +121,7 @@ def __rsub__(self, other): return NotImplemented def __sub__(self, other): - # return self - other + # return self - other (in milliseconds) timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle)) return timing diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 73d2d7ae6..ec824e3d8 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -11,23 +11,28 @@ import pytest from cuda.core.experimental import Device, EventOptions +from cuda.core.experimental._utils import CUDAError @pytest.mark.parametrize("enable_timing", [True, False, None]) def test_timing(init_cuda, enable_timing): options = EventOptions(enable_timing=enable_timing) stream = Device().create_stream() - n_seconds = 0.5 + delay_seconds = 0.5 e1 = stream.record(options=options) - time.sleep(n_seconds) + time.sleep(delay_seconds) e2 = stream.record(options=options) + e2.sync() for e in (e1, e2): - assert e.is_timing_disabled == (not enable_timing if enable_timing is not None else True) + assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing) if enable_timing: - e2.sync() - elapsed_time = e2 - e1 - assert isinstance(elapsed_time, float) - assert n_seconds * 1000 <= elapsed_time < n_seconds * 1000 + 2 # tolerance 2 ms + elapsed_time_ms = e2 - e1 + assert isinstance(elapsed_time_ms, float) + assert delay_seconds * 1000 <= elapsed_time_ms < delay_seconds * 1000 + 2 # tolerance 2 ms + else: + with pytest.raises(CUDAError) as e: + elapsed_time_ms = e2 - e1 + assert "CUDA_ERROR_INVALID_HANDLE" in str(e) def test_is_sync_busy_waited(init_cuda): From f0f19634c044c46bfb6fda557b2d81b44398cc09 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 6 Mar 2025 02:59:46 +0000 Subject: [PATCH 3/4] improve docs --- cuda_core/cuda/core/experimental/_event.py | 13 ++++--------- cuda_core/docs/source/release/0.2.0-notes.rst | 2 +- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index ea4750442..5df676cee 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -54,16 +54,11 @@ class Event: .. code-block:: python # To create events and record the timing: - s = Device(0).create_stream() - e1 = s.record(options={"enable_timing": True}) - # ... run some GPU works ... - e2 = s.record(options={"enable_timing": True}) - e2.sync() - print(f"time = {e2 - e1} milliseconds") - - # Or, if events are already created: + s = Device().create_stream() + e1 = Device().create_event({"enable_timing": True}) + e2 = Device().create_event({"enable_timing": True}) s.record(e1) - # ... run some more GPU works ... + # ... run some GPU works ... s.record(e2) e2.sync() print(f"time = {e2 - e1} milliseconds") diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst index e0792532d..02f586d58 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.rst +++ b/cuda_core/docs/source/release/0.2.0-notes.rst @@ -28,7 +28,7 @@ New features - A C++ helper function ``get_cuda_native_handle()`` is provided in the new ``include/utility.cuh`` header to retrive the underlying CUDA C objects (ex: ``CUstream``) from a Python object returned by the ``.handle`` attribute (ex: :attr:`Stream.handle`). - For objects such as :class:`Program` and :class:`Linker` that could dispatch to different backends, a new ``.backend`` attribute is provided to query this information. - Support CUDA event timing. -- An :class:`~_event.Event` may now be created without recording it to a :class:`Stream` using the :meth:`Device.create_event`` method. +- An :class:`~_event.Event` may now be created without recording it to a :class:`~_stream.Stream` using the :meth:`Device.create_event` method. Limitations ----------- From 268eee504be9ab11f8a3c6b1bf2a2a0e2ad5fd05 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 6 Mar 2025 03:03:54 +0000 Subject: [PATCH 4/4] chaining exception to get nicer err msg --- cuda_core/cuda/core/experimental/_event.py | 7 ++++++- cuda_core/tests/test_event.py | 7 ++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 5df676cee..10d52dda6 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -117,7 +117,12 @@ def __rsub__(self, other): def __sub__(self, other): # return self - other (in milliseconds) - timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle)) + try: + timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle)) + except CUDAError as e: + raise RuntimeError( + "Timing capability must be enabled in order to subtract two Events; timing is disabled by default." + ) from e return timing @property diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index a0ea09d67..384cf4586 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -12,7 +12,6 @@ import cuda.core.experimental from cuda.core.experimental import Device, EventOptions -from cuda.core.experimental._utils import CUDAError def test_event_init_disabled(): @@ -36,9 +35,11 @@ def test_timing(init_cuda, enable_timing): assert isinstance(elapsed_time_ms, float) assert delay_seconds * 1000 <= elapsed_time_ms < delay_seconds * 1000 + 2 # tolerance 2 ms else: - with pytest.raises(CUDAError) as e: + with pytest.raises(RuntimeError) as e: elapsed_time_ms = e2 - e1 - assert "CUDA_ERROR_INVALID_HANDLE" in str(e) + msg = str(e) + assert "disabled by default" in msg + assert "CUDA_ERROR_INVALID_HANDLE" in msg def test_is_sync_busy_waited(init_cuda):