Skip to content

Commit 314060c

Browse files
vzhurba01leofang
andauthored
Add phase 1 of CUDA Graphs support (#455)
* Draft Draft Initial weakref work Use stream as name stash * Align with design * Finish graphs APIs and add tests * Revert removed line in pyproject * Self review * Run pre-commit * Return clean errors when conditional is unsupported * Add numpy version check to tests * Add build modes * Check for both old versions of NVRTC and driver * Fix recursive is building check * Run pre-commit * Close MR allocs so that they don't GC during future Graph building * Check binding version in addition to driver version * Check NVRTC error correctly * Fix options check * Change default build mode to RELAXED * Resolve some review comments * Resolve review comments about naming * Extend test and fix spelling * Pre-commit and revert fix * use CUDA context memoized by stream * avoid erasing the type CUgraphConditionalHandle * launch_graph -> gb.add_child * add docs * fix typo --------- Co-authored-by: Leo Fang <[email protected]> Co-authored-by: Leo Fang <[email protected]>
1 parent 1cb8a6f commit 314060c

File tree

9 files changed

+1569
-0
lines changed

9 files changed

+1569
-0
lines changed

cuda_core/cuda/core/experimental/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
from cuda.core.experimental import utils
66
from cuda.core.experimental._device import Device
77
from cuda.core.experimental._event import Event, EventOptions
8+
from cuda.core.experimental._graph import (
9+
Graph,
10+
GraphBuilder,
11+
GraphCompleteOptions,
12+
GraphDebugPrintOptions,
13+
)
814
from cuda.core.experimental._launch_config import LaunchConfig
915
from cuda.core.experimental._launcher import launch
1016
from cuda.core.experimental._linker import Linker, LinkerOptions

cuda_core/cuda/core/experimental/_device.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from cuda.core.experimental._context import Context, ContextOptions
99
from cuda.core.experimental._event import Event, EventOptions
10+
from cuda.core.experimental._graph import GraphBuilder
1011
from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
1112
from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
1213
from cuda.core.experimental._utils.clear_error_support import assert_type
@@ -1298,3 +1299,15 @@ def sync(self):
12981299
12991300
"""
13001301
handle_return(runtime.cudaDeviceSynchronize())
1302+
1303+
@precondition(_check_context_initialized)
1304+
def create_graph_builder(self) -> GraphBuilder:
1305+
"""Create a new :obj:`~_graph.GraphBuilder` object.
1306+
1307+
Returns
1308+
-------
1309+
:obj:`~_graph.GraphBuilder`
1310+
Newly created graph builder object.
1311+
1312+
"""
1313+
return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True)

cuda_core/cuda/core/experimental/_graph.py

Lines changed: 774 additions & 0 deletions
Large diffs are not rendered by default.

cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import ctypes
1616
import numpy
1717

1818
from cuda.core.experimental._memory import Buffer
19+
from cuda.core.experimental._utils.cuda_utils import driver
1920

2021

2122
ctypedef cpp_complex.complex[float] cpp_single_complex
@@ -235,6 +236,10 @@ cdef class ParamHolder:
235236
if not_prepared:
236237
not_prepared = prepare_ctypes_arg(self.data, self.data_addresses, arg, i)
237238
if not_prepared:
239+
# TODO: revisit this treatment if we decide to cythonize cuda.core
240+
if isinstance(arg, driver.CUgraphConditionalHandle):
241+
prepare_arg[intptr_t](self.data, self.data_addresses, <intptr_t>int(arg), i)
242+
continue
238243
# TODO: support ctypes/numpy struct
239244
raise TypeError("the argument is of unsupported type: " + str(type(arg)))
240245

cuda_core/cuda/core/experimental/_stream.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from cuda.core.experimental._device import Device
1616
from cuda.core.experimental._context import Context
1717
from cuda.core.experimental._event import Event, EventOptions
18+
from cuda.core.experimental._graph import GraphBuilder
1819
from cuda.core.experimental._utils.clear_error_support import assert_type
1920
from cuda.core.experimental._utils.cuda_utils import (
2021
check_or_create_options,
@@ -342,6 +343,19 @@ def __cuda_stream__(self):
342343

343344
return Stream._init(obj=_stream_holder())
344345

346+
def create_graph_builder(self) -> GraphBuilder:
347+
"""Create a new :obj:`~_graph.GraphBuilder` object.
348+
349+
The new graph builder will be associated with this stream.
350+
351+
Returns
352+
-------
353+
:obj:`~_graph.GraphBuilder`
354+
Newly created graph builder object.
355+
356+
"""
357+
return GraphBuilder._init(stream=self, is_stream_owner=False)
358+
345359

346360
LEGACY_DEFAULT_STREAM = Stream._legacy_default()
347361
PER_THREAD_DEFAULT_STREAM = Stream._per_thread_default()

cuda_core/docs/source/api.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,15 @@ CUDA runtime
1717
:toctree: generated/
1818

1919
Device
20+
Graph
21+
GraphBuilder
2022
launch
2123

2224
:template: dataclass.rst
2325

2426
EventOptions
27+
GraphCompleteOptions
28+
GraphDebugPrintOptions
2529
StreamOptions
2630
LaunchConfig
2731

cuda_core/docs/source/release/0.3.0-notes.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ Highlights
1111
----------
1212

1313
- Starting this release ``cuda.core`` is licensed under Apache 2.0.
14+
- Initial support for CUDA graphs (phase 1).
15+
- In this release, we support building a CUDA graph that captures kernel launches. The captured graph can be replayed to reduce
16+
latency. Graph split/join and conditional nodes are supported.
1417

1518

1619
Breaking Changes

cuda_core/tests/test_event.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ def test_error_timing_incomplete():
165165
arr[0] = 1
166166
event3.sync()
167167
event3 - event1 # this should work
168+
b.close()
168169

169170

170171
def test_event_device(init_cuda):

0 commit comments

Comments
 (0)