Skip to content

Feature/occupancy #648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jun 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
f79a7f0
Fix typo: stream._handle -> stream.handle
oleksandr-pavlyk May 20, 2025
de7b3c9
Move definition of LaunchConfig class to separate file
oleksandr-pavlyk May 20, 2025
5bd64a7
Introduce _module.KernelOccupancy class
oleksandr-pavlyk May 20, 2025
b89c95f
Add occupancy tests, except for cluster-related queries
oleksandr-pavlyk May 20, 2025
9679e0e
Fix type in querying handle from Stream argument
oleksandr-pavlyk May 22, 2025
ff322ec
Add tests for cluster-related occupancy descriptors
oleksandr-pavlyk May 22, 2025
fd8302f
Introduce MaxPotentialBlockSizeOccupancyResult named tuple
oleksandr-pavlyk May 22, 2025
40d799a
KernelOccupancy.max_potential_block_size support for CUoccupancyB2DSize
oleksandr-pavlyk May 22, 2025
5968ff0
Add test for B2DSize usage in max_potential_block_size
oleksandr-pavlyk May 22, 2025
fdbad93
Merge branch 'main' into feature/occupancy
oleksandr-pavlyk May 22, 2025
436f111
Merge branch 'main' into feature/occupancy
oleksandr-pavlyk May 29, 2025
428f4fa
Improved max_potential_block_size.__doc__
oleksandr-pavlyk May 30, 2025
f1ff0f5
Add test for dynamic_shared_memory_needed arg of invalid type
oleksandr-pavlyk May 30, 2025
39a08f6
Mention feature/occupancy in 0.3.0 release notes
oleksandr-pavlyk May 30, 2025
f74dcf1
Add symbols to api_private.rst
oleksandr-pavlyk Jun 3, 2025
e2adc57
Reduce test name verbosity
oleksandr-pavlyk Jun 3, 2025
496eb5b
Add doc-strings to KernelOccupancy methods.
oleksandr-pavlyk Jun 3, 2025
f74db2c
fix rendering
leofang Jun 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from cuda.core.experimental import utils
from cuda.core.experimental._device import Device
from cuda.core.experimental._event import Event, EventOptions
from cuda.core.experimental._launcher import LaunchConfig, launch
from cuda.core.experimental._launch_config import LaunchConfig
from cuda.core.experimental._launcher import launch
from cuda.core.experimental._linker import Linker, LinkerOptions
from cuda.core.experimental._module import ObjectCode
from cuda.core.experimental._program import Program, ProgramOptions
Expand Down
97 changes: 97 additions & 0 deletions cuda_core/cuda/core/experimental/_launch_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: Apache-2.0

from dataclasses import dataclass
from typing import Optional, Union

from cuda.core.experimental._device import Device
from cuda.core.experimental._utils.cuda_utils import (
CUDAError,
cast_to_3_tuple,
driver,
get_binding_version,
handle_return,
)

# TODO: revisit this treatment for py313t builds
_inited = False


def _lazy_init():
global _inited
if _inited:
return

global _use_ex
# binding availability depends on cuda-python version
_py_major_minor = get_binding_version()
_driver_ver = handle_return(driver.cuDriverGetVersion())
_use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
_inited = True


@dataclass
class LaunchConfig:
"""Customizable launch options.

Attributes
----------
grid : Union[tuple, int]
Collection of threads that will execute a kernel function.
cluster : Union[tuple, int]
Group of blocks (Thread Block Cluster) that will execute on the same
GPU Processing Cluster (GPC). Blocks within a cluster have access to
distributed shared memory and can be explicitly synchronized.
block : Union[tuple, int]
Group of threads (Thread Block) that will execute on the same
streaming multiprocessor (SM). Threads within a thread blocks have
access to shared memory and can be explicitly synchronized.
shmem_size : int, optional
Dynamic shared-memory size per thread block in bytes.
(Default to size 0)

"""

# TODO: expand LaunchConfig to include other attributes
grid: Union[tuple, int] = None
cluster: Union[tuple, int] = None
block: Union[tuple, int] = None
shmem_size: Optional[int] = None

def __post_init__(self):
_lazy_init()
self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
# thread block clusters are supported starting H100
if self.cluster is not None:
if not _use_ex:
err, drvers = driver.cuDriverGetVersion()
drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")
cc = Device().compute_capability
if cc < (9, 0):
raise CUDAError(
f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
)
self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
if self.shmem_size is None:
self.shmem_size = 0


def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
_lazy_init()
drv_cfg = driver.CUlaunchConfig()
drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
drv_cfg.sharedMemBytes = config.shmem_size
attrs = [] # TODO: support more attributes
if config.cluster:
attr = driver.CUlaunchAttribute()
attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
dim = attr.value.clusterDim
dim.x, dim.y, dim.z = config.cluster
attrs.append(attr)
drv_cfg.numAttrs = len(attrs)
drv_cfg.attrs = attrs
return drv_cfg
71 changes: 4 additions & 67 deletions cuda_core/cuda/core/experimental/_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,13 @@
#
# SPDX-License-Identifier: Apache-2.0

from dataclasses import dataclass
from typing import Optional, Union

from cuda.core.experimental._device import Device
from cuda.core.experimental._kernel_arg_handler import ParamHolder
from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
from cuda.core.experimental._module import Kernel
from cuda.core.experimental._stream import Stream
from cuda.core.experimental._utils.clear_error_support import assert_type
from cuda.core.experimental._utils.cuda_utils import (
CUDAError,
cast_to_3_tuple,
check_or_create_options,
driver,
get_binding_version,
Expand All @@ -37,54 +33,6 @@ def _lazy_init():
_inited = True


@dataclass
class LaunchConfig:
"""Customizable launch options.

Attributes
----------
grid : Union[tuple, int]
Collection of threads that will execute a kernel function.
cluster : Union[tuple, int]
Group of blocks (Thread Block Cluster) that will execute on the same
GPU Processing Cluster (GPC). Blocks within a cluster have access to
distributed shared memory and can be explicitly synchronized.
block : Union[tuple, int]
Group of threads (Thread Block) that will execute on the same
streaming multiprocessor (SM). Threads within a thread blocks have
access to shared memory and can be explicitly synchronized.
shmem_size : int, optional
Dynamic shared-memory size per thread block in bytes.
(Default to size 0)

"""

# TODO: expand LaunchConfig to include other attributes
grid: Union[tuple, int] = None
cluster: Union[tuple, int] = None
block: Union[tuple, int] = None
shmem_size: Optional[int] = None

def __post_init__(self):
_lazy_init()
self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
# thread block clusters are supported starting H100
if self.cluster is not None:
if not _use_ex:
err, drvers = driver.cuDriverGetVersion()
drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")
cc = Device().compute_capability
if cc < (9, 0):
raise CUDAError(
f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
)
self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
if self.shmem_size is None:
self.shmem_size = 0


def launch(stream, config, kernel, *kernel_args):
"""Launches a :obj:`~_module.Kernel`
object with launch-time configuration.
Expand Down Expand Up @@ -114,6 +62,7 @@ def launch(stream, config, kernel, *kernel_args):
f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
) from e
assert_type(kernel, Kernel)
_lazy_init()
config = check_or_create_options(LaunchConfig, config, "launch config")

# TODO: can we ensure kernel_args is valid/safe to use here?
Expand All @@ -127,25 +76,13 @@ def launch(stream, config, kernel, *kernel_args):
# mainly to see if the "Ex" API is available and if so we use it, as it's more feature
# rich.
if _use_ex:
drv_cfg = driver.CUlaunchConfig()
drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
drv_cfg = _to_native_launch_config(config)
drv_cfg.hStream = stream.handle
drv_cfg.sharedMemBytes = config.shmem_size
attrs = [] # TODO: support more attributes
if config.cluster:
attr = driver.CUlaunchAttribute()
attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
dim = attr.value.clusterDim
dim.x, dim.y, dim.z = config.cluster
attrs.append(attr)
drv_cfg.numAttrs = len(attrs)
drv_cfg.attrs = attrs
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
else:
# TODO: check if config has any unsupported attrs
handle_return(
driver.cuLaunchKernel(
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream._handle, args_ptr, 0
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
)
)
Loading