Skip to content

Add cluster to LaunchConfig to support thread block clusters on Hopper #261

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Dec 13, 2024
Merged
30 changes: 25 additions & 5 deletions cuda_core/cuda/core/experimental/_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Optional, Union

from cuda import cuda
from cuda.core.experimental._device import Device
from cuda.core.experimental._kernel_arg_handler import ParamHolder
from cuda.core.experimental._module import Kernel
from cuda.core.experimental._stream import Stream
Expand Down Expand Up @@ -38,10 +39,14 @@ class LaunchConfig:
----------
grid : Union[tuple, int]
Collection of threads that will execute a kernel function.
cluster : Union[tuple, int]
Group of blocks (Thread Block Cluster) that will execute on the same
GPU Processing Cluster (GPC). Blocks within a cluster have access to
distributed shared memory and can be explicitly synchronized.
block : Union[tuple, int]
Group of threads (Thread Block) that will execute on the same
multiprocessor. Threads within a thread blocks have access to
shared memory and can be explicitly synchronized.
streaming multiprocessor (SM). Threads within a thread blocks have
access to shared memory and can be explicitly synchronized.
stream : :obj:`Stream`
The stream establishing the stream ordering semantic of a
launch.
Expand All @@ -53,13 +58,22 @@ class LaunchConfig:

# TODO: expand LaunchConfig to include other attributes
grid: Union[tuple, int] = None
cluster: Union[tuple, int] = None
block: Union[tuple, int] = None
stream: Stream = None
shmem_size: Optional[int] = None

def __post_init__(self):
_lazy_init()
self.grid = self._cast_to_3_tuple(self.grid)
self.block = self._cast_to_3_tuple(self.block)
# thread block clusters are supported starting H100
if self.cluster is not None:
if not _use_ex:
raise CUDAError("thread block clusters require cuda.bindings & driver 11.8+")
if Device().compute_capability < (9, 0):
raise CUDAError("thread block clusters are not supported below Hopper")
self.cluster = self._cast_to_3_tuple(self.cluster)
# we handle "stream=None" in the launch API
if self.stream is not None and not isinstance(self.stream, Stream):
try:
Expand All @@ -69,8 +83,6 @@ def __post_init__(self):
if self.shmem_size is None:
self.shmem_size = 0

_lazy_init()

def _cast_to_3_tuple(self, cfg):
if isinstance(cfg, int):
if cfg < 1:
Expand Down Expand Up @@ -133,7 +145,15 @@ def launch(kernel, config, *kernel_args):
drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
drv_cfg.hStream = config.stream.handle
drv_cfg.sharedMemBytes = config.shmem_size
drv_cfg.numAttrs = 0 # TODO
attrs = [] # TODO: support more attributes
if config.cluster:
attr = cuda.CUlaunchAttribute()
attr.id = cuda.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
dim = attr.value.clusterDim
dim.x, dim.y, dim.z = config.cluster
attrs.append(attr)
drv_cfg.numAttrs = len(attrs)
drv_cfg.attrs = attrs
handle_return(cuda.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
else:
# TODO: check if config has any unsupported attrs
Expand Down
15 changes: 15 additions & 0 deletions cuda_core/docs/source/release/0.1.1-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,24 @@
Released on Dec XX, 2024

## Hightlights

- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
implementation of DLPack & CUDA Array Interface supports.

## New features

- Add `LaunchConfig.cluster` to support thread block clusters on Hopper GPUs.

## Enchancements

- Ensure "ltoir" is a valid code type to `ObjectCode`.
- Improve test coverage.
- Enforce code formatting.

## Bug fixes

- Eliminate potential class destruction issues.
- Fix circular import during handling a foreign CUDA stream.

## Limitations

Expand Down
64 changes: 64 additions & 0 deletions cuda_core/examples/thread_block_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import os
import sys

from cuda.core.experimental import Device, LaunchConfig, Program, launch

# prepare include
cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
if cuda_path is None:
print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr)
sys.exit(0)
cuda_include_path = os.path.join(cuda_path, "include")

# print cluster info using a kernel
code = r"""
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

extern "C"
__global__ void check_cluster_info() {
auto g = cg::this_grid();
auto b = cg::this_thread_block();
if (g.cluster_rank() == 0 && g.block_rank() == 0 && g.thread_rank() == 0) {
printf("grid dim: (%u, %u, %u)\n", g.dim_blocks().x, g.dim_blocks().y, g.dim_blocks().z);
printf("cluster dim: (%u, %u, %u)\n", g.dim_clusters().x, g.dim_clusters().y, g.dim_clusters().z);
printf("block dim: (%u, %u, %u)\n", b.dim_threads().x, b.dim_threads().y, b.dim_threads().z);
}
}
"""

dev = Device()
dev.set_current()
arch = dev.compute_capability
if arch < (9, 0):
print("this demo requires a Hopper GPU (since thread block cluster is a hardware feature)", file=sys.stderr)
sys.exit(0)
arch = "".join(f"{i}" for i in arch)

# prepare program
prog = Program(code, code_type="c++")
mod = prog.compile(
target_type="cubin",
# TODO: update this after NVIDIA/cuda-python#237 is merged
options=(f"-arch=sm_{arch}", "-std=c++17", f"-I{cuda_include_path}"),
)

# run in single precision
ker = mod.get_kernel("check_cluster_info")

# prepare launch config
grid = 4
cluster = 2
block = 32
config = LaunchConfig(grid=grid, cluster=cluster, block=block, stream=dev.default_stream)

# launch kernel on the default stream
launch(ker, config)
dev.sync()

print("done!")
115 changes: 59 additions & 56 deletions cuda_core/tests/example_tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,59 @@
# Copyright 2024 NVIDIA Corporation. All rights reserved.
#
# Please refer to the NVIDIA end user license agreement (EULA) associated
# with this source code for terms and conditions that govern your use of
# this software. Any use, reproduction, disclosure, or distribution of
# this software and related documentation outside the terms of the EULA
# is strictly prohibited.

import gc
import os
import sys

import cupy as cp
import pytest


class SampleTestError(Exception):
pass


def parse_python_script(filepath):
if not filepath.endswith(".py"):
raise ValueError(f"{filepath} not supported")
with open(filepath, encoding="utf-8") as f:
script = f.read()
return script


def run_example(samples_path, filename, env=None):
fullpath = os.path.join(samples_path, filename)
script = parse_python_script(fullpath)
try:
old_argv = sys.argv
sys.argv = [fullpath]
old_sys_path = sys.path.copy()
sys.path.append(samples_path)
exec(script, env if env else {})
except ImportError as e:
# for samples requiring any of optional dependencies
for m in ("cupy",):
if f"No module named '{m}'" in str(e):
pytest.skip(f"{m} not installed, skipping related tests")
break
else:
raise
except Exception as e:
msg = "\n"
msg += f"Got error ({filename}):\n"
msg += str(e)
raise SampleTestError(msg) from e
finally:
sys.path = old_sys_path
sys.argv = old_argv
# further reduce the memory watermark
gc.collect()
cp.get_default_memory_pool().free_all_blocks()
# Copyright 2024 NVIDIA Corporation. All rights reserved.
#
# Please refer to the NVIDIA end user license agreement (EULA) associated
# with this source code for terms and conditions that govern your use of
# this software. Any use, reproduction, disclosure, or distribution of
# this software and related documentation outside the terms of the EULA
# is strictly prohibited.

import gc
import os
import sys

import cupy as cp
import pytest


class SampleTestError(Exception):
pass


def parse_python_script(filepath):
if not filepath.endswith(".py"):
raise ValueError(f"{filepath} not supported")
with open(filepath, encoding="utf-8") as f:
script = f.read()
return script


def run_example(samples_path, filename, env=None):
fullpath = os.path.join(samples_path, filename)
script = parse_python_script(fullpath)
try:
old_argv = sys.argv
sys.argv = [fullpath]
old_sys_path = sys.path.copy()
sys.path.append(samples_path)
exec(script, env if env else {})
except ImportError as e:
# for samples requiring any of optional dependencies
for m in ("cupy",):
if f"No module named '{m}'" in str(e):
pytest.skip(f"{m} not installed, skipping related tests")
break
else:
raise
except SystemExit:
# for samples that early return due to any missing requirements
pytest.skip(f"skip {filename}")
except Exception as e:
msg = "\n"
msg += f"Got error ({filename}):\n"
msg += str(e)
raise SampleTestError(msg) from e
finally:
sys.path = old_sys_path
sys.argv = old_argv
# further reduce the memory watermark
gc.collect()
cp.get_default_memory_pool().free_all_blocks()