Skip to content

Commit b89c95f

Browse files
Add occupancy tests, except for cluster-related queries
1 parent 5bd64a7 commit b89c95f

File tree

1 file changed

+52
-1
lines changed

1 file changed

+52
-1
lines changed

cuda_core/tests/test_module.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from conftest import skipif_testing_with_compute_sanitizer
99

1010
import cuda.core.experimental
11-
from cuda.core.experimental import ObjectCode, Program, ProgramOptions, system
11+
from cuda.core.experimental import Device, ObjectCode, Program, ProgramOptions, system
1212
from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
1313

1414
SAXPY_KERNEL = r"""
@@ -40,6 +40,11 @@ def test_kernel_attributes_init_disabled():
4040
cuda.core.experimental._module.KernelAttributes() # Ensure back door is locked.
4141

4242

43+
def test_kernel_occupancy_init_disabled():
44+
with pytest.raises(RuntimeError, match=r"^KernelOccupancy cannot be instantiated directly\."):
45+
cuda.core.experimental._module.KernelOccupancy() # Ensure back door is locked.
46+
47+
4348
def test_kernel_init_disabled():
4449
with pytest.raises(RuntimeError, match=r"^Kernel objects cannot be instantiated directly\."):
4550
cuda.core.experimental._module.Kernel() # Ensure back door is locked.
@@ -245,3 +250,49 @@ def test_num_args_error_handling(deinit_all_contexts_function, cuda12_prerequisi
245250
with pytest.raises(CUDAError):
246251
# assignment resolves linter error "B018: useless expression"
247252
_ = krn.num_arguments
253+
254+
255+
@pytest.mark.parametrize("block_size", [32, 64, 96, 120, 128, 256])
256+
@pytest.mark.parametrize("smem_size_per_block", [0, 32, 4096])
257+
def test_saxpy_occupancy_max_active_block_per_multiprocessor(get_saxpy_kernel, block_size, smem_size_per_block):
258+
kernel, _ = get_saxpy_kernel
259+
dev_props = Device().properties
260+
assert block_size <= dev_props.max_threads_per_block
261+
assert smem_size_per_block <= dev_props.max_shared_memory_per_block
262+
num_blocks_per_sm = kernel.occupancy.max_active_blocks_per_multiprocessor(block_size, smem_size_per_block)
263+
assert isinstance(num_blocks_per_sm, int)
264+
assert num_blocks_per_sm > 0
265+
kernel_threads_per_sm = num_blocks_per_sm * block_size
266+
kernel_smem_size_per_sm = num_blocks_per_sm * smem_size_per_block
267+
assert kernel_threads_per_sm <= dev_props.max_threads_per_multiprocessor
268+
assert kernel_smem_size_per_sm <= dev_props.max_shared_memory_per_multiprocessor
269+
assert kernel.attributes.num_regs() * num_blocks_per_sm <= dev_props.max_registers_per_multiprocessor
270+
271+
272+
@pytest.mark.parametrize("block_size_limit", [32, 64, 96, 120, 128, 256])
273+
@pytest.mark.parametrize("smem_size_per_block", [0, 32, 4096])
274+
def test_saxpy_occupancy_max_potential_block_size(get_saxpy_kernel, block_size_limit, smem_size_per_block):
275+
kernel, _ = get_saxpy_kernel
276+
dev_props = Device().properties
277+
assert block_size_limit <= dev_props.max_threads_per_block
278+
assert smem_size_per_block <= dev_props.max_shared_memory_per_block
279+
config_data = kernel.occupancy.max_potential_block_size(smem_size_per_block, block_size_limit)
280+
assert isinstance(config_data, tuple)
281+
assert len(config_data) == 2
282+
min_grid_size, max_block_size = config_data
283+
assert isinstance(min_grid_size, int)
284+
assert isinstance(max_block_size, int)
285+
assert min_grid_size > 0
286+
assert max_block_size > 0
287+
assert max_block_size <= block_size_limit
288+
289+
290+
@pytest.mark.parametrize("num_blocks_per_sm, block_size", [(4, 32), (2, 64), (2, 96), (3, 120), (2, 128), (1, 256)])
291+
def test_saxpy_occupancy_available_dynamic_shared_memory_per_block(get_saxpy_kernel, num_blocks_per_sm, block_size):
292+
kernel, _ = get_saxpy_kernel
293+
dev_props = Device().properties
294+
assert block_size <= dev_props.max_threads_per_block
295+
assert num_blocks_per_sm * block_size <= dev_props.max_threads_per_multiprocessor
296+
smem_size = kernel.occupancy.available_dynamic_shared_memory_per_block(num_blocks_per_sm, block_size)
297+
assert smem_size <= dev_props.max_shared_memory_per_block
298+
assert num_blocks_per_sm * smem_size <= dev_props.max_shared_memory_per_multiprocessor

0 commit comments

Comments
 (0)