|
8 | 8 | from conftest import skipif_testing_with_compute_sanitizer
|
9 | 9 |
|
10 | 10 | import cuda.core.experimental
|
11 |
| -from cuda.core.experimental import ObjectCode, Program, ProgramOptions, system |
| 11 | +from cuda.core.experimental import Device, ObjectCode, Program, ProgramOptions, system |
12 | 12 | from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
|
13 | 13 |
|
14 | 14 | SAXPY_KERNEL = r"""
|
@@ -40,6 +40,11 @@ def test_kernel_attributes_init_disabled():
|
40 | 40 | cuda.core.experimental._module.KernelAttributes() # Ensure back door is locked.
|
41 | 41 |
|
42 | 42 |
|
| 43 | +def test_kernel_occupancy_init_disabled(): |
| 44 | + with pytest.raises(RuntimeError, match=r"^KernelOccupancy cannot be instantiated directly\."): |
| 45 | + cuda.core.experimental._module.KernelOccupancy() # Ensure back door is locked. |
| 46 | + |
| 47 | + |
43 | 48 | def test_kernel_init_disabled():
|
44 | 49 | with pytest.raises(RuntimeError, match=r"^Kernel objects cannot be instantiated directly\."):
|
45 | 50 | cuda.core.experimental._module.Kernel() # Ensure back door is locked.
|
@@ -245,3 +250,49 @@ def test_num_args_error_handling(deinit_all_contexts_function, cuda12_prerequisi
|
245 | 250 | with pytest.raises(CUDAError):
|
246 | 251 | # assignment resolves linter error "B018: useless expression"
|
247 | 252 | _ = krn.num_arguments
|
| 253 | + |
| 254 | + |
| 255 | +@pytest.mark.parametrize("block_size", [32, 64, 96, 120, 128, 256]) |
| 256 | +@pytest.mark.parametrize("smem_size_per_block", [0, 32, 4096]) |
| 257 | +def test_saxpy_occupancy_max_active_block_per_multiprocessor(get_saxpy_kernel, block_size, smem_size_per_block): |
| 258 | + kernel, _ = get_saxpy_kernel |
| 259 | + dev_props = Device().properties |
| 260 | + assert block_size <= dev_props.max_threads_per_block |
| 261 | + assert smem_size_per_block <= dev_props.max_shared_memory_per_block |
| 262 | + num_blocks_per_sm = kernel.occupancy.max_active_blocks_per_multiprocessor(block_size, smem_size_per_block) |
| 263 | + assert isinstance(num_blocks_per_sm, int) |
| 264 | + assert num_blocks_per_sm > 0 |
| 265 | + kernel_threads_per_sm = num_blocks_per_sm * block_size |
| 266 | + kernel_smem_size_per_sm = num_blocks_per_sm * smem_size_per_block |
| 267 | + assert kernel_threads_per_sm <= dev_props.max_threads_per_multiprocessor |
| 268 | + assert kernel_smem_size_per_sm <= dev_props.max_shared_memory_per_multiprocessor |
| 269 | + assert kernel.attributes.num_regs() * num_blocks_per_sm <= dev_props.max_registers_per_multiprocessor |
| 270 | + |
| 271 | + |
| 272 | +@pytest.mark.parametrize("block_size_limit", [32, 64, 96, 120, 128, 256]) |
| 273 | +@pytest.mark.parametrize("smem_size_per_block", [0, 32, 4096]) |
| 274 | +def test_saxpy_occupancy_max_potential_block_size(get_saxpy_kernel, block_size_limit, smem_size_per_block): |
| 275 | + kernel, _ = get_saxpy_kernel |
| 276 | + dev_props = Device().properties |
| 277 | + assert block_size_limit <= dev_props.max_threads_per_block |
| 278 | + assert smem_size_per_block <= dev_props.max_shared_memory_per_block |
| 279 | + config_data = kernel.occupancy.max_potential_block_size(smem_size_per_block, block_size_limit) |
| 280 | + assert isinstance(config_data, tuple) |
| 281 | + assert len(config_data) == 2 |
| 282 | + min_grid_size, max_block_size = config_data |
| 283 | + assert isinstance(min_grid_size, int) |
| 284 | + assert isinstance(max_block_size, int) |
| 285 | + assert min_grid_size > 0 |
| 286 | + assert max_block_size > 0 |
| 287 | + assert max_block_size <= block_size_limit |
| 288 | + |
| 289 | + |
| 290 | +@pytest.mark.parametrize("num_blocks_per_sm, block_size", [(4, 32), (2, 64), (2, 96), (3, 120), (2, 128), (1, 256)]) |
| 291 | +def test_saxpy_occupancy_available_dynamic_shared_memory_per_block(get_saxpy_kernel, num_blocks_per_sm, block_size): |
| 292 | + kernel, _ = get_saxpy_kernel |
| 293 | + dev_props = Device().properties |
| 294 | + assert block_size <= dev_props.max_threads_per_block |
| 295 | + assert num_blocks_per_sm * block_size <= dev_props.max_threads_per_multiprocessor |
| 296 | + smem_size = kernel.occupancy.available_dynamic_shared_memory_per_block(num_blocks_per_sm, block_size) |
| 297 | + assert smem_size <= dev_props.max_shared_memory_per_block |
| 298 | + assert num_blocks_per_sm * smem_size <= dev_props.max_shared_memory_per_multiprocessor |
0 commit comments