diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 3db9e8abb..6e289d49b 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -7,6 +7,7 @@ from cuda.core.experimental._event import EventOptions from cuda.core.experimental._launcher import LaunchConfig, launch from cuda.core.experimental._linker import Linker, LinkerOptions +from cuda.core.experimental._module import ObjectCode from cuda.core.experimental._program import Program, ProgramOptions from cuda.core.experimental._stream import Stream, StreamOptions from cuda.core.experimental._system import System diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 07e87fb66..06005f95a 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -65,9 +65,7 @@ def close(self): __slots__ = ("__weakref__", "_mnff", "_timing_disabled", "_busy_waited") def __init__(self): - raise NotImplementedError( - "directly creating an Event object can be ambiguous. Please call call Stream.record()." - ) + raise NotImplementedError("directly creating an Event object can be ambiguous. Please call Stream.record().") @staticmethod def _init(options: Optional[EventOptions] = None): diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 7736d7b2d..2f84f3502 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -439,7 +439,7 @@ def link(self, target_type) -> ObjectCode: addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) code = (ctypes.c_char * size).from_address(addr) - return ObjectCode(bytes(code), target_type) + return ObjectCode._init(bytes(code), target_type) def get_error_log(self) -> str: """Get the error log generated by the linker. diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 0274b3001..c4535c279 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - +from typing import Optional, Union from warnings import warn from cuda.core.experimental._utils import driver, get_binding_version, handle_return, precondition @@ -213,47 +213,42 @@ def attributes(self): class ObjectCode: - """Represent a compiled program that was loaded onto the device. + """Represent a compiled program to be loaded onto the device. This object provides a unified interface for different types of - compiled programs that are loaded onto the device. + compiled programs that will be loaded onto the device. - Loads the module library with specified module code and JIT options. + Note + ---- + This class has no default constructor. If you already have a cubin that you would + like to load, use the :meth:`from_cubin` alternative constructor. For all other + possible code types (ex: "ptx"), only :class:`~cuda.core.experimental.Program` + accepts them and returns an :class:`ObjectCode` instance with its + :meth:`~cuda.core.experimental.Program.compile` method. Note ---- Usage under CUDA 11.x will only load to the current device context. - - Parameters - ---------- - module : Union[bytes, str] - Either a bytes object containing the module to load, or - a file path string containing that module for loading. - code_type : Any - String of the compiled type. - Supported options are "ptx", "cubin", "ltoir" and "fatbin". - jit_options : Optional - Mapping of JIT options to use during module loading. - (Default to no options) - symbol_mapping : Optional - Keyword argument dictionary specifying how symbol names - should be mapped before trying to retrieve them. - (Default to no mappings) - """ - __slots__ = ("_handle", "_backend_version", "_jit_options", "_code_type", "_module", "_loader", "_sym_map") + __slots__ = ("_handle", "_backend_version", "_code_type", "_module", "_loader", "_sym_map") _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin") - def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None): - if code_type not in self._supported_code_type: - raise ValueError + def __init__(self): + raise NotImplementedError( + "directly creating an ObjectCode object can be ambiguous. Please either call Program.compile() " + "or one of the ObjectCode.from_*() constructors" + ) + + @staticmethod + def _init(module, code_type, *, symbol_mapping: Optional[dict] = None): + self = ObjectCode.__new__(ObjectCode) + assert code_type in self._supported_code_type, f"{code_type=} is not supported" _lazy_init() # handle is assigned during _lazy_load self._handle = None - self._jit_options = jit_options self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" self._loader = _backend[self._backend_version] @@ -262,42 +257,41 @@ def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None): self._module = module self._sym_map = {} if symbol_mapping is None else symbol_mapping + return self + + @staticmethod + def from_cubin(module: Union[bytes, str], *, symbol_mapping: Optional[dict] = None) -> "ObjectCode": + """Create an :class:`ObjectCode` instance from an existing cubin. + + Parameters + ---------- + module : Union[bytes, str] + Either a bytes object containing the in-memory cubin to load, or + a file path string pointing to the on-disk cubin to load. + symbol_mapping : Optional[dict] + A dictionary specifying how the unmangled symbol names (as keys) + should be mapped to the mangled names before trying to retrieve + them (default to no mappings). + """ + return ObjectCode._init(module, "cubin", symbol_mapping=symbol_mapping) + # TODO: do we want to unload in a finalizer? Probably not.. def _lazy_load_module(self, *args, **kwargs): if self._handle is not None: return - jit_options = self._jit_options module = self._module if isinstance(module, str): - # TODO: this option is only taken by the new library APIs, but we have - # a bug that we can't easily support it just yet (NVIDIA/cuda-python#73). - if jit_options is not None: - raise ValueError - self._handle = handle_return(self._loader["file"](module)) + if self._backend_version == "new": + self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0)) + else: # "old" backend + self._handle = handle_return(self._loader["file"](module.encode())) else: assert isinstance(module, bytes) - if jit_options is None: - jit_options = {} if self._backend_version == "new": - args = ( - module, - list(jit_options.keys()), - list(jit_options.values()), - len(jit_options), - # TODO: support library options - [], - [], - 0, - ) + self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0)) else: # "old" backend - args = ( - module, - len(jit_options), - list(jit_options.keys()), - list(jit_options.values()), - ) - self._handle = handle_return(self._loader["data"](*args)) + self._handle = handle_return(self._loader["data"](module, 0, [], [])) @precondition(_lazy_load_module) def get_kernel(self, name): @@ -314,6 +308,8 @@ def get_kernel(self, name): Newly created kernel object. """ + if self._code_type not in ("cubin", "ptx", "fatbin"): + raise RuntimeError(f"get_kernel() is not supported for {self._code_type}") try: name = self._sym_map[name] except KeyError: @@ -321,5 +317,3 @@ def get_kernel(self, name): data = handle_return(self._loader["kernel"](self._handle, name)) return Kernel._from_obj(data, self) - - # TODO: implement from_handle() diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index f938895ed..b1fb0d90f 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -386,7 +386,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None): if not isinstance(code, str): raise TypeError("ptx Program expects code argument to be a string") self._linker = Linker( - ObjectCode(code.encode(), code_type), options=self._translate_program_options(options) + ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options) ) self._backend = "linker" else: @@ -472,7 +472,7 @@ def compile(self, target_type, name_expressions=(), logs=None): handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle) logs.write(log.decode()) - return ObjectCode(data, target_type, symbol_mapping=symbol_mapping) + return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping) if self._backend == "linker": return self._linker.link(target_type) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index f5ee30c1a..b52fda55d 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -32,6 +32,7 @@ CUDA compilation toolchain Program Linker + ObjectCode :template: dataclass.rst diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst index 07d0a2527..81e4e1292 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.rst +++ b/cuda_core/docs/source/release/0.2.0-notes.rst @@ -3,7 +3,7 @@ ``cuda.core`` 0.2.0 Release Notes ================================= -Released on , 2024 +Released on , 2025 Highlights ---------- @@ -12,14 +12,19 @@ Highlights - Add :class:`~DeviceProperties` to provide pythonic access to device properties. - Add kernel attributes to :class:`~Kernel` -Limitations ------------ - -- - Breaking Changes ---------------- - Change ``__cuda_stream__`` from attribute to method - The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`. -- :meth: `~Device.properties` now provides an instance of :class:`~DeviceProperties` instead of a dictionary. +- :meth:`~Device.properties` now provides an instance of :class:`~DeviceProperties` instead of a dictionary. + +New features +------------ + +- Expose :class:`ObjectCode` as a public API, which allows loading cubins from memory or disk. For loading other kinds of code types, please continue using :class:`Program`. + +Limitations +----------- + +- diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index 7b83d844c..baa954777 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -34,14 +34,8 @@ } """ arch0 = "".join(f"{i}" for i in dev0.compute_capability) -prog_add = Program(code_add, code_type="c++") -mod_add = prog_add.compile( - "cubin", - options=( - "-std=c++17", - "-arch=sm_" + arch0, - ), -) +prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{arch0}"}) +mod_add = prog_add.compile("cubin") ker_add = mod_add.get_kernel("vector_add") # Set GPU 1 @@ -63,14 +57,8 @@ } """ arch1 = "".join(f"{i}" for i in dev1.compute_capability) -prog_sub = Program(code_sub, code_type="c++") -mod_sub = prog_sub.compile( - "cubin", - options=( - "-std=c++17", - "-arch=sm_" + arch1, - ), -) +prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{arch1}"}) +mod_sub = prog_sub.compile("cubin") ker_sub = mod_sub.get_kernel("vector_sub") diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 9f126fa17..f859142c9 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -10,7 +10,7 @@ import pytest from conftest import can_load_generated_ptx -from cuda.core.experimental import Program, ProgramOptions, system +from cuda.core.experimental import ObjectCode, Program, ProgramOptions, system @pytest.fixture(scope="function") @@ -37,7 +37,7 @@ def get_saxpy_kernel(init_cuda): ) # run in single precision - return mod.get_kernel("saxpy") + return mod.get_kernel("saxpy"), mod @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new") @@ -72,7 +72,7 @@ def test_get_kernel(init_cuda): ], ) def test_read_only_kernel_attributes(get_saxpy_kernel, attr, expected_type): - kernel = get_saxpy_kernel + kernel, _ = get_saxpy_kernel method = getattr(kernel.attributes, attr) # get the value without providing a device ordinal value = method() @@ -82,3 +82,23 @@ def test_read_only_kernel_attributes(get_saxpy_kernel, attr, expected_type): for device in system.devices: value = method(device.device_id) assert isinstance(value, expected_type), f"Expected {attr} to be of type {expected_type}, but got {type(value)}" + + +def test_object_code_load_cubin(get_saxpy_kernel): + _, mod = get_saxpy_kernel + cubin = mod._module + sym_map = mod._sym_map + assert isinstance(cubin, bytes) + mod = ObjectCode.from_cubin(cubin, symbol_mapping=sym_map) + mod.get_kernel("saxpy") # force loading + + +def test_object_code_load_cubin_from_file(get_saxpy_kernel, tmp_path): + _, mod = get_saxpy_kernel + cubin = mod._module + sym_map = mod._sym_map + assert isinstance(cubin, bytes) + cubin_file = tmp_path / "test.cubin" + cubin_file.write_bytes(cubin) + mod = ObjectCode.from_cubin(str(cubin_file), symbol_mapping=sym_map) + mod.get_kernel("saxpy") # force loading