NVIDIA · leofang · Feb 25, 2025 · Feb 22, 2025 · Feb 23, 2025 · Feb 23, 2025
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
@@ -62,8 +62,16 @@ jobs:
           TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
           if [[ $BUILD_CUDA_MAJOR != $TEST_CUDA_MAJOR ]]; then
             SKIP_CUDA_BINDINGS_TEST=1
+            SKIP_CUDA_CORE_CYTHON_TEST=0
           else
             SKIP_CUDA_BINDINGS_TEST=0
+            BUILD_CUDA_MINOR="$(cut -d '.' -f 2 <<< ${{ inputs.build-ctk-ver }})"
+            TEST_CUDA_MINOR="$(cut -d '.' -f 2 <<< ${{ inputs.cuda-version }})"
+            if [[ $BUILD_CUDA_MINOR != $TEST_CUDA_MINOR ]]; then
+              SKIP_CUDA_CORE_CYTHON_TEST=1
+            else
+              SKIP_CUDA_CORE_CYTHON_TEST=0
+            fi
           fi
 
           # make outputs from the previous job as env vars
@@ -77,6 +85,7 @@ jobs:
           echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV
           echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV
           echo "SKIP_CUDA_BINDINGS_TEST=${SKIP_CUDA_BINDINGS_TEST}" >> $GITHUB_ENV
+          echo "SKIP_CUDA_CORE_CYTHON_TEST=${SKIP_CUDA_CORE_CYTHON_TEST}" >> $GITHUB_ENV
 
       - name: Install dependencies
         uses: ./.github/actions/install_unix_deps
@@ -185,7 +194,7 @@ jobs:
 
           pushd ./cuda_bindings
           pip install -r requirements.txt
-          pytest -rxXs tests/
+          pytest -rxXs -v tests/
 
           # It is a bit convoluted to run the Cython tests against CTK wheels,
           # so let's just skip them.
@@ -196,9 +205,9 @@ jobs:
               # TODO: enable this once win-64 runners are up 
               exit 1
             fi  
-            pytest -rxXs tests/cython
-            popd
+            pytest -rxXs -v tests/cython
           fi
+          popd
 
       - name: Run cuda.core tests
         run: |
@@ -220,7 +229,22 @@ jobs:
 
           pushd ./cuda_core
           pip install -r "tests/requirements-cu${TEST_CUDA_MAJOR}.txt"
-          pytest -rxXs tests/
+          pytest -rxXs -v tests/
+
+          # It is a bit convoluted to run the Cython tests against CTK wheels,
+          # so let's just skip them. Also, currently our CI always installs the
+          # latest bindings (from either major version). This is not compatible
+          # with the test requirements.
+          if [[ "${{ inputs.local-ctk }}" == 1 && "${SKIP_CUDA_CORE_CYTHON_TEST}" == 0 ]]; then
+            pip install cython setuptools  # setuptools needed starting PY312
+            if [[ "${{ inputs.host-platform }}" == linux* ]]; then
+              bash tests/cython/build_tests.sh
+            elif [[ "${{ inputs.host-platform }}" == win* ]]; then
+              # TODO: enable this once win-64 runners are up
+              exit 1
+            fi
+            pytest -rxXs -v tests/cython
+          fi
           popd
 
       - name: Ensure cuda-python installable

diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
@@ -186,8 +186,8 @@ jobs:
 
           Push-Location ./cuda_bindings
           pip install -r requirements.txt
-          pytest -rxXs tests/
-          # skip Cython tests for now
+          pytest -rxXs -v tests/
+          # skip Cython tests for now (NVIDIA/cuda-python#466)
           Pop-Location
 
       - name: Run cuda.core tests
@@ -210,7 +210,7 @@ jobs:
 
           Push-Location ./cuda_core
           pip install -r "tests/requirements-cu${TEST_CUDA_MAJOR}.txt"
-          pytest -rxXs tests/
+          pytest -rxXs -v tests/
           Pop-Location
 
       - name: Ensure cuda-python installable

diff --git a/cuda_core/README.md b/cuda_core/README.md
@@ -41,3 +41,14 @@ for more details, including how to sign your commits.
 To run these tests:
 * `python -m pytest tests/` against editable installations
 * `pytest tests/` against installed packages
+
+### Cython Unit Tests
+
+Cython tests are located in `tests/cython` and need to be built. These builds have the same CUDA Toolkit header requirements as [those of cuda.bindings](https://nvidia.github.io/cuda-python/cuda-bindings/latest/install.html#requirements) where the major.minor version must match `cuda.bindings`. To build them:
+
+1. Setup environment variable `CUDA_HOME` with the path to the CUDA Toolkit installation.
+2. Run `build_tests` script located in `test/cython` appropriate to your platform. This will both cythonize the tests and build them.
+
+To run these tests:
+* `python -m pytest tests/cython/` against editable installations
+* `pytest tests/cython/` against installed packages
diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
@@ -1,13 +1,18 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+from __future__ import annotations
+
 import weakref
 from dataclasses import dataclass
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
 from cuda.core.experimental._utils import CUDAError, check_or_create_options, driver, handle_return
 
+if TYPE_CHECKING:
+    import cuda.bindings
+
 
 @dataclass
 class EventOptions:
@@ -130,6 +135,6 @@ def is_done(self) -> bool:
             raise CUDAError(f"unexpected error: {result}")
 
     @property
-    def handle(self) -> int:
-        """Return the underlying cudaEvent_t pointer address as Python int."""
-        return int(self._mnff.handle)
+    def handle(self) -> cuda.bindings.driver.CUevent:
+        """Return the underlying CUevent object."""
+        return self._mnff.handle
diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
@@ -1,14 +1,19 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+from __future__ import annotations
+
 import ctypes
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 from warnings import warn
 
+if TYPE_CHECKING:
+    import cuda.bindings
+
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._utils import check_or_create_options, driver, handle_return, is_sequence
@@ -323,6 +328,10 @@ def _exception_manager(self):
         raise e
 
 
+nvJitLinkHandleT = int
+LinkerHandleT = Union[nvJitLinkHandleT, "cuda.bindings.driver.CUlinkState"]
+
+
 class Linker:
     """Represent a linking machinery to link one or multiple object codes into
     :obj:`~cuda.core.experimental._module.ObjectCode` with the specified options.
@@ -483,10 +492,20 @@ def _input_type_from_code_type(self, code_type: str):
         return input_type
 
     @property
-    def handle(self):
-        """Return the linker handle object."""
+    def handle(self) -> LinkerHandleT:
+        """Return the underlying handle object.
+
+        .. note::
+
+           The type of the returned object depends on the backend.
+        """
         return self._mnff.handle
 
+    @property
+    def backend(self) -> str:
+        """Return this Linker instance's underlying backend."""
+        return "nvJitLink" if self._mnff.use_nvjitlink else "driver"
+
     def close(self):
         """Destroy this linker."""
         self._mnff.close()
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -177,11 +177,11 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
     cdef object capsule
     try:
         capsule = obj.__dlpack__(
-            stream=stream_ptr,
+            stream=int(stream_ptr) if stream_ptr else None,
             max_version=(DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION))
     except TypeError:
         capsule = obj.__dlpack__(
-            stream=stream_ptr)
+            stream=int(stream_ptr) if stream_ptr else None)
 
     cdef void* data = NULL
     if cpython.PyCapsule_IsValid(

diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
@@ -203,7 +203,7 @@ def _from_obj(obj, mod):
         return ker
 
     @property
-    def attributes(self):
+    def attributes(self) -> KernelAttributes:
         """Get the read-only attributes of this kernel."""
         if self._attributes is None:
             self._attributes = KernelAttributes._init(self._handle)
@@ -212,6 +212,9 @@ def attributes(self):
     # TODO: implement from_handle()
 
 
+CodeTypeT = Union[bytes, bytearray, str]
+
+
 class ObjectCode:
     """Represent a compiled program to be loaded onto the device.
 
@@ -294,7 +297,7 @@ def _lazy_load_module(self, *args, **kwargs):
                 self._handle = handle_return(self._loader["data"](module, 0, [], []))
 
     @precondition(_lazy_load_module)
-    def get_kernel(self, name):
+    def get_kernel(self, name) -> Kernel:
         """Return the :obj:`~_module.Kernel` of a specified name from this object code.
 
         Parameters
@@ -317,3 +320,8 @@ def get_kernel(self, name):
 
         data = handle_return(self._loader["kernel"](self._handle, name))
         return Kernel._from_obj(data, self)
+
+    @property
+    def code(self) -> CodeTypeT:
+        """Return the underlying code object."""
+        return self._module
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
@@ -2,13 +2,18 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+from __future__ import annotations
+
 import weakref
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 from warnings import warn
 
+if TYPE_CHECKING:
+    import cuda.bindings
+
 from cuda.core.experimental._device import Device
-from cuda.core.experimental._linker import Linker, LinkerOptions
+from cuda.core.experimental._linker import Linker, LinkerHandleT, LinkerOptions
 from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._utils import (
     _handle_boolean_option,
@@ -331,6 +336,9 @@ def __repr__(self):
         return self._formatted_options
 
 
+ProgramHandleT = Union["cuda.bindings.nvrtc.nvrtcProgram", LinkerHandleT]
+
+
 class Program:
     """Represent a compilation machinery to process programs into
     :obj:`~_module.ObjectCode`.
@@ -382,7 +390,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
 
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
-            self._backend = "nvrtc"
+            self._backend = "NVRTC"
             self._linker = None
 
         elif code_type == "ptx":
@@ -391,7 +399,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._linker = Linker(
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
-            self._backend = "linker"
+            self._backend = self._linker.backend
         else:
             raise NotImplementedError
 
@@ -445,9 +453,9 @@ def compile(self, target_type, name_expressions=(), logs=None):
 
         """
         if target_type not in self._supported_target_type:
-            raise NotImplementedError
+            raise ValueError(f"the target type {target_type} is not supported")
 
-        if self._backend == "nvrtc":
+        if self._backend == "NVRTC":
             if target_type == "ptx" and not self._can_load_generated_ptx():
                 warn(
                     "The CUDA driver version is older than the backend version. "
@@ -489,15 +497,20 @@ def compile(self, target_type, name_expressions=(), logs=None):
 
             return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping)
 
-        if self._backend == "linker":
-            return self._linker.link(target_type)
+        assert self._backend in ("nvJitLink", "driver")
+        return self._linker.link(target_type)
 
     @property
-    def backend(self):
-        """Return the backend type string associated with this program."""
+    def backend(self) -> str:
+        """Return this Program instance's underlying backend."""
         return self._backend
 
     @property
-    def handle(self):
-        """Return the program handle object."""
+    def handle(self) -> ProgramHandleT:
+        """Return the underlying handle object.
+
+        .. note::
+
+           The type of the returned object depends on the backend.
+        """
         return self._mnff.handle
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
@@ -11,6 +11,7 @@
 from typing import TYPE_CHECKING, Optional, Tuple, Union
 
 if TYPE_CHECKING:
+    import cuda.bindings
     from cuda.core.experimental._device import Device
 from cuda.core.experimental._context import Context
 from cuda.core.experimental._event import Event, EventOptions
@@ -147,9 +148,9 @@ def __cuda_stream__(self) -> Tuple[int, int]:
         return (0, self.handle)
 
     @property
-    def handle(self) -> int:
-        """Return the underlying cudaStream_t pointer address as Python int."""
-        return int(self._mnff.handle)
+    def handle(self) -> cuda.bindings.driver.CUstream:
+        """Return the underlying ``CUstream`` object."""
+        return self._mnff.handle
 
     @property
     def is_nonblocking(self) -> bool:

diff --git a/cuda_core/cuda/core/experimental/include/utility.hpp b/cuda_core/cuda/core/experimental/include/utility.hpp
@@ -0,0 +1,23 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+#pragma once
+
+#include <type_traits>
+
+// In cuda.bindings 12.8, the private member name was renamed from "_ptr" to "_pvt_ptr".
 cdef class CUstream: 
     """ 
     CUDA stream 
     Methods 
     ------- 
     getPtr() 
         Get memory address of class instance 
     """ 
     cdef cydriver.CUstream  _pvt_val 
     cdef cydriver.CUstream* _pvt_ptr 
 cdef class CUstream: 
     """ 
  
     CUDA stream 
  
     Methods 
     ------- 
     getPtr() 
         Get memory address of class instance 
  
     """ 
     cdef cydriver.CUstream  _pvt_val 
     cdef cydriver.CUstream* _pvt_ptr 
+// We want to have the C++ layer supporting all past 12.x versions, so some tricks are needed.
+// Since there's no std::has_member<T, member_name> so we use SFINAE to create the same effect.
+
+template <typename T,
+          std::enable_if_t<std::is_pointer_v<decltype(std::remove_pointer_t<T>::_pvt_ptr)>, int> = 0>
+inline auto& get_cuda_native_handle(const T& obj) {
+    return *(obj->_pvt_ptr);
+}
+
+template <typename T,
+          std::enable_if_t<std::is_pointer_v<decltype(std::remove_pointer_t<T>::_ptr)>, int> = 0>
+inline auto& get_cuda_native_handle(const T& obj) {
+    return *(obj->_ptr);
+}
diff --git a/cuda_core/docs/source/_templates/autosummary/class.rst b/cuda_core/docs/source/_templates/autosummary/class.rst
@@ -20,7 +20,7 @@
    .. rubric:: {{ _('Attributes') }}
 
    {% for item in attributes %}
-   .. autoattribute:: {{ item }}
+   .. autoproperty:: {{ item }}
    {%- endfor %}
    {% endif %}
    {% endblock %}