NVIDIA · bernhardmgruber · Jul 22, 2025 · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025
@@ -21,7 +21,7 @@
 #include <thrust/system/cuda/detail/core/util.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
-#include <cuda/__barrier/aligned_size.h> // cannot include <cuda/barrier> directly on CUDA_ARCH < 700
+#include <cuda/__memory/aligned_size.h>
 #include <cuda/cmath>
 #include <cuda/ptx>
 #include <cuda/std/bit>

@@ -10,7 +10,6 @@ Extended API
    extended_api/execution_model
    extended_api/memory_model
    extended_api/thread_groups
-   extended_api/shapes
    extended_api/synchronization_primitives
    extended_api/asynchronous_operations
    extended_api/memory_access_properties

@@ -91,7 +91,7 @@ namely:
      the behavior is undefined.
    - If the objects are not of `TriviallyCopyable <https://en.cppreference.com/w/cpp/named_req/TriviallyCopyable>`_
      type the program is ill-formed, no diagnostic required.
-   - If *Shape* is :ref:`cuda::aligned_size_t <libcudacxx-extended-api-memory-access-shapes-aligned-size>`, ``source``
+   - If *Shape* is :ref:`cuda::aligned_size_t <libcudacxx-extended-api-memory-aligned-size>`, ``source``
      and ``destination`` are both required to be aligned on ``cuda::aligned_size_t::align``, else the behavior is
      undefined.
    - If ``cuda::pipeline`` is in a *quitted state*
@@ -111,7 +111,7 @@ Template Parameters
      - A type satisfying the [*Group*] concept.
    * - ``Shape``
      - Either `cuda::std::size_t <https://en.cppreference.com/w/c/types/size_t>`_
-       or :ref:`cuda::aligned_size_t <libcudacxx-extended-api-memory-access-shapes-aligned-size>`.
+       or :ref:`cuda::aligned_size_t <libcudacxx-extended-api-memory-aligned-size>`.
 
 Parameters
 ----------

@@ -8,8 +8,8 @@ Functional
    :maxdepth: 1
 
    functional/proclaim_return_type
-   functional/get_device_address
    functional/maximum_minimum
+   memory/get_device_address
 
 .. list-table::
    :widths: 25 45 30 30
@@ -40,7 +40,7 @@ Functional
      - CCCL 2.8.0
      - CUDA 12.9
 
-   * - :ref:`cuda::get_device_address <libcudacxx-extended-api-functional-get-device-address>`
+   * - :ref:`cuda::get_device_address <libcudacxx-extended-api-memory-get-device-address>`
      - Returns a valid address to a device object
      - CCCL 2.8.0
      - CUDA 12.9
@@ -7,9 +7,13 @@ Memory
    :hidden:
    :maxdepth: 1
 
-   memory/is_aligned
-   memory/align_up
    memory/align_down
+   memory/align_up
+   memory/aligned_size
+   memory/discard_memory
+   memory/get_device_address
+   memory/is_address_from
+   memory/is_aligned
    memory/ptr_rebind
 
 .. list-table::
@@ -21,6 +25,26 @@ Memory
      - **CCCL Availability**
      - **CUDA Toolkit Availability**
 
+   * - :ref:`aligned_size_t <libcudacxx-extended-api-memory-aligned-size>`
+     - Defines an extent of bytes with a statically defined alignment.
+     - libcu++ 1.2.0 / CCCL 2.0.0 (in ``<cuda/memory>`` since CCCL 3.1.0)
+     - CUDA 11.1
+
+   * - :ref:`discard_memory <libcudacxx-extended-api-memory-discard-memory>`
+     - Writes indeterminate values to memory
+     - libcu++ 1.6.0 / CCCL 2.0.0 (in ``<cuda/memory>`` since CCCL 3.1.0)
+     - CUDA 11.5
+
+   * - :ref:`get_device_address <libcudacxx-extended-api-memory-get-device-address>`
+     - Returns a valid address to a device object
+     - CCCL 2.8.0 (in ``<cuda/memory>`` since CCCL 3.1.0)
+     - CUDA 12.9
+
+   * - :ref:`is_address_from <libcudacxx-extended-api-memory-is_address_from>`
+     - Check if a pointer is from a specific address space
+     - CCCL 3.0.0
+     - CUDA 13.0
+
    * - :ref:`is_aligned <libcudacxx-extended-api-memory-is_aligned>`
      - Check if a pointer is aligned
      - CCCL 3.1.0

@@ -1,9 +1,9 @@
-.. _libcudacxx-extended-api-memory-access-shapes-aligned-size:
+.. _libcudacxx-extended-api-memory-aligned-size:
 
 ``cuda::aligned_size_t``
 ========================
 
-Defined in headers ``<cuda/barrier>`` and ``<cuda/pipeline>``:
+Defined in headers ``<cuda/memory>``, ``<cuda/barrier>`` and ``<cuda/pipeline>``:
 
 .. code:: cuda
 
@@ -72,7 +72,7 @@ Example
 
 .. code:: cuda
 
-   #include <cuda/barrier>
+   #include <cuda/memory>
 
    __global__ void example_kernel(void* dst, void* src, size_t size) {
      cuda::barrier<cuda::thread_scope_system> bar;

@@ -1,9 +1,9 @@
-.. _libcudacxx-extended-api-memory-access-properties-discard-memory:
+.. _libcudacxx-extended-api-memory-discard-memory:
 
 ``cuda::discard_memory``
 ========================
 
-Defined in header ``<cuda/discard_memory>``.
+Defined in header ``<cuda/memory>``, ``<cuda/discard_memory>``.
 
 .. code:: cuda
 
@@ -24,7 +24,7 @@ This kernel needs a scratch pad that does not fit in shared memory, so it uses a
 
 .. code:: cuda
 
-   #include <cuda/discard_memory>
+   #include <cuda/memory>
 
     __device__ int compute(int* scratch, size_t N);
 

@@ -1,9 +1,9 @@
-.. _libcudacxx-extended-api-functional-get-device-address:
+.. _libcudacxx-extended-api-memory-get-device-address:
 
 ``cuda::get_device_address``
 ============================
 
-Defined in the header ``<cuda/functional>``:
+Defined in the headers ``<cuda/memory>`` and ``<cuda/functional>``:
 
 ``cuda::get_device_address`` returns a valid pointer to a device object.
 It replaces uses of ``cudaGetSymbolAddress``, which requires an inout parameter.
@@ -13,7 +13,7 @@ Example
 
 .. code:: cuda
 
-  #include <cuda/functional>
+  #include <cuda/memory>
 
   __device__ int device_object[] = {42, 1337, -1, 0};
 

@@ -0,0 +1,70 @@
+.. _libcudacxx-extended-api-memory-is_address_from:
+
+``cuda::device::is_address_from``
+=================================
+
+.. code:: cuda
+
+   enum class address_space
+   {
+     global,
+     shared,
+     constant,
+     local,
+     grid_constant,
+     cluster_shared,
+   };
+
+   template <typename T>
+   [[nodiscard]] __device__ inline
+   bool is_address_from(address_space space, const void* ptr)
+
+The function checks if a pointer ``ptr`` with a generic address is from a ``space`` address state space.
+
+**Parameters**
+
+- ``space``: The address space.
+- ``ptr``: The pointer.
+
+**Return value**
+
+- ``true`` if the pointer is from the specified address space, ``false`` otherwise.
+
+**Performance considerations**
+
+- If possible, the ``__isGlobal``, ``__isShared``, ``__isConstant``, ``__isLocal``, ``__isGridConstant``, or ``__isClusterShared`` built-in functions are used to determine the address space.
+
+Example
+-------
+
+.. code:: cuda
+
+    #include <cuda/memory>
+
+    struct MutableStruct
+    {
+        mutable int v;
+    };
+
+    __device__ int global_var;
+    __constant__ int constant_var;
+
+    __global__ void kernel(const __grid_constant__ MutableStruct grid_constant_var)
+    {
+        __shared__ int shared_var;
+        int local_var{};
+
+        assert(cuda::device::is_address_from(cuda::device::address_space::global, &global_var));
+        assert(cuda::device::is_address_from(cuda::device::address_space::shared, &shared_var));
+        assert(cuda::device::is_address_from(cuda::device::address_space::constant, &constant_var));
+        assert(cuda::device::is_address_from(cuda::device::address_space::local, &local_var));
+        assert(cuda::device::is_address_from(cuda::device::address_space::grid_constant, &grid_constant_var));
+    }
+
+    int main(int, char**)
+    {
+        kernel<<<1, 1>>>(MutableStruct{42});
+        cudaDeviceSynchronize();
+    }
+
+`See it on Godbolt 🔗 <https://godbolt.org/z/r1qb31szz>`_
@@ -11,7 +11,6 @@ Memory access properties
    memory_access_properties/annotated_ptr
    memory_access_properties/apply_access_property
    memory_access_properties/associate_access_property
-   memory_access_properties/discard_memory
 
 .. list-table::
    :widths: 25 45 30 30
@@ -40,8 +39,3 @@ Memory access properties
      - Associates access property with raw pointer
      - libcu++ 1.6.0 / CCCL 2.0.0
      - CUDA 11.5
-
-   * - :ref:`cuda::discard_memory <libcudacxx-extended-api-memory-access-properties-discard-memory>`
-     - Writes indeterminate values to memory
-     - libcu++ 1.6.0 / CCCL 2.0.0
-     - CUDA 11.5
@@ -19,7 +19,7 @@ Prefetch memory in the L2 cache starting at ``ptr`` applying a residence control
 
 **Constraints**
 
-- :ref:`ShapeT <libcudacxx-extended-api-memory-access-shapes>` is either ``size_t`` or :ref:`cuda::aligned_size_t <libcudacxx-extended-api-memory-access-shapes-aligned-size>`.
+- ``ShapeT`` is either ``size_t`` or :ref:`cuda::aligned_size_t <libcudacxx-extended-api-memory-aligned-size>`.
 - Two properties are supported:
 
     -  :ref:`cuda::access_property::persisting <libcudacxx-extended-api-memory-access-properties-access-property-persisting>`

@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/__barrier/aligned_size.h>
+#include <cuda/__memory/aligned_size.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__cstddef/types.h>
 #include <cuda/std/cstdint>

@@ -24,13 +24,13 @@
 
 #if _CCCL_HAS_CUDA_COMPILER()
 
-#  include <cuda/__barrier/aligned_size.h>
 #  include <cuda/__barrier/async_contract_fulfillment.h>
 #  include <cuda/__barrier/barrier.h>
 #  include <cuda/__barrier/barrier_block_scope.h>
 #  include <cuda/__barrier/barrier_thread_scope.h>
 #  include <cuda/__memcpy_async/check_preconditions.h>
 #  include <cuda/__memcpy_async/memcpy_async_barrier.h>
+#  include <cuda/__memory/aligned_size.h>
 #  include <cuda/std/__atomic/scopes.h>
 #  include <cuda/std/__type_traits/void_t.h>
 #  include <cuda/std/cstddef>

@@ -22,7 +22,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/__barrier/aligned_size.h>
 #include <cuda/__barrier/barrier.h>
 #include <cuda/__barrier/barrier_block_scope.h>
 #include <cuda/__barrier/barrier_thread_scope.h>
@@ -31,6 +30,7 @@
 #include <cuda/__memcpy_async/is_local_smem_barrier.h>
 #include <cuda/__memcpy_async/memcpy_completion.h>
 #include <cuda/__memcpy_async/try_get_barrier_handle.h>
+#include <cuda/__memory/aligned_size.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>

@@ -25,11 +25,11 @@
 #if _CCCL_CUDA_COMPILATION()
 #  if __cccl_ptx_isa >= 800
 
-#    include <cuda/__barrier/aligned_size.h>
 #    include <cuda/__barrier/async_contract_fulfillment.h>
 #    include <cuda/__barrier/barrier_block_scope.h>
 #    include <cuda/__barrier/barrier_native_handle.h>
 #    include <cuda/__memcpy_async/check_preconditions.h>
+#    include <cuda/__memory/aligned_size.h>
 #    include <cuda/__ptx/instructions/cp_async_bulk.h>
 #    include <cuda/__ptx/ptx_dot_variants.h>
 #    include <cuda/__ptx/ptx_helper_functions.h>

@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _CUDA___BARRIER_ALIGNED_SIZE_H
-#define _CUDA___BARRIER_ALIGNED_SIZE_H
+#ifndef _CUDA___MEMORY_ALIGNED_SIZE_H
+#define _CUDA___MEMORY_ALIGNED_SIZE_H
 
 #include <cuda/std/detail/__config>
 
@@ -58,4 +58,4 @@ _LIBCUDACXX_END_NAMESPACE_CUDA
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CUDA___BARRIER_ALIGNED_SIZE_H
+#endif // _CUDA___MEMORY_ALIGNED_SIZE_H