Rust-GPU · LegNeato · Nov 26, 2025 · Nov 24, 2025 · Nov 25, 2025
diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml
@@ -293,6 +293,8 @@ jobs:
         run: cargo --version
       - name: Rustfmt compiletests
         shell: bash
+        # Uses rustfmt directly, rather than via `cargo fmt`, because the compiletests .rs files
+        # are not within a package.
         run: shopt -s globstar && rustfmt --check tests/compiletests/ui/**/*.rs
       - name: Compiletest
         run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_75,compute_90
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/cuda_std/CHANGELOG.md b/crates/cuda_std/CHANGELOG.md
@@ -7,6 +7,7 @@ Notable changes to this project will be documented in this file.
 - Added warp shuffles, matches, reductions, and votes in the `warp` module.
 - Added `activemask` in the `warp` module to query a mask of the active threads.
 - Fixed `lane_id` generating invalid ptx.
+- Removed `shared_array!` due to unsoundness.
 
 ## 0.2.2 - 2/7/22
 

diff --git a/crates/cuda_std/src/shared.rs b/crates/cuda_std/src/shared.rs
@@ -1,83 +1,9 @@
-//! Static and Dynamic shared memory handling.
+//! Dynamic shared memory handling.
+//!
+//! Static shared memory is done via `#[address_space(shared)] static mut ...;`.
 
 use crate::gpu_only;
 
-/// Statically allocates a buffer large enough for `len` elements of `array_type`,
-/// yielding a `*mut array_type` that points to uninitialized shared memory. `len` must
-/// be a constant expression.
-///
-/// Note that this allocates the memory __statically__, it expands to a static in the
-/// `shared` address space. Therefore, calling this macro multiple times in a loop will
-/// always yield the same data. However, separate invocations of the macro will yield
-/// different buffers.
-///
-/// The data is uninitialized by default, therefore, you must be careful to not read the
-/// data before it is written to. The semantics of what "uninitialized" actually means
-/// on the GPU (i.e. if it yields unknown data or if it is UB to read it whatsoever) are
-/// not well known, so even if the type is valid for any backing memory, make sure to
-/// not read uninitialized data.
-///
-/// # Safety
-///
-/// Shared memory usage is fundamentally extremely unsafe and impossible to statically
-/// prove, therefore the burden of correctness is on the user. Some of the things you
-/// must ensure in your usage of shared memory are:
-///
-///  - Shared memory is only shared across __thread blocks__, not the entire device,
-///    therefore it is unsound to try and rely on sharing data across more than one
-///    block.
-///   - You must write to the shared buffer before reading from it as the data is
-///     uninitialized by default.
-///   - [`thread::sync_threads`](crate::thread::sync_threads) must be called before
-///     relying on the results of other threads, this ensures every thread has reached
-///     that point before going on. For example, reading another thread's data after
-///     writing to the buffer.
-///   - No access may be out of bounds, this usually means making sure the amount of
-///     threads and their dimensions are correct.
-///
-/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of
-/// shared memory are right.
-///
-/// # Examples
-///
-/// ```no_run
-/// # use cuda_std::kernel;
-/// # use cuda_std::shared_array;
-/// # use cuda_std::thread;
-/// ##[kernel]
-/// pub unsafe fn reverse_array(d: *mut i32, n: usize) {
-///    let s = shared_array![i32; 64];
-///    let t = thread::thread_idx_x() as usize;
-///    let tr = n - t - 1;
-///    *s.add(t) = *d.add(t);
-///    thread::sync_threads();
-///    *d.add(t) = *s.add(tr);
-/// }
-/// ```
-#[macro_export]
-macro_rules! shared_array {
-    ($array_type:ty; $len:expr) => {{
-        #[$crate::gpu_only]
-        #[inline(always)]
-        fn shared_array() -> *mut $array_type {
-            use ::core::{cell::UnsafeCell, mem::MaybeUninit};
-            struct SyncWrapper(UnsafeCell<MaybeUninit<[$array_type; $len]>>);
-            // SAFETY: it is up to the user to verify sound shared memory usage, we cannot
-            // fundamentally check it for soundness.
-            unsafe impl Send for SyncWrapper {}
-            // SAFETY: see above
-            unsafe impl Sync for SyncWrapper {}
-
-            // the initializer is discarded when declaring shared globals, so it is unimportant.
-            #[$crate::address_space(shared)]
-            static SHARED: SyncWrapper = SyncWrapper(UnsafeCell::new(MaybeUninit::uninit()));
-
-            SHARED.0.get() as *mut $array_type
-        }
-        shared_array()
-    }};
-}
-
 /// Gets a pointer to the dynamic shared memory that was allocated by the caller of the kernel. The
 /// data is left uninitialized.
 ///

diff --git a/crates/cuda_std_macros/src/lib.rs b/crates/cuda_std_macros/src/lib.rs
@@ -205,12 +205,61 @@ pub fn externally_visible(
 }
 
 /// Notifies the codegen to put a `static`/`static mut` inside of a specific memory address space.
-/// This is mostly for internal use and/or advanced users, as the codegen and `cuda_std` handle address space placement
-/// implicitly. **Improper use of this macro could yield weird or undefined behavior**.
+/// This is mostly for internal use and/or advanced users, as the codegen and `cuda_std` handle
+/// address space placement implicitly. **Improper use of this macro could yield weird or undefined
+/// behavior**.
 ///
-/// This macro takes a single argument which can either be `global`, `shared`, `constant`, or `local`.
+/// This macro takes a single argument which can either be `global`, `shared`, `constant`, or
+/// `local`.
 ///
 /// This macro does nothing on the CPU.
+///
+/// # Shared memory
+///
+/// The item `#[address_space(shared) static mut FOO: [MaybeUninit<T>; N];` statically allocates a
+/// buffer large enough for `N` elements of type `T`, yielding an uninitialized array in shared
+/// memory.
+///
+/// Note that this allocates the memory __statically__, i.e. it expands to a static in the `shared`
+/// address space. Therefore, calling this macro multiple times in a loop will always yield the
+/// same data. However, separate invocations of the macro will yield different buffers.
+///
+/// Because the data is uninitialized by default, the type within the array must be `MaybeUninit`,
+/// and uses must follow the usual rules of `MaybeUninit`, such as using `write`/`assume_init`.
+/// Using a non-`MaybeUninit` type is undefined behaviour.
+///
+/// # Safety
+///
+/// Shared memory usage is fundamentally unsafe and much of the burden of correctness is on the
+/// user. For example:
+/// - Shared memory is only shared across __thread blocks__, not the entire device, therefore it is
+///   unsound to rely on sharing data across more than one block.
+/// - You must write to the shared buffer before reading from it as the data is uninitialized by
+///   default.
+/// - `cuda_std::thread::sync_threads` must be called before relying on the results of other
+///   threads. This ensures every thread has reached that point before going on. For example, when
+///   reading another thread's data after writing to the buffer.
+///
+/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of
+/// shared memory are right.
+///
+/// # Examples
+///
+/// ```ignore
+/// use core::mem::MaybeUninit;
+/// use cuda_std::*;
+///
+/// ##[kernel]
+/// pub unsafe fn reverse_array(d: *mut u32, n: usize) {
+///     ##[address_space(shared)]
+///     static mut S: [MaybeUninit<u32>; 64] = [const { MaybeUninit::uninit() }; 64];
+///     let i = thread::thread_idx_x() as usize;
+///     let ir = n - i - 1;
+///     unsafe { S[i].write(*d.add(i)); };
+///     thread::sync_threads();
+///     unsafe { *d.add(i) = S[ir].assume_init(); }
+/// }
+/// ```
 #[proc_macro_attribute]
 pub fn address_space(attr: proc_macro::TokenStream, item: proc_macro::TokenStream) -> TokenStream {
     let mut global = syn::parse_macro_input!(item as syn::ItemStatic);

diff --git a/tests/compiletests/ui/shared/shared_memory.rs b/tests/compiletests/ui/shared/shared_memory.rs
@@ -1,33 +1,37 @@
 // Test CUDA shared memory allocations compile correctly
 // build-pass
+//
+// FIXME: The default of `-Cdebuginfo=2` causes a seg fault, for unclear reasons
+// compile-flags: -Cdebuginfo=1
 
-use cuda_std::kernel;
-use cuda_std::{shared_array, thread};
+use core::mem::MaybeUninit;
+use cuda_std::{address_space, kernel, thread};
 
 #[kernel]
 pub unsafe fn test_static_shared_memory() {
     // Allocate static shared memory for 256 i32 values
-    let shared_data = shared_array![i32; 256];
+    #[address_space(shared)]
+    static mut SHARED_DATA: [MaybeUninit<i32>; 256] = [MaybeUninit::uninit(); 256];
 
     let tid = thread::thread_idx_x() as usize;
 
     // Write to shared memory
-    *shared_data.add(tid) = tid as i32;
+    SHARED_DATA[tid].write(tid as i32);
 
     // Synchronize threads before reading
     thread::sync_threads();
 
     // Read from shared memory
-    let _value = *shared_data.add(tid);
+    let _value = SHARED_DATA[tid].assume_init();
 }
 
 #[kernel]
 pub unsafe fn test_different_types() {
     // Test different array types
-    let _shared_u32 = shared_array![u32; 128];
-    let _shared_f32 = shared_array![f32; 64];
-    let _shared_u8 = shared_array![u8; 512];
+    static mut _SHARED_U32: [MaybeUninit<u32>; 128] = [MaybeUninit::uninit(); 128];
+    static mut _SHARED_F32: [MaybeUninit<f32>; 64] = [MaybeUninit::uninit(); 64];
+    static mut _SHARED_U8: [MaybeUninit<u8>; 512] = [MaybeUninit::uninit(); 512];
 
     // Test arrays of arrays
-    let _shared_vec3 = shared_array![[f32; 3]; 32];
+    static mut _SHARED_VEC3: [MaybeUninit<[f32; 3]>; 32] = [MaybeUninit::uninit(); 32];
 }