Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ jobs:
run: cargo --version
- name: Rustfmt compiletests
shell: bash
# Uses rustfmt directly, rather than via `cargo fmt`, because the compiletests .rs files
# are not within a package.
run: shopt -s globstar && rustfmt --check tests/compiletests/ui/**/*.rs
- name: Compiletest
run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_75,compute_90
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/cuda_std/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Notable changes to this project will be documented in this file.
- Added warp shuffles, matches, reductions, and votes in the `warp` module.
- Added `activemask` in the `warp` module to query a mask of the active threads.
- Fixed `lane_id` generating invalid ptx.
- Removed `shared_array!` due to unsoundness.

## 0.2.2 - 2/7/22

Expand Down
80 changes: 3 additions & 77 deletions crates/cuda_std/src/shared.rs
Original file line number Diff line number Diff line change
@@ -1,83 +1,9 @@
//! Static and Dynamic shared memory handling.
//! Dynamic shared memory handling.
//!
//! Static shared memory is done via `#[address_space(shared)] static mut ...;`.

use crate::gpu_only;

/// Statically allocates a buffer large enough for `len` elements of `array_type`,
/// yielding a `*mut array_type` that points to uninitialized shared memory. `len` must
/// be a constant expression.
///
/// Note that this allocates the memory __statically__, it expands to a static in the
/// `shared` address space. Therefore, calling this macro multiple times in a loop will
/// always yield the same data. However, separate invocations of the macro will yield
/// different buffers.
///
/// The data is uninitialized by default, therefore, you must be careful to not read the
/// data before it is written to. The semantics of what "uninitialized" actually means
/// on the GPU (i.e. if it yields unknown data or if it is UB to read it whatsoever) are
/// not well known, so even if the type is valid for any backing memory, make sure to
/// not read uninitialized data.
///
/// # Safety
///
/// Shared memory usage is fundamentally extremely unsafe and impossible to statically
/// prove, therefore the burden of correctness is on the user. Some of the things you
/// must ensure in your usage of shared memory are:
///
/// - Shared memory is only shared across __thread blocks__, not the entire device,
/// therefore it is unsound to try and rely on sharing data across more than one
/// block.
/// - You must write to the shared buffer before reading from it as the data is
/// uninitialized by default.
/// - [`thread::sync_threads`](crate::thread::sync_threads) must be called before
/// relying on the results of other threads, this ensures every thread has reached
/// that point before going on. For example, reading another thread's data after
/// writing to the buffer.
/// - No access may be out of bounds, this usually means making sure the amount of
/// threads and their dimensions are correct.
///
/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of
/// shared memory are right.
///
/// # Examples
///
/// ```no_run
/// # use cuda_std::kernel;
/// # use cuda_std::shared_array;
/// # use cuda_std::thread;
/// ##[kernel]
/// pub unsafe fn reverse_array(d: *mut i32, n: usize) {
/// let s = shared_array![i32; 64];
/// let t = thread::thread_idx_x() as usize;
/// let tr = n - t - 1;
/// *s.add(t) = *d.add(t);
/// thread::sync_threads();
/// *d.add(t) = *s.add(tr);
/// }
/// ```
#[macro_export]
macro_rules! shared_array {
($array_type:ty; $len:expr) => {{
#[$crate::gpu_only]
#[inline(always)]
fn shared_array() -> *mut $array_type {
use ::core::{cell::UnsafeCell, mem::MaybeUninit};
struct SyncWrapper(UnsafeCell<MaybeUninit<[$array_type; $len]>>);
// SAFETY: it is up to the user to verify sound shared memory usage, we cannot
// fundamentally check it for soundness.
unsafe impl Send for SyncWrapper {}
// SAFETY: see above
unsafe impl Sync for SyncWrapper {}

// the initializer is discarded when declaring shared globals, so it is unimportant.
#[$crate::address_space(shared)]
static SHARED: SyncWrapper = SyncWrapper(UnsafeCell::new(MaybeUninit::uninit()));

SHARED.0.get() as *mut $array_type
}
shared_array()
}};
}

/// Gets a pointer to the dynamic shared memory that was allocated by the caller of the kernel. The
/// data is left uninitialized.
///
Expand Down
55 changes: 52 additions & 3 deletions crates/cuda_std_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,12 +205,61 @@ pub fn externally_visible(
}

/// Notifies the codegen to put a `static`/`static mut` inside of a specific memory address space.
/// This is mostly for internal use and/or advanced users, as the codegen and `cuda_std` handle address space placement
/// implicitly. **Improper use of this macro could yield weird or undefined behavior**.
/// This is mostly for internal use and/or advanced users, as the codegen and `cuda_std` handle
/// address space placement implicitly. **Improper use of this macro could yield weird or undefined
/// behavior**.
///
/// This macro takes a single argument which can either be `global`, `shared`, `constant`, or `local`.
/// This macro takes a single argument which can either be `global`, `shared`, `constant`, or
/// `local`.
///
/// This macro does nothing on the CPU.
///
/// # Shared memory
///
/// The item `#[address_space(shared) static mut FOO: [MaybeUninit<T>; N];` statically allocates a
/// buffer large enough for `N` elements of type `T`, yielding an uninitialized array in shared
/// memory.
///
/// Note that this allocates the memory __statically__, i.e. it expands to a static in the `shared`
/// address space. Therefore, calling this macro multiple times in a loop will always yield the
/// same data. However, separate invocations of the macro will yield different buffers.
///
/// Because the data is uninitialized by default, the type within the array must be `MaybeUninit`,
/// and uses must follow the usual rules of `MaybeUninit`, such as using `write`/`assume_init`.
/// Using a non-`MaybeUninit` type is undefined behaviour.
///
/// # Safety
///
/// Shared memory usage is fundamentally unsafe and much of the burden of correctness is on the
/// user. For example:
/// - Shared memory is only shared across __thread blocks__, not the entire device, therefore it is
/// unsound to rely on sharing data across more than one block.
/// - You must write to the shared buffer before reading from it as the data is uninitialized by
/// default.
/// - `cuda_std::thread::sync_threads` must be called before relying on the results of other
/// threads. This ensures every thread has reached that point before going on. For example, when
/// reading another thread's data after writing to the buffer.
///
/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of
/// shared memory are right.
///
/// # Examples
///
/// ```ignore
/// use core::mem::MaybeUninit;
/// use cuda_std::*;
///
/// ##[kernel]
/// pub unsafe fn reverse_array(d: *mut u32, n: usize) {
/// ##[address_space(shared)]
/// static mut S: [MaybeUninit<u32>; 64] = [const { MaybeUninit::uninit() }; 64];
/// let i = thread::thread_idx_x() as usize;
/// let ir = n - i - 1;
/// unsafe { S[i].write(*d.add(i)); };
/// thread::sync_threads();
/// unsafe { *d.add(i) = S[ir].assume_init(); }
/// }
/// ```
#[proc_macro_attribute]
pub fn address_space(attr: proc_macro::TokenStream, item: proc_macro::TokenStream) -> TokenStream {
let mut global = syn::parse_macro_input!(item as syn::ItemStatic);
Expand Down
22 changes: 13 additions & 9 deletions tests/compiletests/ui/shared/shared_memory.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,37 @@
// Test CUDA shared memory allocations compile correctly
// build-pass
//
// FIXME: The default of `-Cdebuginfo=2` causes a seg fault, for unclear reasons
// compile-flags: -Cdebuginfo=1

use cuda_std::kernel;
use cuda_std::{shared_array, thread};
use core::mem::MaybeUninit;
use cuda_std::{address_space, kernel, thread};

#[kernel]
pub unsafe fn test_static_shared_memory() {
// Allocate static shared memory for 256 i32 values
let shared_data = shared_array![i32; 256];
#[address_space(shared)]
static mut SHARED_DATA: [MaybeUninit<i32>; 256] = [MaybeUninit::uninit(); 256];

let tid = thread::thread_idx_x() as usize;

// Write to shared memory
*shared_data.add(tid) = tid as i32;
SHARED_DATA[tid].write(tid as i32);

// Synchronize threads before reading
thread::sync_threads();

// Read from shared memory
let _value = *shared_data.add(tid);
let _value = SHARED_DATA[tid].assume_init();
}

#[kernel]
pub unsafe fn test_different_types() {
// Test different array types
let _shared_u32 = shared_array![u32; 128];
let _shared_f32 = shared_array![f32; 64];
let _shared_u8 = shared_array![u8; 512];
static mut _SHARED_U32: [MaybeUninit<u32>; 128] = [MaybeUninit::uninit(); 128];
static mut _SHARED_F32: [MaybeUninit<f32>; 64] = [MaybeUninit::uninit(); 64];
static mut _SHARED_U8: [MaybeUninit<u8>; 512] = [MaybeUninit::uninit(); 512];

// Test arrays of arrays
let _shared_vec3 = shared_array![[f32; 3]; 32];
static mut _SHARED_VEC3: [MaybeUninit<[f32; 3]>; 32] = [MaybeUninit::uninit(); 32];
}
Loading