diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57b6377f7..fcf0fd63c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,16 @@ jobs: rust: [nightly] steps: + - name: Install CUDA + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin + sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 + curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb + sudo dpkg -i cuda-keyring_1.0-1_all.deb + sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" + sudo apt-get update -q + sudo apt-get install cuda -y --no-install-recommends + - name: Checkout the Repository uses: actions/checkout@v2 @@ -40,53 +50,26 @@ jobs: sudo ./llvm.sh $(rustc --version -v | grep -oP "LLVM version: \K\d+") rm llvm.sh cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force + + - name: Install cargo-hack + uses: taiki-e/install-action@cargo-hack - - name: Check without features on CPU - run: | - cargo check - - - name: Check with alloc feature on CPU - run: | - cargo check \ - --features alloc - - - name: Check with derive feature on CPU - run: | - cargo check \ - --features derive - - - name: Check with host feature on CPU - run: | - cargo check \ - --features host - - - name: Check with host,derive,alloc features on CPU + - name: Check feature powerset on the CPU run: | - cargo check \ - --features host,derive,alloc + cargo hack check --feature-powerset --optional-deps \ + --skip device \ + --keep-going - - name: Check without features on CUDA + - name: Check feature powerset on CUDA run: | - cargo check \ + cargo hack check --feature-powerset --optional-deps \ + --skip host \ + --keep-going \ --target nvptx64-nvidia-cuda - - name: Check with alloc feature on CUDA - run: | - cargo check \ - --target nvptx64-nvidia-cuda \ - --features alloc - - - name: Check with derive feature on CUDA - run: | - cargo check \ - --target nvptx64-nvidia-cuda \ - --features derive - - name: Check all workspace targets run: | - cargo check \ - --workspace \ - --all-targets + cargo check --workspace --all-targets test: name: Test Suite @@ -157,6 +140,16 @@ jobs: rust: [nightly] steps: + - name: Install CUDA + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin + sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 + curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb + sudo dpkg -i cuda-keyring_1.0-1_all.deb + sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" + sudo apt-get update -q + sudo apt-get install cuda -y --no-install-recommends + - name: Checkout the Repository uses: actions/checkout@v2 @@ -176,58 +169,24 @@ jobs: rm llvm.sh cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force - - name: Check the code style without features on CPU - run: | - cargo clippy \ - -- -D warnings - - - name: Check the code style with alloc feature on CPU - run: | - cargo clippy \ - --features alloc \ - -- -D warnings - - - name: Check the code style with derive feature on CPU - run: | - cargo clippy \ - --features derive \ - -- -D warnings - - - name: Check the code style with host feature on CPU - run: | - cargo clippy \ - --features host \ - -- -D warnings - - - name: Check the code style with host,derive,alloc features on CPU - run: | - cargo clippy \ - --features host,derive,alloc \ - -- -D warnings - - - name: Check the code style without features on CUDA - run: | - cargo clippy \ - --target nvptx64-nvidia-cuda \ - -- -D warnings + - name: Install cargo-hack + uses: taiki-e/install-action@cargo-hack - - name: Check the code style with alloc feature on CUDA + - name: Check feature powerset on the CPU run: | - cargo clippy \ - --target nvptx64-nvidia-cuda \ - --features alloc \ + cargo hack clippy --feature-powerset --optional-deps \ + --skip device \ + --keep-going \ -- -D warnings - - - name: Check the code style with derive feature on CUDA + + - name: Check feature powerset on CUDA run: | - cargo clippy \ + cargo hack clippy --feature-powerset --optional-deps \ + --skip host \ + --keep-going \ --target nvptx64-nvidia-cuda \ - --features derive \ -- -D warnings - - name: Check the code style for all workspace targets + - name: Check all workspace targets run: | - cargo clippy \ - --workspace \ - --all-targets \ - -- -D warnings + cargo clippy --workspace --all-targets -- -D warnings diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 904e1a65c..c54f606d5 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -59,8 +59,8 @@ jobs: ./grcov . -s . --binary-path ./target/debug/deps \ -t lcov -o coverage.lcov --branch \ --keep-only "src/*" \ - --keep-only "rust-cuda-ptx-jit/*" \ --keep-only "rust-cuda-derive/*" \ + --keep-only "rust-cuda-kernel/*" \ --ignore-not-existing \ --excl-line GRCOV_EXCL_LINE \ --excl-start GRCOV_EXCL_START \ diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml index 285fc57c2..5c756572c 100644 --- a/.github/workflows/rustdoc.yml +++ b/.github/workflows/rustdoc.yml @@ -28,6 +28,8 @@ jobs: run: | RUSTDOCFLAGS="\ --enable-index-page \ + --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.1/ \ + --extern-html-root-url final=https://docs.rs/final/0.1.1/ \ --extern-html-root-url rustacuda=https://docs.rs/rustacuda/0.1.3/ \ --extern-html-root-url rustacuda_core=https://docs.rs/rustacuda_core/0.1.2/ \ --extern-html-root-url rustacuda_derive=https://docs.rs/rustacuda_derive/0.1.2/ \ diff --git a/.gitignore b/.gitignore index 767dae236..218ca8786 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ Cargo.lock # These are backup files generated by rustfmt **/*.rs.bk + +# cargo expand dev output files +**/expanded.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index 93f713cad..d12ff8221 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,5 +4,11 @@ "rust-analyzer.updates.askBeforeDownload": false, "rust-analyzer.checkOnSave.command": "reap-clippy", "rust-analyzer.cargo.allFeatures": false, - "rust-analyzer.cargo.features": ["alloc", "derive", "host"], + "rust-analyzer.cargo.features": [ + "derive", + "final", + "host", + "kernel" + ], + "rust-analyzer.showUnlinkedFileNotification": false, } diff --git a/Cargo.toml b/Cargo.toml index e8c86665b..acb60681a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,10 @@ [workspace] members = [ - ".", "rust-cuda-derive", "rust-cuda-ptx-jit", - "examples/single-source", "examples/derive", + ".", "rust-cuda-derive", "rust-cuda-kernel", + "examples/derive", "examples/print", "examples/single-source", ] default-members = [ - ".", "rust-cuda-derive", "rust-cuda-ptx-jit" + ".", "rust-cuda-derive", "rust-cuda-kernel", ] [package] @@ -13,29 +13,32 @@ version = "0.1.0" authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" -rust-version = "1.75" # nightly +rust-version = "1.77" # nightly # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [features] default = [] -alloc = ["hashbrown"] -host = ["rustacuda", "rust-cuda-ptx-jit/host"] -derive = ["rustacuda_derive", "rust-cuda-derive"] +derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"] +device = [] +final = ["dep:final"] +host = ["dep:rustacuda", "dep:regex", "dep:oneshot", "dep:safer_owning_ref"] +kernel = ["dep:rust-cuda-kernel"] [dependencies] -rustacuda_core = "0.1.2" +rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc" } -rustacuda = { version = "0.1.3", optional = true } -rustacuda_derive = { version = "0.1.2", optional = true } +rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true } +rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true } -const-type-layout = { version = "0.2.0", features = ["derive"] } +regex = { version = "1.10", optional = true } -final = "0.1.1" -hashbrown = { version = "0.14", default-features = false, features = ["inline-more"], optional = true } +const-type-layout = { git = "https://github.com/juntyr/const-type-layout", branch = "compress", features = ["derive"] } -rust-cuda-derive = { path = "rust-cuda-derive", optional = true } -rust-cuda-ptx-jit = { path = "rust-cuda-ptx-jit" } +safer_owning_ref = { version = "0.5", optional = true } +oneshot = { version = "0.1", optional = true, features = ["std", "async"] } + +final = { version = "0.1.1", optional = true } -[dev-dependencies] -hashbrown = { version = "0.14", default-features = false, features = ["inline-more"] } +rust-cuda-derive = { path = "rust-cuda-derive", optional = true } +rust-cuda-kernel = { path = "rust-cuda-kernel", optional = true } diff --git a/README.md b/README.md index e9b24ddbb..5080b7033 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,11 @@ -# rust-cuda   [![CI Status]][workflow] [![Rust Doc]][docs] [![License Status]][fossa] [![Code Coverage]][codecov] [![Gitpod Ready-to-Code]][gitpod] +# rust-cuda   [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License Status]][fossa] [![Code Coverage]][codecov] [![Gitpod Ready-to-Code]][gitpod] [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain +[MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange +[repo]: https://github.com/juntyr/rust-cuda + [Rust Doc]: https://img.shields.io/badge/docs-main-blue [docs]: https://juntyr.github.io/rust-cuda/ diff --git a/examples/derive/Cargo.toml b/examples/derive/Cargo.toml index e59a344af..1b000fe8c 100644 --- a/examples/derive/Cargo.toml +++ b/examples/derive/Cargo.toml @@ -1,12 +1,11 @@ [package] name = "derive" version = "0.1.0" -authors = ["Juniper Tyree "] +authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -const-type-layout = { version = "0.2.0" } -rust-cuda = { path = "../../", features = ["derive", "host"] } +rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] } diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs index 814e30f61..6960eadeb 100644 --- a/examples/derive/src/lib.rs +++ b/examples/derive/src/lib.rs @@ -1,14 +1,15 @@ #![deny(clippy::pedantic)] #![feature(const_type_name)] -#![feature(offset_of)] -#[derive(rust_cuda::common::LendRustToCuda)] +#[derive(rc::lend::LendRustToCuda)] +#[cuda(crate = "rc")] struct Inner { #[cuda(embed)] inner: T, } -#[derive(rust_cuda::common::LendRustToCuda)] +#[derive(rc::lend::LendRustToCuda)] +#[cuda(crate = "rc")] struct Outer { #[cuda(embed)] inner: Inner, diff --git a/examples/print/.cargo/config.toml b/examples/print/.cargo/config.toml new file mode 100644 index 000000000..4a98afe58 --- /dev/null +++ b/examples/print/.cargo/config.toml @@ -0,0 +1,2 @@ +[target.nvptx64-nvidia-cuda] +rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"] diff --git a/examples/print/Cargo.toml b/examples/print/Cargo.toml new file mode 100644 index 000000000..b7f864b58 --- /dev/null +++ b/examples/print/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "print" +version = "0.1.0" +authors = ["Juniper Tyree "] +license = "MIT OR Apache-2.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[target.'cfg(target_os = "cuda")'.dependencies] +rust-cuda = { path = "../../", features = ["kernel", "device"] } + +[target.'cfg(not(target_os = "cuda"))'.dependencies] +rust-cuda = { path = "../../", features = ["kernel", "host"] } diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs new file mode 100644 index 000000000..c99ae0df9 --- /dev/null +++ b/examples/print/src/main.rs @@ -0,0 +1,109 @@ +#![deny(clippy::pedantic)] +#![cfg_attr(target_os = "cuda", no_std)] +#![cfg_attr(target_os = "cuda", no_main)] +#![cfg_attr(target_os = "cuda", feature(abi_ptx))] +#![cfg_attr(target_os = "cuda", feature(alloc_error_handler))] +#![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))] +#![feature(const_type_name)] +#![feature(cfg_version)] +#![feature(type_alias_impl_trait)] +#![feature(decl_macro)] + +extern crate alloc; + +#[derive(Copy, Clone, rust_cuda::deps::const_type_layout::TypeLayout)] +#[layout(crate = "rust_cuda::deps::const_type_layout")] +#[repr(C)] +pub enum Action { + Print, + Panic, + AllocError, +} + +#[rust_cuda::kernel::kernel(use link! for impl)] +#[kernel(allow(ptx::local_memory_use))] +pub fn kernel(action: rust_cuda::kernel::param::PerThreadShallowCopy) { + match action { + Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"), + Action::Panic => panic!("panic! from CUDA kernel"), + Action::AllocError => { + ::alloc::alloc::handle_alloc_error(::core::alloc::Layout::new::()) + }, + } +} + +#[cfg(not(target_os = "cuda"))] +fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> { + // Link the non-generic CUDA kernel + struct KernelPtx; + link! { impl kernel for KernelPtx } + + // Initialize the CUDA API + rust_cuda::deps::rustacuda::init(rust_cuda::deps::rustacuda::CudaFlags::empty())?; + + // Get the first CUDA GPU device + let device = rust_cuda::deps::rustacuda::device::Device::get_device(0)?; + + // Create a CUDA context associated to this device + let _context = rust_cuda::host::CudaDropWrapper::from( + rust_cuda::deps::rustacuda::context::Context::create_and_push( + rust_cuda::deps::rustacuda::context::ContextFlags::MAP_HOST + | rust_cuda::deps::rustacuda::context::ContextFlags::SCHED_AUTO, + device, + )?, + ); + + // Create a new CUDA stream to submit kernels to + let mut stream = + rust_cuda::host::CudaDropWrapper::from(rust_cuda::deps::rustacuda::stream::Stream::new( + rust_cuda::deps::rustacuda::stream::StreamFlags::NON_BLOCKING, + None, + )?); + + // Create a new instance of the CUDA kernel and prepare the launch config + let mut kernel = rust_cuda::kernel::TypedPtxKernel::::new::(None); + let config = rust_cuda::kernel::LaunchConfig { + grid: rust_cuda::deps::rustacuda::function::GridSize::x(1), + block: rust_cuda::deps::rustacuda::function::BlockSize::x(4), + ptx_jit: false, + }; + + // Launch the CUDA kernel on the stream and synchronise to its completion + rust_cuda::host::Stream::with(&mut stream, |stream| { + println!("Launching print kernel ..."); + kernel.launch1(stream, &config, Action::Print)?; + println!("Launching panic kernel ..."); + kernel.launch1(stream, &config, Action::Panic)?; + println!("Launching alloc error kernel ..."); + kernel.launch1(stream, &config, Action::AllocError) + })?; + + Ok(()) +} + +#[cfg(target_os = "cuda")] +mod cuda_prelude { + use rust_cuda::device::alloc::PTXAllocator; + + #[global_allocator] + static _GLOBAL_ALLOCATOR: PTXAllocator = PTXAllocator; + + #[panic_handler] + fn panic(info: &::core::panic::PanicInfo) -> ! { + // pretty format and print the panic message + // but don't allow dynamic formatting or panic payload downcasting + rust_cuda::device::utils::pretty_print_panic_info(info, false, false); + + // Safety: no mutable data is shared with the kernel + unsafe { rust_cuda::device::utils::exit() } + } + + #[alloc_error_handler] + #[track_caller] + fn alloc_error_handler(layout: ::core::alloc::Layout) -> ! { + rust_cuda::device::utils::pretty_print_alloc_error(layout); + + // Safety: no mutable data is shared with the kernel + unsafe { rust_cuda::device::utils::exit() } + } +} diff --git a/examples/single-source/.cargo/config.toml b/examples/single-source/.cargo/config.toml index 48db9d693..4a98afe58 100644 --- a/examples/single-source/.cargo/config.toml +++ b/examples/single-source/.cargo/config.toml @@ -1,5 +1,2 @@ [target.nvptx64-nvidia-cuda] -rustflags = ["-Clink-args=--arch sm_35", "-Clink-arg=-O3", "-Clink-arg=--lto"] - -[unstable] -features = ["all"] +rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"] diff --git a/examples/single-source/Cargo.toml b/examples/single-source/Cargo.toml index 128da7cef..1a27dd30e 100644 --- a/examples/single-source/Cargo.toml +++ b/examples/single-source/Cargo.toml @@ -1,17 +1,14 @@ [package] name = "single-source" version = "0.1.0" -authors = ["Juniper Tyree "] +authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html -[dependencies] -const-type-layout = { version = "0.2.0" } - [target.'cfg(target_os = "cuda")'.dependencies] -rust-cuda = { path = "../../", features = ["derive"] } +rc = { package = "rust-cuda", path = "../../", features = ["derive", "kernel", "device"] } [target.'cfg(not(target_os = "cuda"))'.dependencies] -rust-cuda = { path = "../../", features = ["derive", "host"] } +rc = { package = "rust-cuda", path = "../../", features = ["derive", "kernel", "host"] } diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 36c0736c6..3861190d2 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -3,86 +3,117 @@ #![cfg_attr(target_os = "cuda", no_main)] #![cfg_attr(target_os = "cuda", feature(abi_ptx))] #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))] -#![cfg_attr(target_os = "cuda", feature(stdsimd))] #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))] #![feature(const_type_name)] -#![feature(offset_of)] +#![feature(cfg_version)] +#![feature(type_alias_impl_trait)] +#![feature(associated_type_bounds)] +#![feature(decl_macro)] +#![recursion_limit = "1024"] extern crate alloc; -#[macro_use] -extern crate const_type_layout; - #[cfg(not(target_os = "cuda"))] fn main() {} #[repr(C)] -#[derive(TypeLayout)] +#[derive(rc::deps::const_type_layout::TypeLayout)] +#[layout(crate = "rc::deps::const_type_layout")] pub struct Dummy(i32); -#[derive(rust_cuda::common::LendRustToCuda)] +#[derive(Clone, rc::lend::LendRustToCuda)] +#[cuda(crate = "rc")] #[allow(dead_code)] pub struct Wrapper { #[cuda(embed)] inner: T, } -#[derive(rust_cuda::common::LendRustToCuda)] +#[derive(Clone, rc::lend::LendRustToCuda)] +#[cuda(crate = "rc")] pub struct Empty([u8; 0]); #[repr(C)] -#[derive(TypeLayout)] +#[derive(rc::deps::const_type_layout::TypeLayout)] +#[layout(crate = "rc::deps::const_type_layout")] pub struct Tuple(u32, i32); -#[rust_cuda::common::kernel(use link_kernel! as impl Kernel for Launcher)] -pub fn kernel<'a, T: rust_cuda::common::RustToCuda>( - #[kernel(pass = SafeDeviceCopy)] _x: &Dummy, - #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy>, - #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy>, - #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64, - #[kernel(pass = LendRustToCuda)] _: Wrapper, - #[kernel(pass = SafeDeviceCopy)] Tuple(_s, mut __t): Tuple, -) where - ::CudaRepresentation: rust_cuda::safety::StackOnly, -{ -} - -#[cfg(not(target_os = "cuda"))] -mod host { - use super::{Kernel, KernelArgs}; - - #[allow(dead_code)] - struct Launcher(core::marker::PhantomData); - - link_kernel!(crate::Empty); - link_kernel!(rust_cuda::utils::device_copy::SafeDeviceCopyWrapper); +#[repr(C)] +#[derive(Copy, Clone, rc::deps::const_type_layout::TypeLayout)] +#[layout(crate = "rc::deps::const_type_layout")] +pub struct Triple(i32, i32, i32); + +#[rc::kernel::kernel(pub use link! for impl)] +#[kernel(crate = "rc")] +#[kernel( + allow(ptx::double_precision_use), + forbid(ptx::local_memory_use, ptx::register_spills) +)] +pub fn kernel< + 'a, + T: 'static + + Send + + Sync + + Clone + + rc::lend::RustToCuda< + CudaRepresentation: rc::safety::StackOnly, + CudaAllocation: rc::alloc::EmptyCudaAlloc, + > + + rc::safety::StackOnly, +>( + _x: &rc::kernel::param::PerThreadShallowCopy, + _z: &rc::kernel::param::DeepPerThreadBorrow>, + _v @ _w: &'a rc::kernel::param::ShallowInteriorMutable, + _: rc::kernel::param::DeepPerThreadBorrow>, + q @ Triple(s, mut __t, _u): rc::kernel::param::PerThreadShallowCopy, + shared3: &mut rc::utils::shared::ThreadBlockShared, + dynamic: &mut rc::utils::shared::ThreadBlockSharedSlice, +) { + let shared = rc::utils::shared::ThreadBlockShared::<[Tuple; 3]>::new_uninit(); + let shared2 = rc::utils::shared::ThreadBlockShared::<[Tuple; 3]>::new_uninit(); + + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + unsafe { + (*shared.index_mut_unchecked(1)).0 = (f64::from(s) * 2.0) as u32; + } + unsafe { + (*shared2.index_mut_unchecked(2)).1 = q.0 + q.1 + q.2; + } - impl rust_cuda::host::Launcher for Launcher { - type CompilationWatcher = (); - type KernelTraitObject = dyn Kernel; + unsafe { + *shared3.as_mut_ptr() = 12; + } - fn get_launch_package(&mut self) -> rust_cuda::host::LaunchPackage { - unimplemented!() + let index = rc::device::thread::Thread::this().index(); + if index < dynamic.len() { + unsafe { + *dynamic.index_mut_unchecked(index) = Dummy(42); } } } +#[cfg(not(target_os = "cuda"))] +mod host { + // Link several instances of the generic CUDA kernel + struct KernelPtx<'a, T>(std::marker::PhantomData<&'a T>); + crate::link! { impl kernel<'a, crate::Empty> for KernelPtx } + crate::link! { impl kernel<'a, rc::utils::adapter::RustToCudaWithPortableBitCopySemantics> for KernelPtx } +} + #[cfg(target_os = "cuda")] mod cuda_prelude { - use core::arch::nvptx; - - use rust_cuda::device::utils; + use rc::device::alloc::PTXAllocator; #[global_allocator] - static _GLOBAL_ALLOCATOR: utils::PTXAllocator = utils::PTXAllocator; + static _GLOBAL_ALLOCATOR: PTXAllocator = PTXAllocator; #[panic_handler] fn panic(_: &::core::panic::PanicInfo) -> ! { - unsafe { nvptx::trap() } + rc::device::utils::abort() } #[alloc_error_handler] fn alloc_error_handler(_: core::alloc::Layout) -> ! { - unsafe { nvptx::trap() } + rc::device::utils::abort() } } diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml index 4b8677df4..fc214dea7 100644 --- a/rust-cuda-derive/Cargo.toml +++ b/rust-cuda-derive/Cargo.toml @@ -1,9 +1,10 @@ [package] name = "rust-cuda-derive" version = "0.1.0" -authors = ["Juniper Tyree "] +authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" +rust-version = "1.77" # nightly # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -11,16 +12,7 @@ edition = "2021" proc-macro = true [dependencies] -syn = { version = "1.0", features = ["full"] } +syn = { version = "1.0", features = ["full", "fold"] } quote = "1.0" proc-macro2 = "1.0" proc-macro-error = "1.0" -regex = "1.5" -lazy_static = "1.4" -serde_json = "1.0" -cargo_metadata = { version = "0.18", features = ["builder"] } -strip-ansi-escapes = "0.2" -colored = "2.0" - -seahash = "4.1" -ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" } diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs deleted file mode 100644 index 506d8ea03..000000000 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ /dev/null @@ -1,403 +0,0 @@ -use std::{ - env, fs, - io::{Read, Write}, - path::{Path, PathBuf}, - sync::atomic::{AtomicBool, Ordering}, -}; - -use colored::Colorize; -use proc_macro::TokenStream; -use ptx_builder::{ - builder::{BuildStatus, Builder, MessageFormat, Profile}, - error::{BuildErrorKind, Error, Result}, -}; - -use super::utils::skip_kernel_compilation; - -mod config; -mod error; - -use config::{CheckKernelConfig, LinkKernelConfig}; -use error::emit_ptx_build_error; - -pub fn check_kernel(tokens: TokenStream) -> TokenStream { - proc_macro_error::set_dummy(quote! { - "ERROR in this PTX compilation" - }); - - let CheckKernelConfig { - args, - crate_name, - crate_path, - } = match syn::parse_macro_input::parse(tokens) { - Ok(config) => config, - Err(err) => { - abort_call_site!( - "check_kernel!(ARGS NAME PATH) expects ARGS identifier, NAME and PATH string \ - literals: {:?}", - err - ) - }, - }; - - let kernel_ptx = compile_kernel(&args, &crate_name, &crate_path, Specialisation::Check); - - match kernel_ptx { - Some(kernel_ptx) => quote!(#kernel_ptx).into(), - None => quote!("ERROR in this PTX compilation").into(), - } -} - -#[allow(clippy::module_name_repetitions, clippy::too_many_lines)] -pub fn link_kernel(tokens: TokenStream) -> TokenStream { - proc_macro_error::set_dummy(quote! { - const PTX_STR: &'static str = "ERROR in this PTX compilation"; - }); - - let LinkKernelConfig { - kernel, - args, - crate_name, - crate_path, - specialisation, - } = match syn::parse_macro_input::parse(tokens) { - Ok(config) => config, - Err(err) => { - abort_call_site!( - "link_kernel!(KERNEL ARGS NAME PATH SPECIALISATION) expects KERNEL and ARGS \ - identifiers, NAME and PATH string literals, and SPECIALISATION tokens: {:?}", - err - ) - }, - }; - - if skip_kernel_compilation() { - return quote! { - const PTX_STR: &'static str = "CLIPPY skips specialised PTX compilation"; - } - .into(); - } - - let Some(mut kernel_ptx) = compile_kernel( - &args, - &crate_name, - &crate_path, - Specialisation::Link(&specialisation), - ) else { - return (quote! { - const PTX_STR: &'static str = "ERROR in this PTX compilation"; - }) - .into(); - }; - - let kernel_layout_name = if specialisation.is_empty() { - format!("{kernel}_type_layout_kernel") - } else { - format!( - "{kernel}_type_layout_kernel_{:016x}", - seahash::hash(specialisation.as_bytes()) - ) - }; - - let mut type_layouts = Vec::new(); - - let type_layout_start_pattern = format!("\n\t// .globl\t{kernel_layout_name}"); - - if let Some(type_layout_start) = kernel_ptx.find(&type_layout_start_pattern) { - const BEFORE_PARAM_PATTERN: &str = ".global .align 1 .b8 "; - const PARAM_LEN_PATTERN: &str = "["; - const LEN_BYTES_PATTERN: &str = "] = {"; - const AFTER_BYTES_PATTERN: &str = "};"; - - let after_type_layout_start = type_layout_start + type_layout_start_pattern.len(); - - let Some(type_layout_middle) = kernel_ptx[after_type_layout_start..] - .find(&format!(".visible .entry {kernel_layout_name}")) - .map(|i| after_type_layout_start + i) - else { - abort_call_site!( - "Kernel compilation generated invalid PTX: incomplete type layout information" - ) - }; - - let mut next_type_layout = after_type_layout_start; - - while let Some(param_start_offset) = - kernel_ptx[next_type_layout..type_layout_middle].find(BEFORE_PARAM_PATTERN) - { - let param_start = next_type_layout + param_start_offset + BEFORE_PARAM_PATTERN.len(); - - if let Some(len_start_offset) = - kernel_ptx[param_start..type_layout_middle].find(PARAM_LEN_PATTERN) - { - let len_start = param_start + len_start_offset + PARAM_LEN_PATTERN.len(); - - if let Some(bytes_start_offset) = - kernel_ptx[len_start..type_layout_middle].find(LEN_BYTES_PATTERN) - { - let bytes_start = len_start + bytes_start_offset + LEN_BYTES_PATTERN.len(); - - if let Some(bytes_end_offset) = - kernel_ptx[bytes_start..type_layout_middle].find(AFTER_BYTES_PATTERN) - { - let param = &kernel_ptx[param_start..(param_start + len_start_offset)]; - let len = &kernel_ptx[len_start..(len_start + bytes_start_offset)]; - let bytes = &kernel_ptx[bytes_start..(bytes_start + bytes_end_offset)]; - - let param = quote::format_ident!("{}", param); - - let Ok(len) = len.parse::() else { - abort_call_site!( - "Kernel compilation generated invalid PTX: invalid type layout \ - length" - ) - }; - let Ok(bytes) = bytes - .split(", ") - .map(std::str::FromStr::from_str) - .collect::, _>>() - else { - abort_call_site!( - "Kernel compilation generated invalid PTX: invalid type layout \ - byte" - ) - }; - - if bytes.len() != len { - abort_call_site!( - "Kernel compilation generated invalid PTX: type layout length \ - mismatch" - ); - } - - let byte_str = syn::LitByteStr::new(&bytes, proc_macro2::Span::call_site()); - - type_layouts.push(quote! { - const #param: &[u8; #len] = #byte_str; - }); - - next_type_layout = - bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len(); - } else { - next_type_layout = bytes_start; - } - } else { - next_type_layout = len_start; - } - } else { - next_type_layout = param_start; - } - } - - let Some(type_layout_end) = kernel_ptx[type_layout_middle..] - .find('}') - .map(|i| type_layout_middle + i + '}'.len_utf8()) - else { - abort_call_site!("Kernel compilation generated invalid PTX") - }; - - kernel_ptx.replace_range(type_layout_start..type_layout_end, ""); - } - - (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into() -} - -fn compile_kernel( - args: &syn::Ident, - crate_name: &str, - crate_path: &Path, - specialisation: Specialisation, -) -> Option { - if let Ok(rust_flags) = proc_macro::tracked_env::var("RUSTFLAGS") { - env::set_var( - "RUSTFLAGS", - rust_flags - .replace("-Zinstrument-coverage", "") - .replace("-Cinstrument-coverage", ""), - ); - } - - let specialisation_var = format!( - "RUST_CUDA_DERIVE_SPECIALISE_{}_{}", - crate_name, - args.to_string().to_uppercase() - ); - - match build_kernel_with_specialisation(crate_path, &specialisation_var, specialisation) { - Ok(kernel_path) => { - let mut file = fs::File::open(&kernel_path) - .unwrap_or_else(|_| panic!("Failed to open kernel file at {:?}.", &kernel_path)); - - let mut kernel_ptx = String::new(); - - file.read_to_string(&mut kernel_ptx) - .unwrap_or_else(|_| panic!("Failed to read kernel file at {:?}.", &kernel_path)); - - colored::control::set_override(true); - eprintln!( - "{} {} compiling a PTX crate.", - "[PTX]".bright_black().bold(), - "Finished".green().bold() - ); - colored::control::unset_override(); - - Some(kernel_ptx) - }, - Err(err) => { - eprintln!("{err:?}"); - emit_ptx_build_error(); - None - }, - } -} - -#[allow(clippy::too_many_lines)] -fn build_kernel_with_specialisation( - kernel_path: &Path, - env_var: &str, - specialisation: Specialisation, -) -> Result { - match specialisation { - Specialisation::Check => env::set_var(env_var, "chECK"), - Specialisation::Link(specialisation) => env::set_var(env_var, specialisation), - }; - - let result = (|| { - let mut builder = Builder::new(kernel_path)?; - - builder = match specialisation { - Specialisation::Check => builder.set_profile(Profile::Debug), - Specialisation::Link(_) => builder.set_profile(Profile::Release), - }; - - builder = builder.set_message_format(MessageFormat::Json { - render_diagnostics: false, - short: false, - ansi: true, - }); - - let specialisation_prefix = match specialisation { - Specialisation::Check => String::from("chECK"), - Specialisation::Link(specialisation) => { - format!("{:016x}", seahash::hash(specialisation.as_bytes())) - }, - }; - builder = builder.set_prefix(specialisation_prefix.clone()); - - let any_output = AtomicBool::new(false); - let crate_name = String::from(builder.get_crate_name()); - - match builder.build_live( - |stdout_line| { - if let Ok(cargo_metadata::Message::CompilerMessage(mut message)) = - serde_json::from_str(stdout_line) - { - if any_output - .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed) - .is_ok() - { - colored::control::set_override(true); - eprintln!( - "{} of {} ({})", - "[PTX]".bright_black().bold(), - crate_name.bold(), - specialisation_prefix.to_ascii_lowercase(), - ); - colored::control::unset_override(); - } - - if let Some(rendered) = &mut message.message.rendered { - colored::control::set_override(true); - let prefix = " | ".bright_black().bold().to_string(); - colored::control::unset_override(); - - let glue = String::from('\n') + &prefix; - - let mut lines = rendered - .split('\n') - .rev() - .skip_while(|l| l.trim().is_empty()) - .collect::>(); - lines.reverse(); - - let mut prefixed = prefix + &lines.join(&glue); - - std::mem::swap(rendered, &mut prefixed); - } - - eprintln!("{}", serde_json::to_string(&message.message).unwrap()); - } - }, - |stderr_line| { - if stderr_line.trim().is_empty() { - return; - } - - if any_output - .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed) - .is_ok() - { - colored::control::set_override(true); - eprintln!( - "{} of {} ({})", - "[PTX]".bright_black().bold(), - crate_name.bold(), - specialisation_prefix.to_ascii_lowercase(), - ); - colored::control::unset_override(); - } - - colored::control::set_override(true); - eprintln!( - " {} {}", - "|".bright_black().bold(), - stderr_line.replace(" ", "") - ); - colored::control::unset_override(); - }, - )? { - BuildStatus::Success(output) => { - let ptx_path = output.get_assembly_path(); - - let mut specialised_ptx_path = ptx_path.clone(); - - specialised_ptx_path.set_extension(format!("{specialisation_prefix}.ptx")); - - fs::copy(&ptx_path, &specialised_ptx_path).map_err(|err| { - Error::from(BuildErrorKind::BuildFailed(vec![format!( - "Failed to copy kernel from {ptx_path:?} to {specialised_ptx_path:?}: \ - {err}" - )])) - })?; - - if let Specialisation::Link(specialisation) = specialisation { - fs::OpenOptions::new() - .append(true) - .open(&specialised_ptx_path) - .and_then(|mut file| writeln!(file, "\n// {specialisation}")) - .map_err(|err| { - Error::from(BuildErrorKind::BuildFailed(vec![format!( - "Failed to write specialisation to {specialised_ptx_path:?}: {err}" - )])) - })?; - } - - Ok(specialised_ptx_path) - }, - BuildStatus::NotNeeded => Err(Error::from(BuildErrorKind::BuildFailed(vec![format!( - "Kernel build for specialisation {:?} was not needed.", - &specialisation - )]))), - } - })(); - - env::remove_var(env_var); - - result -} - -#[derive(Copy, Clone, Debug)] -enum Specialisation<'a> { - Check, - Link(&'a str), -} diff --git a/rust-cuda-derive/src/kernel/mod.rs b/rust-cuda-derive/src/kernel/mod.rs deleted file mode 100644 index c44f1dd2f..000000000 --- a/rust-cuda-derive/src/kernel/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -pub mod link; -pub mod specialise; -pub mod wrapper; - -mod utils; diff --git a/rust-cuda-derive/src/kernel/specialise/mod.rs b/rust-cuda-derive/src/kernel/specialise/mod.rs deleted file mode 100644 index 337508b5b..000000000 --- a/rust-cuda-derive/src/kernel/specialise/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -pub mod call; -pub mod entry; -pub mod ty; diff --git a/rust-cuda-derive/src/kernel/specialise/ty.rs b/rust-cuda-derive/src/kernel/specialise/ty.rs deleted file mode 100644 index 9b5a06955..000000000 --- a/rust-cuda-derive/src/kernel/specialise/ty.rs +++ /dev/null @@ -1,54 +0,0 @@ -use proc_macro::TokenStream; - -pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { - let SpecialiseTypeConfig { kernel, typedef } = match syn::parse_macro_input::parse(tokens) { - Ok(config) => config, - Err(err) => { - abort_call_site!( - "specialise_kernel_type!(KERNEL::TYPEDEF) expects KERNEL and TYPEDEF identifiers: \ - {:?}", - err - ) - }, - }; - - let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") { - Ok(crate_name) => crate_name.to_uppercase(), - Err(err) => abort_call_site!("Failed to read crate name: {:?}", err), - }; - - let specialisation_var = format!( - "RUST_CUDA_DERIVE_SPECIALISE_{}_{}", - crate_name, - kernel.to_string().to_uppercase() - ); - - match proc_macro::tracked_env::var(&specialisation_var) { - Ok(specialisation) => { - match format!("<() as {kernel}{specialisation}>::{typedef}").parse() { - Ok(parsed_specialisation) => parsed_specialisation, - Err(err) => abort_call_site!("Failed to parse specialisation: {:?}", err), - } - }, - Err(err) => abort_call_site!( - "Failed to read specialisation from {:?}: {:?}", - &specialisation_var, - err - ), - } -} - -struct SpecialiseTypeConfig { - kernel: syn::Ident, - typedef: syn::Ident, -} - -impl syn::parse::Parse for SpecialiseTypeConfig { - fn parse(input: syn::parse::ParseStream) -> syn::Result { - let kernel: syn::Ident = input.parse()?; - let _dc: syn::token::Colon2 = input.parse()?; - let typedef: syn::Ident = input.parse()?; - - Ok(Self { kernel, typedef }) - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs deleted file mode 100644 index c07486c2b..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/config.rs +++ /dev/null @@ -1,32 +0,0 @@ -pub(super) struct KernelConfig { - pub(super) visibility: Option, - pub(super) linker: syn::Ident, - pub(super) kernel: syn::Ident, - pub(super) args: syn::Ident, - pub(super) launcher: syn::Ident, -} - -impl syn::parse::Parse for KernelConfig { - fn parse(input: syn::parse::ParseStream) -> syn::Result { - let visibility: Option = input.parse()?; - let _use: syn::token::Use = input.parse()?; - let linker: syn::Ident = input.parse()?; - let _bang: syn::token::Bang = input.parse()?; - let _as: syn::token::As = input.parse()?; - let _impl: syn::token::Impl = input.parse()?; - let kernel: syn::Ident = input.parse()?; - let _lt_token: syn::token::Lt = input.parse()?; - let args: syn::Ident = input.parse()?; - let _gt_token: syn::token::Gt = input.parse()?; - let _for: syn::token::For = input.parse()?; - let launcher: syn::Ident = input.parse()?; - - Ok(Self { - visibility, - linker, - kernel, - args, - launcher, - }) - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs b/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs deleted file mode 100644 index 4c725601b..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs +++ /dev/null @@ -1,77 +0,0 @@ -use proc_macro2::TokenStream; - -use super::super::{DeclGenerics, FunctionInputs, ImplGenerics, KernelConfig}; - -pub(in super::super) fn quote_args_trait( - KernelConfig { - visibility, args, .. - }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_trait_params: generic_params, - generic_close_token, - generic_trait_where_clause: generic_where_clause, - .. - }: &DeclGenerics, - ImplGenerics { - impl_generics, - ty_generics, - where_clause, - }: &ImplGenerics, - FunctionInputs { func_inputs, .. }: &FunctionInputs, -) -> TokenStream { - let func_input_typedefs = (0..func_inputs.len()) - .map(|i| { - let type_ident = quote::format_ident!("__T_{}", i); - - quote! { - type #type_ident; - } - }) - .collect::>(); - - let func_input_types = func_inputs - .iter() - .enumerate() - .map(|(i, arg)| { - let pat_type = match arg { - syn::FnArg::Typed(pat_type) => pat_type, - syn::FnArg::Receiver(_) => unreachable!(), - }; - - let type_ident = quote::format_ident!("__T_{}", i); - let arg_type = match &*pat_type.ty { - syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, - other => other, - }; - - quote! { - type #type_ident = #arg_type; - } - }) - .collect::>(); - - quote! { - #[cfg(not(target_os = "cuda"))] - #[allow(clippy::missing_safety_doc)] - #visibility unsafe trait #args #generic_start_token #generic_params #generic_close_token - #generic_where_clause - { - #(#func_input_typedefs)* - } - - // #args must always be pub in CUDA kernel as it is used to define the - // public kernel entry point signature - #[cfg(target_os = "cuda")] - #[allow(clippy::missing_safety_doc)] - pub unsafe trait #args #generic_start_token #generic_params #generic_close_token - #generic_where_clause - { - #(#func_input_typedefs)* - } - - unsafe impl #impl_generics #args #ty_generics for () #where_clause { - #(#func_input_types)* - } - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs deleted file mode 100644 index dadda41ec..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs +++ /dev/null @@ -1,92 +0,0 @@ -use proc_macro2::TokenStream; -use syn::spanned::Spanned; - -use crate::kernel::utils::skip_kernel_compilation; - -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; - -pub(super) fn quote_get_ptx_str( - FuncIdent { - func_ident, - func_ident_hash, - .. - }: &FuncIdent, - config @ KernelConfig { args, .. }: &KernelConfig, - generics @ DeclGenerics { - generic_start_token, - generic_close_token, - .. - }: &DeclGenerics, - inputs: &FunctionInputs, - func_params: &[syn::Ident], - macro_type_ids: &[syn::Ident], -) -> TokenStream { - let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") { - Ok(crate_name) => crate_name.to_uppercase(), - Err(err) => abort_call_site!("Failed to read crate name: {:?}.", err), - }; - - let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR") - .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err)); - - let cpu_func_lifetime_erased_types = - super::kernel_func_raw::generate_launch_types(config, generics, inputs, macro_type_ids).1; - - let matching_kernel_assert = if skip_kernel_compilation() { - quote!() - } else { - quote::quote_spanned! { func_ident.span()=> - const _: ::rust_cuda::safety::kernel_signature::Assert<{ - ::rust_cuda::safety::kernel_signature::CpuAndGpuKernelSignatures::Match - }> = ::rust_cuda::safety::kernel_signature::Assert::<{ - ::rust_cuda::safety::kernel_signature::check( - PTX_STR.as_bytes(), - concat!(".visible .entry ", rust_cuda::host::specialise_kernel_call!( - #func_ident_hash #generic_start_token - #($#macro_type_ids),* - #generic_close_token - )).as_bytes() - ) - }>; - } - }; - - let type_layout_asserts = if skip_kernel_compilation() { - Vec::new() - } else { - cpu_func_lifetime_erased_types - .iter() - .zip(func_params.iter()) - .map(|(ty, param)| { - let layout_param = syn::Ident::new( - &format!("__{func_ident_hash}_{param}_layout").to_uppercase(), - param.span(), - ); - - quote::quote_spanned! { ty.span()=> - const _: ::rust_cuda::safety::type_layout::Assert<{ - ::rust_cuda::safety::type_layout::CpuAndGpuTypeLayouts::Match - }> = ::rust_cuda::safety::type_layout::Assert::<{ - ::rust_cuda::safety::type_layout::check::<#ty>(#layout_param) - }>; - } - }) - .collect::>() - }; - - quote! { - fn get_ptx_str() -> &'static str { - rust_cuda::host::link_kernel!{ - #func_ident #args #crate_name #crate_manifest_dir #generic_start_token - #($#macro_type_ids),* - #generic_close_token - } - - #matching_kernel_assert - - #(#type_layout_asserts)* - - PTX_STR - } - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs deleted file mode 100644 index 7cad78e05..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs +++ /dev/null @@ -1,169 +0,0 @@ -use proc_macro2::TokenStream; - -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig}; - -pub(super) fn quote_kernel_func( - KernelConfig { args, .. }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_wrapper_params, - generic_close_token, - generic_wrapper_where_clause, - .. - }: &DeclGenerics, - inputs @ FunctionInputs { func_inputs, .. }: &FunctionInputs, - fn_ident @ FuncIdent { func_ident, .. }: &FuncIdent, - func_params: &[syn::Ident], - func_attrs: &[syn::Attribute], - macro_type_ids: &[syn::Ident], -) -> TokenStream { - let new_func_inputs = func_inputs - .iter() - .enumerate() - .map(|(i, arg)| match arg { - syn::FnArg::Typed(syn::PatType { - attrs, - pat, - colon_token, - ty, - }) => { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = quote! { - <() as #args #generic_start_token - #($#macro_type_ids),* - #generic_close_token>::#type_ident - }; - - if let syn::Type::Reference(syn::TypeReference { - and_token, - lifetime, - mutability, - .. - }) = &**ty - { - quote! { - #(#attrs)* #pat #colon_token #and_token #lifetime #mutability #syn_type - } - } else { - quote! { #(#attrs)* #pat #colon_token #syn_type } - } - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .collect::>(); - - let raw_func_input_wrap = generate_raw_func_input_wrap(inputs, fn_ident, func_params); - - quote! { - #(#func_attrs)* - #[allow(clippy::needless_lifetimes)] - fn #func_ident #generic_start_token #generic_wrapper_params #generic_close_token ( - &mut self, #(#new_func_inputs),* - ) -> rust_cuda::rustacuda::error::CudaResult<()> - #generic_wrapper_where_clause - { - // impls check adapted from Nikolai Vazquez's `impls` crate: - // https://docs.rs/impls/1.0.3/src/impls/lib.rs.html#584-602 - const fn __check_is_sync(_x: &T) -> bool { - trait IsSyncMarker { - const SYNC: bool = false; - } - impl IsSyncMarker for T {} - struct CheckIs(::core::marker::PhantomData); - #[allow(dead_code)] - impl CheckIs { - const SYNC: bool = true; - } - - >::SYNC - } - - #raw_func_input_wrap - } - } -} - -#[allow(clippy::too_many_lines)] -fn generate_raw_func_input_wrap( - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, - FuncIdent { func_ident_raw, .. }: &FuncIdent, - func_params: &[syn::Ident], -) -> TokenStream { - func_inputs - .iter() - .zip(func_params) - .zip(func_input_cuda_types.iter()) - .rev() - .fold( - quote! { - self.#func_ident_raw(#(#func_params),*) - }, - |inner, ((arg, param), (cuda_mode, _ptx_jit))| match arg { - syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => match cuda_mode { - InputCudaType::SafeDeviceCopy => { - if let syn::Type::Reference(..) = &**ty { - let pat_box = quote::format_ident!("__{}_box", param); - - // DeviceCopy mode only supports immutable references - quote! { - let mut #pat_box = rust_cuda::host::HostDeviceBox::from( - rust_cuda::rustacuda::memory::DeviceBox::new( - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat) - )? - ); - #[allow(clippy::redundant_closure_call)] - // Safety: `#pat_box` contains exactly the device copy of `#pat` - let __result = (|#pat| { #inner })(unsafe { - rust_cuda::host::HostAndDeviceConstRef::new( - &#pat_box, rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat) - ) - }); - - #[allow(invalid_reference_casting)] - if !__check_is_sync(#pat) { - // Safety: - // * Since `#ty` is `!Sync`, it contains interior mutability - // * Therefore, part of the 'immutable' device copy may have - // been mutated - // * If all mutation was confined to interior mutability, - // then passing these changes on is safe (and expected) - // * If any mutations occured outside interior mutability, - // then UB occurred, in the kernel (we're not the cause) - #pat_box.copy_to(unsafe { &mut *(#pat as *const _ as *mut _) })?; - } - - ::core::mem::drop(#pat_box); - __result - } - } else { - quote! { { - let #pat = rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from(#pat); - #inner - } } - } - }, - InputCudaType::LendRustToCuda => { - if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty { - if mutability.is_some() { - quote! { rust_cuda::host::LendToCuda::lend_to_cuda_mut( - #pat, |#pat| { #inner } - ) } - } else { - quote! { rust_cuda::host::LendToCuda::lend_to_cuda( - #pat, |#pat| { #inner } - ) } - } - } else { - quote! { rust_cuda::host::LendToCuda::move_to_cuda( - #pat, |#pat| { #inner } - ) } - } - }, - }, - syn::FnArg::Receiver(_) => unreachable!(), - }, - ) -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/launch_types.rs deleted file mode 100644 index 0fed7282f..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/launch_types.rs +++ /dev/null @@ -1,106 +0,0 @@ -use proc_macro2::TokenStream; -use syn::spanned::Spanned; - -use crate::kernel::utils::r2c_move_lifetime; - -use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig}; - -pub(in super::super) fn generate_launch_types( - KernelConfig { args, .. }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_close_token, - .. - }: &DeclGenerics, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, - macro_type_ids: &[syn::Ident], -) -> (Vec, Vec, Vec) { - let mut cpu_func_types_launch = Vec::with_capacity(func_inputs.len()); - let mut cpu_func_lifetime_erased_types = Vec::with_capacity(func_inputs.len()); - let mut cpu_func_unboxed_types = Vec::with_capacity(func_inputs.len()); - - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .enumerate() - .for_each(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg { - syn::FnArg::Typed(syn::PatType { ty, .. }) => { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = quote::quote_spanned! { ty.span()=> - <() as #args #generic_start_token - #($#macro_type_ids),* - #generic_close_token>::#type_ident - }; - - cpu_func_unboxed_types.push(syn_type.clone()); - - let cuda_type = match cuda_mode { - InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=> - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> - }, - InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceAccessible< - <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation - > - }, - }; - - cpu_func_types_launch.push( - if let syn::Type::Reference(syn::TypeReference { - mutability, - lifetime, - .. - }) = &**ty - { - if mutability.is_some() { - quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type> - } - } else { - quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceConstRef<#lifetime, #cuda_type> - } - } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - let lifetime = r2c_move_lifetime(i, ty); - - quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type> - } - } else { - quote! { #cuda_type } - }, - ); - - cpu_func_lifetime_erased_types.push( - if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty { - if mutability.is_some() { - quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<'static, #cuda_type> - } - } else { - quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceConstRef<'static, #cuda_type> - } - } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<'static, #cuda_type> - } - } else { - cuda_type - }, - ); - }, - syn::FnArg::Receiver(_) => unreachable!(), - }); - - ( - cpu_func_types_launch, - cpu_func_lifetime_erased_types, - cpu_func_unboxed_types, - ) -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs deleted file mode 100644 index ab352b4c8..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs +++ /dev/null @@ -1,111 +0,0 @@ -use proc_macro2::TokenStream; - -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; - -mod launch_types; -mod raw_func_types; -mod type_wrap; - -pub(super) use launch_types::generate_launch_types; -use raw_func_types::generate_raw_func_types; -use type_wrap::generate_func_input_and_ptx_jit_wraps; - -#[allow(clippy::too_many_arguments)] -pub(super) fn quote_kernel_func_raw( - config @ KernelConfig { args, .. }: &KernelConfig, - decl_generics @ DeclGenerics { - generic_start_token, - generic_wrapper_params, - generic_close_token, - generic_wrapper_where_clause, - .. - }: &DeclGenerics, - func_inputs: &FunctionInputs, - FuncIdent { func_ident_raw, .. }: &FuncIdent, - func_params: &[syn::Ident], - func_attrs: &[syn::Attribute], - macro_type_ids: &[syn::Ident], -) -> TokenStream { - let new_func_inputs_raw = - generate_raw_func_types(config, decl_generics, func_inputs, macro_type_ids); - let (func_input_wrap, func_cpu_ptx_jit_wrap) = - generate_func_input_and_ptx_jit_wraps(func_inputs); - let (cpu_func_types_launch, cpu_func_lifetime_erased_types, cpu_func_unboxed_types) = - generate_launch_types(config, decl_generics, func_inputs, macro_type_ids); - - quote! { - #(#func_attrs)* - #[allow(clippy::extra_unused_type_parameters)] - fn #func_ident_raw #generic_start_token #generic_wrapper_params #generic_close_token ( - &mut self, #(#new_func_inputs_raw),* - ) -> rust_cuda::rustacuda::error::CudaResult<()> - #generic_wrapper_where_clause - { - let rust_cuda::host::LaunchPackage { - kernel, watcher, config, stream - } = rust_cuda::host::Launcher::get_launch_package(self); - - let kernel_jit_result = if config.ptx_jit { - rust_cuda::ptx_jit::compilePtxJITwithArguments! { - kernel.compile_with_ptx_jit_args(#(#func_cpu_ptx_jit_wrap),*) - }? - } else { - kernel.compile_with_ptx_jit_args(None)? - }; - - let function = match kernel_jit_result { - rust_cuda::host::KernelJITResult::Recompiled(function) => { - // Call launcher hook on kernel compilation - ::on_compile(function, watcher)?; - - function - }, - rust_cuda::host::KernelJITResult::Cached(function) => function, - }; - - #[allow(clippy::redundant_closure_call)] - (|#(#func_params: #cpu_func_types_launch),*| { - #[deny(improper_ctypes)] - mod __rust_cuda_ffi_safe_assert { - use super::#args; - - extern "C" { #( - #[allow(dead_code)] - static #func_params: #cpu_func_lifetime_erased_types; - )* } - } - - if false { - #[allow(dead_code)] - fn assert_impl_devicecopy(_val: &T) {} - - #[allow(dead_code)] - fn assert_impl_no_aliasing() {} - - #[allow(dead_code)] - fn assert_impl_fits_into_device_register< - T: rust_cuda::safety::FitsIntoDeviceRegister, - >(_val: &T) {} - - #(assert_impl_devicecopy(&#func_params);)* - #(assert_impl_no_aliasing::<#cpu_func_unboxed_types>();)* - #(assert_impl_fits_into_device_register(&#func_params);)* - } - - let rust_cuda::host::LaunchConfig { - grid, block, shared_memory_size, ptx_jit: _, - } = config; - - unsafe { stream.launch(function, grid, block, shared_memory_size, - &[ - #( - &#func_params as *const _ as *mut ::std::ffi::c_void - ),* - ] - ) }?; - - stream.synchronize() - })(#(#func_input_wrap),*) - } - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs deleted file mode 100644 index 380048ec5..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs +++ /dev/null @@ -1,93 +0,0 @@ -use proc_macro2::TokenStream; -use syn::spanned::Spanned; - -use crate::kernel::utils::r2c_move_lifetime; - -use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig}; - -pub(super) fn generate_raw_func_types( - KernelConfig { args, .. }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_close_token, - .. - }: &DeclGenerics, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, - macro_type_ids: &[syn::Ident], -) -> Vec { - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .enumerate() - .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg { - syn::FnArg::Typed(syn::PatType { - attrs, - pat, - colon_token, - ty, - }) => { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = quote! { - <() as #args #generic_start_token - #($#macro_type_ids),* - #generic_close_token>::#type_ident - }; - - let cuda_type = match cuda_mode { - InputCudaType::SafeDeviceCopy => quote! { - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> - }, - InputCudaType::LendRustToCuda => quote! { - rust_cuda::common::DeviceAccessible< - <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation - > - }, - }; - - if let syn::Type::Reference(syn::TypeReference { - lifetime, - mutability, - .. - }) = &**ty - { - let wrapped_type = if mutability.is_some() { - if matches!(cuda_mode, InputCudaType::SafeDeviceCopy) { - abort!( - mutability.span(), - "Cannot mutably alias a `SafeDeviceCopy` kernel parameter." - ); - } - - quote!( - rust_cuda::host::HostAndDeviceMutRef<#lifetime, #cuda_type> - ) - } else { - quote!( - rust_cuda::host::HostAndDeviceConstRef<#lifetime, #cuda_type> - ) - }; - - quote! { - #(#attrs)* #mutability #pat #colon_token #wrapped_type - } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - let lifetime = r2c_move_lifetime(i, ty); - - let wrapped_type = quote! { - rust_cuda::host::HostAndDeviceOwned<#lifetime, #cuda_type> - }; - - quote! { - #(#attrs)* #pat #colon_token #wrapped_type - } - } else { - quote! { #(#attrs)* #pat #colon_token #cuda_type } - } - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .collect() -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs deleted file mode 100644 index 432930731..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs +++ /dev/null @@ -1,38 +0,0 @@ -use proc_macro2::TokenStream; - -use crate::kernel::wrapper::InputCudaType; - -use super::super::super::super::FunctionInputs; - -pub(super) fn generate_func_input_and_ptx_jit_wraps( - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, -) -> (Vec, Vec) { - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .map(|(arg, (cuda_mode, ptx_jit))| match arg { - syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => { - #[allow(clippy::if_same_then_else)] - let func_input = if let syn::Type::Reference(_) = &**ty { - quote! { #pat.for_device() } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - quote! { #pat.for_device() } - } else { - quote! { #pat } - }; - - let ptx_load = if ptx_jit.0 { - quote! { ConstLoad[#pat.for_host()] } - } else { - quote! { Ignore[#pat] } - }; - - (func_input, ptx_load) - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .unzip() -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs deleted file mode 100644 index 7ab891e7e..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs +++ /dev/null @@ -1,105 +0,0 @@ -use proc_macro2::TokenStream; - -use super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; - -mod get_ptx_str; -mod kernel_func; -mod kernel_func_raw; -mod new_kernel; - -use get_ptx_str::quote_get_ptx_str; -use kernel_func::quote_kernel_func; -use kernel_func_raw::quote_kernel_func_raw; -use new_kernel::quote_new_kernel; - -pub(in super::super) fn quote_cpu_linker_macro( - config @ KernelConfig { - visibility, - kernel, - linker, - launcher, - .. - }: &KernelConfig, - decl_generics @ DeclGenerics { - generic_start_token, - generic_trait_params: generic_params, - generic_close_token, - .. - }: &DeclGenerics, - func_inputs: &FunctionInputs, - func_ident: &FuncIdent, - func_params: &[syn::Ident], - func_attrs: &[syn::Attribute], -) -> TokenStream { - let macro_types = generic_params - .iter() - .enumerate() - .map(|(i, generic)| { - let generic_ident = quote::format_ident!("__g_{}", i); - - match generic { - syn::GenericParam::Type(_) => quote!($#generic_ident:ty), - syn::GenericParam::Const(_) => quote!($#generic_ident:expr), - syn::GenericParam::Lifetime(_) => unreachable!(), - } - }) - .collect::>(); - - let macro_type_ids = (0..generic_params.len()) - .map(|i| quote::format_ident!("__g_{}", i)) - .collect::>(); - - let cpu_linker_macro_visibility = if visibility.is_some() { - quote! { #[macro_export] } - } else { - quote! {} - }; - - let get_ptx_str = quote_get_ptx_str( - func_ident, - config, - decl_generics, - func_inputs, - func_params, - ¯o_type_ids, - ); - let new_kernel = quote_new_kernel(config, decl_generics, func_ident, ¯o_type_ids); - let kernel_func = quote_kernel_func( - config, - decl_generics, - func_inputs, - func_ident, - func_params, - func_attrs, - ¯o_type_ids, - ); - let kernel_func_raw = quote_kernel_func_raw( - config, - decl_generics, - func_inputs, - func_ident, - func_params, - func_attrs, - ¯o_type_ids, - ); - - quote! { - #[cfg(not(target_os = "cuda"))] - #cpu_linker_macro_visibility - macro_rules! #linker { - (#(#macro_types),* $(,)?) => { - unsafe impl #kernel #generic_start_token #($#macro_type_ids),* #generic_close_token - for #launcher #generic_start_token #($#macro_type_ids),* #generic_close_token - { - #get_ptx_str - - #new_kernel - - #kernel_func - - #kernel_func_raw - } - }; - } - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs deleted file mode 100644 index fa32591db..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs +++ /dev/null @@ -1,33 +0,0 @@ -use proc_macro2::TokenStream; - -use super::super::super::{DeclGenerics, FuncIdent, KernelConfig}; - -pub(super) fn quote_new_kernel( - KernelConfig { kernel, .. }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_close_token, - .. - }: &DeclGenerics, - FuncIdent { - func_ident_hash, .. - }: &FuncIdent, - macro_type_ids: &[syn::Ident], -) -> TokenStream { - quote! { - fn new_kernel() -> rust_cuda::rustacuda::error::CudaResult< - rust_cuda::host::TypedKernel - > { - let ptx = Self::get_ptx_str(); - let entry_point = rust_cuda::host::specialise_kernel_call!( - #func_ident_hash #generic_start_token - #($#macro_type_ids),* - #generic_close_token - ); - - rust_cuda::host::TypedKernel::new(ptx, entry_point) - } - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs deleted file mode 100644 index cad3cdc6a..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs +++ /dev/null @@ -1,186 +0,0 @@ -use proc_macro2::TokenStream; - -use crate::kernel::utils::r2c_move_lifetime; - -use super::super::{ - DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig, -}; - -pub(in super::super) fn quote_cpu_wrapper( - config @ KernelConfig { - visibility, kernel, .. - }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_trait_params, - generic_close_token, - generic_trait_where_clause, - generic_wrapper_params, - generic_wrapper_where_clause, - .. - }: &DeclGenerics, - impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics, - func_inputs: &FunctionInputs, - FuncIdent { - func_ident, - func_ident_raw, - .. - }: &FuncIdent, - func_attrs: &[syn::Attribute], -) -> TokenStream { - let launcher_predicate = quote! { - Self: Sized + rust_cuda::host::Launcher< - KernelTraitObject = dyn #kernel #ty_generics - > - }; - - let generic_wrapper_where_clause = match generic_wrapper_where_clause { - Some(syn::WhereClause { - where_token, - predicates, - }) if !predicates.is_empty() => { - let comma = if predicates.empty_or_trailing() { - quote!() - } else { - quote!(,) - }; - - quote! { - #where_token #predicates #comma #launcher_predicate - } - }, - _ => quote! { - where #launcher_predicate - }, - }; - - let (new_func_inputs_decl, new_func_inputs_raw_decl) = - generate_new_func_inputs_decl(config, impl_generics, func_inputs); - - quote! { - #[cfg(not(target_os = "cuda"))] - #[allow(clippy::missing_safety_doc)] - #visibility unsafe trait #kernel #generic_start_token #generic_trait_params #generic_close_token - #generic_trait_where_clause - { - fn get_ptx_str() -> &'static str where #launcher_predicate; - - fn new_kernel() -> rust_cuda::rustacuda::error::CudaResult< - rust_cuda::host::TypedKernel - > where #launcher_predicate; - - #(#func_attrs)* - fn #func_ident #generic_start_token #generic_wrapper_params #generic_close_token ( - &mut self, #(#new_func_inputs_decl),* - ) -> rust_cuda::rustacuda::error::CudaResult<()> - #generic_wrapper_where_clause; - - #(#func_attrs)* - fn #func_ident_raw #generic_start_token #generic_wrapper_params #generic_close_token ( - &mut self, #(#new_func_inputs_raw_decl),* - ) -> rust_cuda::rustacuda::error::CudaResult<()> - #generic_wrapper_where_clause; - } - } -} - -fn generate_new_func_inputs_decl( - KernelConfig { args, .. }: &KernelConfig, - ImplGenerics { ty_generics, .. }: &ImplGenerics, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, -) -> (Vec, Vec) { - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .enumerate() - .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg { - syn::FnArg::Typed(syn::PatType { - attrs, - pat, - colon_token, - ty, - }) => ( - syn::FnArg::Typed(syn::PatType { - attrs: attrs.clone(), - pat: pat.clone(), - colon_token: *colon_token, - ty: { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = syn::parse_quote!(<() as #args #ty_generics>::#type_ident); - - if let syn::Type::Reference(syn::TypeReference { - and_token, - lifetime, - mutability, - .. - }) = &**ty - { - Box::new(syn::Type::Reference(syn::TypeReference { - and_token: *and_token, - lifetime: lifetime.clone(), - mutability: *mutability, - elem: syn_type, - })) - } else { - syn_type - } - }, - }), - syn::FnArg::Typed(syn::PatType { - attrs: attrs.clone(), - pat: pat.clone(), - colon_token: *colon_token, - ty: { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type: Box = - syn::parse_quote!(<() as #args #ty_generics>::#type_ident); - - let cuda_type = match cuda_mode { - InputCudaType::SafeDeviceCopy => syn::parse_quote!( - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> - ), - InputCudaType::LendRustToCuda => syn::parse_quote!( - rust_cuda::common::DeviceAccessible< - <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation - > - ), - }; - - if let syn::Type::Reference(syn::TypeReference { - lifetime, - mutability, - .. - }) = &**ty - { - let wrapped_type = if mutability.is_some() { - syn::parse_quote!( - rust_cuda::host::HostAndDeviceMutRef<#lifetime, #cuda_type> - ) - } else { - syn::parse_quote!( - rust_cuda::host::HostAndDeviceConstRef<#lifetime, #cuda_type> - ) - }; - - Box::new(wrapped_type) - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - let lifetime = r2c_move_lifetime(i, ty); - - let wrapped_type = syn::parse_quote!( - rust_cuda::host::HostAndDeviceOwned<#lifetime, #cuda_type> - ); - - Box::new(wrapped_type) - } else { - cuda_type - } - }, - }), - ), - syn::FnArg::Receiver(_) => unreachable!(), - }) - .unzip() -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs deleted file mode 100644 index 628642fc0..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs +++ /dev/null @@ -1,25 +0,0 @@ -use proc_macro2::TokenStream; - -use super::super::{DeclGenerics, FuncIdent}; - -pub(in super::super) fn quote_cuda_generic_function( - DeclGenerics { - generic_start_token, - generic_kernel_params: generic_params, - generic_close_token, - generic_kernel_where_clause: generic_where_clause, - .. - }: &DeclGenerics, - func_inputs: &syn::punctuated::Punctuated, - FuncIdent { func_ident, .. }: &FuncIdent, - func_attrs: &[syn::Attribute], - func_block: &syn::Block, -) -> TokenStream { - quote! { - #[cfg(target_os = "cuda")] - #(#func_attrs)* - fn #func_ident #generic_start_token #generic_params #generic_close_token (#func_inputs) - #generic_where_clause - #func_block - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs deleted file mode 100644 index d017efae1..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ /dev/null @@ -1,249 +0,0 @@ -use proc_macro2::TokenStream; -use quote::quote_spanned; -use syn::spanned::Spanned; - -use super::super::{FuncIdent, FunctionInputs, InputCudaType, KernelConfig}; - -#[allow(clippy::too_many_lines)] -pub(in super::super) fn quote_cuda_wrapper( - config @ KernelConfig { args, .. }: &KernelConfig, - inputs @ FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, - FuncIdent { - func_ident, - func_ident_hash, - .. - }: &FuncIdent, - func_attrs: &[syn::Attribute], - func_params: &[syn::Ident], -) -> TokenStream { - let (ptx_func_inputs, ptx_func_types) = specialise_ptx_func_inputs(config, inputs); - let ptx_func_unboxed_types = specialise_ptx_unboxed_types(config, inputs); - - let func_layout_params = func_params - .iter() - .map(|ident| { - syn::Ident::new( - &format!("__{func_ident_hash}_{ident}_layout").to_uppercase(), - ident.span(), - ) - }) - .collect::>(); - - let ptx_func_input_unwrap = func_inputs - .iter().zip(func_input_cuda_types.iter()).enumerate() - .rev() - .fold(quote! { - #func_ident(#(#func_params),*) - }, |inner, (i, (arg, (cuda_mode, ptx_jit)))| match arg { - syn::FnArg::Typed(syn::PatType { - pat, - ty, - .. - }) => { - // Emit PTX JIT load markers - let ptx_jit_load = if ptx_jit.0 { - quote! { - rust_cuda::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref()) - } - } else { quote! {} }; - - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = quote::quote_spanned! { ty.span()=> - rust_cuda::device::specialise_kernel_type!(#args :: #type_ident) - }; - - match cuda_mode { - InputCudaType::SafeDeviceCopy => if let syn::Type::Reference( - syn::TypeReference { and_token, .. } - ) = &**ty { - // DeviceCopy mode only supports immutable references - quote! { #ptx_jit_load; { let #pat: #and_token #syn_type = #pat.as_ref().into_ref(); #inner } } - } else { - quote! { { let #pat: #syn_type = #pat.into_inner(); #inner } } - }, - InputCudaType::LendRustToCuda => if let syn::Type::Reference( - syn::TypeReference { and_token, mutability, ..} - ) = &**ty { - if mutability.is_some() { - quote! { - #ptx_jit_load; - rust_cuda::device::BorrowFromRust::with_borrow_from_rust_mut( - #pat, |#pat: #and_token #mutability rust_cuda::device::ShallowCopy<#syn_type>| { #inner }, - ) - } - } else { - quote! { - #ptx_jit_load; - rust_cuda::device::BorrowFromRust::with_borrow_from_rust( - #pat, |#pat: #and_token rust_cuda::device::ShallowCopy<#syn_type>| { #inner }, - ) - } - } - } else { - quote! { - #ptx_jit_load; - rust_cuda::device::BorrowFromRust::with_moved_from_rust( - #pat, |#pat: #syn_type| { #inner }, - ) - } - } - } - }, - syn::FnArg::Receiver(_) => unreachable!(), - }); - - let func_type_layout_ident = quote::format_ident!("{}_type_layout", func_ident); - - quote! { - #[cfg(target_os = "cuda")] - #[rust_cuda::device::specialise_kernel_entry(#args)] - #[no_mangle] - #(#func_attrs)* - pub unsafe extern "ptx-kernel" fn #func_type_layout_ident(#(#func_params: &mut &[u8]),*) { - #( - #[no_mangle] - static #func_layout_params: [ - u8; rust_cuda::const_type_layout::serialised_type_graph_len::<#ptx_func_types>() - ] = rust_cuda::const_type_layout::serialise_type_graph::<#ptx_func_types>(); - - *#func_params = &#func_layout_params; - )* - } - - #[cfg(target_os = "cuda")] - #[rust_cuda::device::specialise_kernel_entry(#args)] - #[no_mangle] - #(#func_attrs)* - pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) { - #[deny(improper_ctypes)] - mod __rust_cuda_ffi_safe_assert { - use super::#args; - - extern "C" { #( - #[allow(dead_code)] - static #func_params: #ptx_func_types; - )* } - } - - if false { - #[allow(dead_code)] - fn assert_impl_devicecopy(_val: &T) {} - - #[allow(dead_code)] - fn assert_impl_no_aliasing() {} - - #[allow(dead_code)] - fn assert_impl_fits_into_device_register< - T: rust_cuda::safety::FitsIntoDeviceRegister, - >(_val: &T) {} - - #(assert_impl_devicecopy(&#func_params);)* - #(assert_impl_no_aliasing::<#ptx_func_unboxed_types>();)* - #(assert_impl_fits_into_device_register(&#func_params);)* - } - - #ptx_func_input_unwrap - } - } -} - -fn specialise_ptx_func_inputs( - KernelConfig { args, .. }: &KernelConfig, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, -) -> (Vec, Vec) { - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .enumerate() - .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg { - syn::FnArg::Typed( - fn_arg @ syn::PatType { - attrs, - pat, - colon_token, - ty, - }, - ) => { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = quote::quote_spanned! { ty.span()=> - rust_cuda::device::specialise_kernel_type!(#args :: #type_ident) - }; - - let cuda_type = match cuda_mode { - InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=> - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> - }, - InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceAccessible< - <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation - > - }, - }; - - let ty = if let syn::Type::Reference(syn::TypeReference { - lifetime, - mutability, - .. - }) = &**ty - { - let lifetime = quote_spanned! { lifetime.span()=> - 'static - }; - - if mutability.is_some() { - quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type> - } - } else { - quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceConstRef<#lifetime, #cuda_type> - } - } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - let lifetime = quote_spanned! { ty.span()=> - 'static - }; - - quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type> - } - } else { - cuda_type - }; - - let fn_arg = quote::quote_spanned! { fn_arg.span()=> - #(#attrs)* #pat #colon_token #ty - }; - - (fn_arg, ty) - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .unzip() -} - -fn specialise_ptx_unboxed_types( - KernelConfig { args, .. }: &KernelConfig, - FunctionInputs { func_inputs, .. }: &FunctionInputs, -) -> Vec { - func_inputs - .iter() - .enumerate() - .map(|(i, arg)| match arg { - syn::FnArg::Typed(syn::PatType { ty, .. }) => { - let type_ident = quote::format_ident!("__T_{}", i); - - quote::quote_spanned! { ty.span()=> - rust_cuda::device::specialise_kernel_type!(#args :: #type_ident) - } - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .collect() -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs deleted file mode 100644 index 4dd9b4096..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -pub mod args_trait; -pub mod cpu_linker_macro; -pub mod cpu_wrapper; -pub mod cuda_generic_function; -pub mod cuda_wrapper; diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs deleted file mode 100644 index ceeee1e3e..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs +++ /dev/null @@ -1,93 +0,0 @@ -use syn::spanned::Spanned; - -use super::InputCudaType; - -pub(super) enum KernelInputAttribute { - PassType(proc_macro2::Span, InputCudaType), - PtxJit(proc_macro2::Span, bool), -} - -impl syn::parse::Parse for KernelInputAttribute { - fn parse(input: syn::parse::ParseStream) -> syn::Result { - let ident: syn::Ident = input.parse()?; - - match &*ident.to_string() { - "pass" => { - let eq: syn::token::Eq = input.parse()?; - let mode: syn::Ident = input.parse()?; - - let cuda_type = match &*mode.to_string() { - "SafeDeviceCopy" => InputCudaType::SafeDeviceCopy, - "LendRustToCuda" => InputCudaType::LendRustToCuda, - _ => abort!( - mode.span(), - "Unexpected CUDA transfer mode `{:?}`: Expected `SafeDeviceCopy` or \ - `LendRustToCuda`.", - mode - ), - }; - - Ok(KernelInputAttribute::PassType( - ident - .span() - .join(eq.span()) - .unwrap() - .join(mode.span()) - .unwrap(), - cuda_type, - )) - }, - "jit" => { - let eq: Option = input.parse()?; - - let (ptx_jit, span) = if eq.is_some() { - let value: syn::LitBool = input.parse()?; - - ( - value.value(), - ident - .span() - .join(eq.span()) - .unwrap() - .span() - .join(value.span()) - .unwrap(), - ) - } else { - (true, ident.span()) - }; - - Ok(KernelInputAttribute::PtxJit(span, ptx_jit)) - }, - _ => abort!( - ident.span(), - "Unexpected kernel attribute `{:?}`: Expected `pass` or `jit`.", - ident - ), - } - } -} - -pub(super) struct KernelInputAttributes(Vec); - -impl syn::parse::Parse for KernelInputAttributes { - fn parse(input: syn::parse::ParseStream) -> syn::Result { - let content; - let _parens = syn::parenthesized!(content in input); - - syn::punctuated::Punctuated::< - KernelInputAttribute, syn::token::Comma - >::parse_separated_nonempty(&content).map(|punctuated| { - Self(punctuated.into_iter().collect()) - }) - } -} - -impl IntoIterator for KernelInputAttributes { - type IntoIter = std::vec::IntoIter; - type Item = KernelInputAttribute; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs deleted file mode 100644 index f3cc1a4d8..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs +++ /dev/null @@ -1,229 +0,0 @@ -use syn::spanned::Spanned; - -use crate::kernel::utils::r2c_move_lifetime; - -use super::{InputCudaType, InputPtxJit}; - -mod attribute; -use attribute::{KernelInputAttribute, KernelInputAttributes}; - -pub(super) struct FunctionInputs { - pub(super) func_inputs: syn::punctuated::Punctuated, - pub(super) func_input_cuda_types: Vec<(InputCudaType, InputPtxJit)>, -} - -pub(super) fn parse_function_inputs( - func: &syn::ItemFn, - generic_params: &mut syn::punctuated::Punctuated, -) -> FunctionInputs { - let mut implicit_lifetime_id: usize = 0; - - let (func_inputs, func_input_cuda_types): ( - syn::punctuated::Punctuated, - Vec<(InputCudaType, InputPtxJit)>, - ) = func - .sig - .inputs - .iter() - .enumerate() - .map(|(i, arg)| match arg { - receiver @ syn::FnArg::Receiver(_) => { - abort!(receiver.span(), "Kernel function must not have a receiver.") - }, - syn::FnArg::Typed( - input @ syn::PatType { - attrs, - pat, - colon_token, - ty, - }, - ) => { - let mut cuda_type: Option = None; - let mut ptx_jit: Option = None; - - let attrs = attrs - .iter() - .filter(|attr| match attr.path.get_ident() { - Some(ident) if ident == "kernel" => { - let attrs: KernelInputAttributes = - match syn::parse_macro_input::parse(attr.tokens.clone().into()) { - Ok(data) => data, - Err(err) => abort!(attr.span(), err), - }; - - for attr in attrs { - match attr { - KernelInputAttribute::PassType(_span, pass_type) - if cuda_type.is_none() => - { - cuda_type = Some(pass_type); - }, - KernelInputAttribute::PassType(span, _pass_type) => { - abort!(span, "Duplicate CUDA transfer mode declaration."); - }, - KernelInputAttribute::PtxJit(span, jit) - if ptx_jit.is_none() => - { - if !matches!(&**ty, syn::Type::Reference(_)) && jit { - abort!( - span, - "Only reference types can be PTX JIT loaded." - ); - } - - ptx_jit = Some(InputPtxJit(jit)); - }, - KernelInputAttribute::PtxJit(span, _jit) => { - abort!(span, "Duplicate PTX JIT declaration."); - }, - } - } - - false - }, - _ => true, - }) - .cloned() - .collect(); - - let cuda_type = cuda_type.unwrap_or_else(|| { - abort!( - input.span(), - "Kernel function input must specify its CUDA transfer mode using \ - #[kernel(pass = ...)]." - ); - }); - - let ty = ensure_reference_type_lifetime( - i, - ty, - &cuda_type, - &mut implicit_lifetime_id, - generic_params, - ); - - ( - syn::FnArg::Typed(syn::PatType { - attrs, - pat: pat.clone(), - colon_token: *colon_token, - ty, - }), - (cuda_type, ptx_jit.unwrap_or(InputPtxJit(false))), - ) - }, - }) - .unzip(); - - FunctionInputs { - func_inputs, - func_input_cuda_types, - } -} - -#[allow(clippy::unnecessary_box_returns)] -fn ensure_reference_type_lifetime( - i: usize, - ty: &syn::Type, - cuda_type: &InputCudaType, - implicit_lifetime_id: &mut usize, - generic_params: &mut syn::punctuated::Punctuated, -) -> Box { - match ty { - syn::Type::Reference(syn::TypeReference { - and_token, - lifetime, - mutability, - elem, - }) => { - let lifetime = lifetime.clone().unwrap_or_else(|| { - let lifetime = syn::Lifetime::new( - &format!("'__r2c_lt_{implicit_lifetime_id}"), - lifetime.span(), - ); - - generic_params.insert( - *implicit_lifetime_id, - syn::GenericParam::Lifetime(syn::LifetimeDef { - attrs: Vec::new(), - lifetime: lifetime.clone(), - colon_token: None, - bounds: syn::punctuated::Punctuated::new(), - }), - ); - - *implicit_lifetime_id += 1; - - lifetime - }); - - let elem = if matches!(cuda_type, InputCudaType::LendRustToCuda) { - (|| { - if let syn::Type::Path(syn::TypePath { - path: syn::Path { segments, .. }, - qself: None, - }) = &**elem - { - if let Some(syn::PathSegment { - ident, - arguments: - syn::PathArguments::AngleBracketed( - syn::AngleBracketedGenericArguments { args, .. }, - ), - }) = segments.last() - { - if ident == "ShallowCopy" && segments.len() == 1 { - match args.last() { - Some(syn::GenericArgument::Type(elem)) if args.len() == 1 => { - return Box::new(elem.clone()); - }, - _ => { - abort!( - args.span(), - "`ShallowCopy` takes exactly one generic type \ - argument." - ); - }, - } - } - } - } - - emit_warning!( - elem.span(), - "RustToCuda kernel parameters should be explicitly wrapped with the \ - `ShallowCopy` marker to communicate their aliasing behaviour." - ); - - elem.clone() - })() - } else { - elem.clone() - }; - - Box::new(syn::Type::Reference(syn::TypeReference { - and_token: *and_token, - lifetime: Some(lifetime), - mutability: *mutability, - elem, - })) - }, - ty => { - if matches!(cuda_type, InputCudaType::LendRustToCuda) { - generic_params.insert( - *implicit_lifetime_id, - syn::GenericParam::Lifetime(syn::LifetimeDef { - attrs: Vec::new(), - lifetime: r2c_move_lifetime(i, ty), - colon_token: None, - bounds: syn::punctuated::Punctuated::new(), - }), - ); - - *implicit_lifetime_id += 1; - } - - Box::new(ty.clone()) - }, - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs deleted file mode 100644 index 6f63af892..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ /dev/null @@ -1,325 +0,0 @@ -use std::hash::{Hash, Hasher}; - -use proc_macro::TokenStream; - -mod config; -mod generate; -mod inputs; -mod parse; - -use config::KernelConfig; -use generate::{ - args_trait::quote_args_trait, cpu_linker_macro::quote_cpu_linker_macro, - cpu_wrapper::quote_cpu_wrapper, cuda_generic_function::quote_cuda_generic_function, - cuda_wrapper::quote_cuda_wrapper, -}; -use inputs::{parse_function_inputs, FunctionInputs}; -use parse::parse_kernel_fn; -use proc_macro2::Span; -use syn::spanned::Spanned; - -#[allow(clippy::too_many_lines)] -pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { - let mut hasher = seahash::SeaHasher::new(); - - attr.to_string().hash(&mut hasher); - func.to_string().hash(&mut hasher); - - let kernel_hash = hasher.finish(); - - let config: KernelConfig = match syn::parse_macro_input::parse(attr) { - Ok(config) => config, - Err(err) => { - abort_call_site!( - "#[kernel(pub? use LINKER! as impl KERNEL for LAUNCHER)] expects LINKER, \ - KERNEL, ARGS and LAUNCHER identifiers: {:?}", - err - ) - }, - }; - - let func = parse_kernel_fn(func); - - let mut generic_kernel_params = func.sig.generics.params.clone(); - let mut func_inputs = parse_function_inputs(&func, &mut generic_kernel_params); - - let (generic_start_token, generic_close_token) = if generic_kernel_params.is_empty() { - (None, None) - } else if let (Some(start), Some(close)) = - (func.sig.generics.lt_token, func.sig.generics.gt_token) - { - (Some(start), Some(close)) - } else { - (Some(syn::parse_quote!(<)), Some(syn::parse_quote!(>))) - }; - - let generic_trait_params = generic_kernel_params - .iter() - .filter(|generic_param| !matches!(generic_param, syn::GenericParam::Lifetime(_))) - .cloned() - .collect(); - let generic_wrapper_params = generic_kernel_params - .iter() - .filter(|generic_param| matches!(generic_param, syn::GenericParam::Lifetime(_))) - .cloned() - .collect(); - - let generic_kernel_where_clause = &func.sig.generics.where_clause; - let generic_trait_where_clause = generic_kernel_where_clause.as_ref().map( - |syn::WhereClause { - where_token, - predicates, - }: &syn::WhereClause| { - let predicates = predicates - .iter() - .filter(|predicate| !matches!(predicate, syn::WherePredicate::Lifetime(_))) - .cloned() - .collect(); - - syn::WhereClause { - where_token: *where_token, - predicates, - } - }, - ); - let generic_wrapper_where_clause = generic_kernel_where_clause.as_ref().map( - |syn::WhereClause { - where_token, - predicates, - }: &syn::WhereClause| { - let predicates = predicates - .iter() - .filter(|predicate| matches!(predicate, syn::WherePredicate::Lifetime(_))) - .cloned() - .collect(); - - syn::WhereClause { - where_token: *where_token, - predicates, - } - }, - ); - - let decl_generics = DeclGenerics { - generic_start_token: &generic_start_token, - generic_trait_params: &generic_trait_params, - generic_close_token: &generic_close_token, - generic_trait_where_clause: &generic_trait_where_clause, - generic_wrapper_params: &generic_wrapper_params, - generic_wrapper_where_clause: &generic_wrapper_where_clause, - generic_kernel_params: &generic_kernel_params, - generic_kernel_where_clause, - }; - let trait_generics = syn::Generics { - lt_token: generic_start_token, - params: generic_trait_params.clone(), - gt_token: generic_close_token, - where_clause: generic_trait_where_clause.clone(), - }; - let impl_generics = { - let (impl_generics, ty_generics, where_clause) = trait_generics.split_for_impl(); - - ImplGenerics { - impl_generics, - ty_generics, - where_clause, - } - }; - - let func_ident = FuncIdent { - func_ident: &func.sig.ident, - func_ident_raw: quote::format_ident!("{}_raw", &func.sig.ident), - func_ident_hash: quote::format_ident!("{}_{:016x}", &func.sig.ident, kernel_hash), - }; - - let func_params = func_inputs - .func_inputs - .iter() - .enumerate() - .map(|(i, arg)| match arg { - syn::FnArg::Typed(syn::PatType { pat, .. }) => match ident_from_pat(pat) { - Some(ident) => ident, - None => syn::Ident::new(&format!("{}_arg_{i}", func_ident.func_ident), pat.span()), - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .collect::>(); - - let pat_func_inputs = func_inputs - .func_inputs - .iter_mut() - .zip(&func_params) - .map(|(arg, ident)| match arg { - syn::FnArg::Typed(syn::PatType { - attrs, - colon_token, - ty, - .. - }) => { - let ident_fn_arg = syn::FnArg::Typed(syn::PatType { - attrs: attrs.clone(), - pat: Box::new(syn::Pat::Ident(syn::PatIdent { - attrs: Vec::new(), - by_ref: None, - mutability: None, - ident: ident.clone(), - subpat: None, - })), - colon_token: *colon_token, - ty: ty.clone(), - }); - - std::mem::replace(arg, ident_fn_arg) - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .collect(); - - let args_trait = quote_args_trait(&config, &decl_generics, &impl_generics, &func_inputs); - let cpu_wrapper = quote_cpu_wrapper( - &config, - &decl_generics, - &impl_generics, - &func_inputs, - &func_ident, - &func.attrs, - ); - let cpu_cuda_check = quote_generic_check(&func_ident, &config); - let cpu_linker_macro = quote_cpu_linker_macro( - &config, - &decl_generics, - &func_inputs, - &func_ident, - &func_params, - &func.attrs, - ); - let cuda_wrapper = quote_cuda_wrapper( - &config, - &func_inputs, - &func_ident, - &func.attrs, - &func_params, - ); - let cuda_generic_function = quote_cuda_generic_function( - &decl_generics, - &pat_func_inputs, - &func_ident, - &func.attrs, - &func.block, - ); - - (quote! { - #args_trait - #cpu_wrapper - - #cpu_cuda_check - - #cpu_linker_macro - - #cuda_wrapper - #cuda_generic_function - }) - .into() -} - -enum InputCudaType { - SafeDeviceCopy, - LendRustToCuda, -} - -struct InputPtxJit(bool); - -#[allow(clippy::struct_field_names)] -struct DeclGenerics<'f> { - generic_start_token: &'f Option, - generic_trait_params: &'f syn::punctuated::Punctuated, - generic_close_token: &'f Option, - generic_trait_where_clause: &'f Option, - generic_wrapper_params: &'f syn::punctuated::Punctuated, - generic_wrapper_where_clause: &'f Option, - generic_kernel_params: &'f syn::punctuated::Punctuated, - generic_kernel_where_clause: &'f Option, -} - -struct ImplGenerics<'f> { - #[allow(clippy::struct_field_names)] - impl_generics: syn::ImplGenerics<'f>, - ty_generics: syn::TypeGenerics<'f>, - where_clause: Option<&'f syn::WhereClause>, -} - -#[allow(clippy::struct_field_names)] -struct FuncIdent<'f> { - func_ident: &'f syn::Ident, - func_ident_raw: syn::Ident, - func_ident_hash: syn::Ident, -} - -fn ident_from_pat(pat: &syn::Pat) -> Option { - match pat { - syn::Pat::Lit(_) - | syn::Pat::Macro(_) - | syn::Pat::Path(_) - | syn::Pat::Range(_) - | syn::Pat::Rest(_) - | syn::Pat::Verbatim(_) - | syn::Pat::Wild(_) => None, - syn::Pat::Ident(syn::PatIdent { ident, .. }) => Some(ident.clone()), - syn::Pat::Box(syn::PatBox { pat, .. }) - | syn::Pat::Reference(syn::PatReference { pat, .. }) - | syn::Pat::Type(syn::PatType { pat, .. }) => ident_from_pat(pat), - syn::Pat::Or(syn::PatOr { cases, .. }) => ident_from_pat_iter(cases.iter()), - syn::Pat::Slice(syn::PatSlice { elems, .. }) - | syn::Pat::TupleStruct(syn::PatTupleStruct { - pat: syn::PatTuple { elems, .. }, - .. - }) - | syn::Pat::Tuple(syn::PatTuple { elems, .. }) => ident_from_pat_iter(elems.iter()), - syn::Pat::Struct(syn::PatStruct { fields, .. }) => { - ident_from_pat_iter(fields.iter().map(|field| &*field.pat)) - }, - _ => Err(()).ok(), - } -} - -fn ident_from_pat_iter<'p, I: Iterator>(iter: I) -> Option { - iter.filter_map(ident_from_pat) - .fold(None, |acc: Option<(String, Span)>, ident| { - if let Some((mut str_acc, span_acc)) = acc { - str_acc.push('_'); - str_acc.push_str(ident.to_string().trim_matches('_')); - - Some((str_acc, span_acc.join(ident.span()).unwrap())) - } else { - Some((ident.to_string(), ident.span())) - } - }) - .map(|(string, span)| syn::Ident::new(&string, span)) -} - -fn quote_generic_check( - FuncIdent { - func_ident_hash, .. - }: &FuncIdent, - KernelConfig { args, .. }: &KernelConfig, -) -> proc_macro2::TokenStream { - let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") { - Ok(crate_name) => crate_name.to_uppercase(), - Err(err) => abort_call_site!("Failed to read crate name: {:?}.", err), - }; - - let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR") - .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err)); - - quote::quote_spanned! { func_ident_hash.span()=> - #[cfg(not(target_os = "cuda"))] - const _: ::rust_cuda::safety::kernel_signature::Assert<{ - ::rust_cuda::safety::kernel_signature::CpuAndGpuKernelSignatures::Match - }> = ::rust_cuda::safety::kernel_signature::Assert::<{ - ::rust_cuda::safety::kernel_signature::check( - rust_cuda::host::check_kernel!(#args #crate_name #crate_manifest_dir).as_bytes(), - concat!(".visible .entry ", stringify!(#func_ident_hash)).as_bytes() - ) - }>; - } -} diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs index d5d8f3018..514bbf66e 100644 --- a/rust-cuda-derive/src/lib.rs +++ b/rust-cuda-derive/src/lib.rs @@ -1,7 +1,46 @@ -#![deny(clippy::pedantic)] -#![feature(box_patterns)] -#![feature(proc_macro_tracked_env)] -#![feature(proc_macro_span)] +//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License +//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod +//! Ready-to-Code]][gitpod] +//! +//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main +//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain +//! +//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange +//! [repo]: https://github.com/juntyr/rust-cuda +//! +//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue +//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda_derive/ +//! +//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield +//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield +//! +//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx +//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda +//! +//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod +//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda +//! +//! `rust-cuda-derive` provides the +//! [`#[derive(LendRustToCuda)]`](LendRustToCuda) derive macro for the +//! [`rust_cuda::lend::RustToCuda`] +//! utility trait, which enables the usage of the +//! [`rust_cuda::lend::LendToCuda`] +//! trait that allows Rust data structures to be shared with CUDA kernels. +//! +//! The async variants of both traits are *optionally* implemented as well. +//! +//! [`rust_cuda::lend::RustToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html +//! [`rust_cuda::lend::LendToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.LendToCuda.html + +#![deny(clippy::complexity)] +#![deny(clippy::correctness)] +#![warn(clippy::nursery)] +#![warn(clippy::pedantic)] +#![deny(clippy::perf)] +#![deny(clippy::style)] +#![deny(clippy::suspicious)] +#![deny(unsafe_code)] +#![deny(missing_docs)] #![feature(if_let_guard)] #![feature(let_chains)] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] @@ -13,16 +52,68 @@ extern crate proc_macro_error; use proc_macro::TokenStream; -mod kernel; mod rust_to_cuda; -// cargo expand --target x86_64-unknown-linux-gnu --ugly \ -// | rustfmt --config max_width=160 > out.rs -// cargo expand --target nvptx64-nvidia-cuda --ugly \ -// | rustfmt --config max_width=160 > out.rs - #[proc_macro_error] #[proc_macro_derive(LendRustToCuda, attributes(cuda))] +/// Provides the [`#[derive(LendRustToCuda)`](LendRustToCuda) +/// derive macro for the +/// [`rust_cuda::lend::RustToCuda`] +/// utility trait, which enables the usage of the +/// [`rust_cuda::lend::LendToCuda`] +/// trait that allows Rust data structures to be shared with CUDA kernels. +/// +/// At the moment, only +/// [`struct`](https://doc.rust-lang.org/std/keyword.struct.html)s are supported +/// by this derive macro. +/// +/// The derive also accepts a `#[cuda(...)]` attribute. You can annotate the +/// entire struct with the `#[cuda(...)]` to configure the implementation as +/// follows: +/// +/// - `#[cuda(crate = "")]` changes the path to the [`rust-cuda`] +/// crate that the derive uses, which by default is `rust_cuda`. +/// - `#[cuda(bound = "")]` adds the provided predicate to the +/// where clause of the trait implementation. +/// - `#[cuda(free = "")]` removes the the auto-added trait bounds for the +/// type parameter `` from the trait implementation, e.g. when +/// implementing a wrapper around [`std::marker::PhantomData`] which should +/// implement the trait for any `T`. +/// - `#[cuda(async = )]` explicitly enables or disables the async +/// implementation of the trait, [`rust_cuda::lend::RustToCudaAsync`]. By +/// default, `#[cuda(async = true)]` is set. +/// - `#[cuda(layout::ATTR = "VALUE")]` adds the `#[layout(ATTR = "VALUE")]` +/// attribute to the [`#derive(const_type_layout::TypeLayout)`] derive for +/// this struct's [`rust_cuda::lend::RustToCuda::CudaRepresentation`]. +/// - `#[cuda(ignore)]` removes all subsequent attributes from the generated +/// [`rust_cuda::lend::RustToCuda::CudaRepresentation`] struct. +/// +/// Additionally, the `#[cuda(...)]` attribute can also be applied individually +/// to the fields of the struct to customise the implementation as follows: +/// +/// - `#[cuda(embed)]` signals that this field has a non-identity CUDA +/// representation and should be embedded by using the +/// [`rust_cuda::lend::RustToCuda`] implementation of this field's type. When +/// this attribute is not specified, the field must instead implement +/// [`Copy`], [`rust_cuda::safety::PortableBitSemantics`], and +/// [`const_type_layout::TypeGraphLayout`]. +/// - `#[cuda(embed = "")]` works like `#[cuda(embed)]` but can be +/// used when the field's type does not implement +/// [`rust_cuda::lend::RustToCuda`] itself, but some `` exists, +/// which implements [`rust_cuda::lend::RustToCudaProxy`] for the field's +/// type. +/// - `#[cuda(ignore)]` removes all subsequent attributes from this field in the +/// generated [`rust_cuda::lend::RustToCuda::CudaRepresentation`] struct. +/// +/// [`rust_cuda::lend::RustToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html +/// [`rust_cuda::lend::LendToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.LendToCuda.html +/// [`rust-cuda`]: https://juntyr.github.io/rust-cuda/rust_cuda +/// [`rust_cuda::lend::RustToCudaAsync`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCudaAsync.html +/// [`#derive(const_type_layout::TypeLayout)`]: https://docs.rs/const-type-layout/0.2.1/const_type_layout/derive.TypeLayout.html +/// [`rust_cuda::lend::RustToCuda::CudaRepresentation`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html#associatedtype.CudaRepresentation +/// [`rust_cuda::safety::PortableBitSemantics`]: https://juntyr.github.io/rust-cuda/rust_cuda/safety/trait.PortableBitSemantics.html +/// [`const_type_layout::TypeGraphLayout`]: https://docs.rs/const-type-layout/0.2.1/const_type_layout/trait.TypeGraphLayout.html +/// [`rust_cuda::lend::RustToCudaProxy`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCudaProxy.html pub fn rust_to_cuda_derive(input: TokenStream) -> TokenStream { // Note: We cannot report a more precise span yet let ast = match syn::parse(input) { @@ -33,44 +124,3 @@ pub fn rust_to_cuda_derive(input: TokenStream) -> TokenStream { // Build the implementation of the `RustToCuda` and `CudaAsRust` traits rust_to_cuda::impl_rust_to_cuda(&ast) } - -#[proc_macro_error] -#[proc_macro_attribute] -pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { - kernel::wrapper::kernel(attr, func) -} - -#[doc(hidden)] -#[proc_macro_error] -#[proc_macro] -pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { - kernel::specialise::ty::specialise_kernel_type(tokens) -} - -#[doc(hidden)] -#[proc_macro_error] -#[proc_macro] -pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream { - kernel::specialise::call::specialise_kernel_call(tokens) -} - -#[doc(hidden)] -#[proc_macro_error] -#[proc_macro_attribute] -pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStream { - kernel::specialise::entry::specialise_kernel_entry(attr, func) -} - -#[doc(hidden)] -#[proc_macro_error] -#[proc_macro] -pub fn check_kernel(tokens: TokenStream) -> TokenStream { - kernel::link::check_kernel(tokens) -} - -#[doc(hidden)] -#[proc_macro_error] -#[proc_macro] -pub fn link_kernel(tokens: TokenStream) -> TokenStream { - kernel::link::link_kernel(tokens) -} diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index 0ddca9b28..18fd867c1 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -1,37 +1,55 @@ use proc_macro2::TokenStream; use quote::{format_ident, quote, ToTokens}; -use super::field_ty::CudaReprFieldTy; +use crate::rust_to_cuda::field_ty::CudaReprFieldTy; #[allow(clippy::too_many_arguments, clippy::too_many_lines)] pub fn impl_field_copy_init_and_expand_alloc_type( + crate_path: &syn::Path, field: &syn::Field, field_index: usize, cuda_repr_field_ty: &CudaReprFieldTy, mut combined_cuda_alloc_type: TokenStream, + mut combined_cuda_alloc_async_type: TokenStream, r2c_field_declarations: &mut Vec, + r2c_field_async_declarations: &mut Vec, + r2c_field_async_completions: &mut Vec, r2c_field_initialisations: &mut Vec, r2c_field_destructors: &mut Vec, + r2c_field_async_destructors: &mut Vec, + r2c_field_async_completion_calls: &mut Vec, c2r_field_initialisations: &mut Vec, -) -> TokenStream { +) -> (TokenStream, TokenStream) { + #[allow(clippy::option_if_let_else)] let field_accessor = match &field.ident { Some(ident) => quote! { #ident }, None => proc_macro2::Literal::usize_unsuffixed(field_index).to_token_stream(), }; + #[allow(clippy::option_if_let_else)] let field_repr_ident = match &field.ident { Some(ident) => format_ident!("field_{}_repr", ident), None => format_ident!("field_{}_repr", field_index), }; + #[allow(clippy::option_if_let_else)] + let field_completion_ident = match &field.ident { + Some(ident) => format_ident!("field_{}_completion", ident), + None => format_ident!("field_{}_completion", field_index), + }; let optional_field_ident = field.ident.as_ref().map(|ident| quote! { #ident: }); match cuda_repr_field_ty { CudaReprFieldTy::SafeDeviceCopy => { r2c_field_declarations.push(quote! { - let #field_repr_ident = rust_cuda::common::DeviceAccessible::from( + let #field_repr_ident = #crate_path::utils::ffi::DeviceAccessible::from( + &self.#field_accessor, + ); + }); + r2c_field_async_declarations.push(quote! { + let #field_repr_ident = #crate_path::utils::ffi::DeviceAccessible::from( &self.#field_accessor, ); }); @@ -42,23 +60,37 @@ pub fn impl_field_copy_init_and_expand_alloc_type( c2r_field_initialisations.push(quote! { #optional_field_ident { - rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor).into_inner() + #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor).into_inner() }, }); }, CudaReprFieldTy::RustToCuda { field_ty } => { combined_cuda_alloc_type = quote! { - rust_cuda::host::CombinedCudaAlloc< - <#field_ty as rust_cuda::common::RustToCuda>::CudaAllocation, + #crate_path::alloc::CombinedCudaAlloc< + <#field_ty as #crate_path::lend::RustToCuda>::CudaAllocation, #combined_cuda_alloc_type > }; + combined_cuda_alloc_async_type = quote! { + #crate_path::alloc::CombinedCudaAlloc< + <#field_ty as #crate_path::lend::RustToCudaAsync>::CudaAllocationAsync, + #combined_cuda_alloc_async_type + > + }; r2c_field_declarations.push(quote! { - let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCuda::borrow( + let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow( + &self.#field_accessor, + alloc_front, + )?; + }); + r2c_field_async_declarations.push(quote! { + let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCudaAsync::borrow_async( &self.#field_accessor, alloc_front, + stream, )?; + let (#field_repr_ident, #field_completion_ident) = #field_repr_ident.unwrap_unchecked()?; }); r2c_field_initialisations.push(quote! { @@ -66,33 +98,72 @@ pub fn impl_field_copy_init_and_expand_alloc_type( }); r2c_field_destructors.push(quote! { - let alloc_front = rust_cuda::common::RustToCuda::restore( + let alloc_front = #crate_path::lend::RustToCuda::restore( &mut self.#field_accessor, alloc_front, )?; }); + r2c_field_async_destructors.push(quote! { + let this_backup = unsafe { + ::core::mem::ManuallyDrop::new(::core::ptr::read(&this)) + }; + let (r#async, alloc_front) = #crate_path::lend::RustToCudaAsync::restore_async( + this.map_mut(|this| &mut this.#field_accessor), + alloc_front, + stream, + )?; + let (value, #field_completion_ident) = r#async.unwrap_unchecked()?; + ::core::mem::forget(value); + let this = ::core::mem::ManuallyDrop::into_inner(this_backup); + }); + + r2c_field_async_completion_calls.push(quote! { + #crate_path::utils::r#async::Completion::< + #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, _> + >::complete( + #field_completion_ident, &mut this.#field_accessor, + )?; + }); + + r2c_field_async_completions.push(field_completion_ident); c2r_field_initialisations.push(quote! { #optional_field_ident { - rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor) + #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor) }, }); }, CudaReprFieldTy::RustToCudaProxy { proxy_ty, field_ty } => { combined_cuda_alloc_type = quote! { - rust_cuda::host::CombinedCudaAlloc< - <#proxy_ty as rust_cuda::common::RustToCuda>::CudaAllocation, + #crate_path::alloc::CombinedCudaAlloc< + <#proxy_ty as #crate_path::lend::RustToCuda>::CudaAllocation, #combined_cuda_alloc_type > }; + combined_cuda_alloc_async_type = quote! { + #crate_path::alloc::CombinedCudaAlloc< + <#proxy_ty as #crate_path::lend::RustToCudaAsync>::CudaAllocationAsync, + #combined_cuda_alloc_async_type + > + }; r2c_field_declarations.push(quote! { - let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCuda::borrow( + let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow( + < + #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty> + >::from_ref(&self.#field_accessor), + alloc_front, + )?; + }); + r2c_field_async_declarations.push(quote! { + let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCudaAsync::borrow_async( < - #proxy_ty as rust_cuda::common::RustToCudaProxy<#field_ty> + #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty> >::from_ref(&self.#field_accessor), alloc_front, + stream, )?; + let (#field_repr_ident, #field_completion_ident) = #field_repr_ident.unwrap_unchecked()?; }); r2c_field_initialisations.push(quote! { @@ -100,23 +171,50 @@ pub fn impl_field_copy_init_and_expand_alloc_type( }); r2c_field_destructors.push(quote! { - let alloc_front = rust_cuda::common::RustToCuda::restore( + let alloc_front = #crate_path::lend::RustToCuda::restore( < - #proxy_ty as rust_cuda::common::RustToCudaProxy<#field_ty> + #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty> >::from_mut(&mut self.#field_accessor), alloc_front, )?; }); + r2c_field_async_destructors.push(quote! { + let this_backup = unsafe { + ::core::mem::ManuallyDrop::new(::core::ptr::read(&this)) + }; + let (r#async, alloc_front) = #crate_path::lend::RustToCudaAsync::restore_async( + this.map_mut(|this| < + #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty> + >::from_mut(&mut this.#field_accessor)), + alloc_front, + stream, + )?; + let (value, #field_completion_ident) = r#async.unwrap_unchecked()?; + ::core::mem::forget(value); + let this = ::core::mem::ManuallyDrop::into_inner(this_backup); + }); + + r2c_field_async_completion_calls.push(quote! { + #crate_path::utils::r#async::Completion::< + #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, _> + >::complete( + #field_completion_ident, < + #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty> + >::from_mut(&mut this.#field_accessor), + )?; + }); + + r2c_field_async_completions.push(field_completion_ident); c2r_field_initialisations.push(quote! { #optional_field_ident { - rust_cuda::common::RustToCudaProxy::<#field_ty>::into( - rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor) + #crate_path::lend::RustToCudaProxy::<#field_ty>::into( + #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor) ) }, }); }, } - combined_cuda_alloc_type + (combined_cuda_alloc_type, combined_cuda_alloc_async_type) } diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs index 8416d3c17..c9fe48b77 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs @@ -12,7 +12,10 @@ pub enum CudaReprFieldTy { }, } -pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprFieldTy { +pub fn swap_field_type_and_filter_attrs( + crate_path: &syn::Path, + field: &mut syn::Field, +) -> CudaReprFieldTy { let mut cuda_repr_field_ty: Option = None; let mut field_ty = field.ty.clone(); @@ -33,8 +36,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField field_ty: Box::new(field_ty.clone()), }); field_ty = parse_quote! { - rust_cuda::common::DeviceAccessible< - <#field_ty as rust_cuda::common::RustToCuda>::CudaRepresentation + #crate_path::utils::ffi::DeviceAccessible< + <#field_ty as #crate_path::lend::RustToCuda>::CudaRepresentation > }; } else { @@ -54,8 +57,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField Ok(proxy_ty) => { let old_field_ty = Box::new(field_ty.clone()); field_ty = parse_quote! { - rust_cuda::common::DeviceAccessible< - <#proxy_ty as rust_cuda::common::RustToCuda>::CudaRepresentation + #crate_path::utils::ffi::DeviceAccessible< + <#proxy_ty as #crate_path::lend::RustToCuda>::CudaRepresentation > }; cuda_repr_field_ty = Some(CudaReprFieldTy::RustToCudaProxy { @@ -66,7 +69,7 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField Err(err) => emit_error!( s.span(), "[rust-cuda]: Invalid #[cuda(embed = \ - \"\")] field attribute: {}.", + \"\")] field attribute: {}.", err ), } @@ -80,8 +83,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField _ => { emit_error!( meta.span(), - "[rust-cuda]: Expected #[cuda(ignore)] / #[cdua(embed)] / \ - #[cuda(embed = \"\")] field attribute" + "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \ + #[cuda(embed = \"\")] field attribute" ); } } @@ -89,8 +92,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField } else { emit_error!( attr.span(), - "[rust-cuda]: Expected #[cuda(ignore)] / #[cdua(embed)] / \ - #[cuda(embed = \"\")] field attribute." + "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \ + #[cuda(embed = \"\")] field attribute." ); } @@ -100,12 +103,13 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField } }); + #[allow(clippy::option_if_let_else)] let cuda_repr_field_ty = if let Some(cuda_repr_field_ty) = cuda_repr_field_ty { cuda_repr_field_ty } else { field_ty = parse_quote! { - rust_cuda::common::DeviceAccessible< - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#field_ty> + #crate_path::utils::ffi::DeviceAccessible< + #crate_path::utils::adapter::RustToCudaWithPortableBitCopySemantics<#field_ty> > }; diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs index 8b21246d2..f090f5c70 100644 --- a/rust-cuda-derive/src/rust_to_cuda/generics.rs +++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs @@ -4,7 +4,14 @@ use syn::spanned::Spanned; #[allow(clippy::too_many_lines)] pub fn expand_cuda_struct_generics_where_requested_in_attrs( ast: &syn::DeriveInput, -) -> (Vec, syn::Generics, Vec) { +) -> ( + Vec, + syn::Generics, + syn::Generics, + Vec, + bool, + syn::Path, +) { let mut type_params = ast .generics .type_params() @@ -13,6 +20,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( let mut struct_attrs_cuda = ast.attrs.clone(); let mut struct_generics_cuda = ast.generics.clone(); + let mut struct_generics_cuda_async = ast.generics.clone(); let mut struct_layout_attrs = Vec::new(); for ty in &type_params { @@ -23,6 +31,8 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( } let mut r2c_ignore = false; + let mut r2c_async_impl = None; + let mut crate_path = None; struct_attrs_cuda.retain(|attr| { if attr.path.is_ident("cuda") { @@ -36,11 +46,17 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( path, lit: syn::Lit::Str(s), .. - })) if path.is_ident("bound") => match syn::parse_str(&s.value()) { - Ok(bound) => struct_generics_cuda - .make_where_clause() - .predicates - .push(bound), + })) if path.is_ident("bound") => match syn::parse_str::(&s.value()) { + Ok(bound) => { + struct_generics_cuda + .make_where_clause() + .predicates + .push(bound.clone()); + struct_generics_cuda_async + .make_where_clause() + .predicates + .push(bound); + }, Err(err) => emit_error!( s.span(), "[rust-cuda]: Invalid #[cuda(bound = \"\")] \ @@ -78,11 +94,46 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( } }, syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue { - path: - syn::Path { - leading_colon: None, - segments, - }, + path, + lit: syn::Lit::Bool(b), + .. + })) if path.is_ident("async") => if r2c_async_impl.is_none() { + r2c_async_impl = Some(b.value()); + } else { + emit_error!( + b.span(), + "[rust-cuda]: Duplicate #[cuda(async)] attribute.", + ); + }, + syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue { + path, + lit: syn::Lit::Str(s), + .. + })) if path.is_ident("crate") => match syn::parse_str::(&s.value()) { + Ok(new_crate_path) => { + if crate_path.is_none() { + crate_path = Some( + syn::parse_quote_spanned! { s.span() => #new_crate_path }, + ); + } else { + emit_error!( + s.span(), + "[rust-cuda]: Duplicate #[cuda(crate)] attribute.", + ); + } + }, + Err(err) => emit_error!( + s.span(), + "[rust-cuda]: Invalid #[cuda(crate = \ + \"\")] attribute: {}.", + err + ), + }, + syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue { + path: syn::Path { + leading_colon: None, + segments, + }, lit: syn::Lit::Str(s), .. })) if segments.len() == 2 @@ -108,9 +159,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( _ => { emit_error!( meta.span(), - "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(bound = \ - \"\")] / #[cuda(layout::ATTR = \"VALUE\")] \ - struct attribute." + "[rust-cuda]: Expected #[cuda(crate = \"\")] / #[cuda(bound = \"\")] / #[cuda(free = \"\")] / #[cuda(async = )] / #[cuda(layout::ATTR = \"VALUE\")] / #[cuda(ignore)] struct attribute." ); }, } @@ -118,8 +167,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( } else { emit_error!( attr.span(), - "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(bound = \ - \"\")] / #[cuda(layout::ATTR = \"VALUE\")] struct attribute." + "[rust-cuda]: Expected #[cuda(crate = \"\")] / #[cuda(bound = \"\")] / #[cuda(free = \"\")] / #[cuda(async = )] / #[cuda(layout::ATTR = \"VALUE\")] / #[cuda(ignore)] struct attribute." ); } @@ -129,14 +177,29 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( } }); + let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda)); + for ty in &type_params { struct_generics_cuda .make_where_clause() .predicates .push(syn::parse_quote! { - #ty: ::rust_cuda::common::RustToCuda + #ty: #crate_path::lend::RustToCuda + }); + struct_generics_cuda_async + .make_where_clause() + .predicates + .push(syn::parse_quote! { + #ty: #crate_path::lend::RustToCudaAsync }); } - (struct_attrs_cuda, struct_generics_cuda, struct_layout_attrs) + ( + struct_attrs_cuda, + struct_generics_cuda, + struct_generics_cuda_async, + struct_layout_attrs, + r2c_async_impl.unwrap_or(true), + crate_path, + ) } diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 8b99e4f73..e0a67b7e3 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -1,7 +1,9 @@ use proc_macro2::TokenStream; use quote::quote; +#[allow(clippy::too_many_arguments)] pub fn cuda_struct_declaration( + crate_path: &syn::Path, struct_attrs_cuda: &[syn::Attribute], struct_layout_attrs: &[syn::Attribute], struct_vis_cuda: &syn::Visibility, @@ -10,7 +12,7 @@ pub fn cuda_struct_declaration( struct_fields_cuda: &syn::Fields, struct_semi_cuda: Option, ) -> TokenStream { - let (impl_generics, ty_generics, where_clause) = struct_generics_cuda.split_for_impl(); + let (_impl_generics, _ty_generics, where_clause) = struct_generics_cuda.split_for_impl(); let struct_repr = if struct_attrs_cuda .iter() @@ -21,29 +23,30 @@ pub fn cuda_struct_declaration( quote! { #[repr(C)] } }; + #[allow(clippy::option_if_let_else)] let struct_fields_where_clause = if let Some(struct_semi_cuda) = struct_semi_cuda { quote!(#struct_fields_cuda #where_clause #struct_semi_cuda) } else { quote!(#where_clause #struct_fields_cuda) }; + let const_type_layout_crate_path = quote! { #crate_path::deps::const_type_layout }.to_string(); + quote! { #[allow(dead_code)] #[doc(hidden)] #(#struct_attrs_cuda)* - #[derive(rust_cuda::const_type_layout::TypeLayout)] + #[derive(#crate_path::deps::const_type_layout::TypeLayout)] #struct_repr #(#struct_layout_attrs)* + #[layout(crate = #const_type_layout_crate_path)] #struct_vis_cuda struct #struct_name_cuda #struct_generics_cuda #struct_fields_where_clause - - // #[derive(DeviceCopy)] can interfer with type parameters - unsafe impl #impl_generics rust_cuda::rustacuda_core::DeviceCopy - for #struct_name_cuda #ty_generics #where_clause {} } } #[allow(clippy::too_many_arguments)] pub fn rust_to_cuda_trait( + crate_path: &syn::Path, struct_name: &syn::Ident, struct_name_cuda: &syn::Ident, struct_generics_cuda: &syn::Generics, @@ -70,22 +73,22 @@ pub fn rust_to_cuda_trait( let (impl_generics, ty_generics, where_clause) = struct_generics_cuda.split_for_impl(); quote! { - unsafe impl #impl_generics rust_cuda::common::RustToCuda for #struct_name #ty_generics + unsafe impl #impl_generics #crate_path::lend::RustToCuda for #struct_name #ty_generics #where_clause { type CudaRepresentation = #struct_name_cuda #ty_generics; - #[cfg(not(target_os = "cuda"))] type CudaAllocation = #combined_cuda_alloc_type; #[cfg(not(target_os = "cuda"))] - unsafe fn borrow( - &self, alloc: CudaAllocType - ) -> rust_cuda::rustacuda::error::CudaResult<( - rust_cuda::common::DeviceAccessible, - rust_cuda::host::CombinedCudaAlloc + unsafe fn borrow( + &self, + alloc: CudaAllocType, + ) -> #crate_path::deps::rustacuda::error::CudaResult<( + #crate_path::utils::ffi::DeviceAccessible, + #crate_path::alloc::CombinedCudaAlloc )> { - let alloc_front = rust_cuda::host::NullCudaAlloc; + let alloc_front = #crate_path::alloc::NoCudaAlloc; let alloc_tail = alloc; #(#r2c_field_declarations)* @@ -93,18 +96,18 @@ pub fn rust_to_cuda_trait( let borrow = #rust_to_cuda_struct_construction; Ok(( - rust_cuda::common::DeviceAccessible::from(borrow), - rust_cuda::host::CombinedCudaAlloc::new(alloc_front, alloc_tail) + #crate_path::utils::ffi::DeviceAccessible::from(borrow), + #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail) )) } #[cfg(not(target_os = "cuda"))] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: rust_cuda::host::CombinedCudaAlloc< + alloc: #crate_path::alloc::CombinedCudaAlloc< Self::CudaAllocation, CudaAllocType >, - ) -> rust_cuda::rustacuda::error::CudaResult { + ) -> #crate_path::deps::rustacuda::error::CudaResult { let (alloc_front, alloc_tail) = alloc.split(); #(#r2c_field_destructors)* @@ -115,7 +118,130 @@ pub fn rust_to_cuda_trait( } } +#[allow(clippy::too_many_arguments)] +pub fn rust_to_cuda_async_trait( + crate_path: &syn::Path, + struct_name: &syn::Ident, + struct_name_cuda: &syn::Ident, + struct_generics_cuda_async: &syn::Generics, + struct_fields_cuda: &syn::Fields, + combined_cuda_alloc_async_type: &TokenStream, + r2c_field_async_declarations: &[TokenStream], + r2c_field_async_completions: &[syn::Ident], + r2c_field_initialisations: &[TokenStream], + r2c_field_async_destructors: &[TokenStream], + r2c_field_async_completion_calls: &[TokenStream], +) -> TokenStream { + let rust_to_cuda_struct_construction = match struct_fields_cuda { + syn::Fields::Named(_) => quote! { + #struct_name_cuda { + #(#r2c_field_initialisations)* + } + }, + syn::Fields::Unnamed(_) => quote! { + #struct_name_cuda ( + #(#r2c_field_initialisations)* + ) + }, + syn::Fields::Unit => quote! { #struct_name_cuda }, + }; + + let async_borrow_completion = if r2c_field_async_completions.is_empty() { + quote! { #crate_path::utils::r#async::Async::ready(borrow, stream) } + } else { + quote! { + if #(#r2c_field_async_completions.is_none())&&* { + #crate_path::utils::r#async::Async::ready(borrow, stream) + } else { + #crate_path::utils::r#async::Async::pending( + borrow, stream, #crate_path::utils::r#async::NoCompletion, + )? + } + } + }; + + let async_restore_completion = if r2c_field_async_completions.is_empty() { + quote! { #crate_path::utils::r#async::Async::ready(this, stream) } + } else { + quote! { + if #(#r2c_field_async_completions.is_none())&&* { + #crate_path::utils::r#async::Async::ready(this, stream) + } else { + #crate_path::utils::r#async::Async::< + _, #crate_path::utils::r#async::CompletionFnMut, + >::pending( + this, stream, #crate_path::deps::alloc::boxed::Box::new(|this| { + #(#r2c_field_async_completion_calls)* + Ok(()) + }), + )? + } + } + }; + + let (impl_generics, ty_generics, where_clause) = struct_generics_cuda_async.split_for_impl(); + + quote! { + unsafe impl #impl_generics #crate_path::lend::RustToCudaAsync for #struct_name #ty_generics + #where_clause + { + type CudaAllocationAsync = #combined_cuda_alloc_async_type; + + #[cfg(not(target_os = "cuda"))] + unsafe fn borrow_async<'stream, CudaAllocType: #crate_path::alloc::CudaAlloc>( + &self, + alloc: CudaAllocType, + stream: #crate_path::host::Stream<'stream>, + ) -> #crate_path::deps::rustacuda::error::CudaResult<( + #crate_path::utils::r#async::Async< + '_, 'stream, + #crate_path::utils::ffi::DeviceAccessible, + >, + #crate_path::alloc::CombinedCudaAlloc, + )> { + let alloc_front = #crate_path::alloc::NoCudaAlloc; + let alloc_tail = alloc; + + #(#r2c_field_async_declarations)* + + let borrow = #rust_to_cuda_struct_construction; + let borrow = #crate_path::utils::ffi::DeviceAccessible::from(borrow); + + let r#async = #async_borrow_completion; + let alloc = #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail); + + Ok((r#async, alloc)) + } + + #[cfg(not(target_os = "cuda"))] + unsafe fn restore_async<'a, 'stream, CudaAllocType: #crate_path::alloc::CudaAlloc, CudaRestoreOwner>( + this: #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, Self>, + alloc: #crate_path::alloc::CombinedCudaAlloc< + Self::CudaAllocationAsync, CudaAllocType + >, + stream: #crate_path::host::Stream<'stream>, + ) -> #crate_path::deps::rustacuda::error::CudaResult<( + #crate_path::utils::r#async::Async< + 'a, 'stream, + #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, Self>, + #crate_path::utils::r#async::CompletionFnMut<'a, Self>, + >, + CudaAllocType, + )> { + let (alloc_front, alloc_tail) = alloc.split(); + + #(#r2c_field_async_destructors)* + + let r#async = #async_restore_completion; + + Ok((r#async, alloc_tail)) + } + } + } +} + pub fn cuda_as_rust_trait( + crate_path: &syn::Path, struct_name: &syn::Ident, struct_name_cuda: &syn::Ident, struct_generics_cuda: &syn::Generics, @@ -139,14 +265,14 @@ pub fn cuda_as_rust_trait( let (impl_generics, ty_generics, where_clause) = &struct_generics_cuda.split_for_impl(); quote! { - unsafe impl #impl_generics rust_cuda::common::CudaAsRust + unsafe impl #impl_generics #crate_path::lend::CudaAsRust for #struct_name_cuda #ty_generics #where_clause { type RustRepresentation = #struct_name #ty_generics; #[cfg(target_os = "cuda")] unsafe fn as_rust( - this: &rust_cuda::common::DeviceAccessible, + this: &#crate_path::utils::ffi::DeviceAccessible, ) -> #struct_name #ty_generics { #cuda_as_rust_struct_construction } diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs index 18589b78a..615c81edf 100644 --- a/rust-cuda-derive/src/rust_to_cuda/mod.rs +++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs @@ -10,7 +10,7 @@ fn get_cuda_repr_ident(rust_repr_ident: &proc_macro2::Ident) -> proc_macro2::Ide format_ident!("{}CudaRepresentation", rust_repr_ident) } -#[allow(clippy::module_name_repetitions)] +#[allow(clippy::module_name_repetitions, clippy::too_many_lines)] pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { let (mut struct_fields_cuda, struct_semi_cuda) = if let syn::Data::Struct(s) = &ast.data { (s.fields.clone(), s.semi_token) @@ -21,12 +21,28 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { let struct_name = &ast.ident; let struct_name_cuda = get_cuda_repr_ident(struct_name); + let ( + struct_attrs_cuda, + struct_generics_cuda, + struct_generics_cuda_async, + struct_layout_attrs, + r2c_async_impl, + crate_path, + ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast); + let mut combined_cuda_alloc_type: TokenStream = quote! { - rust_cuda::host::NullCudaAlloc + #crate_path::alloc::NoCudaAlloc + }; + let mut combined_cuda_alloc_async_type: TokenStream = quote! { + #crate_path::alloc::NoCudaAlloc }; let mut r2c_field_declarations: Vec = Vec::new(); + let mut r2c_field_async_declarations: Vec = Vec::new(); + let mut r2c_field_async_completions: Vec = Vec::new(); let mut r2c_field_initialisations: Vec = Vec::new(); let mut r2c_field_destructors: Vec = Vec::new(); + let mut r2c_field_async_destructors: Vec = Vec::new(); + let mut r2c_field_async_completion_calls: Vec = Vec::new(); let mut c2r_field_initialisations: Vec = Vec::new(); @@ -40,32 +56,41 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { .. }) => { let mut r2c_field_destructors_reverse: Vec = Vec::new(); + let mut r2c_field_async_destructors_reverse: Vec = Vec::new(); for (field_index, field) in fields.iter_mut().enumerate() { - let cuda_repr_field_ty = field_ty::swap_field_type_and_filter_attrs(field); - - combined_cuda_alloc_type = field_copy::impl_field_copy_init_and_expand_alloc_type( - field, - field_index, - &cuda_repr_field_ty, - combined_cuda_alloc_type, - &mut r2c_field_declarations, - &mut r2c_field_initialisations, - &mut r2c_field_destructors_reverse, - &mut c2r_field_initialisations, - ); + let cuda_repr_field_ty = + field_ty::swap_field_type_and_filter_attrs(&crate_path, field); + + (combined_cuda_alloc_type, combined_cuda_alloc_async_type) = + field_copy::impl_field_copy_init_and_expand_alloc_type( + &crate_path, + field, + field_index, + &cuda_repr_field_ty, + combined_cuda_alloc_type, + combined_cuda_alloc_async_type, + &mut r2c_field_declarations, + &mut r2c_field_async_declarations, + &mut r2c_field_async_completions, + &mut r2c_field_initialisations, + &mut r2c_field_destructors_reverse, + &mut r2c_field_async_destructors_reverse, + &mut r2c_field_async_completion_calls, + &mut c2r_field_initialisations, + ); } // The fields must be deallocated in the reverse order of their allocation r2c_field_destructors.extend(r2c_field_destructors_reverse.into_iter().rev()); + r2c_field_async_destructors + .extend(r2c_field_async_destructors_reverse.into_iter().rev()); }, syn::Fields::Unit => (), } - let (struct_attrs_cuda, struct_generics_cuda, struct_layout_attrs) = - generics::expand_cuda_struct_generics_where_requested_in_attrs(ast); - let cuda_struct_declaration = r#impl::cuda_struct_declaration( + &crate_path, &struct_attrs_cuda, &struct_layout_attrs, &ast.vis, @@ -76,6 +101,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { ); let rust_to_cuda_trait_impl = r#impl::rust_to_cuda_trait( + &crate_path, struct_name, &struct_name_cuda, &struct_generics_cuda, @@ -86,7 +112,26 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { &r2c_field_destructors, ); + let rust_to_cuda_async_trait_impl = if r2c_async_impl { + r#impl::rust_to_cuda_async_trait( + &crate_path, + struct_name, + &struct_name_cuda, + &struct_generics_cuda_async, + &struct_fields_cuda, + &combined_cuda_alloc_async_type, + &r2c_field_async_declarations, + &r2c_field_async_completions, + &r2c_field_initialisations, + &r2c_field_async_destructors, + &r2c_field_async_completion_calls, + ) + } else { + TokenStream::new() + }; + let cuda_as_rust_trait_impl = r#impl::cuda_as_rust_trait( + &crate_path, struct_name, &struct_name_cuda, &struct_generics_cuda, @@ -99,6 +144,8 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { #rust_to_cuda_trait_impl + #rust_to_cuda_async_trait_impl + #cuda_as_rust_trait_impl }) .into() diff --git a/rust-cuda-kernel/Cargo.toml b/rust-cuda-kernel/Cargo.toml new file mode 100644 index 000000000..b944bf875 --- /dev/null +++ b/rust-cuda-kernel/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "rust-cuda-kernel" +version = "0.1.0" +authors = ["Juniper Tyree "] +license = "MIT OR Apache-2.0" +edition = "2021" +rust-version = "1.77" # nightly +links = "libnvptxcompiler_static" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +proc-macro = true + +[dependencies] +syn = { version = "1.0", features = ["full", "fold"] } +quote = "1.0" +proc-macro2 = "1.0" +proc-macro-error = "1.0" +regex = "1.5" +lazy_static = "1.4" +serde_json = "1.0" +cargo_metadata = { version = "0.18", features = ["builder"] } +strip-ansi-escapes = "0.2" +colored = "2.0" +thiserror = "1.0" +seahash = "4.1" +ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" } + +[build-dependencies] +find_cuda_helper = "0.2" diff --git a/rust-cuda-kernel/build.rs b/rust-cuda-kernel/build.rs new file mode 100644 index 000000000..f7aa5b1a9 --- /dev/null +++ b/rust-cuda-kernel/build.rs @@ -0,0 +1,5 @@ +fn main() { + find_cuda_helper::include_cuda(); + + println!("cargo:rustc-link-lib=nvptxcompiler_static"); +} diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-kernel/src/kernel/link/config.rs similarity index 65% rename from rust-cuda-derive/src/kernel/link/config.rs rename to rust-cuda-kernel/src/kernel/link/config.rs index cdfd0b575..02297ba7d 100644 --- a/rust-cuda-derive/src/kernel/link/config.rs +++ b/rust-cuda-kernel/src/kernel/link/config.rs @@ -1,18 +1,23 @@ -use std::path::PathBuf; +use std::{collections::HashMap, path::PathBuf}; + +use quote::quote; + +use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; #[allow(clippy::module_name_repetitions)] pub(super) struct LinkKernelConfig { pub(super) kernel: syn::Ident, - pub(super) args: syn::Ident, + pub(super) kernel_hash: syn::Ident, pub(super) crate_name: String, pub(super) crate_path: PathBuf, pub(super) specialisation: String, + pub(super) ptx_lint_levels: HashMap, } impl syn::parse::Parse for LinkKernelConfig { fn parse(input: syn::parse::ParseStream) -> syn::Result { let kernel: syn::Ident = input.parse()?; - let args: syn::Ident = input.parse()?; + let kernel_hash: syn::Ident = input.parse()?; let name: syn::LitStr = input.parse()?; let path: syn::LitStr = input.parse()?; @@ -35,31 +40,48 @@ impl syn::parse::Parse for LinkKernelConfig { String::new() }; + let attrs = syn::punctuated::Punctuated::< + syn::MetaList, + syn::token::Comma, + >::parse_separated_nonempty(input)?; + + let mut ptx_lint_levels = HashMap::new(); + + for syn::MetaList { path, nested, .. } in attrs { + parse_ptx_lint_level(&path, &nested, &mut ptx_lint_levels); + } + + proc_macro_error::abort_if_dirty(); + Ok(Self { kernel, - args, + kernel_hash, crate_name: name.value(), crate_path: PathBuf::from(path.value()), specialisation, + ptx_lint_levels, }) } } #[allow(clippy::module_name_repetitions)] pub(super) struct CheckKernelConfig { - pub(super) args: syn::Ident, + pub(super) kernel: syn::Ident, + pub(super) kernel_hash: syn::Ident, pub(super) crate_name: String, pub(super) crate_path: PathBuf, } impl syn::parse::Parse for CheckKernelConfig { fn parse(input: syn::parse::ParseStream) -> syn::Result { - let args: syn::Ident = input.parse()?; + let kernel: syn::Ident = input.parse()?; + let kernel_hash: syn::Ident = input.parse()?; let name: syn::LitStr = input.parse()?; let path: syn::LitStr = input.parse()?; Ok(Self { - args, + kernel, + kernel_hash, crate_name: name.value(), crate_path: PathBuf::from(path.value()), }) diff --git a/rust-cuda-derive/src/kernel/link/error.rs b/rust-cuda-kernel/src/kernel/link/error.rs similarity index 91% rename from rust-cuda-derive/src/kernel/link/error.rs rename to rust-cuda-kernel/src/kernel/link/error.rs index 0c83e19a5..811269ccc 100644 --- a/rust-cuda-derive/src/kernel/link/error.rs +++ b/rust-cuda-kernel/src/kernel/link/error.rs @@ -22,15 +22,14 @@ pub fn emit_ptx_build_error() { let call_site = proc_macro::Span::call_site(); - let (byte_start, byte_end) = - if let Some(captures) = PROC_MACRO_SPAN_REGEX.captures(&format!("{call_site:?}")) { + let (byte_start, byte_end) = PROC_MACRO_SPAN_REGEX + .captures(&format!("{call_site:?}")) + .map_or((0_u32, 0_u32), |captures| { ( captures["start"].parse().unwrap_or(0_u32), captures["end"].parse().unwrap_or(0_u32), ) - } else { - (0_u32, 0_u32) - }; + }); let span = DiagnosticSpanBuilder::default() .file_name( diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs new file mode 100644 index 000000000..bbe243c9f --- /dev/null +++ b/rust-cuda-kernel/src/kernel/link/mod.rs @@ -0,0 +1,842 @@ +use std::{ + collections::HashMap, + env, + ffi::CString, + fmt::Write as FmtWrite, + fs, + io::{Read, Write}, + os::raw::c_int, + path::{Path, PathBuf}, + ptr::addr_of_mut, + sync::atomic::{AtomicBool, Ordering}, +}; + +use colored::Colorize; +use proc_macro::TokenStream; +use proc_macro2::Span; +use ptx_builder::{ + builder::{BuildStatus, Builder, MessageFormat, Profile}, + error::{BuildErrorKind, Error, Result}, +}; +use quote::quote; + +use crate::kernel::{ + lints::{LintLevel, PtxLint}, + utils::skip_kernel_compilation, + KERNEL_TYPE_LAYOUT_IDENT, KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY, + PTX_CSTR_IDENT, +}; + +mod config; +mod error; +mod ptx_compiler_sys; + +use config::{CheckKernelConfig, LinkKernelConfig}; +use error::emit_ptx_build_error; +use ptx_compiler_sys::NvptxError; + +pub fn check_kernel(tokens: TokenStream) -> TokenStream { + proc_macro_error::set_dummy( + quote! {::core::compile_error!("rust-cuda PTX kernel check failed");}, + ); + + let CheckKernelConfig { + kernel, + kernel_hash, + crate_name, + crate_path, + } = match syn::parse_macro_input::parse(tokens) { + Ok(config) => config, + Err(err) => { + abort_call_site!( + "check_kernel!(KERNEL HASH NAME PATH) expects KERNEL and HASH identifiers, annd \ + NAME and PATH string literals: {:?}", + err + ) + }, + }; + + let kernel_ptx = compile_kernel_ptx(&kernel, &crate_name, &crate_path, Specialisation::Check); + + let Some(kernel_ptx) = kernel_ptx else { + return quote!(::core::compile_error!("rust-cuda PTX kernel check failed");).into(); + }; + + check_kernel_ptx_and_report( + &kernel_ptx, + Specialisation::Check, + &kernel_hash, + &HashMap::new(), + ); + + quote!().into() +} + +#[allow(clippy::module_name_repetitions)] +pub fn compile_kernel(tokens: TokenStream) -> TokenStream { + let ptx_cstr_ident = syn::Ident::new(PTX_CSTR_IDENT, Span::call_site()); + let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, Span::call_site()); + + proc_macro_error::set_dummy(quote! { + const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; + + const fn #ffi_signature_ident() -> HostAndDeviceKernelSignatureTypeLayout { + HostAndDeviceKernelSignatureTypeLayout::Match + } + + ::core::compile_error!("rust-cuda PTX kernel compilation failed"); + }); + + let LinkKernelConfig { + kernel, + kernel_hash, + crate_name, + crate_path, + specialisation, + ptx_lint_levels, + } = match syn::parse_macro_input::parse(tokens) { + Ok(config) => config, + Err(err) => { + abort_call_site!( + "compile_kernel!(KERNEL HASH NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \ + HASH identifiers, NAME and PATH string literals, and SPECIALISATION and LINTS \ + tokens: {:?}", + err + ) + }, + }; + + if skip_kernel_compilation() { + return quote! { + const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"CLIPPY skips specialised PTX compilation"; + } + .into(); + } + + let Some(mut kernel_ptx) = compile_kernel_ptx( + &kernel, + &crate_name, + &crate_path, + Specialisation::Link(&specialisation), + ) else { + return (quote! { + const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; + + const fn #ffi_signature_ident() -> HostAndDeviceKernelSignatureTypeLayout { + HostAndDeviceKernelSignatureTypeLayout::Match + } + + ::core::compile_error!("rust-cuda PTX kernel compilation failed"); + }) + .into(); + }; + + let type_layouts = extract_ptx_kernel_layout(&mut kernel_ptx); + remove_kernel_type_use_from_ptx(&mut kernel_ptx); + + check_kernel_ptx_and_report( + &kernel_ptx, + Specialisation::Link(&specialisation), + &kernel_hash, + &ptx_lint_levels, + ); + + let kernel_ptx = match CString::new(kernel_ptx) { + Ok(kernel_ptx) => kernel_ptx, + Err(err) => abort_call_site!( + "Kernel compilation generated invalid PTX: internal nul byte: {:?}", + err + ), + }; + + let kernel_ptx = proc_macro::Literal::c_string(&kernel_ptx); + let kernel_ptx = proc_macro2::TokenStream::from(proc_macro::TokenStream::from( + proc_macro::TokenTree::Literal(kernel_ptx), + )); + + (quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = #kernel_ptx; #(#type_layouts)* }) + .into() +} + +fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec { + const BEFORE_PARAM_PATTERN: &str = ".visible .global .align 1 .b8 "; + const PARAM_LEN_PATTERN: &str = "["; + const LEN_BYTES_PATTERN: &str = "] = {"; + const AFTER_BYTES_PATTERN: &str = "};"; + + let mut type_layouts = Vec::new(); + + while let Some(type_layout_start) = kernel_ptx.find(BEFORE_PARAM_PATTERN) { + let param_start = type_layout_start + BEFORE_PARAM_PATTERN.len(); + + let Some(len_start_offset) = kernel_ptx[param_start..].find(PARAM_LEN_PATTERN) else { + abort_call_site!("Kernel compilation generated invalid PTX: missing type layout data") + }; + let len_start = param_start + len_start_offset + PARAM_LEN_PATTERN.len(); + + let Some(bytes_start_offset) = kernel_ptx[len_start..].find(LEN_BYTES_PATTERN) else { + abort_call_site!("Kernel compilation generated invalid PTX: missing type layout length") + }; + let bytes_start = len_start + bytes_start_offset + LEN_BYTES_PATTERN.len(); + + let Some(bytes_end_offset) = kernel_ptx[bytes_start..].find(AFTER_BYTES_PATTERN) else { + abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout data") + }; + let param = &kernel_ptx[param_start..(param_start + len_start_offset)]; + let len = &kernel_ptx[len_start..(len_start + bytes_start_offset)]; + let bytes = &kernel_ptx[bytes_start..(bytes_start + bytes_end_offset)]; + + let param = quote::format_ident!("{}", param); + + let Ok(len) = len.parse::() else { + abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout length") + }; + let Ok(bytes) = bytes + .split(", ") + .map(std::str::FromStr::from_str) + .collect::, _>>() + else { + abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout byte") + }; + + if bytes.len() != len { + abort_call_site!( + "Kernel compilation generated invalid PTX: type layout length mismatch" + ); + } + + // let mut ascii_escaped_bytes = Vec::new(); + // for b in &bytes { + // ascii_escaped_bytes.extend(std::ascii::escape_default(*b)); + // } + // emit_call_site_warning!("{}", std::str::from_utf8(&ascii_escaped_bytes).unwrap()); + + let mut zeros = 0; + for b in &bytes { + if *b == 0 { + zeros += 1; + } else { + zeros = 0; + } + } + + #[allow(clippy::cast_precision_loss)] // FIXME + { + emit_call_site_warning!("type layout: {}B (can do {:.02} compression)", bytes.len(), (bytes.len() as f64) / ((bytes.len() - zeros) as f64)); + } + + let byte_str = syn::LitByteStr::new(&bytes[..bytes.len()-zeros], proc_macro2::Span::call_site()); + + type_layouts.push(quote! { + const fn #param() -> HostAndDeviceKernelSignatureTypeLayout { + if check_serialised_type_graph::(#byte_str) { + HostAndDeviceKernelSignatureTypeLayout::Match + } else { + HostAndDeviceKernelSignatureTypeLayout::Mismatch + } + } + }); + + let type_layout_end = bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len(); + + kernel_ptx.replace_range(type_layout_start..type_layout_end, ""); + } + + type_layouts +} + +fn remove_kernel_type_use_from_ptx(kernel_ptx: &mut String) { + while let Some(kernel_type_layout_start) = kernel_ptx.find(KERNEL_TYPE_USE_START_CANARY) { + let kernel_type_layout_start = kernel_ptx[..kernel_type_layout_start] + .rfind('\n') + .unwrap_or(kernel_type_layout_start); + + let Some(kernel_type_layout_end_offset) = + kernel_ptx[kernel_type_layout_start..].find(KERNEL_TYPE_USE_END_CANARY) + else { + abort_call_site!( + "Kernel compilation generated invalid PTX: incomplete type layout use section" + ); + }; + + let kernel_type_layout_end_offset = kernel_type_layout_end_offset + + kernel_ptx[kernel_type_layout_start + kernel_type_layout_end_offset..] + .find('\n') + .unwrap_or(KERNEL_TYPE_USE_END_CANARY.len()); + + let kernel_type_layout_end = kernel_type_layout_start + kernel_type_layout_end_offset; + + kernel_ptx.replace_range(kernel_type_layout_start..kernel_type_layout_end, ""); + } +} + +#[allow(clippy::too_many_lines)] +fn check_kernel_ptx_and_report( + kernel_ptx: &str, + specialisation: Specialisation, + kernel_hash: &proc_macro2::Ident, + ptx_lint_levels: &HashMap, +) { + let (result, error_log, info_log, binary, version, drop) = + check_kernel_ptx(kernel_ptx, specialisation, kernel_hash, ptx_lint_levels); + + let ptx_compiler = match &version { + Ok((major, minor)) => format!("PTX compiler v{major}.{minor}"), + Err(_) => String::from("PTX compiler"), + }; + + let mut errors = String::new(); + + if let Err(err) = drop { + let _ = errors.write_fmt(format_args!("Error dropping the {ptx_compiler}: {err}\n")); + } + + if let Err(err) = version { + let _ = errors.write_fmt(format_args!( + "Error fetching the version of the {ptx_compiler}: {err}\n" + )); + } + + let ptx_source_code = { + let mut max_lines = kernel_ptx.chars().filter(|c| *c == '\n').count() + 1; + let mut indent = 0; + while max_lines > 0 { + max_lines /= 10; + indent += 1; + } + + format!( + "PTX source code:\n{}", + kernel_ptx + .lines() + .enumerate() + .map(|(i, l)| format!("{:indent$}| {l}", i + 1)) + .collect::>() + .join("\n") + ) + }; + + match binary { + Ok(None) => (), + Ok(Some(binary)) => { + if ptx_lint_levels + .get(&PtxLint::DumpAssembly) + .map_or(false, |level| *level > LintLevel::Allow) + { + const HEX: [char; 16] = [ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', + ]; + + let mut binary_hex = String::with_capacity(binary.len() * 2); + for byte in binary { + binary_hex.push(HEX[usize::from(byte >> 4)]); + binary_hex.push(HEX[usize::from(byte & 0x0F)]); + } + + if ptx_lint_levels + .get(&PtxLint::DumpAssembly) + .map_or(false, |level| *level > LintLevel::Warn) + { + emit_call_site_error!( + "{} compiled binary:\n{}\n\n{}", + ptx_compiler, + binary_hex, + ptx_source_code + ); + } else { + emit_call_site_warning!( + "{} compiled binary:\n{}\n\n{}", + ptx_compiler, + binary_hex, + ptx_source_code + ); + } + } + }, + Err(err) => { + let _ = errors.write_fmt(format_args!( + "Error fetching the compiled binary from {ptx_compiler}: {err}\n" + )); + }, + } + + match info_log { + Ok(None) => (), + Ok(Some(info_log)) => emit_call_site_warning!( + "{} info log:\n{}\n{}", + ptx_compiler, + info_log, + ptx_source_code + ), + Err(err) => { + let _ = errors.write_fmt(format_args!( + "Error fetching the info log of the {ptx_compiler}: {err}\n" + )); + }, + }; + + let error_log = match error_log { + Ok(None) => String::new(), + Ok(Some(error_log)) => { + format!("{ptx_compiler} error log:\n{error_log}\n{ptx_source_code}") + }, + Err(err) => { + let _ = errors.write_fmt(format_args!( + "Error fetching the error log of the {ptx_compiler}: {err}\n" + )); + String::new() + }, + }; + + if let Err(err) = result { + let _ = errors.write_fmt(format_args!("Error compiling the PTX source code: {err}\n")); + } + + if !error_log.is_empty() || !errors.is_empty() { + abort_call_site!( + "{error_log}{}{errors}", + if !error_log.is_empty() && !errors.is_empty() { + "\n\n" + } else { + "" + } + ); + } +} + +#[allow(clippy::type_complexity)] +#[allow(clippy::too_many_lines)] +fn check_kernel_ptx( + kernel_ptx: &str, + specialisation: Specialisation, + kernel_hash: &proc_macro2::Ident, + ptx_lint_levels: &HashMap, +) -> ( + Result<(), NvptxError>, + Result, NvptxError>, + Result, NvptxError>, + Result>, NvptxError>, + Result<(u32, u32), NvptxError>, + Result<(), NvptxError>, +) { + let compiler = { + let mut compiler = std::ptr::null_mut(); + #[allow(unsafe_code)] // FFI + if let Err(err) = NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerCreate( + addr_of_mut!(compiler), + kernel_ptx.len() as ptx_compiler_sys::size_t, + kernel_ptx.as_ptr().cast(), + ) + }) { + abort_call_site!("PTX compiler creation failed: {}", err); + } + compiler + }; + + let result = (|| { + let kernel_name = match specialisation { + Specialisation::Check => format!("{kernel_hash}_chECK"), + Specialisation::Link("") => format!("{kernel_hash}_kernel"), + Specialisation::Link(specialisation) => format!( + "{kernel_hash}_kernel_{:016x}", + seahash::hash(specialisation.as_bytes()) + ), + }; + let kernel_name = CString::new(kernel_name).unwrap(); + + let mut options = vec![c"--entry", kernel_name.as_c_str()]; + + if ptx_lint_levels + .values() + .any(|level| *level > LintLevel::Warn) + { + let mut options = options.clone(); + + if ptx_lint_levels + .get(&PtxLint::Verbose) + .map_or(false, |level| *level > LintLevel::Warn) + { + options.push(c"--verbose"); + } + if ptx_lint_levels + .get(&PtxLint::DoublePrecisionUse) + .map_or(false, |level| *level > LintLevel::Warn) + { + options.push(c"--warn-on-double-precision-use"); + } + if ptx_lint_levels + .get(&PtxLint::LocalMemoryUse) + .map_or(false, |level| *level > LintLevel::Warn) + { + options.push(c"--warn-on-local-memory-usage"); + } + if ptx_lint_levels + .get(&PtxLint::RegisterSpills) + .map_or(false, |level| *level > LintLevel::Warn) + { + options.push(c"--warn-on-spills"); + } + if ptx_lint_levels + .get(&PtxLint::DynamicStackSize) + .map_or(true, |level| *level <= LintLevel::Warn) + { + options.push(c"--suppress-stack-size-warning"); + } + options.push(c"--warning-as-error"); + + let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); + + #[allow(unsafe_code)] // FFI + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerCompile( + compiler, + c_int::try_from(options_ptrs.len()).unwrap(), + options_ptrs.as_ptr().cast(), + ) + })?; + }; + + if ptx_lint_levels + .get(&PtxLint::Verbose) + .map_or(false, |level| *level > LintLevel::Allow) + { + options.push(c"--verbose"); + } + if ptx_lint_levels + .get(&PtxLint::DoublePrecisionUse) + .map_or(false, |level| *level > LintLevel::Allow) + { + options.push(c"--warn-on-double-precision-use"); + } + if ptx_lint_levels + .get(&PtxLint::LocalMemoryUse) + .map_or(false, |level| *level > LintLevel::Allow) + { + options.push(c"--warn-on-local-memory-usage"); + } + if ptx_lint_levels + .get(&PtxLint::RegisterSpills) + .map_or(false, |level| *level > LintLevel::Allow) + { + options.push(c"--warn-on-spills"); + } + if ptx_lint_levels + .get(&PtxLint::DynamicStackSize) + .map_or(true, |level| *level < LintLevel::Warn) + { + options.push(c"--suppress-stack-size-warning"); + } + + let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); + + #[allow(unsafe_code)] // FFI + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerCompile( + compiler, + c_int::try_from(options_ptrs.len()).unwrap(), + options_ptrs.as_ptr().cast(), + ) + }) + })(); + + let error_log = (|| { + let mut error_log_size = 0; + + #[allow(unsafe_code)] // FFI + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size)) + })?; + + if error_log_size == 0 { + return Ok(None); + } + + #[allow(clippy::cast_possible_truncation)] + let mut error_log: Vec = vec![0; error_log_size as usize]; + + #[allow(unsafe_code)] // FFI + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast()) + })?; + + Ok(Some(String::from_utf8_lossy(&error_log).into_owned())) + })(); + + let info_log = (|| { + let mut info_log_size = 0; + + #[allow(unsafe_code)] // FFI + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size)) + })?; + + if info_log_size == 0 { + return Ok(None); + } + + #[allow(clippy::cast_possible_truncation)] + let mut info_log: Vec = vec![0; info_log_size as usize]; + + #[allow(unsafe_code)] // FFI + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast()) + })?; + + Ok(Some(String::from_utf8_lossy(&info_log).into_owned())) + })(); + + let binary = (|| { + if result.is_err() { + return Ok(None); + } + + let mut binary_size = 0; + + #[allow(unsafe_code)] // FFI + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetCompiledProgramSize( + compiler, + addr_of_mut!(binary_size), + ) + })?; + + if binary_size == 0 { + return Ok(None); + } + + #[allow(clippy::cast_possible_truncation)] + let mut binary: Vec = vec![0; binary_size as usize]; + + #[allow(unsafe_code)] // FFI + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetCompiledProgram(compiler, binary.as_mut_ptr().cast()) + })?; + + Ok(Some(binary)) + })(); + + let version = (|| { + let mut major = 0; + let mut minor = 0; + + #[allow(unsafe_code)] // FFI + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor)) + })?; + + Ok((major, minor)) + })(); + + let drop = { + let mut compiler = compiler; + #[allow(unsafe_code)] // FFI + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerDestroy(addr_of_mut!(compiler)) + }) + }; + + (result, error_log, info_log, binary, version, drop) +} + +fn compile_kernel_ptx( + kernel: &syn::Ident, + crate_name: &str, + crate_path: &Path, + specialisation: Specialisation, +) -> Option { + if let Ok(rust_flags) = proc_macro::tracked_env::var("RUSTFLAGS") { + env::set_var( + "RUSTFLAGS", + rust_flags + .replace("-Zinstrument-coverage", "") + .replace("-Cinstrument-coverage", ""), + ); + } + + let specialisation_var = format!( + "RUST_CUDA_DERIVE_SPECIALISE_{}_{}", + crate_name, + kernel.to_string().to_uppercase() + ); + + match build_kernel_with_specialisation(crate_path, &specialisation_var, specialisation) { + Ok(kernel_path) => { + let mut file = fs::File::open(&kernel_path) + .unwrap_or_else(|_| panic!("Failed to open kernel file at {:?}.", &kernel_path)); + + let mut kernel_ptx = String::new(); + + file.read_to_string(&mut kernel_ptx) + .unwrap_or_else(|_| panic!("Failed to read kernel file at {:?}.", &kernel_path)); + + colored::control::set_override(true); + eprintln!( + "{} {} compiling a PTX crate.", + "[PTX]".bright_black().bold(), + "Finished".green().bold() + ); + colored::control::unset_override(); + + Some(kernel_ptx) + }, + Err(err) => { + eprintln!("{err}"); + emit_ptx_build_error(); + None + }, + } +} + +#[allow(clippy::too_many_lines)] +fn build_kernel_with_specialisation( + kernel_path: &Path, + env_var: &str, + specialisation: Specialisation, +) -> Result { + match specialisation { + Specialisation::Check => env::set_var(env_var, "chECK"), + Specialisation::Link(specialisation) => env::set_var(env_var, specialisation), + }; + + let result = (|| { + let mut builder = Builder::new(kernel_path)?; + + builder = match specialisation { + Specialisation::Check => builder.set_profile(Profile::Debug), + Specialisation::Link(_) => builder.set_profile(Profile::Release), + }; + + builder = builder.set_message_format(MessageFormat::Json { + render_diagnostics: false, + short: false, + ansi: true, + }); + + let specialisation_prefix = match specialisation { + Specialisation::Check => String::from("chECK"), + Specialisation::Link(specialisation) => { + format!("{:016x}", seahash::hash(specialisation.as_bytes())) + }, + }; + builder = builder.set_prefix(specialisation_prefix.clone()); + + let any_output = AtomicBool::new(false); + let crate_name = String::from(builder.get_crate_name()); + + let build = builder.build_live( + |stdout_line| { + if let Ok(cargo_metadata::Message::CompilerMessage(mut message)) = + serde_json::from_str(stdout_line) + { + if any_output + .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { + colored::control::set_override(true); + eprintln!( + "{} of {} ({})", + "[PTX]".bright_black().bold(), + crate_name.bold(), + specialisation_prefix.to_ascii_lowercase(), + ); + colored::control::unset_override(); + } + + if let Some(rendered) = &mut message.message.rendered { + colored::control::set_override(true); + let prefix = " | ".bright_black().bold().to_string(); + colored::control::unset_override(); + + let glue = String::from('\n') + &prefix; + + let mut lines = rendered + .split('\n') + .rev() + .skip_while(|l| l.trim().is_empty()) + .collect::>(); + lines.reverse(); + + let mut prefixed = prefix + &lines.join(&glue); + + std::mem::swap(rendered, &mut prefixed); + } + + eprintln!("{}", serde_json::to_string(&message.message).unwrap()); + } + }, + |stderr_line| { + if stderr_line.trim().is_empty() { + return; + } + + if any_output + .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { + colored::control::set_override(true); + eprintln!( + "{} of {} ({})", + "[PTX]".bright_black().bold(), + crate_name.bold(), + specialisation_prefix.to_ascii_lowercase(), + ); + colored::control::unset_override(); + } + + colored::control::set_override(true); + eprintln!( + " {} {}", + "|".bright_black().bold(), + stderr_line.replace(" ", "") + ); + colored::control::unset_override(); + }, + )?; + + match build { + BuildStatus::Success(output) => { + let ptx_path = output.get_assembly_path(); + + let mut specialised_ptx_path = ptx_path.clone(); + + specialised_ptx_path.set_extension(format!("{specialisation_prefix}.ptx")); + + fs::copy(&ptx_path, &specialised_ptx_path).map_err(|err| { + Error::from(BuildErrorKind::BuildFailed(vec![format!( + "Failed to copy kernel from {ptx_path:?} to {specialised_ptx_path:?}: \ + {err}" + )])) + })?; + + if let Specialisation::Link(specialisation) = specialisation { + fs::OpenOptions::new() + .append(true) + .open(&specialised_ptx_path) + .and_then(|mut file| writeln!(file, "\n// {specialisation}")) + .map_err(|err| { + Error::from(BuildErrorKind::BuildFailed(vec![format!( + "Failed to write specialisation to {specialised_ptx_path:?}: {err}" + )])) + })?; + } + + Ok(specialised_ptx_path) + }, + BuildStatus::NotNeeded => Err(Error::from(BuildErrorKind::BuildFailed(vec![format!( + "Kernel build for specialisation {:?} was not needed.", + &specialisation + )]))), + } + })(); + + env::remove_var(env_var); + + result +} + +#[derive(Copy, Clone, Debug)] +enum Specialisation<'a> { + Check, + Link(&'a str), +} diff --git a/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs new file mode 100644 index 000000000..7fffc7b4c --- /dev/null +++ b/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs @@ -0,0 +1,275 @@ +use thiserror::Error; + +#[allow(non_camel_case_types)] +pub type size_t = ::std::os::raw::c_ulonglong; + +#[repr(C)] +pub struct NvptxCompiler { + _private: [u8; 0], +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Error)] +#[non_exhaustive] +pub enum NvptxError { + #[error("Invalid compiler handle")] + InvalidCompilerHandle, + #[error("Invalid PTX input")] + InvalidInput, + #[error("Compilation failure")] + CompilationFailure, + #[error("Internal error")] + Internal, + #[error("Out of memory")] + OutOfMemory, + #[error("Incomplete compiler invocation")] + CompilerInvocationIncomplete, + #[error("Unsupported PTX version")] + UnsupportedPtxVersion, + #[error("Unsupported dev-side sync")] + UnsupportedDevSideSync, + #[error("Unknown error code")] + UnknownError, +} + +impl NvptxError { + const NVPTXCOMPILE_ERROR_COMPILATION_FAILURE: NvptxCompileResult = 3; + const NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE: NvptxCompileResult = 6; + const NVPTXCOMPILE_ERROR_INTERNAL: NvptxCompileResult = 4; + const NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE: NvptxCompileResult = 1; + const NVPTXCOMPILE_ERROR_INVALID_INPUT: NvptxCompileResult = 2; + const NVPTXCOMPILE_ERROR_OUT_OF_MEMORY: NvptxCompileResult = 5; + const NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC: NvptxCompileResult = 8; + const NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION: NvptxCompileResult = 7; + const NVPTXCOMPILE_SUCCESS: NvptxCompileResult = 0; + + pub const fn try_err_from(result: NvptxCompileResult) -> Result<(), Self> { + match result { + Self::NVPTXCOMPILE_SUCCESS => Ok(()), + Self::NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE => Err(Self::InvalidCompilerHandle), + Self::NVPTXCOMPILE_ERROR_INVALID_INPUT => Err(Self::InvalidInput), + Self::NVPTXCOMPILE_ERROR_COMPILATION_FAILURE => Err(Self::CompilationFailure), + Self::NVPTXCOMPILE_ERROR_INTERNAL => Err(Self::Internal), + Self::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY => Err(Self::OutOfMemory), + Self::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE => { + Err(Self::CompilerInvocationIncomplete) + }, + Self::NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION => Err(Self::UnsupportedPtxVersion), + Self::NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC => Err(Self::UnsupportedDevSideSync), + _ => Err(Self::UnknownError), + } + } +} + +/// [`NvptxCompilerHandle`] represents a handle to the PTX Compiler. +/// +/// To compile a PTX program string, an instance of [`NvptxCompiler`] +/// must be created and the handle to it must be obtained using the +/// API [`nvPTXCompilerCreate`]. Then the compilation can be done +/// using the API [`nvPTXCompilerCompile`]. +pub type NvptxCompilerHandle = *mut NvptxCompiler; + +/// The [`NvptxCompiler`] APIs return the [`NvptxCompileResult`] codes to +/// indicate the call result"] +pub type NvptxCompileResult = ::std::os::raw::c_int; + +extern "C" { + /// Queries the current major and minor version of PTX Compiler APIs being + /// used. + /// + /// # Parameters + /// - [out] `major`: Major version of the PTX Compiler APIs + /// - [out] `minor`: Minor version of the PTX Compiler APIs + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// + /// # Note + /// The version of PTX Compiler APIs follows the CUDA Toolkit versioning. + /// The PTX ISA version supported by a PTX Compiler API version is listed + /// [here](https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes). + pub fn nvPTXCompilerGetVersion( + major: *mut ::std::os::raw::c_uint, + minor: *mut ::std::os::raw::c_uint, + ) -> NvptxCompileResult; + + /// Obtains the handle to an instance of the PTX compiler + /// initialized with the given PTX program `ptxCode`. + /// + /// # Parameters + /// - [out] `compiler`: Returns a handle to PTX compiler initialized with + /// the PTX program `ptxCode` + /// - [in] `ptxCodeLen`: Size of the PTX program `ptxCode` passed as a + /// string + /// - [in] `ptxCode`: The PTX program which is to be compiled passed as a + /// string + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + pub fn nvPTXCompilerCreate( + compiler: *mut NvptxCompilerHandle, + ptxCodeLen: size_t, + ptxCode: *const ::std::os::raw::c_char, + ) -> NvptxCompileResult; + + /// Destroys and cleans the already created PTX compiler. + /// + /// # Parameters + /// - [in] `compiler`: A handle to the PTX compiler which is to be + /// destroyed. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + pub fn nvPTXCompilerDestroy(compiler: *mut NvptxCompilerHandle) -> NvptxCompileResult; + + /// Compile a PTX program with the given compiler options. + /// + /// # Parameters + /// - [in, out] `compiler`: A handle to PTX compiler initialized with the + /// PTX program which is to be compiled. The compiled program can be + /// accessed using the handle. + /// - [in] `numCompileOptions`: Length of the array `compileOptions` + /// - [in] `compileOptions`: Compiler options with which compilation should + /// be done. The compiler options string is a null terminated character + /// array. A valid list of compiler options is available at + /// [link](http://docs.nvidia.com/cuda/ptx-compiler-api/index.html#compile-options). + /// + /// # Note + /// `--gpu-name` (`-arch`) is a mandatory option. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILATION_FAILURE`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION`] + pub fn nvPTXCompilerCompile( + compiler: NvptxCompilerHandle, + numCompileOptions: ::std::os::raw::c_int, + compileOptions: *const *const ::std::os::raw::c_char, + ) -> NvptxCompileResult; + + /// Obtains the size of the image of the compiled program. + /// + /// # Parameters + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `binaryImageSize`: The size of the image of the compiled program + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`] + /// + /// # Note + /// The [`nvPTXCompilerCompile`] function should be invoked for the handle + /// before calling this API. Otherwise, + /// [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`] + /// is returned. + pub fn nvPTXCompilerGetCompiledProgramSize( + compiler: NvptxCompilerHandle, + binaryImageSize: *mut size_t, + ) -> NvptxCompileResult; + + /// Obtains the image of the compiled program. + /// + /// # Parameters + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `binaryImage`: The image of the compiled program. The caller + /// should allocate memory for `binaryImage`. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`] + /// + /// # Note + /// The [`nvPTXCompilerCompile`] function should be invoked for the handle + /// before calling this API. Otherwise, + /// [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`] + /// is returned. + pub fn nvPTXCompilerGetCompiledProgram( + compiler: NvptxCompilerHandle, + binaryImage: *mut ::std::os::raw::c_void, + ) -> NvptxCompileResult; + + /// Query the size of the error message that was seen previously for the + /// handle. + /// + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `errorLogSize`: The size of the error log in bytes which was + /// produced in previous call to [`nvPTXCompilerCompile`]. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + pub fn nvPTXCompilerGetErrorLogSize( + compiler: NvptxCompilerHandle, + errorLogSize: *mut size_t, + ) -> NvptxCompileResult; + + /// Query the error message that was seen previously for the handle. + /// + /// # Parameters + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `errorLog`: The error log which was produced in previous call to + /// [`nvPTXCompilerCompile`]. The caller should allocate memory for + /// `errorLog`. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + pub fn nvPTXCompilerGetErrorLog( + compiler: NvptxCompilerHandle, + errorLog: *mut ::std::os::raw::c_char, + ) -> NvptxCompileResult; + + /// Query the size of the information message that was seen previously for + /// the handle. + /// + /// # Parameters + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `infoLogSize`: The size of the information log in bytes which + /// was produced in previous call to [`nvPTXCompilerCompile`]. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + pub fn nvPTXCompilerGetInfoLogSize( + compiler: NvptxCompilerHandle, + infoLogSize: *mut size_t, + ) -> NvptxCompileResult; + + /// Query the information message that was seen previously for the handle. + /// + /// # Parameters + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `infoLog`: The information log which was produced in previous + /// call to [`nvPTXCompilerCompile`]. The caller should allocate memory + /// for `infoLog`. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + pub fn nvPTXCompilerGetInfoLog( + compiler: NvptxCompilerHandle, + infoLog: *mut ::std::os::raw::c_char, + ) -> NvptxCompileResult; +} diff --git a/rust-cuda-kernel/src/kernel/lints.rs b/rust-cuda-kernel/src/kernel/lints.rs new file mode 100644 index 000000000..5fbe415b2 --- /dev/null +++ b/rust-cuda-kernel/src/kernel/lints.rs @@ -0,0 +1,171 @@ +use std::{collections::HashMap, fmt}; + +use syn::spanned::Spanned; + +#[allow(clippy::too_many_lines)] +pub fn parse_ptx_lint_level( + path: &syn::Path, + nested: &syn::punctuated::Punctuated, + ptx_lint_levels: &mut HashMap, +) { + let level = match path.get_ident() { + Some(ident) if ident == "allow" => LintLevel::Allow, + Some(ident) if ident == "warn" => LintLevel::Warn, + Some(ident) if ident == "deny" => LintLevel::Deny, + Some(ident) if ident == "forbid" => LintLevel::Forbid, + _ => { + emit_error!( + path.span(), + "[rust-cuda]: Invalid lint #[kernel(())] attribute: unknown lint \ + level, must be one of `allow`, `warn`, `deny`, `forbid`.", + ); + + return; + }, + }; + + for meta in nested { + let syn::NestedMeta::Meta(syn::Meta::Path(path)) = meta else { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute.", + level, + ); + continue; + }; + + if path.leading_colon.is_some() + || path.segments.empty_or_trailing() + || path.segments.len() != 2 + { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form \ + `ptx::lint`.", + level, + ); + continue; + } + + let Some(syn::PathSegment { + ident: namespace, + arguments: syn::PathArguments::None, + }) = path.segments.first() + else { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form \ + `ptx::lint`.", + level, + ); + continue; + }; + + if namespace != "ptx" { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form \ + `ptx::lint`.", + level, + ); + continue; + } + + let Some(syn::PathSegment { + ident: lint, + arguments: syn::PathArguments::None, + }) = path.segments.last() + else { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form \ + `ptx::lint`.", + level, + ); + continue; + }; + + let lint = match lint { + l if l == "verbose" => PtxLint::Verbose, + l if l == "double_precision_use" => PtxLint::DoublePrecisionUse, + l if l == "local_memory_use" => PtxLint::LocalMemoryUse, + l if l == "register_spills" => PtxLint::RegisterSpills, + l if l == "dump_assembly" => PtxLint::DumpAssembly, + l if l == "dynamic_stack_size" => PtxLint::DynamicStackSize, + _ => { + emit_error!( + meta.span(), + "[rust-cuda]: Unknown PTX kernel lint `ptx::{}`.", + lint, + ); + continue; + }, + }; + + match ptx_lint_levels.get(&lint) { + None => (), + Some(LintLevel::Forbid) if level < LintLevel::Forbid => { + emit_error!( + meta.span(), + "[rust-cuda]: {}(ptx::{}) incompatible with previous forbid.", + level, + lint, + ); + continue; + }, + Some(previous) => { + emit_warning!( + meta.span(), + "[rust-cuda]: {}(ptx::{}) overwrites previous {}.", + level, + lint, + previous, + ); + }, + } + + ptx_lint_levels.insert(lint, level); + } +} + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +pub enum LintLevel { + Allow, + Warn, + Deny, + Forbid, +} + +impl fmt::Display for LintLevel { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Allow => fmt.write_str("allow"), + Self::Warn => fmt.write_str("warn"), + Self::Deny => fmt.write_str("deny"), + Self::Forbid => fmt.write_str("forbid"), + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +pub enum PtxLint { + Verbose, + DoublePrecisionUse, + LocalMemoryUse, + RegisterSpills, + DumpAssembly, + DynamicStackSize, +} + +impl fmt::Display for PtxLint { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Verbose => fmt.write_str("verbose"), + Self::DoublePrecisionUse => fmt.write_str("double_precision_use"), + Self::LocalMemoryUse => fmt.write_str("local_memory_use"), + Self::RegisterSpills => fmt.write_str("register_spills"), + Self::DumpAssembly => fmt.write_str("dump_assembly"), + Self::DynamicStackSize => fmt.write_str("dynamic_stack_size"), + } + } +} diff --git a/rust-cuda-kernel/src/kernel/mod.rs b/rust-cuda-kernel/src/kernel/mod.rs new file mode 100644 index 000000000..86ffbd8fd --- /dev/null +++ b/rust-cuda-kernel/src/kernel/mod.rs @@ -0,0 +1,11 @@ +pub mod link; +pub mod specialise; +pub mod wrapper; + +mod lints; +mod utils; + +const KERNEL_TYPE_USE_START_CANARY: &str = "// //"; +const KERNEL_TYPE_USE_END_CANARY: &str = "// //"; +const KERNEL_TYPE_LAYOUT_IDENT: &str = "KERNEL_SIGNATURE_LAYOUT"; +const PTX_CSTR_IDENT: &str = "PTX_CSTR"; diff --git a/rust-cuda-derive/src/kernel/specialise/call.rs b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs similarity index 68% rename from rust-cuda-derive/src/kernel/specialise/call.rs rename to rust-cuda-kernel/src/kernel/specialise/entry_point.rs index 34eb0dc35..b429a9297 100644 --- a/rust-cuda-derive/src/kernel/specialise/call.rs +++ b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs @@ -1,7 +1,10 @@ +use std::ffi::CString; + use proc_macro::TokenStream; +use quote::quote; #[allow(clippy::module_name_repetitions)] -pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream { +pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream { let SpecialiseMangleConfig { kernel, specialisation, @@ -9,13 +12,14 @@ pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "specialise_kernel_call!(KERNEL SPECIALISATION) expects KERNEL identifier and \ - SPECIALISATION tokens: {:?}", + "specialise_kernel_entry_point!(KERNEL SPECIALISATION) expects KERNEL identifier \ + and SPECIALISATION tokens: {:?}", err ) }, }; + #[allow(clippy::option_if_let_else)] let mangled_kernel_ident = if let Some(specialisation) = specialisation { format!( "{kernel}_kernel_{:016x}", @@ -25,7 +29,16 @@ pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream { format!("{kernel}_kernel") }; - (quote! { #mangled_kernel_ident }).into() + let mangled_kernel_ident = match CString::new(mangled_kernel_ident) { + Ok(mangled_kernel_ident) => mangled_kernel_ident, + Err(err) => abort_call_site!( + "Kernel compilation generated invalid kernel entry point: internal nul byte: {:?}", + err + ), + }; + + let mangled_kernel_ident = proc_macro::Literal::c_string(&mangled_kernel_ident); + proc_macro::TokenTree::Literal(mangled_kernel_ident).into() } struct SpecialiseMangleConfig { diff --git a/rust-cuda-derive/src/kernel/specialise/entry.rs b/rust-cuda-kernel/src/kernel/specialise/function.rs similarity index 82% rename from rust-cuda-derive/src/kernel/specialise/entry.rs rename to rust-cuda-kernel/src/kernel/specialise/function.rs index e8bce23b9..44d8b8a81 100644 --- a/rust-cuda-derive/src/kernel/specialise/entry.rs +++ b/rust-cuda-kernel/src/kernel/specialise/function.rs @@ -1,12 +1,13 @@ use std::env::VarError; use proc_macro::TokenStream; +use quote::quote; #[allow(clippy::module_name_repetitions)] -pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStream { +pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream { let mut func: syn::ItemFn = syn::parse(func).unwrap_or_else(|err| { abort_call_site!( - "#[specialise_kernel_entry(...)] must be wrapped around a function: {:?}", + "#[specialise_kernel_function(...)] must be wrapped around a function: {:?}", err ) }); @@ -14,7 +15,7 @@ pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStr let kernel: syn::Ident = match syn::parse_macro_input::parse(attr) { Ok(kernel) => kernel, Err(err) => abort_call_site!( - "#[specialise_kernel_entry(KERNEL)] expects KERNEL identifier: {:?}", + "#[specialise_kernel_function(KERNEL)] expects KERNEL identifier: {:?}", err ), }; @@ -33,7 +34,7 @@ pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStr func.sig.ident = match proc_macro::tracked_env::var(&specialisation_var).as_deref() { Ok("") => quote::format_ident!("{}_kernel", func.sig.ident), Ok("chECK") => { - let func_ident = func.sig.ident; + let func_ident = quote::format_ident!("{}_chECK", func.sig.ident); return (quote! { #[cfg(target_os = "cuda")] diff --git a/rust-cuda-kernel/src/kernel/specialise/mod.rs b/rust-cuda-kernel/src/kernel/specialise/mod.rs new file mode 100644 index 000000000..e5dcd518e --- /dev/null +++ b/rust-cuda-kernel/src/kernel/specialise/mod.rs @@ -0,0 +1,3 @@ +pub mod entry_point; +pub mod function; +pub mod param_type; diff --git a/rust-cuda-kernel/src/kernel/specialise/param_type.rs b/rust-cuda-kernel/src/kernel/specialise/param_type.rs new file mode 100644 index 000000000..a398e5eac --- /dev/null +++ b/rust-cuda-kernel/src/kernel/specialise/param_type.rs @@ -0,0 +1,292 @@ +use proc_macro::TokenStream; +use quote::ToTokens; + +#[allow(clippy::module_name_repetitions)] +pub fn specialise_kernel_param_type(tokens: TokenStream) -> TokenStream { + let SpecialiseTypeConfig { + mut ty, + generics, + kernel, + } = match syn::parse_macro_input::parse(tokens) { + Ok(config) => config, + Err(err) => { + abort_call_site!( + "specialise_kernel_param_type!(TY for GENERICS in KERNEL) expects TY type, \ + GENERICS generics, and KERNEL identifier: {:?}", + err + ) + }, + }; + + let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") { + Ok(crate_name) => crate_name.to_uppercase(), + Err(err) => abort_call_site!("Failed to read crate name: {:?}", err), + }; + + let specialisation_var = format!( + "RUST_CUDA_DERIVE_SPECIALISE_{}_{}", + crate_name, + kernel.to_string().to_uppercase() + ); + + let specialisation = match proc_macro::tracked_env::var(&specialisation_var) { + Ok(specialisation) => specialisation, + Err(err) => abort_call_site!( + "Failed to read specialisation from {:?}: {:?}", + &specialisation_var, + err + ), + }; + let specialisation = match syn::parse_str(&specialisation) { + _ if specialisation.is_empty() => syn::PathArguments::None, + Ok(specialisation) => syn::PathArguments::AngleBracketed(specialisation), + Err(err) => abort_call_site!("Failed to parse specialisation: {:?}", err), + }; + + if let syn::PathArguments::AngleBracketed(syn::AngleBracketedGenericArguments { + args, .. + }) = specialisation + { + if generics.params.len() != args.len() { + abort_call_site!( + "Mismatch specialising {} with {}", + generics.split_for_impl().1.to_token_stream(), + args.to_token_stream() + ); + } + + // replace all lifetimes with 'static + ty = syn::fold::Fold::fold_type( + &mut FoldLifetimeAllStatic { + r#static: syn::parse_quote!('static), + }, + ty, + ); + + for (generic, arg) in generics.params.into_iter().zip(args.into_iter()) { + match (generic, arg) { + ( + syn::GenericParam::Lifetime(syn::LifetimeDef { + lifetime: _generic, .. + }), + syn::GenericArgument::Lifetime(_arg), + ) => { + // all lifetimes are already replaced with 'static above + }, + ( + syn::GenericParam::Const(syn::ConstParam { ident: generic, .. }), + syn::GenericArgument::Const(arg), + ) => { + ty = syn::fold::Fold::fold_type(&mut FoldConstGeneric { generic, arg }, ty); + }, + ( + syn::GenericParam::Type(syn::TypeParam { ident: generic, .. }), + syn::GenericArgument::Type(arg), + ) => { + ty = syn::fold::Fold::fold_type(&mut FoldTypeGeneric { generic, arg }, ty); + }, + (generic, arg) => abort_call_site!( + "Mismatch specialising {} with {}", + generic.to_token_stream(), + arg.to_token_stream() + ), + } + } + } else if !generics.params.is_empty() { + abort_call_site!( + "Missing specialisation for {}", + generics.split_for_impl().1.to_token_stream() + ); + } + + ty.into_token_stream().into() +} + +struct SpecialiseTypeConfig { + ty: syn::Type, + generics: syn::Generics, + kernel: syn::Ident, +} + +impl syn::parse::Parse for SpecialiseTypeConfig { + fn parse(input: syn::parse::ParseStream) -> syn::Result { + let ty: syn::Type = input.parse()?; + let _for: syn::token::For = input.parse()?; + let generics: syn::Generics = input.parse()?; + let _in: syn::token::In = input.parse()?; + let kernel: syn::Ident = input.parse()?; + + Ok(Self { + ty, + generics, + kernel, + }) + } +} + +struct FoldLifetimeAllStatic { + r#static: syn::Lifetime, +} + +impl syn::fold::Fold for FoldLifetimeAllStatic { + fn fold_type_reference(&mut self, r#ref: syn::TypeReference) -> syn::TypeReference { + let syn::TypeReference { + and_token, + lifetime: _, + mutability, + elem, + } = r#ref; + + syn::fold::fold_type_reference( + self, + syn::TypeReference { + and_token, + lifetime: Some(self.r#static.clone()), + mutability, + elem, + }, + ) + } + + fn fold_lifetime(&mut self, lt: syn::Lifetime) -> syn::Lifetime { + let mut r#static = self.r#static.clone(); + r#static.set_span(lt.span()); + r#static + } +} + +struct FoldConstGeneric { + generic: syn::Ident, + arg: syn::Expr, +} + +impl syn::fold::Fold for FoldConstGeneric { + fn fold_generic_argument(&mut self, arg: syn::GenericArgument) -> syn::GenericArgument { + let syn::GenericArgument::Type(syn::Type::Path(syn::TypePath { + qself: None, + path: + syn::Path { + leading_colon: None, + segments, + }, + })) = arg + else { + return syn::fold::fold_generic_argument(self, arg); + }; + + if let Some(syn::PathSegment { + ident, + arguments: syn::PathArguments::None, + }) = segments.first() + && segments.len() == 1 + && ident == &self.generic + { + return syn::GenericArgument::Const(self.arg.clone()); + } + + syn::fold::fold_generic_argument( + self, + syn::GenericArgument::Type(syn::Type::Path(syn::TypePath { + qself: None, + path: syn::Path { + leading_colon: None, + segments, + }, + })), + ) + } + + fn fold_expr(&mut self, expr: syn::Expr) -> syn::Expr { + let syn::Expr::Path(syn::ExprPath { + qself: None, + path: + syn::Path { + leading_colon: None, + segments, + }, + attrs, + }) = expr + else { + return syn::fold::fold_expr(self, expr); + }; + + if let Some(syn::PathSegment { + ident, + arguments: syn::PathArguments::None, + }) = segments.first() + && segments.len() == 1 + && ident == &self.generic + { + return self.arg.clone(); + } + + syn::fold::fold_expr( + self, + syn::Expr::Path(syn::ExprPath { + qself: None, + path: syn::Path { + leading_colon: None, + segments, + }, + attrs, + }), + ) + } +} + +struct FoldTypeGeneric { + generic: syn::Ident, + arg: syn::Type, +} + +impl syn::fold::Fold for FoldTypeGeneric { + fn fold_type(&mut self, ty: syn::Type) -> syn::Type { + let syn::Type::Path(syn::TypePath { + qself: None, + path: + syn::Path { + leading_colon: None, + segments, + }, + }) = ty + else { + return syn::fold::fold_type(self, ty); + }; + + if let Some(syn::PathSegment { + ident, + arguments: syn::PathArguments::None, + }) = segments.first() + && ident == &self.generic + { + return if segments.len() > 1 { + syn::Type::Path(syn::TypePath { + qself: Some(syn::QSelf { + lt_token: syn::parse_quote!(<), + ty: Box::new(self.arg.clone()), + position: 0, + as_token: None, + gt_token: syn::parse_quote!(>), + }), + path: syn::Path { + leading_colon: syn::parse_quote!(::), + segments: segments.into_iter().skip(1).collect(), + }, + }) + } else { + self.arg.clone() + }; + } + + syn::fold::fold_type( + self, + syn::Type::Path(syn::TypePath { + qself: None, + path: syn::Path { + leading_colon: None, + segments, + }, + }), + ) + } +} diff --git a/rust-cuda-derive/src/kernel/utils.rs b/rust-cuda-kernel/src/kernel/utils.rs similarity index 69% rename from rust-cuda-derive/src/kernel/utils.rs rename to rust-cuda-kernel/src/kernel/utils.rs index 5afd05858..c73876f09 100644 --- a/rust-cuda-derive/src/kernel/utils.rs +++ b/rust-cuda-kernel/src/kernel/utils.rs @@ -1,5 +1,3 @@ -use syn::spanned::Spanned; - pub fn skip_kernel_compilation() -> bool { let mut skip_compilation = false; @@ -13,7 +11,3 @@ pub fn skip_kernel_compilation() -> bool { skip_compilation } - -pub fn r2c_move_lifetime(arg: usize, ty: &syn::Type) -> syn::Lifetime { - syn::Lifetime::new(&format!("'__r2c_move_lt_{arg}"), ty.span()) -} diff --git a/rust-cuda-kernel/src/kernel/wrapper/config.rs b/rust-cuda-kernel/src/kernel/wrapper/config.rs new file mode 100644 index 000000000..66807f2d1 --- /dev/null +++ b/rust-cuda-kernel/src/kernel/wrapper/config.rs @@ -0,0 +1,17 @@ +pub(super) struct KernelConfig { + pub(super) visibility: Option, + pub(super) link: syn::Ident, +} + +impl syn::parse::Parse for KernelConfig { + fn parse(input: syn::parse::ParseStream) -> syn::Result { + let visibility: Option = input.parse()?; + let _use: syn::token::Use = input.parse()?; + let link: syn::Ident = input.parse()?; + let _bang: syn::token::Bang = input.parse()?; + let _for: syn::token::For = input.parse()?; + let _impl: syn::token::Impl = input.parse()?; + + Ok(Self { visibility, link }) + } +} diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs new file mode 100644 index 000000000..0799f4cc7 --- /dev/null +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs @@ -0,0 +1,95 @@ +use proc_macro2::TokenStream; +use syn::spanned::Spanned; +use quote::quote; + +use crate::kernel::wrapper::{DeclGenerics, FuncIdent}; + +pub(in super::super) fn quote_cuda_generic_function( + crate_path: &syn::Path, + DeclGenerics { + generic_start_token, + generic_kernel_params: generic_params, + generic_close_token, + .. + }: &DeclGenerics, + func_inputs: &syn::punctuated::Punctuated, + FuncIdent { func_ident, .. }: &FuncIdent, + func_attrs: &[syn::Attribute], + func_block: &syn::Block, +) -> TokenStream { + let mut generic_params = (*generic_params).clone(); + + let kernel_func_inputs = func_inputs + .iter() + .enumerate() + .map( + |( + i, + syn::PatType { + attrs, + ty, + pat, + colon_token, + }, + )| { + let (ty, lt) = if let syn::Type::Reference(syn::TypeReference { + and_token, + lifetime, + mutability, + elem, + }) = &**ty + { + let lifetime = lifetime.clone().unwrap_or_else(|| { + let lifetime = + syn::Lifetime::new(&format!("'__rust_cuda_lt_{i}"), ty.span()); + generic_params.insert( + 0, + syn::GenericParam::Lifetime(syn::LifetimeDef { + attrs: Vec::new(), + colon_token: None, + lifetime: lifetime.clone(), + bounds: syn::punctuated::Punctuated::new(), + }), + ); + lifetime + }); + let lt = quote!(#lifetime); + ( + syn::Type::Reference(syn::TypeReference { + and_token: *and_token, + lifetime: Some(lifetime), + mutability: *mutability, + elem: elem.clone(), + }), + lt, + ) + } else { + (syn::Type::clone(ty), quote!('_)) + }; + + let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> + <#ty as #crate_path::kernel::CudaKernelParameter>::DeviceType<#lt> + }; + + syn::FnArg::Typed(syn::PatType { + attrs: attrs.clone(), + ty: Box::new(ty), + pat: pat.clone(), + colon_token: *colon_token, + }) + }, + ) + .collect::>(); + + let generic_start_token = generic_start_token.unwrap_or_default(); + let generic_close_token = generic_close_token.unwrap_or_default(); + + quote! { + #[cfg(target_os = "cuda")] + #(#func_attrs)* + fn #func_ident #generic_start_token #generic_params #generic_close_token ( + #(#kernel_func_inputs),* + ) + #func_block + } +} diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs new file mode 100644 index 000000000..ff7e2ee48 --- /dev/null +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -0,0 +1,131 @@ +use proc_macro2::TokenStream; +use syn::spanned::Spanned; +use quote::quote; + +use crate::kernel::{ + wrapper::{FuncIdent, FunctionInputs, ImplGenerics}, + KERNEL_TYPE_LAYOUT_IDENT, KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY, +}; + +#[allow(clippy::too_many_lines)] +pub(in super::super) fn quote_cuda_wrapper( + crate_path: &syn::Path, + inputs @ FunctionInputs { func_inputs }: &FunctionInputs, + func @ FuncIdent { + func_ident, + func_ident_hash, + .. + }: &FuncIdent, + impl_generics @ ImplGenerics { + impl_generics: generics, + .. + }: &ImplGenerics, + func_attrs: &[syn::Attribute], + func_params: &[syn::Ident], +) -> TokenStream { + let (ffi_inputs, ffi_types) = + specialise_ffi_input_types(crate_path, inputs, func, impl_generics); + + let ffi_param_ptx_jit_wrap = func_inputs.iter().enumerate().rev().fold( + quote! { + #func_ident(#(#func_params),*) + }, + |inner, (i, syn::PatType { pat, ty, .. })| { + let specialised_ty = quote::quote_spanned! { ty.span()=> + #crate_path::device::specialise_kernel_param_type!(#ty for #generics in #func_ident) + }; + + // Load the device param from its FFI representation + // To allow some parameters to also inject PTX JIT load markers here, + // we pass them the param index i + quote::quote_spanned! { ty.span()=> + unsafe { + < + #specialised_ty as #crate_path::kernel::CudaKernelParameter + >::with_ffi_as_device::<_, #i>( + #pat, |#pat: < + #specialised_ty as #crate_path::kernel::CudaKernelParameter + >::DeviceType::<'_>| { #inner } + ) + } + } + }, + ); + + let private_func_params = func_params + .iter() + .map(|param| { + let mut private = syn::Ident::clone(param); + private.set_span(proc_macro::Span::def_site().into()); + private + }) + .collect::>(); + + let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span()); + let ffi_signature_ty = quote! { extern "C" fn(#(#ffi_types),*) }; + + quote! { + #[cfg(target_os = "cuda")] + #[#crate_path::device::specialise_kernel_function(#func_ident)] + #[no_mangle] + #[allow(unused_unsafe)] + #(#func_attrs)* + pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ffi_inputs),*) { + extern "C" { #( + #[allow(dead_code)] + #[deny(improper_ctypes)] + static #private_func_params: #ffi_types; + )* } + + unsafe { + // Initialise the dynamically-sized thread-block shared memory + // and the thread-local offset pointer that points to it + #crate_path::utils::shared::init(); + } + + unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY); } + #[no_mangle] + static #ffi_signature_ident: [ + u8; #crate_path::deps::const_type_layout::serialised_type_graph_len::<#ffi_signature_ty>() + ] = #crate_path::deps::const_type_layout::serialise_type_graph::<#ffi_signature_ty>(); + unsafe { ::core::ptr::read_volatile(&#ffi_signature_ident) }; + unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_END_CANARY); } + + #ffi_param_ptx_jit_wrap + } + } +} + +fn specialise_ffi_input_types( + crate_path: &syn::Path, + FunctionInputs { func_inputs }: &FunctionInputs, + FuncIdent { func_ident, .. }: &FuncIdent, + ImplGenerics { impl_generics, .. }: &ImplGenerics, +) -> (Vec, Vec) { + func_inputs + .iter() + .map(|syn::PatType { + attrs, + pat, + colon_token, + ty, + }| { + let specialised_ty = quote::quote_spanned! { ty.span()=> + #crate_path::device::specialise_kernel_param_type!(#ty for #impl_generics in #func_ident) + }; + + let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> + <#specialised_ty as #crate_path::kernel::CudaKernelParameter>::FfiType<'static, 'static> + }; + + let ffi_param = syn::FnArg::Typed(syn::PatType { + attrs: attrs.clone(), + ty: Box::new(ffi_ty.clone()), + pat: pat.clone(), + colon_token: *colon_token, + }); + + (ffi_param, ffi_ty) + }) + .unzip() +} diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs new file mode 100644 index 000000000..757f22470 --- /dev/null +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs @@ -0,0 +1,72 @@ +use proc_macro2::TokenStream; +use quote::quote; + +use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; + +pub(in super::super) fn quote_host_kernel_ty( + crate_path: &syn::Path, + DeclGenerics { + generic_kernel_params, + generic_start_token, + generic_close_token, + .. + }: &DeclGenerics, + ImplGenerics { ty_generics, .. }: &ImplGenerics, + FunctionInputs { func_inputs }: &FunctionInputs, + FuncIdent { func_ident, .. }: &FuncIdent, + func_params: &[syn::Ident], + func_attrs: &[syn::Attribute], +) -> TokenStream { + let cuda_kernel_param_tys = func_inputs + .iter() + .map(|syn::PatType { ty, .. }| &**ty) + .collect::>(); + + let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); + + let full_generics = generic_kernel_params + .iter() + .map(|param| match param { + syn::GenericParam::Type(syn::TypeParam { ident, .. }) + | syn::GenericParam::Const(syn::ConstParam { ident, .. }) => quote!(#ident), + syn::GenericParam::Lifetime(syn::LifetimeDef { lifetime, .. }) => quote!(#lifetime), + }) + .collect::>(); + + let mut private_func_ident = syn::Ident::clone(func_ident); + private_func_ident.set_span(proc_macro::Span::def_site().into()); + + let ty_turbofish = ty_generics.as_turbofish(); + + quote! { + #[cfg(not(target_os = "cuda"))] + #[allow(non_camel_case_types)] + pub type #func_ident #generic_start_token + #generic_kernel_params + #generic_close_token = impl Fn( + &mut #crate_path::kernel::Launcher<#func_ident #generic_start_token + #(#full_generics),* + #generic_close_token>, + #(#cuda_kernel_param_tys),* + ); + + #[cfg(not(target_os = "cuda"))] + #(#func_attrs)* + #[allow(clippy::too_many_arguments)] + #[allow(clippy::used_underscore_binding)] + fn #private_func_ident #generic_start_token + #generic_kernel_params + #generic_close_token ( + #launcher: &mut #crate_path::kernel::Launcher<#func_ident #generic_start_token + #(#full_generics),* + #generic_close_token>, + #func_inputs + ) { + let _: #func_ident <#(#full_generics),*> = #private_func_ident #ty_turbofish; + + #( + let _ = #func_params; + )* + } + } +} diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs new file mode 100644 index 000000000..1813942d8 --- /dev/null +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs @@ -0,0 +1,50 @@ +use proc_macro2::TokenStream; +use quote::quote; + +use crate::kernel::wrapper::{FunctionInputs, ImplGenerics}; + +pub(in super::super) fn quote_args_trait( + args: &syn::Ident, + ImplGenerics { + impl_generics, + ty_generics, + }: &ImplGenerics, + FunctionInputs { func_inputs }: &FunctionInputs, +) -> TokenStream { + let func_input_typedefs = (0..func_inputs.len()) + .map(|i| { + let type_ident = quote::format_ident!("__T_{}", i); + + quote! { + type #type_ident; + } + }) + .collect::>(); + + let func_input_types = func_inputs + .iter() + .enumerate() + .map(|(i, pat_type)| { + let type_ident = quote::format_ident!("__T_{}", i); + let arg_type = match &*pat_type.ty { + syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, + other => other, + }; + + quote! { + type #type_ident = #arg_type; + } + }) + .collect::>(); + + quote! { + #[allow(non_camel_case_types)] + pub trait #args #impl_generics { + #(#func_input_typedefs)* + } + + impl #impl_generics #args #ty_generics for () { + #(#func_input_types)* + } + } +} diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs new file mode 100644 index 000000000..ef65d5596 --- /dev/null +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs @@ -0,0 +1,151 @@ +use proc_macro2::TokenStream; +use syn::spanned::Spanned; +use quote::quote; + +use crate::kernel::{ + utils::skip_kernel_compilation, + wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}, + KERNEL_TYPE_LAYOUT_IDENT, PTX_CSTR_IDENT, +}; + +#[allow(clippy::too_many_arguments)] +pub(super) fn quote_get_ptx( + crate_path: &syn::Path, + FuncIdent { + func_ident, + func_ident_hash, + .. + }: &FuncIdent, + generics @ DeclGenerics { + generic_start_token, + generic_close_token, + .. + }: &DeclGenerics, + impl_generics: &ImplGenerics, + inputs: &FunctionInputs, + func_params: &[syn::Ident], + macro_type_ids: &[syn::Ident], + ptx_lint_levels: &TokenStream, +) -> TokenStream { + let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") { + Ok(crate_name) => crate_name.to_uppercase(), + Err(err) => abort_call_site!("Failed to read crate name: {:?}.", err), + }; + + let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR") + .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err)); + + let args = syn::Ident::new("KernelArgs", proc_macro::Span::def_site().into()); + let args_trait = super::args_trait::quote_args_trait(&args, impl_generics, inputs); + + let cpu_func_lifetime_erased_types = + generate_lifetime_erased_types(crate_path, &args, generics, inputs, macro_type_ids); + + let ptx_cstr_ident = syn::Ident::new(PTX_CSTR_IDENT, func_ident.span()); + + let matching_kernel_assert = if skip_kernel_compilation() { + quote!() + } else { + quote::quote_spanned! { func_ident.span()=> + const _: #crate_path::safety::ptx_entry_point::Assert<{ + #crate_path::safety::ptx_entry_point::HostAndDeviceKernelEntryPoint::Match + }> = #crate_path::safety::ptx_entry_point::Assert::<{ + #crate_path::safety::ptx_entry_point::check( + #ptx_cstr_ident.to_bytes(), + #crate_path::kernel::specialise_kernel_entry_point!( + #func_ident_hash #generic_start_token + #($#macro_type_ids),* + #generic_close_token + ).to_bytes(), + ) + }>; + } + }; + + let signature_layout_assert = if skip_kernel_compilation() { + quote!() + } else { + let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span()); + let ffi_signature_ty = quote! { extern "C" fn(#(#cpu_func_lifetime_erased_types),*) }; + + quote::quote_spanned! { func_ident.span()=> + const _: #crate_path::safety::ptx_kernel_signature::Assert<{ + #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match + }> = #crate_path::safety::ptx_kernel_signature::Assert::<{ + #ffi_signature_ident::<#ffi_signature_ty>() + }>; + } + }; + + let private_func_params = func_params + .iter() + .map(|param| { + let mut private = syn::Ident::clone(param); + private.set_span(proc_macro::Span::def_site().into()); + private + }) + .collect::>(); + + quote! { + fn get_ptx() -> &'static ::core::ffi::CStr { + // FIXME: don't use imports here + use #crate_path::deps::const_type_layout::{TypeGraphLayout, check_serialised_type_graph}; + use #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout; + + #args_trait + + extern "C" { #( + #[allow(dead_code)] + #[deny(improper_ctypes)] + static #private_func_params: #cpu_func_lifetime_erased_types; + )* } + + #crate_path::kernel::compile_kernel!{ + #func_ident #func_ident_hash #crate_name #crate_manifest_dir #generic_start_token + #($#macro_type_ids),* + #generic_close_token #ptx_lint_levels + } + + #matching_kernel_assert + + #signature_layout_assert + + #ptx_cstr_ident + } + } +} + +fn generate_lifetime_erased_types( + crate_path: &syn::Path, + args: &syn::Ident, + DeclGenerics { + generic_start_token, + generic_close_token, + .. + }: &DeclGenerics, + FunctionInputs { func_inputs }: &FunctionInputs, + macro_type_ids: &[syn::Ident], +) -> Vec { + func_inputs + .iter() + .enumerate() + .map(|(i, syn::PatType { ty, .. })| { + let type_ident = quote::format_ident!("__T_{}", i); + + let mut specialised_ty = quote::quote_spanned! { ty.span()=> + <() as #args #generic_start_token + #($#macro_type_ids),* + #generic_close_token>::#type_ident + }; + // the args trait has to unbox outer lifetimes, so we need to add them back in here + if let syn::Type::Reference(syn::TypeReference { and_token, lifetime, mutability, .. }) = &**ty { + let lifetime = quote::quote_spanned! { lifetime.span()=> 'static }; + + specialised_ty = quote! { #and_token #lifetime #mutability #specialised_ty }; + } + + quote::quote_spanned! { ty.span()=> + <#specialised_ty as #crate_path::kernel::CudaKernelParameter>::FfiType<'static, 'static> + } + }).collect() +} diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs new file mode 100644 index 000000000..353e6c5dc --- /dev/null +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs @@ -0,0 +1,111 @@ +use proc_macro2::TokenStream; +use quote::quote; + +use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig}; + +mod args_trait; +mod get_ptx; + +use get_ptx::quote_get_ptx; + +#[allow(clippy::too_many_arguments)] // FIXME +pub(in super::super) fn quote_host_link_macro( + crate_path: &syn::Path, + KernelConfig { + visibility, link, .. + }: &KernelConfig, + decl_generics @ DeclGenerics { + generic_start_token, + generic_close_token, + generic_kernel_params, + .. + }: &DeclGenerics, + impl_generics: &ImplGenerics, + func_inputs: &FunctionInputs, + func_ident @ FuncIdent { + func_ident: func_ident_name, + func_ident_hash, + .. + }: &FuncIdent, + func_params: &[syn::Ident], + ptx_lint_levels: &TokenStream, +) -> TokenStream { + let macro_generics = generic_kernel_params + .iter() + .enumerate() + .map(|(i, generic)| { + let generic_ident = quote::format_ident!("__g_{}", i); + + match generic { + syn::GenericParam::Type(_) => quote!($#generic_ident:ty), + syn::GenericParam::Const(_) => quote!($#generic_ident:expr), + syn::GenericParam::Lifetime(_) => quote!($#generic_ident:lifetime), + } + }) + .collect::>(); + + let macro_generic_ids = (0..generic_kernel_params.len()) + .map(|i| quote::format_ident!("__g_{}", i)) + .collect::>(); + + let macro_only_lt_generic_ids = generic_kernel_params + .iter() + .enumerate() + .filter_map(|(i, generic)| { + let generic_ident = quote::format_ident!("__g_{}", i); + + match generic { + syn::GenericParam::Type(_) | syn::GenericParam::Const(_) => None, + syn::GenericParam::Lifetime(_) => Some(generic_ident), + } + }) + .collect::>(); + + let macro_non_lt_generic_ids = generic_kernel_params + .iter() + .enumerate() + .filter_map(|(i, generic)| { + let generic_ident = quote::format_ident!("__g_{}", i); + + match generic { + syn::GenericParam::Type(_) | syn::GenericParam::Const(_) => Some(generic_ident), + syn::GenericParam::Lifetime(_) => None, + } + }) + .collect::>(); + + let get_ptx = quote_get_ptx( + crate_path, + func_ident, + decl_generics, + impl_generics, + func_inputs, + func_params, + ¯o_non_lt_generic_ids, + ptx_lint_levels, + ); + + quote! { + #[cfg(not(target_os = "cuda"))] + #visibility macro #link( + impl #func_ident_name #generic_start_token + #(#macro_generics),* $(,)? + #generic_close_token for $ptx:ident + ) { + unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::kernel::CompiledKernelPtx< + #func_ident_name #generic_start_token #($#macro_generic_ids),* #generic_close_token + > for $ptx #generic_start_token #($#macro_generic_ids),* #generic_close_token + { + #get_ptx + + fn get_entry_point() -> &'static ::core::ffi::CStr { + #crate_path::kernel::specialise_kernel_entry_point!( + #func_ident_hash #generic_start_token + #($#macro_non_lt_generic_ids),* + #generic_close_token + ) + } + } + } + } +} diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs new file mode 100644 index 000000000..829cb0433 --- /dev/null +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs @@ -0,0 +1,4 @@ +pub mod cuda_generic_function; +pub mod cuda_wrapper; +pub mod host_kernel_ty; +pub mod host_link_macro; diff --git a/rust-cuda-kernel/src/kernel/wrapper/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/mod.rs new file mode 100644 index 000000000..9dffacc51 --- /dev/null +++ b/rust-cuda-kernel/src/kernel/wrapper/mod.rs @@ -0,0 +1,354 @@ +use std::{ + collections::HashMap, + hash::{Hash, Hasher}, +}; + +use proc_macro::TokenStream; + +mod config; +mod generate; +mod parse; + +use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; + +use config::KernelConfig; +use generate::{ + cuda_generic_function::quote_cuda_generic_function, cuda_wrapper::quote_cuda_wrapper, + host_kernel_ty::quote_host_kernel_ty, host_link_macro::quote_host_link_macro, +}; +use parse::parse_kernel_fn; +use proc_macro2::{Ident, Span}; +use syn::spanned::Spanned; +use quote::quote; + +#[allow(clippy::too_many_lines)] +pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { + let mut hasher = seahash::SeaHasher::new(); + + attr.to_string().hash(&mut hasher); + func.to_string().hash(&mut hasher); + + let kernel_hash = hasher.finish(); + + let config: KernelConfig = match syn::parse_macro_input::parse(attr) { + Ok(config) => config, + Err(err) => { + abort_call_site!( + "#[kernel(pub? use LINK! for impl)] expects LINK macro identifier: {:?}", + err + ) + }, + }; + + let mut func = parse_kernel_fn(func); + + let mut crate_path = None; + let mut ptx_lint_levels = HashMap::new(); + + func.attrs.retain(|attr| { + if attr.path.is_ident("kernel") { + if let Ok(syn::Meta::List(list)) = attr.parse_meta() { + for meta in &list.nested { + match meta { + syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue { + path, + lit: syn::Lit::Str(s), + .. + })) if path.is_ident("crate") => match syn::parse_str::(&s.value()) { + Ok(new_crate_path) => { + if crate_path.is_none() { + crate_path = Some( + syn::parse_quote_spanned! { s.span() => #new_crate_path }, + ); + + continue; + } + + emit_error!( + s.span(), + "[rust-cuda]: Duplicate #[kernel(crate)] attribute.", + ); + }, + Err(err) => emit_error!( + s.span(), + "[rust-cuda]: Invalid #[kernel(crate = \ + \"\")] attribute: {}.", + err + ), + }, + syn::NestedMeta::Meta(syn::Meta::List(syn::MetaList { + path, + nested, + .. + })) if path.is_ident("allow") || path.is_ident("warn") || path.is_ident("deny") || path.is_ident("forbid") => { + parse_ptx_lint_level(path, nested, &mut ptx_lint_levels); + }, + _ => { + emit_error!( + meta.span(), + "[rust-cuda]: Expected #[kernel(crate = \"\")] or #[kernel(allow/warn/deny/forbid())] function attribute." + ); + } + } + } + } else { + emit_error!( + attr.span(), + "[rust-cuda]: Expected #[kernel(crate = \"\")] or or #[kernel(allow/warn/deny/forbid())] function attribute." + ); + } + + false + } else { + true + } + }); + + let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda)); + + let _ = ptx_lint_levels.try_insert(PtxLint::Verbose, LintLevel::Allow); + let _ = ptx_lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn); + let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUse, LintLevel::Warn); + let _ = ptx_lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn); + let _ = ptx_lint_levels.try_insert(PtxLint::DumpAssembly, LintLevel::Allow); + let _ = ptx_lint_levels.try_insert(PtxLint::DynamicStackSize, LintLevel::Warn); + + let ptx_lint_levels = { + let (lints, levels): (Vec, Vec) = ptx_lint_levels + .into_iter() + .map(|(lint, level)| { + ( + Ident::new(&lint.to_string(), Span::call_site()), + Ident::new(&level.to_string(), Span::call_site()), + ) + }) + .unzip(); + + quote! { + #(#levels(ptx::#lints)),* + } + }; + + let mut func_inputs = FunctionInputs { + func_inputs: func + .sig + .inputs + .into_iter() + .map(|arg| match arg { + syn::FnArg::Typed(arg) => arg, + syn::FnArg::Receiver(_) => { + unreachable!("already checked that no receiver arg exists") + }, + }) + .collect(), + }; + + let generic_kernel_params = func.sig.generics.params.clone(); + let (generic_start_token, generic_close_token) = + (func.sig.generics.lt_token, func.sig.generics.gt_token); + + let generic_trait_params = generic_kernel_params + .iter() + .filter(|generic_param| !matches!(generic_param, syn::GenericParam::Lifetime(_))) + .cloned() + .collect(); + + let decl_generics = DeclGenerics { + generic_start_token: &generic_start_token, + generic_close_token: &generic_close_token, + generic_kernel_params: &generic_kernel_params, + }; + let trait_generics = syn::Generics { + lt_token: generic_start_token, + params: generic_trait_params, + gt_token: generic_close_token, + where_clause: None, + }; + let (impl_generics, ty_generics, _where_clause) = trait_generics.split_for_impl(); + let impl_generics = ImplGenerics { + impl_generics, + ty_generics, + }; + + let func_ident = FuncIdent { + func_ident: &func.sig.ident, + func_ident_hash: quote::format_ident!("{}_{:016x}", &func.sig.ident, kernel_hash), + }; + + let func_params = func_inputs + .func_inputs + .iter() + .enumerate() + .map(|(i, syn::PatType { pat, .. })| match ident_from_pat(pat) { + Some(ident) => ident, + None => syn::Ident::new(&format!("{}_arg_{i}", func_ident.func_ident), pat.span()), + }) + .collect::>(); + + let pat_func_inputs = func_inputs + .func_inputs + .iter_mut() + .zip(&func_params) + .map(|(arg, ident)| { + let syn::PatType { + attrs, + colon_token, + ty, + .. + } = arg; + + let ident_fn_arg = syn::PatType { + attrs: attrs.clone(), + pat: Box::new(syn::Pat::Ident(syn::PatIdent { + attrs: Vec::new(), + by_ref: None, + mutability: None, + ident: ident.clone(), + subpat: None, + })), + colon_token: *colon_token, + ty: ty.clone(), + }; + + std::mem::replace(arg, ident_fn_arg) + }) + .collect(); + + let host_kernel_ty = quote_host_kernel_ty( + &crate_path, + &decl_generics, + &impl_generics, + &func_inputs, + &func_ident, + &func_params, + &func.attrs, + ); + let host_generic_kernel_check = quote_generic_check(&crate_path, &func_ident); + let host_link_macro = quote_host_link_macro( + &crate_path, + &config, + &decl_generics, + &impl_generics, + &func_inputs, + &func_ident, + &func_params, + &ptx_lint_levels, + ); + let cuda_wrapper = quote_cuda_wrapper( + &crate_path, + &func_inputs, + &func_ident, + &impl_generics, + &func.attrs, + &func_params, + ); + let cuda_generic_function = quote_cuda_generic_function( + &crate_path, + &decl_generics, + &pat_func_inputs, + &func_ident, + &func.attrs, + &func.block, + ); + + (quote! { + #host_kernel_ty + + #host_generic_kernel_check + + #host_link_macro + + #cuda_wrapper + #cuda_generic_function + }) + .into() +} + +struct FunctionInputs { + func_inputs: syn::punctuated::Punctuated, +} + +#[allow(clippy::struct_field_names)] +struct DeclGenerics<'f> { + generic_start_token: &'f Option, + generic_close_token: &'f Option, + generic_kernel_params: &'f syn::punctuated::Punctuated, +} + +struct ImplGenerics<'f> { + #[allow(clippy::struct_field_names)] + impl_generics: syn::ImplGenerics<'f>, + ty_generics: syn::TypeGenerics<'f>, +} + +#[allow(clippy::struct_field_names)] +struct FuncIdent<'f> { + func_ident: &'f syn::Ident, + func_ident_hash: syn::Ident, +} + +fn ident_from_pat(pat: &syn::Pat) -> Option { + match pat { + syn::Pat::Lit(_) + | syn::Pat::Macro(_) + | syn::Pat::Path(_) + | syn::Pat::Range(_) + | syn::Pat::Rest(_) + | syn::Pat::Verbatim(_) + | syn::Pat::Wild(_) => None, + syn::Pat::Ident(syn::PatIdent { ident, .. }) => Some(ident.clone()), + syn::Pat::Box(syn::PatBox { pat, .. }) + | syn::Pat::Reference(syn::PatReference { pat, .. }) + | syn::Pat::Type(syn::PatType { pat, .. }) => ident_from_pat(pat), + syn::Pat::Or(syn::PatOr { cases, .. }) => ident_from_pat_iter(cases.iter()), + syn::Pat::Slice(syn::PatSlice { elems, .. }) + | syn::Pat::TupleStruct(syn::PatTupleStruct { + pat: syn::PatTuple { elems, .. }, + .. + }) + | syn::Pat::Tuple(syn::PatTuple { elems, .. }) => ident_from_pat_iter(elems.iter()), + syn::Pat::Struct(syn::PatStruct { fields, .. }) => { + ident_from_pat_iter(fields.iter().map(|field| &*field.pat)) + }, + _ => Err(()).ok(), + } +} + +fn ident_from_pat_iter<'p, I: Iterator>(iter: I) -> Option { + iter.filter_map(ident_from_pat) + .fold(None, |acc: Option<(String, Span)>, ident| { + if let Some((mut str_acc, span_acc)) = acc { + str_acc.push('_'); + str_acc.push_str(ident.to_string().trim_matches('_')); + + Some((str_acc, span_acc.join(ident.span()).unwrap())) + } else { + Some((ident.to_string(), ident.span())) + } + }) + .map(|(string, span)| syn::Ident::new(&string, span)) +} + +fn quote_generic_check( + crate_path: &syn::Path, + FuncIdent { + func_ident, + func_ident_hash, + .. + }: &FuncIdent, +) -> proc_macro2::TokenStream { + let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") { + Ok(crate_name) => crate_name.to_uppercase(), + Err(err) => abort_call_site!("Failed to read crate name: {:?}.", err), + }; + + let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR") + .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err)); + + quote::quote_spanned! { func_ident_hash.span()=> + #[cfg(not(target_os = "cuda"))] + #crate_path::kernel::check_kernel! { + #func_ident #func_ident_hash #crate_name #crate_manifest_dir + } + } +} diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-kernel/src/kernel/wrapper/parse.rs similarity index 66% rename from rust-cuda-derive/src/kernel/wrapper/parse.rs rename to rust-cuda-kernel/src/kernel/wrapper/parse.rs index 936143cf2..8d1662772 100644 --- a/rust-cuda-derive/src/kernel/wrapper/parse.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/parse.rs @@ -23,7 +23,7 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn { if func.sig.asyncness.is_some() { abort!( func.sig.asyncness.span(), - "Kernel function must not (yet) be async." + "Kernel function must not be async." ); } @@ -41,6 +41,20 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn { ); } + for param in &func.sig.inputs { + if let syn::FnArg::Receiver(receiver) = param { + abort!(receiver.span(), "Kernel function must not have a receiver."); + } + } + + if func.sig.inputs.len() > 12 { + emit_warning!( + func.sig.inputs.span(), + "Kernel function has too many arguments, {} were found but at most 12 are supported.", + func.sig.inputs.len() + ); + } + match &func.sig.output { syn::ReturnType::Default => (), syn::ReturnType::Type(_, box syn::Type::Tuple(tuple)) if tuple.elems.is_empty() => (), @@ -50,5 +64,12 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn { ), }; + if let Some(r#where) = &func.sig.generics.where_clause { + abort!( + r#where.span(), + "Kernel function must not have a where clause, use type generic bounds instead." + ); + } + func } diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs new file mode 100644 index 000000000..e6d5cf3ac --- /dev/null +++ b/rust-cuda-kernel/src/lib.rs @@ -0,0 +1,207 @@ +//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License +//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod +//! Ready-to-Code]][gitpod] +//! +//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main +//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain +//! +//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange +//! [repo]: https://github.com/juntyr/rust-cuda +//! +//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue +//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda_kernel/ +//! +//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield +//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield +//! +//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx +//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda +//! +//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod +//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda +//! +//! `rust-cuda-kernel` provides the [`#[kernel]`](macro@kernel) attribute +//! macro. When applied to a function, it compiles it as a CUDA kernel that +//! can be *safely* called from Rust code on the host. + +#![deny(clippy::complexity)] +#![deny(clippy::correctness)] +#![warn(clippy::nursery)] +#![warn(clippy::pedantic)] +#![deny(clippy::perf)] +#![deny(clippy::style)] +#![deny(clippy::suspicious)] +#![deny(unsafe_code)] +#![warn(missing_docs)] +#![feature(box_patterns)] +#![feature(proc_macro_tracked_env)] +#![feature(proc_macro_span)] +#![feature(let_chains)] +#![feature(map_try_insert)] +#![feature(proc_macro_def_site)] +#![feature(proc_macro_c_str_literals)] +#![feature(cfg_version)] +#![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] + +extern crate proc_macro; + +#[macro_use] +extern crate proc_macro_error; + +use proc_macro::TokenStream; + +mod kernel; + +#[proc_macro_error] +#[proc_macro_attribute] +/// Provides the [`#[kernel]`](macro@kernel) attribute macro. When applied to a +/// function, it compiles it as a CUDA kernel that can be *safely* called from +/// Rust code on the host. +/// +/// The annotated function must be public, not const, not async, not have an +/// explicit ABI, not be variadic, not have a receiver (e.g. `&self`), and +/// return the unit type `()`. At the moment, the kernel function must also +/// not use a where clause – use type generic bounds instead. +/// +/// While the [`#[kernel]`](macro@kernel) attribute supports functions with any +/// number of arguments, [`rust_cuda::kernel::TypedPtxKernel`] only supports +/// launching kernels with up to 12 parameters at the moment. +/// +/// The [`#[kernel]`](macro@kernel) attribute uses the following syntax: +/// +/// ```rust,ignore +/// #[kernel(pub? use link! for impl)] +/// fn my_kernel(/* parameters */) { +/// /* kernel code */ +/// } +/// ``` +/// +/// where `link` is the name of a macro that will be generated to manually link +/// specific monomorphised instantiations of the (optionally generic) kernel +/// function, and the optional `pub` controls whether this macro is public or +/// private. +/// +/// Note that all kernel parameters must implement the sealed +/// [`rust_cuda::kernel::CudaKernelParameter`] trait. +/// +/// To use a specific monomorphised instantiation of the kernel, the generated +/// `link!` macro must be invoked with the following syntax: +/// +/// ```rust,ignore +/// struct KernelPtx; +/// link! { impl my_kernel for KernelPtx } +/// ``` +/// for the non-generic kernel function `my_kernel` and a non-generic marker +/// type `KernelPtx`, which can be used as the generic `Kernel` type parameter +/// for [`rust_cuda::kernel::TypedPtxKernel`] to instantiate and launch the +/// kernel. Specifically, the [`rust_cuda::kernel::CompiledKernelPtx`] trait is +/// implemented for the `KernelPtx` type. +/// +/// If the kernel function is generic, the following syntax is used instead: +/// ```rust,ignore +/// #[kernel(pub? use link! for impl)] +/// fn my_kernel<'a, A, B: Bounded, const N: usize>(/* parameters */) { +/// /* kernel code */ +/// } +/// +/// struct KernelPtx<'a, A, B: Bounded, const N: usize>(/* ... */); +/// link! { impl my_kernel<'a, u32, MyStruct, 42> for KernelPtx } +/// link! { impl my_kernel<'a, bool, MyOtherStruct, 24> for KernelPtx } +/// ``` +/// +/// If the kernel generic space is closed, the `link!` macro can be made +/// private and all instantiations must be requested in the same crate that +/// defines the kernel function. If downstream code should be allowed to use +/// and compile new specific monomorphised instantiations of the kernel, the +/// `link!` macro should be publicly exported. Then, downstream code can define +/// its own `MyKernelPtx` marker types for which the kernel is linked and which +/// can be passed to [`rust_cuda::kernel::CompiledKernelPtx`]-generic code in +/// the kernel-defining crate to construct the requested +/// [`rust_cuda::kernel::TypedPtxKernel`]. +/// +/// Inside the scope of the [`#[kernel]`](macro@kernel) attribute, a helper +/// `#[kernel(...)]` attribute can be applied to the kernel function: +/// +/// - `#[kernel(crate = "")]` changes the path to the [`rust-cuda`] +/// crate that the kernel compilation uses, which by default is `rust_cuda`. +/// - `#[kernel(allow/warn/deny/forbid())]` checks the specified +/// CUDA-specific lint for each kernel compilation, using default Rust +/// semantics for allowing, warning on, denying, or forbidding a lint. The +/// following lints are supported: +/// - `ptx::double_precision_use`: check for any uses of [`f64`] operations +/// inside the compiled PTX binary, as they are often significantly less +/// performant on NVIDIA GPUs than [`f32`] operations. By default, +/// `#[kernel(warn(ptx::double_precision_use))]` is set. +/// - `ptx::local_memory_use`: check for any usage of local memory, which may +/// slow down kernel execution. By default, +/// `#[kernel(warn(ptx::local_memory_use))]` is set. +/// - `ptx::register_spills`: check for any spills of registers to local +/// memory. While using less registers can allow more kernels to be run in +/// parallel, register spills may also point to missed optimisations. By +/// default, `#[kernel(warn(ptx::register_spills))]` is set. +/// - `ptx::dynamic_stack_size`: check if the PTX compiler is unable to +/// statically determine the size of the required kernel function stack. +/// When the static stack size is known, the compiler may be able to keep it +/// entirely within the fast register file. However, when the stack size is +/// dynamic, more costly memory load and store operations are needed. By +/// default, `#[kernel(warn(ptx::dynamic_stack_size))]` is set. +/// - `ptx::verbose`: utility lint to output verbose PTX compiler messages as +/// warnings (`warn`) or errors (`deny` or `forbid`) or to not output them +/// (`allow`). By default, `#[kernel(allow(ptx::verbose))]` is set. +/// - `ptx::dump_assembly`: utility lint to output the compiled PTX assembly +/// code as a warning (`warn`) or an error (`deny` or `forbid`) or to not +/// output it (`allow`). By default, `#[kernel(allow(ptx::dump_assembly))]` +/// is set. +/// +/// [`rust_cuda::kernel::TypedPtxKernel`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/struct.TypedPtxKernel.html +/// [`rust_cuda::kernel::CudaKernelParameter`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/trait.CudaKernelParameter.html +/// [`rust_cuda::kernel::CompiledKernelPtx`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/trait.CompiledKernelPtx.html +/// [`rust-cuda`]: https://juntyr.github.io/rust-cuda/rust_cuda +pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { + kernel::wrapper::kernel(attr, func) +} + +#[doc(hidden)] +#[proc_macro_error] +#[proc_macro] +/// Helper macro to specialise the generic kernel param types when compiling +/// the specialised kernel for CUDA. +pub fn specialise_kernel_param_type(tokens: TokenStream) -> TokenStream { + kernel::specialise::param_type::specialise_kernel_param_type(tokens) +} + +#[doc(hidden)] +#[proc_macro_error] +#[proc_macro] +/// Helper macro to specialise the CUDA kernel entry point name, used on the +/// host for linking to it. +pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream { + kernel::specialise::entry_point::specialise_kernel_entry_point(tokens) +} + +#[doc(hidden)] +#[proc_macro_error] +#[proc_macro_attribute] +/// Helper macro to specialise the name of the CUDA kernel function item, used +/// to give each specialised version a unique ident when compiling for CUDA. +pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream { + kernel::specialise::function::specialise_kernel_function(attr, func) +} + +#[doc(hidden)] +#[proc_macro_error] +#[proc_macro] +/// Helper macro to cheaply check the generic CUDA kernel, used on the host to +/// provide code error feedback even when no specialised kernel is linked. +pub fn check_kernel(tokens: TokenStream) -> TokenStream { + kernel::link::check_kernel(tokens) +} + +#[doc(hidden)] +#[proc_macro_error] +#[proc_macro] +/// Helper macro to compile a specialised CUDA kernel and produce its PTX +/// assembly code, which is used on the host when linking specialised kernels. +pub fn compile_kernel(tokens: TokenStream) -> TokenStream { + kernel::link::compile_kernel(tokens) +} diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml deleted file mode 100644 index f2a4cd09a..000000000 --- a/rust-cuda-ptx-jit/Cargo.toml +++ /dev/null @@ -1,17 +0,0 @@ -[package] -name = "rust-cuda-ptx-jit" -version = "0.1.0" -authors = ["Juniper Tyree "] -license = "MIT OR Apache-2.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[features] -default = [] -host = ["regex", "rustacuda", "lazy_static"] - -[dependencies] -rustacuda = { version = "0.1.3", optional = true } -regex = { version = "1.5", optional = true } -lazy_static = { version = "1.4", optional = true } diff --git a/rust-cuda-ptx-jit/src/device.rs b/rust-cuda-ptx-jit/src/device.rs deleted file mode 100644 index 533021b90..000000000 --- a/rust-cuda-ptx-jit/src/device.rs +++ /dev/null @@ -1,13 +0,0 @@ -#[macro_export] -#[doc(hidden)] -#[doc(cfg(not(feature = "host")))] -macro_rules! PtxJITConstLoad { - ([$index:literal] => $reference:expr) => { - unsafe { - ::core::arch::asm!( - concat!("// //"), - in(reg32) *($reference as *const _ as *const u32), - ) - } - }; -} diff --git a/rust-cuda-ptx-jit/src/host/arguments.rs b/rust-cuda-ptx-jit/src/host/arguments.rs deleted file mode 100644 index 0a67d42ea..000000000 --- a/rust-cuda-ptx-jit/src/host/arguments.rs +++ /dev/null @@ -1,48 +0,0 @@ -#[macro_export] -#[doc(hidden)] -#[doc(cfg(feature = "host"))] -#[allow(clippy::module_name_repetitions)] -macro_rules! compilePtxJITwithArguments { - // Invocation without arguments fast track - ($compiler:ident ()) => { - $crate::compilePtxJITwithArguments!($compiler.with_arguments ()) - }; - // Invocation without arguments fast track - ($compiler:ident $(. $path:ident)+ ()) => { - $compiler$(.$path)+(None) - }; - // Invocation with arguments is forwarded to incremental muncher - ($compiler:ident ( $($args:tt)* )) => { - $crate::compilePtxJITwithArguments!($compiler.with_arguments ( $($args)* )) - }; - // Invocation with arguments is forwarded to incremental muncher - ($compiler:ident $(. $path:ident)+ ( $($args:tt)* )) => { - $crate::compilePtxJITwithArguments!(@munch None $compiler$(.$path)+ => [, $($args)*] =>) - }; - // Muncher base case: no `ConstLoad[$expr]` arguments - (@munch None $compiler:ident $(. $path:ident)+ => [] => $($rubbish:expr),*) => { - $compiler$(.$path)+(None) - }; - // Muncher base case: at least one `ConstLoad[$expr]` argument - (@munch Some $compiler:ident $(. $path:ident)+ => [] => $($exprs:expr),*) => { - $compiler$(.$path)+(Some(&[$($exprs),*])) - }; - // Muncher helper case: first `ConstLoad[$expr]` argument is recognised (redirect) - (@munch None $compiler:ident $(. $path:ident)+ => [, ConstLoad [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => { - $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [, ConstLoad [ $head ] $($tail)*] => $($exprs),*) - }; - // Muncher recursive case: much one `Ignore[$expr]` argument (no `ConstLoad[$expr]`s so far) - (@munch None $compiler:ident $(. $path:ident)+ => [, Ignore [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => { - $crate::compilePtxJITwithArguments!(@munch None $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* None) - }; - // Muncher recursive case: much one `Ignore[$expr]` argument (some `ConstLoad[$expr]`s already) - (@munch Some $compiler:ident $(. $path:ident)+ => [, Ignore [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => { - $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* None) - }; - // Muncher recursive case: much one `ConstLoad[$expr]` (some `ConstLoad[$expr]`s already) - (@munch Some $compiler:ident $(. $path:ident)+ => [, ConstLoad [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => { - $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* Some(unsafe { - ::std::slice::from_raw_parts($head as *const _ as *const u8, ::std::mem::size_of_val($head)) - })) - }; -} diff --git a/rust-cuda-ptx-jit/src/host/compiler/regex.rs b/rust-cuda-ptx-jit/src/host/compiler/regex.rs deleted file mode 100644 index 5cff3bdc9..000000000 --- a/rust-cuda-ptx-jit/src/host/compiler/regex.rs +++ /dev/null @@ -1,46 +0,0 @@ -#[allow(unused_imports)] -use regex::bytes::Regex; - -lazy_static::lazy_static! { - pub static ref CONST_MARKER_REGEX: Regex = { - Regex::new( - r"(?-u)// %r\d+)-(?P\d+)> //" - ).unwrap() - }; - - pub static ref CONST_BASE_REGISTER_REGEX: Regex = { - Regex::new( - r"(?-u)ld\.global\.u32\s*(?P%r\d+)\s*,\s*\[(?P%r[ds]?\d+)]\s*;", - ).unwrap() - }; - - pub static ref CONST_LOAD_INSTRUCTION_REGEX: Regex = { - Regex::new( - r"(?x-u)(?P - ld\.global - (?:\.(?Pv[24]))? - \. - (?P[suf]) - (?P8|16|32|64) - \s* - (?P - (?:%[rf][sd]?\d+) | - (?:\{(?:\s*%[rf][sd]?\d+,)*\s*%[rf][sd]?\d+\s*\}) - ) - ,\s* - \[ - (?P%r[ds]?\d+) - (?: - \+ - (?P\d+) - )? - \] - \s*; - )", - ).unwrap() - }; - - pub static ref REGISTER_REGEX: Regex = { - Regex::new(r"(?-u)(?P%[rf][sd]?\d+)").unwrap() - }; -} diff --git a/rust-cuda-ptx-jit/src/host/kernel.rs b/rust-cuda-ptx-jit/src/host/kernel.rs deleted file mode 100644 index 02baabfcf..000000000 --- a/rust-cuda-ptx-jit/src/host/kernel.rs +++ /dev/null @@ -1,58 +0,0 @@ -use std::{ffi::CStr, mem::ManuallyDrop}; - -use rustacuda::{error::CudaResult, function::Function, module::Module}; - -#[doc(cfg(feature = "host"))] -#[allow(clippy::module_name_repetitions)] -pub struct CudaKernel { - module: ManuallyDrop>, - function: ManuallyDrop>, -} - -impl CudaKernel { - /// # Errors - /// - /// Returns a `CudaError` if `ptx` is not a valid PTX source, or it does - /// not contain an entry point named `entry_point`. - pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult { - let module = Box::new(Module::load_from_string(ptx)?); - - let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point); - - let function = match function { - Ok(function) => function, - Err(err) => { - if let Err((_err, module)) = Module::drop(*module) { - std::mem::forget(module); - } - - return Err(err); - }, - }; - - Ok(Self { - function: ManuallyDrop::new(function), - module: ManuallyDrop::new(module), - }) - } - - #[must_use] - pub fn get_function(&self) -> &Function { - &self.function - } -} - -impl Drop for CudaKernel { - fn drop(&mut self) { - { - // Ensure that self.function is dropped before self.module as - // it borrows data from the module and must not outlive it - let _function = unsafe { ManuallyDrop::take(&mut self.function) }; - } - - if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) }) - { - std::mem::forget(module); - } - } -} diff --git a/rust-cuda-ptx-jit/src/host/mod.rs b/rust-cuda-ptx-jit/src/host/mod.rs deleted file mode 100644 index d0d9ffb53..000000000 --- a/rust-cuda-ptx-jit/src/host/mod.rs +++ /dev/null @@ -1,4 +0,0 @@ -pub mod compiler; -pub mod kernel; - -mod arguments; diff --git a/rust-cuda-ptx-jit/src/lib.rs b/rust-cuda-ptx-jit/src/lib.rs deleted file mode 100644 index ae6080a3e..000000000 --- a/rust-cuda-ptx-jit/src/lib.rs +++ /dev/null @@ -1,14 +0,0 @@ -#![deny(clippy::pedantic)] -#![cfg_attr(not(feature = "host"), no_std)] -#![feature(doc_cfg)] -#![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] - -#[cfg(feature = "host")] -mod host; - -#[cfg(feature = "host")] -pub use host::{compiler::PtxJITCompiler, compiler::PtxJITResult, kernel::CudaKernel}; - -#[cfg(any(not(feature = "host"), doc))] -#[doc(cfg(not(feature = "host")))] -mod device; diff --git a/rust-toolchain b/rust-toolchain index 512b40786..7734bcf14 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1,5 +1,4 @@ [toolchain] -# Pin version pin until const traits are back -channel = "nightly-2023-11-10" +channel = "nightly" components = [ "cargo", "rustfmt", "clippy" ] targets = [ "x86_64-unknown-linux-gnu", "nvptx64-nvidia-cuda" ] diff --git a/src/alloc.rs b/src/alloc.rs new file mode 100644 index 000000000..80d0ee840 --- /dev/null +++ b/src/alloc.rs @@ -0,0 +1,67 @@ +#![allow(clippy::module_name_repetitions)] + +pub trait EmptyCudaAlloc: From + Into + sealed::empty::Sealed {} + +pub trait CudaAlloc: sealed::alloc::Sealed {} + +impl CudaAlloc for Option {} +impl sealed::alloc::Sealed for Option {} + +pub struct NoCudaAlloc; +impl CudaAlloc for NoCudaAlloc {} +impl sealed::alloc::Sealed for NoCudaAlloc {} +impl EmptyCudaAlloc for NoCudaAlloc {} +impl sealed::empty::Sealed for NoCudaAlloc {} + +pub struct SomeCudaAlloc(()); +impl CudaAlloc for SomeCudaAlloc {} +impl sealed::alloc::Sealed for SomeCudaAlloc {} +impl !EmptyCudaAlloc for SomeCudaAlloc {} +impl !sealed::empty::Sealed for SomeCudaAlloc {} + +pub struct CombinedCudaAlloc(A, B); +impl CudaAlloc for CombinedCudaAlloc {} +impl sealed::alloc::Sealed for CombinedCudaAlloc {} +impl EmptyCudaAlloc + for CombinedCudaAlloc +{ +} +impl sealed::empty::Sealed + for CombinedCudaAlloc +{ +} +impl From + for CombinedCudaAlloc +{ + fn from(_: NoCudaAlloc) -> Self { + Self(A::from(NoCudaAlloc), B::from(NoCudaAlloc)) + } +} +impl From> + for NoCudaAlloc +{ + fn from(val: CombinedCudaAlloc) -> Self { + let _: (Self, Self) = (val.0.into(), val.1.into()); + Self + } +} +impl CombinedCudaAlloc { + #[must_use] + pub const fn new(front: A, tail: B) -> Self { + Self(front, tail) + } + + pub fn split(self) -> (A, B) { + (self.0, self.1) + } +} + +pub(crate) mod sealed { + pub(super) mod empty { + pub trait Sealed {} + } + + pub mod alloc { + pub trait Sealed {} + } +} diff --git a/src/common.rs b/src/common.rs deleted file mode 100644 index b2d398e09..000000000 --- a/src/common.rs +++ /dev/null @@ -1,186 +0,0 @@ -#[cfg(any(not(feature = "host"), doc))] -use core::convert::{AsMut, AsRef}; -use core::marker::PhantomData; - -#[cfg(feature = "host")] -use alloc::fmt; -#[cfg(not(feature = "host"))] -use core::ops::{Deref, DerefMut}; -#[cfg(feature = "host")] -use core::{mem::MaybeUninit, ptr::copy_nonoverlapping}; - -use const_type_layout::TypeGraphLayout; -use rustacuda_core::DeviceCopy; - -#[cfg(feature = "derive")] -#[doc(cfg(feature = "derive"))] -pub use rust_cuda_derive::LendRustToCuda; - -#[cfg(feature = "derive")] -#[doc(cfg(feature = "derive"))] -pub use rust_cuda_derive::kernel; - -#[cfg(feature = "host")] -use crate::{safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper}; - -#[repr(transparent)] -#[cfg_attr(not(feature = "host"), derive(Debug))] -#[derive(TypeLayout)] -pub struct DeviceAccessible(T); - -unsafe impl DeviceCopy for DeviceAccessible {} - -#[cfg(feature = "host")] -impl From for DeviceAccessible { - fn from(value: T) -> Self { - Self(value) - } -} - -#[cfg(feature = "host")] -impl From<&T> for DeviceAccessible> { - fn from(value: &T) -> Self { - let value = unsafe { - let mut uninit = MaybeUninit::uninit(); - copy_nonoverlapping(value, uninit.as_mut_ptr(), 1); - uninit.assume_init() - }; - - Self(SafeDeviceCopyWrapper::from(value)) - } -} - -#[cfg(feature = "host")] -impl fmt::Debug for DeviceAccessible { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - fmt.debug_struct(stringify!(DeviceAccessible)) - .finish_non_exhaustive() - } -} - -#[cfg(not(feature = "host"))] -impl Deref for DeviceAccessible { - type Target = T; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -#[cfg(not(feature = "host"))] -impl DerefMut for DeviceAccessible { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -/// # Safety -/// -/// This is an internal trait and should ONLY be derived automatically using -/// `#[derive(LendRustToCuda)]` -pub unsafe trait RustToCuda { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - type CudaAllocation: crate::host::CudaAlloc; - type CudaRepresentation: CudaAsRust + TypeGraphLayout; - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - /// # Errors - /// - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA - /// - /// # Safety - /// - /// This is an internal function and should NEVER be called manually - /// The returned `Self::CudaRepresentation` must NEVER be accessed on the - /// CPU as it contains a GPU-resident copy of `self`. - #[allow(clippy::type_complexity)] - unsafe fn borrow( - &self, - alloc: A, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - crate::host::CombinedCudaAlloc, - )>; - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - /// # Errors - /// - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA - /// - /// # Safety - /// - /// This is an internal function and should NEVER be called manually - #[allow(clippy::type_complexity)] - unsafe fn restore( - &mut self, - alloc: crate::host::CombinedCudaAlloc, - ) -> rustacuda::error::CudaResult; -} - -/// # Safety -/// -/// This is an internal trait and should NEVER be implemented manually -pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout { - type RustRepresentation: RustToCuda; - - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] - /// # Safety - /// - /// This is an internal function and should NEVER be called manually - unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation; -} - -pub trait RustToCudaProxy: RustToCuda { - fn from_ref(val: &T) -> &Self; - fn from_mut(val: &mut T) -> &mut Self; - - fn into(self) -> T; -} - -#[repr(transparent)] -#[derive(Clone, Copy, TypeLayout)] -pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> { - #[cfg_attr(feature = "host", allow(dead_code))] - pub(super) pointer: *const T, - pub(super) reference: PhantomData<&'r T>, -} - -unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceConstRef<'r, T> {} - -#[cfg(any(not(feature = "host"), doc))] -#[doc(cfg(not(feature = "host")))] -impl<'r, T: DeviceCopy> AsRef for DeviceConstRef<'r, T> { - fn as_ref(&self) -> &T { - unsafe { &*self.pointer } - } -} - -#[repr(transparent)] -#[derive(TypeLayout)] -pub struct DeviceMutRef<'r, T: DeviceCopy + 'r> { - #[cfg_attr(feature = "host", allow(dead_code))] - pub(super) pointer: *mut T, - pub(super) reference: PhantomData<&'r mut T>, -} - -unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceMutRef<'r, T> {} - -#[cfg(any(not(feature = "host"), doc))] -#[doc(cfg(not(feature = "host")))] -impl<'r, T: DeviceCopy> AsRef for DeviceMutRef<'r, T> { - fn as_ref(&self) -> &T { - unsafe { &*self.pointer } - } -} - -#[cfg(any(not(feature = "host"), doc))] -#[doc(cfg(not(feature = "host")))] -impl<'r, T: DeviceCopy> AsMut for DeviceMutRef<'r, T> { - fn as_mut(&mut self) -> &mut T { - unsafe { &mut *self.pointer } - } -} diff --git a/src/deps.rs b/src/deps.rs new file mode 100644 index 000000000..50fd38f3f --- /dev/null +++ b/src/deps.rs @@ -0,0 +1,12 @@ +#[doc(hidden)] +pub extern crate alloc; + +pub extern crate const_type_layout; + +#[cfg(feature = "host")] +pub extern crate owning_ref; + +#[cfg(feature = "host")] +pub extern crate rustacuda; + +pub extern crate rustacuda_core; diff --git a/src/device/alloc.rs b/src/device/alloc.rs new file mode 100644 index 000000000..bca59a1eb --- /dev/null +++ b/src/device/alloc.rs @@ -0,0 +1,21 @@ +#[cfg(all(feature = "device", not(doc)))] +use core::arch::nvptx; + +use crate::deps::alloc::alloc::{GlobalAlloc, Layout}; + +/// Memory allocator using CUDA malloc/free +pub struct PTXAllocator; + +unsafe impl GlobalAlloc for PTXAllocator { + #[allow(clippy::inline_always)] + #[inline(always)] + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + nvptx::malloc(layout.size()).cast() + } + + #[allow(clippy::inline_always)] + #[inline(always)] + unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) { + nvptx::free(ptr.cast()); + } +} diff --git a/src/device/mod.rs b/src/device/mod.rs index 225bc8252..df20ae5a8 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -1,115 +1,7 @@ -use core::{ - mem::ManuallyDrop, - ops::{Deref, DerefMut}, -}; - -#[cfg(feature = "derive")] -#[doc(cfg(feature = "derive"))] -pub use rust_cuda_derive::{specialise_kernel_entry, specialise_kernel_type}; - -use crate::{ - common::{CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceMutRef, RustToCuda}, - safety::SafeDeviceCopy, -}; +#[doc(hidden)] +#[cfg(feature = "kernel")] +pub use rust_cuda_kernel::{specialise_kernel_function, specialise_kernel_param_type}; +pub mod alloc; +pub mod thread; pub mod utils; - -pub trait BorrowFromRust: RustToCuda { - /// # Safety - /// - /// This function is only safe to call iff `cuda_repr` is the - /// `DeviceConstRef` borrowed on the CPU using the corresponding - /// `LendToCuda::lend_to_cuda`. - unsafe fn with_borrow_from_rust) -> O>( - cuda_repr: DeviceConstRef::CudaRepresentation>>, - inner: F, - ) -> O; - - /// # Safety - /// - /// This function is only safe to call iff `cuda_repr_mut` is the - /// `DeviceMutRef` borrowed on the CPU using the corresponding - /// `LendToCuda::lend_to_cuda_mut`. - /// Furthermore, since different GPU threads can access heap storage - /// mutably inside the safe `inner` scope, there must not be any - /// aliasing between concurrently running threads. - unsafe fn with_borrow_from_rust_mut) -> O>( - cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, - inner: F, - ) -> O; - - /// # Safety - /// - /// This function is only safe to call iff `cuda_repr` is the - /// `DeviceMutRef` borrowed on the CPU using the corresponding - /// `LendToCuda::move_to_cuda`. - unsafe fn with_moved_from_rust O>( - cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, - inner: F, - ) -> O - where - Self: Sized + SafeDeviceCopy, - ::CudaRepresentation: SafeDeviceCopy; -} - -impl BorrowFromRust for T { - #[inline] - unsafe fn with_borrow_from_rust) -> O>( - cuda_repr: DeviceConstRef::CudaRepresentation>>, - inner: F, - ) -> O { - // rust_repr must never be dropped as we do NOT own any of the - // heap memory it might reference - let rust_repr = ShallowCopy::new(CudaAsRust::as_rust(cuda_repr.as_ref())); - - inner(&rust_repr) - } - - #[inline] - unsafe fn with_borrow_from_rust_mut) -> O>( - mut cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, - inner: F, - ) -> O { - // rust_repr must never be dropped as we do NOT own any of the - // heap memory it might reference - let mut rust_repr_mut = ShallowCopy::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut())); - - inner(&mut rust_repr_mut) - } - - #[inline] - unsafe fn with_moved_from_rust O>( - mut cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, - inner: F, - ) -> O - where - Self: Sized + SafeDeviceCopy, - ::CudaRepresentation: SafeDeviceCopy, - { - inner(CudaAsRust::as_rust(cuda_repr_mut.as_mut())) - } -} - -#[repr(transparent)] -#[derive(Debug)] -pub struct ShallowCopy(ManuallyDrop); - -impl ShallowCopy { - fn new(value: T) -> Self { - Self(ManuallyDrop::new(value)) - } -} - -impl Deref for ShallowCopy { - type Target = T; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl DerefMut for ShallowCopy { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} diff --git a/src/device/thread.rs b/src/device/thread.rs new file mode 100644 index 000000000..bb5599cda --- /dev/null +++ b/src/device/thread.rs @@ -0,0 +1,155 @@ +#[cfg(all(feature = "device", not(doc)))] +use core::arch::nvptx; + +#[allow(clippy::module_name_repetitions)] +pub struct Thread { + _private: (), +} + +#[allow(clippy::module_name_repetitions)] +pub struct ThreadBlock { + _private: (), +} + +#[allow(clippy::module_name_repetitions)] +pub struct ThreadBlockGrid { + _private: (), +} + +impl Thread { + #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] + pub const fn this() -> Self { + Self { _private: () } + } + + #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] + pub fn index(&self) -> usize { + let block = self.block(); + let grid = block.grid(); + + let block_id = block.idx().as_id(&grid.dim()); + let thread_id = self.idx().as_id(&block.dim()); + + block_id * block.dim().size() + thread_id + } + + #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] + pub fn idx(&self) -> Idx3 { + #[allow(clippy::cast_sign_loss)] + unsafe { + Idx3 { + x: nvptx::_thread_idx_x() as u32, + y: nvptx::_thread_idx_y() as u32, + z: nvptx::_thread_idx_z() as u32, + } + } + } + + #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] + pub const fn block(&self) -> ThreadBlock { + ThreadBlock { _private: () } + } +} + +impl ThreadBlock { + #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] + pub fn dim(&self) -> Dim3 { + #[allow(clippy::cast_sign_loss)] + unsafe { + Dim3 { + x: nvptx::_block_dim_x() as u32, + y: nvptx::_block_dim_y() as u32, + z: nvptx::_block_dim_z() as u32, + } + } + } + + #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] + pub fn idx(&self) -> Idx3 { + #[allow(clippy::cast_sign_loss)] + unsafe { + Idx3 { + x: nvptx::_block_idx_x() as u32, + y: nvptx::_block_idx_y() as u32, + z: nvptx::_block_idx_z() as u32, + } + } + } + + #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] + pub const fn grid(&self) -> ThreadBlockGrid { + ThreadBlockGrid { _private: () } + } + + #[allow(clippy::inline_always)] + #[inline(always)] + pub fn synchronize(&self) { + unsafe { nvptx::_syncthreads() } + } +} + +impl ThreadBlockGrid { + #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] + pub fn dim(&self) -> Dim3 { + #[allow(clippy::cast_sign_loss)] + unsafe { + Dim3 { + x: nvptx::_grid_dim_x() as u32, + y: nvptx::_grid_dim_y() as u32, + z: nvptx::_grid_dim_z() as u32, + } + } + } +} + +/// Dimension specified in kernel launching +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct Dim3 { + pub x: u32, + pub y: u32, + pub z: u32, +} + +/// Indices that the kernel code is running on +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct Idx3 { + pub x: u32, + pub y: u32, + pub z: u32, +} + +impl Dim3 { + #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] + pub const fn size(&self) -> usize { + (self.x as usize) * (self.y as usize) * (self.z as usize) + } +} + +impl Idx3 { + #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] + pub const fn as_id(&self, dim: &Dim3) -> usize { + (self.x as usize) + + (self.y as usize) * (dim.x as usize) + + (self.z as usize) * (dim.x as usize) * (dim.y as usize) + } +} diff --git a/src/device/utils.rs b/src/device/utils.rs index a45ff9c71..8447c5235 100644 --- a/src/device/utils.rs +++ b/src/device/utils.rs @@ -1,220 +1,214 @@ -use alloc::alloc::{GlobalAlloc, Layout}; -#[cfg(target_os = "cuda")] -use core::arch::nvptx; - -/// Memory allocator using CUDA malloc/free -pub struct PTXAllocator; - -unsafe impl GlobalAlloc for PTXAllocator { - unsafe fn alloc(&self, layout: Layout) -> *mut u8 { - nvptx::malloc(layout.size()).cast() - } - - unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) { - nvptx::free(ptr.cast()); - } -} - -// Based on https://github.com/popzxc/stdext-rs/blob/master/src/macros.rs -#[macro_export] -#[doc(hidden)] -macro_rules! function { - () => {{ - // Hack to get the name of the enclosing function - fn f() {} - fn type_name_of(_: T) -> &'static str { - core::any::type_name::() - } - let name = type_name_of(f); - - // Remove the `::f` suffix - &name[..name.len() - 3] - }}; -} - -/// Alternative of [`std::print!`](https://doc.rust-lang.org/std/macro.print.html) using CUDA `vprintf` system-call -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -#[macro_export] -macro_rules! print { +use crate::deps::alloc::{fmt, string::String}; + +/// Abort the CUDA kernel using the `trap` system call. +/// +/// [`abort`] poisons the CUDA context and no more work can be performed in it. +#[allow(clippy::inline_always)] +#[inline(always)] +pub fn abort() -> ! { + unsafe { ::core::arch::nvptx::trap() } +} + +/// Exit the CUDA kernel using the `exit` instruction. +/// +/// # Safety +/// +/// [`exit`] quits the kernel early and any mutable data accessible outside this +/// kernel launch (by the host or a subsequent kernel launch) may be in an +/// inconsistent state. Therefore, kernel failure must be communicated back to +/// host and handled in some other manner. +/// +/// Safely return from the main kernel function instead. +#[allow(clippy::inline_always)] +#[inline(always)] +pub unsafe fn exit() -> ! { + unsafe { ::core::arch::asm!("exit;", options(noreturn)) } +} + +/// Prints to the CUDA kernel's standard output using the `vprintf` system call. +/// +/// Replacement for the [`std::print!`] macro, which now forwards to the +/// [`print()`] function. +pub macro print($($arg:tt)*) { + self::print(::core::format_args!($($arg)*)) +} + +/// Prints to the CUDA kernel's standard output using the `vprintf` system call. +/// +/// Replacement for the [`std::println!`] macro, which now forwards to the +/// [`print()`] function. +pub macro println { + () => { + self::print(::core::format_args!("\n")) + }, ($($arg:tt)*) => { - let msg = $crate::alloc::format!($($arg)*); - - #[allow(unused_unsafe)] - unsafe { - ::core::arch::nvptx::vprintf(msg.as_ptr(), ::core::ptr::null_mut()); - } + self::print(::core::format_args!("{}\n", ::core::format_args!($($arg)*))) + }, +} + +/// The [`print()`] function takes an [`Arguments`](core::fmt::Arguments) struct +/// and formats and prints it to the CUDA kernel's standard output using the +/// `vprintf` system call. +/// +/// The [`Arguments`](core::fmt::Arguments) instance can be created with the +/// [`format_args!`](core::format_args) macro. +#[allow(clippy::inline_always)] +#[inline(always)] +pub fn print(args: ::core::fmt::Arguments) { + #[repr(C)] + struct FormatArgs { + msg_len: u32, + msg_ptr: *const u8, } -} - -/// Alternative of [`std::println!`](https://doc.rust-lang.org/std/macro.println.html) using CUDA `vprintf` system-call -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -#[macro_export] -macro_rules! println { - () => ($crate::print!("\n")); - ($fmt:expr) => ($crate::print!(concat!($fmt, "\n"))); - ($fmt:expr, $($arg:tt)*) => ($crate::print!(concat!($fmt, "\n"), $($arg)*)); -} -/// Assertion in GPU kernel for one expression is true. -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -#[macro_export] -macro_rules! assert { - ($e:expr) => { - if !$e { - let msg = $crate::alloc::format!( - "\nassertion failed: {}\nexpression: {:?}", - stringify!($e), - $e, - ); - - unsafe { - ::core::arch::nvptx::__assert_fail( - msg.as_ptr(), - file!().as_ptr(), - line!(), - $crate::function!().as_ptr(), - ) - }; - } + let msg; // place to store the dynamically expanded format string + #[allow(clippy::option_if_let_else)] + let msg = if let Some(msg) = args.as_str() { + msg + } else { + msg = fmt::format(args); + msg.as_str() }; -} -/// Assertion in GPU kernel for two expressions are equal. -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -#[macro_export] -macro_rules! assert_eq { - ($a:expr, $b:expr) => { - if $a != $b { - let msg = $crate::alloc::format!( - "\nassertion failed: ({} == {})\nleft : {:?}\nright: {:?}", - stringify!($a), - stringify!($b), - $a, - $b - ); - - unsafe { - ::core::arch::nvptx::__assert_fail( - msg.as_ptr(), - file!().as_ptr(), - line!(), - $crate::function!().as_ptr(), - ) - }; - } - }; -} - -/// Assertion in GPU kernel for two expressions are not equal. -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -#[macro_export] -macro_rules! assert_ne { - ($a:expr, $b:expr) => { - if $a == $b { - let msg = $crate::alloc::format!( - "\nassertion failed: ({} != {})\nleft : {:?}\nright: {:?}", - stringify!($a), - stringify!($b), - $a, - $b - ); - - unsafe { - ::core::arch::nvptx::__assert_fail( - msg.as_ptr(), - file!().as_ptr(), - line!(), - $crate::function!().as_ptr(), - ) - }; - } + let args = FormatArgs { + msg_len: u32::try_from(msg.len()).unwrap_or(u32::MAX), + msg_ptr: msg.as_ptr(), }; -} - -/// Dimension specified in kernel launching -#[derive(Debug)] -pub struct Dim3 { - pub x: u32, - pub y: u32, - pub z: u32, -} -/// Indices that the kernel code is running on -#[derive(Debug)] -pub struct Idx3 { - pub x: u32, - pub y: u32, - pub z: u32, -} - -#[must_use] -pub fn block_dim() -> Dim3 { - #[allow(clippy::cast_sign_loss)] unsafe { - Dim3 { - x: nvptx::_block_dim_x() as u32, - y: nvptx::_block_dim_y() as u32, - z: nvptx::_block_dim_z() as u32, - } + ::core::arch::nvptx::vprintf(c"%*s".as_ptr().cast(), ::core::ptr::from_ref(&args).cast()); } } -#[must_use] -pub fn block_idx() -> Idx3 { - #[allow(clippy::cast_sign_loss)] - unsafe { - Idx3 { - x: nvptx::_block_idx_x() as u32, - y: nvptx::_block_idx_y() as u32, - z: nvptx::_block_idx_z() as u32, - } +/// Helper function to efficiently pretty-print a [`core::panic::PanicInfo`] +/// using the `vprintf` system call. +/// +/// If `allow_dynamic_message` is set, +/// [`alloc::fmt::format`](crate::deps::alloc::fmt::format) is used to print +/// [`core::panic::PanicInfo::message`] message when +/// [`core::fmt::Arguments::as_str`] returns [`None`]. Note that this may pull +/// in a large amount of string formatting and dynamic allocation code. +/// If unset, a default placeholder panic message is printed instead. +/// +/// If `allow_dynamic_payload` is set, [`core::panic::PanicInfo::payload`] is +/// checked for [`&str`] and [`String`] to get a message to print if +/// [`core::panic::PanicInfo::message`] returns [`None`]. Note that this may +/// pull in some dynamic dispatch code. If unset, a default placeholder panic +/// message is printed instead. +#[allow(clippy::inline_always)] +#[inline(always)] +pub fn pretty_print_panic_info( + info: &::core::panic::PanicInfo, + allow_dynamic_message: bool, + allow_dynamic_payload: bool, +) { + #[repr(C)] + struct FormatArgs { + file_len: u32, + file_ptr: *const u8, + line: u32, + column: u32, + thread_idx_x: u32, + thread_idx_y: u32, + thread_idx_z: u32, + msg_len: u32, + msg_ptr: *const u8, } -} -#[must_use] -pub fn grid_dim() -> Dim3 { - #[allow(clippy::cast_sign_loss)] - unsafe { - Dim3 { - x: nvptx::_grid_dim_x() as u32, - y: nvptx::_grid_dim_y() as u32, - z: nvptx::_grid_dim_z() as u32, + let msg; // place to store the dynamically expanded format string + #[allow(clippy::option_if_let_else)] + let msg = if let Some(message) = info.message() { + if let Some(msg) = message.as_str() { + msg + } else if allow_dynamic_message { + msg = fmt::format(*message); + msg.as_str() + } else { + "" } - } -} + } else if let Some(msg) = info.payload().downcast_ref::<&'static str>() + && allow_dynamic_payload + { + msg + } else if let Some(msg) = info.payload().downcast_ref::() + && allow_dynamic_payload + { + msg.as_str() + } else { + "" + }; -#[must_use] -pub fn thread_idx() -> Idx3 { - #[allow(clippy::cast_sign_loss)] - unsafe { - Idx3 { - x: nvptx::_thread_idx_x() as u32, - y: nvptx::_thread_idx_y() as u32, - z: nvptx::_thread_idx_z() as u32, - } - } -} + let location_line = info.location().map_or(0, ::core::panic::Location::line); + let location_column = info.location().map_or(0, ::core::panic::Location::column); + let location_file = info + .location() + .map_or("", ::core::panic::Location::file); + + let thread_idx = crate::device::thread::Thread::this().idx(); + + let args = FormatArgs { + file_len: u32::try_from(location_file.len()).unwrap_or(u32::MAX), + file_ptr: location_file.as_ptr(), + line: location_line, + column: location_column, + thread_idx_x: thread_idx.x, + thread_idx_y: thread_idx.y, + thread_idx_z: thread_idx.z, + msg_len: u32::try_from(msg.len()).unwrap_or(u32::MAX), + msg_ptr: msg.as_ptr(), + }; -impl Dim3 { - #[must_use] - pub fn size(&self) -> usize { - (self.x as usize) * (self.y as usize) * (self.z as usize) + unsafe { + ::core::arch::nvptx::vprintf( + c"panicked at %*s:%u:%u on thread (x=%u, y=%u, z=%u):\n%*s\n" + .as_ptr() + .cast(), + ::core::ptr::from_ref(&args).cast(), + ); } } -impl Idx3 { - #[must_use] - pub fn as_id(&self, dim: &Dim3) -> usize { - (self.x as usize) - + (self.y as usize) * (dim.x as usize) - + (self.z as usize) * (dim.x as usize) * (dim.y as usize) +/// Helper function to efficiently pretty-print an error message (inside an +/// allocation error handler) using the `vprintf` system call. +#[track_caller] +#[allow(clippy::inline_always)] +#[inline(always)] +pub fn pretty_print_alloc_error(layout: ::core::alloc::Layout) { + #[repr(C)] + struct FormatArgs { + size: usize, + align: usize, + file_len: u32, + file_ptr: *const u8, + line: u32, + column: u32, + thread_idx_x: u32, + thread_idx_y: u32, + thread_idx_z: u32, } -} -#[must_use] -pub fn index() -> usize { - let block_id = block_idx().as_id(&grid_dim()); - let thread_id = thread_idx().as_id(&block_dim()); + let location = ::core::panic::Location::caller(); + let thread_idx = crate::device::thread::Thread::this().idx(); + + let args = FormatArgs { + size: layout.size(), + align: layout.align(), + file_len: u32::try_from(location.file().len()).unwrap_or(u32::MAX), + file_ptr: location.file().as_ptr(), + line: location.line(), + column: location.column(), + thread_idx_x: thread_idx.x, + thread_idx_y: thread_idx.y, + thread_idx_z: thread_idx.z, + }; - block_id * block_dim().size() + thread_id + unsafe { + ::core::arch::nvptx::vprintf( + c"memory allocation of %llu bytes with alignment %llu failed at \ + %*s:%u:%u on thread (x=%u, y=%u, z=%u)\n" + .as_ptr() + .cast(), + ::core::ptr::from_ref(&args).cast(), + ); + } } diff --git a/src/host.rs b/src/host.rs deleted file mode 100644 index 6c91a26bc..000000000 --- a/src/host.rs +++ /dev/null @@ -1,612 +0,0 @@ -use core::{ - marker::PhantomData, - mem::ManuallyDrop, - ops::{Deref, DerefMut}, -}; - -use rustacuda::{ - context::Context, - error::{CudaError, CudaResult}, - function::Function, - memory::{DeviceBox, DeviceBuffer, LockedBuffer}, - module::Module, - stream::Stream, -}; -use rustacuda_core::{DeviceCopy, DevicePointer}; - -#[cfg(feature = "derive")] -#[doc(cfg(feature = "derive"))] -pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_call}; - -use crate::{ - common::{DeviceAccessible, DeviceConstRef, DeviceMutRef, RustToCuda}, - ptx_jit::{CudaKernel, PtxJITCompiler, PtxJITResult}, - safety::SafeDeviceCopy, -}; - -pub trait Launcher { - type KernelTraitObject: ?Sized; - type CompilationWatcher; - - fn get_launch_package(&mut self) -> LaunchPackage; - - /// # Errors - /// - /// Should only return a `CudaError` if some implementation-defined - /// critical kernel function configuration failed. - #[allow(unused_variables)] - fn on_compile(kernel: &Function, watcher: &mut Self::CompilationWatcher) -> CudaResult<()> { - Ok(()) - } -} - -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct LaunchConfig { - pub grid: rustacuda::function::GridSize, - pub block: rustacuda::function::BlockSize, - pub shared_memory_size: u32, - pub ptx_jit: bool, -} - -pub struct LaunchPackage<'l, L: ?Sized + Launcher> { - pub config: LaunchConfig, - - pub kernel: &'l mut TypedKernel, - pub stream: &'l mut Stream, - - pub watcher: &'l mut L::CompilationWatcher, -} - -pub enum KernelJITResult<'k> { - Cached(&'k Function<'k>), - Recompiled(&'k Function<'k>), -} - -pub struct TypedKernel { - compiler: PtxJITCompiler, - kernel: Option, - entry_point: alloc::boxed::Box, - marker: PhantomData, -} - -impl TypedKernel { - /// # Errors - /// - /// Returns a `CudaError` if `ptx` or `entry_point` contain nul bytes. - pub fn new(ptx: &str, entry_point: &str) -> CudaResult { - let ptx_cstring = std::ffi::CString::new(ptx).map_err(|_| CudaError::InvalidPtx)?; - - let compiler = crate::ptx_jit::PtxJITCompiler::new(&ptx_cstring); - - let entry_point_cstring = - std::ffi::CString::new(entry_point).map_err(|_| CudaError::InvalidValue)?; - let entry_point = entry_point_cstring.into_boxed_c_str(); - - Ok(Self { - compiler, - kernel: None, - entry_point, - marker: PhantomData, - }) - } - - /// # Errors - /// - /// Returns a `CudaError` if `ptx` (from [`Self::new`]) is not a valid - /// PTX source, or it does not contain an entry point named `entry_point` - /// (from [`Self::new`]). - pub fn compile_with_ptx_jit_args( - &mut self, - arguments: Option<&[Option<&[u8]>]>, - ) -> CudaResult { - let ptx_jit = self.compiler.with_arguments(arguments); - - let kernel_jit = match (&mut self.kernel, ptx_jit) { - (Some(kernel), PtxJITResult::Cached(_)) => { - KernelJITResult::Cached(kernel.get_function()) - }, - (kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => { - let recomputed_kernel = CudaKernel::new(ptx_cstr, &self.entry_point)?; - - // Replace the existing compiled kernel, drop the old one - let kernel = kernel.insert(recomputed_kernel); - - KernelJITResult::Recompiled(kernel.get_function()) - }, - }; - - Ok(kernel_jit) - } -} - -pub trait LendToCuda: RustToCuda { - /// Lends an immutable copy of `&self` to CUDA: - /// - code in the CUDA kernel can only access `&self` through the - /// `DeviceConstRef` inside the closure - /// - after the closure, `&self` will not have changed - /// - /// # Errors - /// - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA - fn lend_to_cuda< - O, - E: From, - F: FnOnce( - HostAndDeviceConstRef::CudaRepresentation>>, - ) -> Result, - >( - &self, - inner: F, - ) -> Result; - - /// Lends a mutable copy of `&mut self` to CUDA: - /// - code in the CUDA kernel can only access `&mut self` through the - /// `DeviceMutRef` inside the closure - /// - after the closure, `&mut self` might have changed in the following - /// ways: - /// - to avoid aliasing, each CUDA thread gets its own shallow copy of - /// `&mut self`, i.e. any shallow changes will NOT be reflected after - /// the closure - /// - each CUDA thread can access the same heap allocated storage, i.e. - /// any deep changes will be reflected after the closure - /// - /// # Errors - /// - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA - fn lend_to_cuda_mut< - O, - E: From, - F: FnOnce( - HostAndDeviceMutRef::CudaRepresentation>>, - ) -> Result, - >( - &mut self, - inner: F, - ) -> Result; - - /// Moves `self` to CUDA iff `self` is `SafeDeviceCopy` - /// - /// # Errors - /// - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA - fn move_to_cuda< - O, - E: From, - F: FnOnce( - HostAndDeviceOwned::CudaRepresentation>>, - ) -> Result, - >( - self, - inner: F, - ) -> Result - where - Self: Sized + SafeDeviceCopy, - ::CudaRepresentation: SafeDeviceCopy, - ::CudaAllocation: EmptyCudaAlloc; -} - -impl LendToCuda for T { - fn lend_to_cuda< - O, - E: From, - F: FnOnce( - HostAndDeviceConstRef::CudaRepresentation>>, - ) -> Result, - >( - &self, - inner: F, - ) -> Result { - let (cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?; - - let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner); - - core::mem::drop(cuda_repr); - core::mem::drop(alloc); - - result - } - - fn lend_to_cuda_mut< - O, - E: From, - F: FnOnce( - HostAndDeviceMutRef::CudaRepresentation>>, - ) -> Result, - >( - &mut self, - inner: F, - ) -> Result { - let (mut cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?; - - let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, inner); - - core::mem::drop(cuda_repr); - - let _: NullCudaAlloc = unsafe { self.restore(alloc) }?; - - result - } - - fn move_to_cuda< - O, - E: From, - F: FnOnce( - HostAndDeviceOwned::CudaRepresentation>>, - ) -> Result, - >( - self, - inner: F, - ) -> Result - where - Self: Sized + SafeDeviceCopy, - ::CudaRepresentation: SafeDeviceCopy, - ::CudaAllocation: EmptyCudaAlloc, - { - let (cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?; - - let result = HostAndDeviceOwned::with_new(cuda_repr, inner); - - core::mem::drop(alloc); - - result - } -} - -pub(crate) mod private { - pub mod alloc { - pub trait Sealed {} - } - - pub mod drop { - pub trait Sealed: Sized { - fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>; - } - } - - pub mod empty { - pub trait Sealed {} - } -} - -pub trait EmptyCudaAlloc: private::empty::Sealed {} -impl EmptyCudaAlloc for T {} - -pub trait CudaAlloc: private::alloc::Sealed {} -impl CudaAlloc for T {} - -impl private::alloc::Sealed for Option {} - -pub struct NullCudaAlloc; -impl private::alloc::Sealed for NullCudaAlloc {} -impl private::empty::Sealed for NullCudaAlloc {} - -pub struct CombinedCudaAlloc(A, B); -impl private::alloc::Sealed for CombinedCudaAlloc {} -impl private::empty::Sealed - for CombinedCudaAlloc -{ -} -impl CombinedCudaAlloc { - pub fn new(front: A, tail: B) -> Self { - Self(front, tail) - } - - pub fn split(self) -> (A, B) { - (self.0, self.1) - } -} - -pub struct CudaDropWrapper(Option); -impl private::alloc::Sealed for CudaDropWrapper {} -impl From for CudaDropWrapper { - fn from(val: C) -> Self { - Self(Some(val)) - } -} -impl Drop for CudaDropWrapper { - fn drop(&mut self) { - if let Some(val) = self.0.take() { - if let Err((_err, val)) = C::drop(val) { - core::mem::forget(val); - } - } - } -} -impl Deref for CudaDropWrapper { - type Target = C; - - fn deref(&self) -> &Self::Target { - self.0.as_ref().unwrap() - } -} -impl DerefMut for CudaDropWrapper { - fn deref_mut(&mut self) -> &mut Self::Target { - self.0.as_mut().unwrap() - } -} - -macro_rules! impl_sealed_drop_collection { - ($type:ident) => { - impl private::drop::Sealed for $type { - fn drop(val: Self) -> Result<(), (CudaError, Self)> { - Self::drop(val) - } - } - }; -} - -impl_sealed_drop_collection!(DeviceBuffer); -impl_sealed_drop_collection!(DeviceBox); -impl_sealed_drop_collection!(LockedBuffer); - -macro_rules! impl_sealed_drop_value { - ($type:ident) => { - impl private::drop::Sealed for $type { - fn drop(val: Self) -> Result<(), (CudaError, Self)> { - Self::drop(val) - } - } - }; -} - -impl_sealed_drop_value!(Module); -impl_sealed_drop_value!(Stream); -impl_sealed_drop_value!(Context); - -#[repr(transparent)] -#[allow(clippy::module_name_repetitions)] -pub struct HostDeviceBox(DevicePointer); - -impl private::alloc::Sealed for HostDeviceBox {} - -impl HostDeviceBox { - /// # Errors - /// - /// Returns a `CudaError` iff copying from `value` into `self` failed. - pub fn copy_from(&mut self, value: &T) -> CudaResult<()> { - // Safety: pointer comes from `DeviceBox::into_device` - // i.e. this function completes the roundtrip - let mut device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) }; - - rustacuda::memory::CopyDestination::copy_from(&mut *device_box, value) - } - - /// # Errors - /// - /// Returns a `CudaError` iff copying from `self` into `value` failed. - pub fn copy_to(&self, value: &mut T) -> CudaResult<()> { - // Safety: pointer comes from `DeviceBox::into_device` - // i.e. this function completes the roundtrip - let device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) }; - - rustacuda::memory::CopyDestination::copy_to(&*device_box, value) - } -} - -impl From> for HostDeviceBox { - fn from(device_box: DeviceBox) -> Self { - Self(DeviceBox::into_device(device_box)) - } -} - -impl From> for DeviceBox { - fn from(host_device_box: HostDeviceBox) -> Self { - // Safety: pointer comes from `DeviceBox::into_device` - // i.e. this function completes the roundtrip - unsafe { DeviceBox::from_device(host_device_box.0) } - } -} - -impl Drop for HostDeviceBox { - fn drop(&mut self) { - // Safety: pointer comes from `DeviceBox::into_device` - // i.e. this function completes the roundtrip - let device_box = unsafe { DeviceBox::from_device(self.0) }; - - core::mem::drop(CudaDropWrapper::from(device_box)); - } -} - -#[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceMutRef<'a, T: DeviceCopy> { - device_box: &'a mut HostDeviceBox, - host_ref: &'a mut T, -} - -impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> { - /// # Safety - /// - /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub unsafe fn new(device_box: &'a mut HostDeviceBox, host_ref: &'a mut T) -> Self { - Self { - device_box, - host_ref, - } - } - - /// # Errors - /// - /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved - /// to CUDA or an error occurs inside `inner`. - pub fn with_new< - O, - E: From, - F: for<'b> FnOnce(HostAndDeviceMutRef<'b, T>) -> Result, - >( - host_ref: &mut T, - inner: F, - ) -> Result { - let mut device_box: HostDeviceBox<_> = DeviceBox::new(host_ref)?.into(); - - // Safety: `device_box` contains exactly the device copy of `host_ref` - let result = inner(HostAndDeviceMutRef { - device_box: &mut device_box, - host_ref, - }); - - // Copy back any changes made - device_box.copy_to(host_ref)?; - - core::mem::drop(device_box); - - result - } - - #[must_use] - pub fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T> - where - 'a: 'b, - { - DeviceMutRef { - pointer: self.device_box.0.as_raw_mut(), - reference: PhantomData, - } - } - - #[must_use] - pub fn for_host<'b: 'a>(&'b self) -> &'a T { - self.host_ref - } - - #[must_use] - pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T> - where - 'a: 'b, - { - // Safety: `device_box` contains EXACTLY the device copy of `host_ref` - // by construction of `HostAndDeviceMutRef` - unsafe { HostAndDeviceConstRef::new(self.device_box, self.host_ref) } - } - - #[must_use] - pub fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRef<'b, T> - where - 'a: 'b, - { - // Safety: `device_box` contains EXACTLY the device copy of `host_ref` - // by construction of `HostAndDeviceMutRef` - unsafe { HostAndDeviceMutRef::new(self.device_box, self.host_ref) } - } -} - -#[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceConstRef<'a, T: DeviceCopy> { - device_box: &'a HostDeviceBox, - host_ref: &'a T, -} - -impl<'a, T: DeviceCopy> Clone for HostAndDeviceConstRef<'a, T> { - fn clone(&self) -> Self { - *self - } -} - -impl<'a, T: DeviceCopy> Copy for HostAndDeviceConstRef<'a, T> {} - -impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { - /// # Safety - /// - /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub unsafe fn new(device_box: &'a HostDeviceBox, host_ref: &'a T) -> Self { - Self { - device_box, - host_ref, - } - } - - /// # Errors - /// - /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved - /// to CUDA or an error occurs inside `inner`. - pub fn with_new< - O, - E: From, - F: for<'b> FnOnce(HostAndDeviceConstRef<'b, T>) -> Result, - >( - host_ref: &T, - inner: F, - ) -> Result { - let device_box: HostDeviceBox<_> = DeviceBox::new(host_ref)?.into(); - - // Safety: `device_box` contains exactly the device copy of `host_ref` - let result = inner(HostAndDeviceConstRef { - device_box: &device_box, - host_ref, - }); - - core::mem::drop(device_box); - - result - } - - #[must_use] - pub fn for_device<'b>(&'b self) -> DeviceConstRef<'a, T> - where - 'a: 'b, - { - DeviceConstRef { - pointer: self.device_box.0.as_raw(), - reference: PhantomData, - } - } - - #[must_use] - pub fn for_host(&'a self) -> &'a T { - self.host_ref - } - - #[must_use] - pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T> - where - 'a: 'b, - { - *self - } -} - -#[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceOwned<'a, T: SafeDeviceCopy + DeviceCopy> { - device_box: &'a mut HostDeviceBox, - host_val: &'a mut T, -} - -impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> { - /// # Errors - /// - /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved - /// to CUDA or an error occurs inside `inner`. - pub fn with_new< - O, - E: From, - F: for<'b> FnOnce(HostAndDeviceOwned<'b, T>) -> Result, - >( - mut value: T, - inner: F, - ) -> Result { - let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into(); - - // Safety: `device_box` contains exactly the device copy of `value` - let result = inner(HostAndDeviceOwned { - device_box: &mut device_box, - host_val: &mut value, - }); - - core::mem::drop(device_box); - core::mem::drop(value); - - result - } - - #[must_use] - pub fn for_device(self) -> DeviceMutRef<'a, T> { - DeviceMutRef { - pointer: self.device_box.0.as_raw_mut(), - reference: PhantomData, - } - } - - #[must_use] - pub fn for_host(&'a mut self) -> &'a T { - self.host_val - } -} diff --git a/src/host/mod.rs b/src/host/mod.rs new file mode 100644 index 000000000..589556560 --- /dev/null +++ b/src/host/mod.rs @@ -0,0 +1,412 @@ +use std::{ + marker::PhantomData, + mem::ManuallyDrop, + ops::{Deref, DerefMut}, +}; + +use const_type_layout::TypeGraphLayout; +use rustacuda::{ + context::Context, + error::CudaError, + event::Event, + memory::{CopyDestination, DeviceBox, DeviceBuffer, LockedBox, LockedBuffer}, + module::Module, +}; + +use crate::{ + safety::PortableBitSemantics, + utils::{ + adapter::DeviceCopyWithPortableBitSemantics, + ffi::{ + DeviceConstPointer, DeviceConstRef, DeviceMutPointer, DeviceMutRef, DeviceOwnedPointer, + DeviceOwnedRef, + }, + r#async::{Async, NoCompletion}, + }, +}; + +type InvariantLifetime<'brand> = PhantomData &'brand ()>; + +#[derive(Copy, Clone)] +#[repr(transparent)] +pub struct Stream<'stream> { + stream: &'stream rustacuda::stream::Stream, + _brand: InvariantLifetime<'stream>, +} + +impl<'stream> Deref for Stream<'stream> { + type Target = rustacuda::stream::Stream; + + fn deref(&self) -> &Self::Target { + self.stream + } +} + +impl<'stream> Stream<'stream> { + #[allow(clippy::needless_pass_by_ref_mut)] + /// Create a new uniquely branded [`Stream`], which can bind async + /// operations to the [`Stream`] that they are computed on. + /// + /// The uniqueness guarantees are provided by using branded types, + /// as inspired by the Ghost Cell paper by Yanovski, J., Dang, H.-H., + /// Jung, R., and Dreyer, D.: . + /// + /// # Examples + /// + /// The following example shows that two [`Stream`]'s with different + /// `'stream` lifetime brands cannot be used interchangeably. + /// + /// ```rust, compile_fail + /// use rust_cuda::host::Stream; + /// + /// fn check_same<'stream>(_stream_a: Stream<'stream>, _stream_b: Stream<'stream>) {} + /// + /// fn two_streams<'stream_a, 'stream_b>(stream_a: Stream<'stream_a>, stream_b: Stream<'stream_b>) { + /// check_same(stream_a, stream_b); + /// } + /// ``` + pub fn with( + stream: &mut rustacuda::stream::Stream, + inner: impl for<'new_stream> FnOnce(Stream<'new_stream>) -> O, + ) -> O { + inner(Stream { + stream, + _brand: InvariantLifetime::default(), + }) + } +} + +pub trait CudaDroppable: Sized { + #[allow(clippy::missing_errors_doc)] + fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>; +} + +#[repr(transparent)] +pub struct CudaDropWrapper(ManuallyDrop); +impl crate::alloc::CudaAlloc for CudaDropWrapper {} +impl crate::alloc::sealed::alloc::Sealed for CudaDropWrapper {} +impl From for CudaDropWrapper { + fn from(val: C) -> Self { + Self(ManuallyDrop::new(val)) + } +} +impl Drop for CudaDropWrapper { + fn drop(&mut self) { + // Safety: drop is only ever called once + let val = unsafe { ManuallyDrop::take(&mut self.0) }; + + if let Err((_err, val)) = C::drop(val) { + core::mem::forget(val); + } + } +} +impl Deref for CudaDropWrapper { + type Target = C; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} +impl DerefMut for CudaDropWrapper { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl CudaDroppable for DeviceBox { + fn drop(val: Self) -> Result<(), (CudaError, Self)> { + Self::drop(val) + } +} + +impl CudaDroppable for DeviceBuffer { + fn drop(val: Self) -> Result<(), (CudaError, Self)> { + Self::drop(val) + } +} + +impl CudaDroppable for LockedBox { + fn drop(val: Self) -> Result<(), (CudaError, Self)> { + Self::drop(val) + } +} + +impl CudaDroppable for LockedBuffer { + fn drop(val: Self) -> Result<(), (CudaError, Self)> { + Self::drop(val) + } +} + +macro_rules! impl_sealed_drop_value { + ($type:ty) => { + impl CudaDroppable for $type { + fn drop(val: Self) -> Result<(), (CudaError, Self)> { + Self::drop(val) + } + } + }; +} + +impl_sealed_drop_value!(Module); +impl_sealed_drop_value!(rustacuda::stream::Stream); +impl_sealed_drop_value!(Context); +impl_sealed_drop_value!(Event); + +#[allow(clippy::module_name_repetitions)] +pub struct HostAndDeviceMutRef<'a, T: PortableBitSemantics + TypeGraphLayout> { + device_box: &'a mut DeviceBox>, + host_ref: &'a mut T, +} + +impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { + /// # Errors + /// + /// Returns a [`CudaError`] iff `value` cannot be moved + /// to CUDA or an error occurs inside `inner`. + pub fn with_new< + O, + E: From, + F: for<'b> FnOnce(HostAndDeviceMutRef<'b, T>) -> Result, + >( + host_ref: &mut T, + inner: F, + ) -> Result { + let mut device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(host_ref), + )?); + + // Safety: `device_box` contains exactly the device copy of `host_ref` + let result = inner(HostAndDeviceMutRef { + device_box: &mut device_box, + host_ref, + }); + + // Copy back any changes made + device_box.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(host_ref))?; + + core::mem::drop(device_box); + + result + } + + /// # Safety + /// + /// `device_box` must contain EXACTLY the device copy of `host_ref` + pub(crate) unsafe fn new_unchecked( + device_box: &'a mut DeviceBox>, + host_ref: &'a mut T, + ) -> Self { + Self { + device_box, + host_ref, + } + } + + #[must_use] + pub(crate) fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T> + where + 'a: 'b, + { + DeviceMutRef { + pointer: DeviceMutPointer(self.device_box.as_device_ptr().as_raw_mut().cast()), + reference: PhantomData, + } + } + + #[must_use] + pub(crate) fn for_host<'b: 'a>(&'b self) -> &'a T { + self.host_ref + } + + #[must_use] + pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T> + where + 'a: 'b, + { + HostAndDeviceConstRef { + device_box: self.device_box, + host_ref: self.host_ref, + } + } + + #[must_use] + pub(crate) unsafe fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRef<'b, T> + where + 'a: 'b, + { + HostAndDeviceMutRef { + device_box: self.device_box, + host_ref: self.host_ref, + } + } + + #[must_use] + pub fn into_mut<'b>(self) -> HostAndDeviceMutRef<'b, T> + where + 'a: 'b, + { + HostAndDeviceMutRef { + device_box: self.device_box, + host_ref: self.host_ref, + } + } + + #[must_use] + pub fn into_async<'b, 'stream>( + self, + stream: Stream<'stream>, + ) -> Async<'b, 'stream, HostAndDeviceMutRef<'b, T>, NoCompletion> + where + 'a: 'b, + { + Async::ready(self.into_mut(), stream) + } +} + +#[allow(clippy::module_name_repetitions)] +pub struct HostAndDeviceConstRef<'a, T: PortableBitSemantics + TypeGraphLayout> { + device_box: &'a DeviceBox>, + host_ref: &'a T, +} + +impl<'a, T: PortableBitSemantics + TypeGraphLayout> Clone for HostAndDeviceConstRef<'a, T> { + fn clone(&self) -> Self { + *self + } +} + +impl<'a, T: PortableBitSemantics + TypeGraphLayout> Copy for HostAndDeviceConstRef<'a, T> {} + +impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> { + /// # Errors + /// + /// Returns a [`CudaError`] iff `value` cannot be moved + /// to CUDA or an error occurs inside `inner`. + pub fn with_new< + O, + E: From, + F: for<'b> FnOnce(HostAndDeviceConstRef<'b, T>) -> Result, + >( + host_ref: &T, + inner: F, + ) -> Result { + let device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(host_ref), + )?); + + // Safety: `device_box` contains exactly the device copy of `host_ref` + let result = inner(HostAndDeviceConstRef { + device_box: &device_box, + host_ref, + }); + + core::mem::drop(device_box); + + result + } + + /// # Safety + /// + /// `device_box` must contain EXACTLY the device copy of `host_ref` + pub(crate) const unsafe fn new_unchecked( + device_box: &'a DeviceBox>, + host_ref: &'a T, + ) -> Self { + Self { + device_box, + host_ref, + } + } + + #[must_use] + pub(crate) fn for_device<'b>(&'b self) -> DeviceConstRef<'a, T> + where + 'a: 'b, + { + let mut hack = ManuallyDrop::new(unsafe { std::ptr::read(self.device_box) }); + + DeviceConstRef { + pointer: DeviceConstPointer(hack.as_device_ptr().as_raw().cast()), + reference: PhantomData, + } + } + + #[must_use] + pub(crate) const fn for_host(&'a self) -> &'a T { + self.host_ref + } + + #[must_use] + pub const fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T> + where + 'a: 'b, + { + *self + } + + #[must_use] + pub const fn as_async<'b, 'stream>( + &'b self, + stream: Stream<'stream>, + ) -> Async<'b, 'stream, HostAndDeviceConstRef<'b, T>, NoCompletion> + where + 'a: 'b, + { + Async::ready( + HostAndDeviceConstRef { + device_box: self.device_box, + host_ref: self.host_ref, + }, + stream, + ) + } +} + +#[allow(clippy::module_name_repetitions)] +pub struct HostAndDeviceOwned<'a, T: PortableBitSemantics + TypeGraphLayout> { + device_box: &'a mut DeviceBox>, + host_val: &'a mut T, +} + +impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> { + /// # Errors + /// + /// Returns a [`CudaError`] iff `value` cannot be moved + /// to CUDA or an error occurs inside `inner`. + pub fn with_new, F: FnOnce(HostAndDeviceOwned) -> Result>( + mut value: T, + inner: F, + ) -> Result { + let mut device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(&value), + )?); + + // Safety: `device_box` contains exactly the device copy of `value` + inner(HostAndDeviceOwned { + device_box: &mut device_box, + host_val: &mut value, + }) + } + + #[must_use] + pub(crate) fn for_device(self) -> DeviceOwnedRef<'a, T> { + DeviceOwnedRef { + pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()), + marker: PhantomData::, + reference: PhantomData::<&'a mut ()>, + } + } + + #[must_use] + pub(crate) fn for_host(&self) -> &T { + self.host_val + } + + #[must_use] + pub const fn into_async<'stream>( + self, + stream: Stream<'stream>, + ) -> Async<'a, 'stream, Self, NoCompletion> { + Async::ready(self, stream) + } +} diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs new file mode 100644 index 000000000..3fc2b2e60 --- /dev/null +++ b/src/kernel/mod.rs @@ -0,0 +1,623 @@ +#[cfg(feature = "host")] +use std::{ + ffi::{CStr, CString}, + marker::PhantomData, + mem::ManuallyDrop, + ptr::NonNull, +}; + +#[cfg(feature = "host")] +use rustacuda::{ + error::{CudaError, CudaResult}, + function::Function, + module::Module, +}; + +#[cfg(feature = "kernel")] +pub use rust_cuda_kernel::kernel; + +#[doc(hidden)] +#[cfg(all(feature = "kernel", feature = "host"))] +#[allow(clippy::module_name_repetitions)] +pub use rust_cuda_kernel::{check_kernel, compile_kernel, specialise_kernel_entry_point}; + +#[cfg(feature = "host")] +mod ptx_jit; +#[cfg(feature = "host")] +use ptx_jit::{PtxJITCompiler, PtxJITResult}; + +#[cfg(feature = "host")] +use crate::host::Stream; +use crate::safety::PortableBitSemantics; + +pub mod param; + +mod sealed { + #[doc(hidden)] + pub trait Sealed {} + + #[cfg(feature = "host")] + pub struct Token; +} + +#[cfg(all(feature = "host", not(doc)))] +#[doc(hidden)] +pub trait WithNewAsync< + 'stream, + P: ?Sized + CudaKernelParameter, + O, + E: From, +> +{ + fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result + where + P: 'b; +} + +#[cfg(all(feature = "host", not(doc)))] +impl< + 'stream, + P: ?Sized + CudaKernelParameter, + O, + E: From, + F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result, + > WithNewAsync<'stream, P, O, E> for F +{ + fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result + where + P: 'b, + { + (self)(param) + } +} + +#[cfg(feature = "device")] +#[doc(hidden)] +pub trait WithFfiAsDevice { + fn with<'b>(self, param: P::DeviceType<'b>) -> O + where + P: 'b; +} + +#[cfg(feature = "device")] +impl FnOnce(P::DeviceType<'b>) -> O> + WithFfiAsDevice for F +{ + fn with<'b>(self, param: P::DeviceType<'b>) -> O + where + P: 'b, + { + (self)(param) + } +} + +pub trait CudaKernelParameter: sealed::Sealed { + #[cfg(feature = "host")] + type SyncHostType; + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> + where + Self: 'b; + #[doc(hidden)] + type FfiType<'stream, 'b>: PortableBitSemantics + where + Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> + where + Self: 'b; + + #[cfg(feature = "host")] + #[allow(clippy::missing_errors_doc)] // FIXME + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b; + + #[doc(hidden)] + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + param: &Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b; + + #[doc(hidden)] + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + param: &Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + ) -> std::alloc::Layout + where + Self: 'b; + + #[doc(hidden)] + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + ) -> Result, E> + where + Self: 'b; + + #[doc(hidden)] + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl WithFfiAsDevice, + ) -> O + where + Self: 'short; +} + +#[cfg(feature = "host")] +pub struct Launcher<'stream, 'kernel, Kernel> { + pub stream: Stream<'stream>, + pub kernel: &'kernel mut TypedPtxKernel, + pub config: LaunchConfig, +} + +#[cfg(feature = "host")] +macro_rules! impl_launcher_launch { + ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => { + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch<$($T: CudaKernelParameter),*>( + &mut self, + $($arg: $T::SyncHostType),* + ) -> CudaResult<()> + where + Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*), + { + self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*) + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $with_async< + 'a, + Ok, + Err: From, + $($T: CudaKernelParameter),* + >( + &'a mut self, + $($arg: $T::SyncHostType,)* + inner: impl FnOnce( + &'a mut Self, + $($T::AsyncHostType<'stream, '_>),* + ) -> Result, + ) -> Result + where + Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*), + { + #[allow(unused_variables)] + let stream = self.stream; + + impl_launcher_launch! { impl with_new_async ($($arg: $T),*) + (stream) { + inner(self, $($arg),*) + } } + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch_async<$($T: CudaKernelParameter),*>( + &mut self, + $($arg: $T::AsyncHostType<'stream, '_>),* + ) -> CudaResult> + where + Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*), + { + self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*) + } + }; + (impl $func:ident () + ($($other:expr),*) $inner:block) => { + $inner + }; + (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => { + $T0::$func($arg0 $(, $other)*, |$arg0: <$T0 as CudaKernelParameter>::AsyncHostType<'stream, '_>| { + impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } + }) + }; +} + +#[cfg(feature = "host")] +impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> { + impl_launcher_launch! { launch0() => with0_async => launch0_async } + + impl_launcher_launch! { launch1( + arg1: A + ) => with1_async => launch1_async } + + impl_launcher_launch! { launch2( + arg1: A, arg2: B + ) => with2_async => launch2_async } + + impl_launcher_launch! { launch3( + arg1: A, arg2: B, arg3: C + ) => with3_async => launch3_async } + + impl_launcher_launch! { launch4( + arg1: A, arg2: B, arg3: C, arg4: D + ) => with4_async => launch4_async } + + impl_launcher_launch! { launch5( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E + ) => with5_async => launch5_async } + + impl_launcher_launch! { launch6( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F + ) => with6_async => launch6_async } + + impl_launcher_launch! { launch7( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G + ) => with7_async => launch7_async } + + impl_launcher_launch! { launch8( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H + ) => with8_async => launch8_async } + + impl_launcher_launch! { launch9( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I + ) => with9_async => launch9_async } + + impl_launcher_launch! { launch10( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J + ) => with10_async => launch10_async } + + impl_launcher_launch! { launch11( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K + ) => with11_async => launch11_async } + + impl_launcher_launch! { launch12( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K, arg12: L + ) => with12_async => launch12_async } +} + +#[cfg(feature = "host")] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct LaunchConfig { + pub grid: rustacuda::function::GridSize, + pub block: rustacuda::function::BlockSize, + pub ptx_jit: bool, +} + +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +pub struct RawPtxKernel { + module: ManuallyDrop>, + function: ManuallyDrop>, +} + +#[cfg(feature = "host")] +impl RawPtxKernel { + /// # Errors + /// + /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does + /// not contain an entry point named `entry_point`. + pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult { + let module: Box = Box::new(Module::load_from_string(ptx)?); + + let function = unsafe { &*std::ptr::from_ref(module.as_ref()) }.get_function(entry_point); + + let function = match function { + Ok(function) => function, + Err(err) => { + if let Err((_err, module)) = Module::drop(*module) { + std::mem::forget(module); + } + + return Err(err); + }, + }; + + Ok(Self { + function: ManuallyDrop::new(function), + module: ManuallyDrop::new(module), + }) + } + + #[must_use] + pub fn get_function(&self) -> &Function { + &self.function + } +} + +#[cfg(feature = "host")] +impl Drop for RawPtxKernel { + fn drop(&mut self) { + { + // Ensure that self.function is dropped before self.module as + // it borrows data from the module and must not outlive it + let _function = unsafe { ManuallyDrop::take(&mut self.function) }; + } + + if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) }) + { + std::mem::forget(module); + } + } +} + +#[cfg(feature = "host")] +pub type PtxKernelConfigure = dyn FnMut(&Function) -> CudaResult<()>; + +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +pub struct TypedPtxKernel { + compiler: PtxJITCompiler, + ptx_kernel: Option, + entry_point: Box, + configure: Option>, + marker: PhantomData, +} + +#[cfg(feature = "host")] +macro_rules! impl_typed_kernel_launch { + ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => { + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch<'kernel, 'stream, $($T: CudaKernelParameter),*>( + &'kernel mut self, + stream: Stream<'stream>, + config: &LaunchConfig, + $($arg: $T::SyncHostType),* + ) -> CudaResult<()> + where + Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*), + { + self.$with_async::<(), CudaError, $($T),*>( + stream, + config, + $($arg,)* + |kernel, stream, config, $($arg),*| { + let r#async = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*)?; + + // important: always synchronise here, this function is sync! + r#async.synchronize() + }, + ) + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $with_async< + 'kernel, + 'stream, + Ok, + Err: From, + $($T: CudaKernelParameter),* + >( + &'kernel mut self, + stream: Stream<'stream>, + config: &LaunchConfig, + $($arg: $T::SyncHostType,)* + inner: impl FnOnce( + &'kernel mut Self, + Stream<'stream>, + &LaunchConfig, + $($T::AsyncHostType<'stream, '_>),* + ) -> Result, + ) -> Result + where + Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*), + { + impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) { + inner(self, stream, config, $($arg),*) + } } + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::needless_lifetimes)] // 'stream is unused for zero args + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch_async<'kernel, 'stream, $($T: CudaKernelParameter),*>( + &'kernel mut self, + stream: Stream<'stream>, + config: &LaunchConfig, + $($arg: $T::AsyncHostType<'stream, '_>),* + ) -> CudaResult> + // launch_async does not need to capture its parameters until kernel completion: + // - moved parameters are moved and cannot be used again, deallocation will sync + // - immutably borrowed parameters can be shared across multiple kernel launches + // - mutably borrowed parameters are more tricky: + // - Rust's borrowing rules ensure that a single mutable reference cannot be + // passed into multiple parameters of the kernel (no mutable aliasing) + // - CUDA guarantees that kernels launched on the same stream are executed + // sequentially, so even immediate resubmissions for the same mutable data + // will not have temporally overlapping mutation on the same stream + // - however, we have to guarantee that mutable data cannot be used on several + // different streams at the same time + // - Async::move_to_stream always adds a synchronisation barrier between the + // old and the new stream to ensure that all uses on the old stream happen + // strictly before all uses on the new stream + // - async launches take AsyncProj<&mut HostAndDeviceMutRef<..>>, which either + // captures an Async, which must be moved to a different stream explicitly, + // or contains data that cannot async move to a different stream without + // - any use of a mutable borrow in an async kernel launch adds a sync barrier + // on the launch stream s.t. the borrow is only complete once the kernel has + // completed + where + Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*), + { + let function = if config.ptx_jit { + impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + (sealed::Token) { + self.compile_with_ptx_jit_args(Some(&[$($arg),*])) + } }? + } else { + self.compile_with_ptx_jit_args(None)? + }; + + #[allow(unused_mut)] + let mut shared_memory_size = crate::utils::shared::SharedMemorySize::new(); + $( + shared_memory_size.add($T::shared_layout_for_async(&$arg, sealed::Token)); + )* + let Ok(shared_memory_size) = u32::try_from(shared_memory_size.total()) else { + // FIXME: this should really be InvalidConfiguration = 9 + return Err(CudaError::LaunchOutOfResources) + }; + + unsafe { stream.launch( + function, + config.grid.clone(), + config.block.clone(), + shared_memory_size, + &[ + $(core::ptr::from_mut( + &mut $T::async_to_ffi($arg, sealed::Token)? + ).cast::()),* + ], + ) }?; + + crate::utils::r#async::Async::pending( + (), stream, crate::utils::r#async::NoCompletion, + ) + } + }; + (impl $func:ident () + ($($other:expr),*) $inner:block) => { + $inner + }; + (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => { + $T0::$func($arg0 $(, $other)*, |$arg0: <$T0 as CudaKernelParameter>::AsyncHostType<'stream, '_>| { + impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } + }) + }; + (impl $func:ident ref () + ($($other:expr),*) $inner:block) => { + $inner + }; + (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => { + $T0::$func(&$arg0 $(, $other)*, |$arg0| { + impl_typed_kernel_launch! { impl $func ref ($($arg: $T),*) + ($($other),*) $inner } + }) + }; +} + +#[cfg(feature = "host")] +impl TypedPtxKernel { + #[must_use] + pub fn new>(configure: Option>) -> Self { + let compiler = PtxJITCompiler::new(T::get_ptx()); + let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str(); + + Self { + compiler, + ptx_kernel: None, + entry_point, + configure, + marker: PhantomData::, + } + } +} + +#[cfg(feature = "host")] +impl TypedPtxKernel { + impl_typed_kernel_launch! { launch0() => with0_async => launch0_async } + + impl_typed_kernel_launch! { launch1( + arg1: A + ) => with1_async => launch1_async } + + impl_typed_kernel_launch! { launch2( + arg1: A, arg2: B + ) => with2_async => launch2_async } + + impl_typed_kernel_launch! { launch3( + arg1: A, arg2: B, arg3: C + ) => with3_async => launch3_async } + + impl_typed_kernel_launch! { launch4( + arg1: A, arg2: B, arg3: C, arg4: D + ) => with4_async => launch4_async } + + impl_typed_kernel_launch! { launch5( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E + ) => with5_async => launch5_async } + + impl_typed_kernel_launch! { launch6( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F + ) => with6_async => launch6_async } + + impl_typed_kernel_launch! { launch7( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G + ) => with7_async => launch7_async } + + impl_typed_kernel_launch! { launch8( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H + ) => with8_async => launch8_async } + + impl_typed_kernel_launch! { launch9( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I + ) => with9_async => launch9_async } + + impl_typed_kernel_launch! { launch10( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J + ) => with10_async => launch10_async } + + impl_typed_kernel_launch! { launch11( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K + ) => with11_async => launch11_async } + + impl_typed_kernel_launch! { launch12( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K, arg12: L + ) => with12_async => launch12_async } + + /// # Errors + /// + /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to + /// [`Self::new`] is not a valid PTX source or does not contain the + /// entry point it declares. + fn compile_with_ptx_jit_args( + &mut self, + arguments: Option<&[Option<&NonNull<[u8]>>]>, + ) -> CudaResult<&Function> { + let ptx_jit = self.compiler.with_arguments(arguments); + + let kernel_jit = match (&mut self.ptx_kernel, ptx_jit) { + (Some(ptx_kernel), PtxJITResult::Cached(_)) => ptx_kernel.get_function(), + (ptx_kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => { + let recomputed_ptx_kernel = RawPtxKernel::new(ptx_cstr, &self.entry_point)?; + + // Replace the existing compiled kernel, drop the old one + let ptx_kernel = ptx_kernel.insert(recomputed_ptx_kernel); + + let function = ptx_kernel.get_function(); + + if let Some(configure) = self.configure.as_mut() { + configure(function)?; + } + + function + }, + }; + + Ok(kernel_jit) + } +} + +#[cfg(feature = "host")] +/// # Safety +/// +/// The PTX string returned by [`CompiledKernelPtx::get_ptx`] must correspond +/// to the compiled kernel code for the `Kernel` function and contain a kernel +/// entry point whose name is returned by +/// [`CompiledKernelPtx::get_entry_point`]. +/// +/// This trait should not be implemented manually – use the +/// [`kernel`] macro instead. +pub unsafe trait CompiledKernelPtx { + fn get_ptx() -> &'static CStr; + fn get_entry_point() -> &'static CStr; +} diff --git a/src/kernel/param.rs b/src/kernel/param.rs new file mode 100644 index 000000000..c87148c7a --- /dev/null +++ b/src/kernel/param.rs @@ -0,0 +1,1224 @@ +#[cfg(feature = "device")] +use core::convert::AsRef; +use core::{ + marker::PhantomData, + ops::{Deref, DerefMut}, +}; + +#[cfg(feature = "host")] +use std::{alloc::Layout, ptr::NonNull}; + +use const_type_layout::TypeGraphLayout; + +use crate::{ + alloc::EmptyCudaAlloc, + kernel::{sealed, CudaKernelParameter}, + lend::RustToCuda, + safety::{PortableBitSemantics, SafeMutableAliasing}, + utils::ffi::{DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef}, +}; + +pub struct PtxJit { + never: !, + _marker: PhantomData, +} + +impl Deref for PtxJit { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl DerefMut for PtxJit { + fn deref_mut(&mut self) -> &mut Self::Target { + self.never + } +} + +pub struct PerThreadShallowCopy< + T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, +> { + never: !, + _marker: PhantomData, +} + +impl Deref + for PerThreadShallowCopy +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl DerefMut + for PerThreadShallowCopy +{ + fn deref_mut(&mut self) -> &mut Self::Target { + self.never + } +} + +impl< + T: Copy + + Send + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout, + > CudaKernelParameter for PerThreadShallowCopy +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = T where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = T where Self: 'b; + type FfiType<'stream, 'b> = crate::utils::adapter::RustToCudaWithPortableBitCopySemantics where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + _stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + inner.with(param) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + Ok(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param)) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + let param = param.into_inner(); + + inner.with(param) + } +} +impl< + T: Copy + + Send + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout, + > sealed::Sealed for PerThreadShallowCopy +{ +} + +impl< + 'a, + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, + > CudaKernelParameter for &'a PerThreadShallowCopy +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< + 'b, + 'stream, + crate::host::HostAndDeviceConstRef<'b, T>, + > where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b T where Self: 'b; + type FfiType<'stream, 'b> = DeviceConstRef<'b, T> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = &'a T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + let _ = stream; + crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { + inner.with(unsafe { crate::utils::r#async::AsyncProj::new(const_ref, None) }) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + let param = unsafe { param.unwrap_unchecked() }; + Ok(param.for_device()) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + let param = param.as_ref(); + + inner.with(param) + } +} +impl< + 'a, + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, + > sealed::Sealed for &'a PerThreadShallowCopy +{ +} + +impl< + 'a, + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, + > CudaKernelParameter for &'a PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a PerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b> where Self: 'b; + type FfiType<'stream, 'b> = + <&'a PerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = <&'a PerThreadShallowCopy as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + let _ = stream; + // FIXME: forward impl + crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { + inner.with(unsafe { crate::utils::r#async::AsyncProj::new(const_ref, None) }) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + let param_ref = param.proj_ref(); + let param = unsafe { param_ref.unwrap_ref_unchecked() }; + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param, token) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + // FIXME: forward impl + let param = param.as_ref(); + + inner.with(param) + } +} +impl< + 'a, + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, + > sealed::Sealed for &'a PtxJit> +{ +} + +pub struct ShallowInteriorMutable< + T: Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, +> { + never: !, + _marker: PhantomData, +} + +impl< + T: Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, + > Deref for ShallowInteriorMutable +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl< + 'a, + T: Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, + > CudaKernelParameter for &'a ShallowInteriorMutable +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< + 'b, + 'stream, + crate::host::HostAndDeviceConstRef<'b, T> + > where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b T where Self: 'b; + type FfiType<'stream, 'b> = DeviceConstRef<'b, T> where Self: 'b; + #[cfg(feature = "host")] + /// The kernel takes a mutable borrow of the interior mutable data to ensure + /// the interior mutability is limited to just this kernel invocation. + type SyncHostType = &'a mut T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + let _ = stream; + crate::host::HostAndDeviceMutRef::with_new(param, |mut_ref| { + inner.with(unsafe { crate::utils::r#async::AsyncProj::new(mut_ref.as_ref(), None) }) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + let param = unsafe { param.unwrap_unchecked() }; + Ok(param.for_device()) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + let param = param.as_ref(); + + inner.with(param) + } +} +impl< + 'a, + T: crate::safety::StackOnly + + Sync + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, + > sealed::Sealed for &'a ShallowInteriorMutable +{ +} + +pub trait InteriorMutableSync: Sync + sealed::Sealed {} + +macro_rules! impl_atomic_interior_mutable { + ($atomic:ident($interior:ty)) => { + impl InteriorMutableSync for core::sync::atomic::$atomic {} + impl sealed::Sealed for core::sync::atomic::$atomic {} + }; + ($($atomic:ident($interior:ty)),*) => { + $(impl_atomic_interior_mutable! { $atomic($interior) })* + } +} + +impl_atomic_interior_mutable! { + AtomicBool(bool), + AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize), + AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize) +} + +impl InteriorMutableSync + for core::cell::SyncUnsafeCell +{ +} +impl sealed::Sealed + for core::cell::SyncUnsafeCell +{ +} + +pub struct DeepPerThreadBorrow { + never: !, + _marker: PhantomData, +} + +impl Deref for DeepPerThreadBorrow { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl< + T: Send + + Clone + + RustToCuda, + > CudaKernelParameter for DeepPerThreadBorrow +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< + 'b, + 'stream, + crate::host::HostAndDeviceOwned< + 'b, + DeviceAccessible<::CudaRepresentation>, + >, + crate::utils::r#async::NoCompletion, + > where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = T where Self: 'b; + type FfiType<'stream, 'b> = + DeviceOwnedRef<'b, DeviceAccessible<::CudaRepresentation>> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream))) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + let (param, _completion): (_, Option) = + unsafe { param.unwrap_unchecked()? }; + Ok(param.for_device()) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + unsafe { + crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) + } + } +} +impl< + T: Send + + Clone + + RustToCuda, + > sealed::Sealed for DeepPerThreadBorrow +{ +} + +impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow { + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< + 'b, + 'stream, + crate::host::HostAndDeviceConstRef< + 'b, + DeviceAccessible<::CudaRepresentation>, + >, + > where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b T where Self: 'b; + type FfiType<'stream, 'b> = + DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = &'a T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + let _ = stream; + crate::lend::LendToCuda::lend_to_cuda(param, |param| { + inner.with(unsafe { crate::utils::r#async::AsyncProj::new(param, None) }) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + let param = unsafe { param.unwrap_unchecked() }; + Ok(param.for_device()) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + unsafe { + crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) + } + } +} +impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow {} + +impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter + for &'a mut DeepPerThreadBorrow +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< + 'b, + 'stream, + crate::host::HostAndDeviceMutRef< + 'b, + DeviceAccessible<::CudaRepresentation>, + >, + > where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b mut T where Self: 'b; + type FfiType<'stream, 'b> = + DeviceMutRef<'b, DeviceAccessible<::CudaRepresentation>> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = &'a mut T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + crate::lend::LendToCuda::lend_to_cuda_mut(param, |param| { + // FIXME: express the same with param.into_async(stream).as_mut() + let _ = stream; + inner.with({ + // Safety: this projection cannot be moved to a different stream + // without first exiting lend_to_cuda_mut and synchronizing + unsafe { crate::utils::r#async::AsyncProj::new(param.into_mut(), None) } + }) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + mut param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + param.record_mut_use()?; + let mut param = unsafe { param.unwrap_unchecked() }; + Ok(param.for_device()) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + unsafe { + crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) + } + } +} +impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed + for &'a mut DeepPerThreadBorrow +{ +} + +impl< + T: Send + + Clone + + RustToCuda, + > CudaKernelParameter for PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = as CudaKernelParameter>::DeviceType<'b> where Self: 'b; + type FfiType<'stream, 'b> = + as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + // FIXME: forward impl + crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream))) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + let param = unsafe { param.as_ref().unwrap_unchecked() }; + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + as CudaKernelParameter>::async_to_ffi(param, token) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + // FIXME: forward impl + unsafe { + crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) + } + } +} +impl< + T: Send + + Clone + + RustToCuda, + > sealed::Sealed for PtxJit> +{ +} + +impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit> { + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = <&'a DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b> where Self: 'b; + type FfiType<'stream, 'b> = + <&'a DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = <&'a DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + // FIXME: forward impl + let _ = stream; + crate::lend::LendToCuda::lend_to_cuda(param, |param| { + inner.with(unsafe { crate::utils::r#async::AsyncProj::new(param, None) }) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + let param_ref = param.proj_ref(); + let param = unsafe { param_ref.unwrap_unchecked() }; + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + <&'a DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + // FIXME: forward impl + unsafe { + crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) + } + } +} +impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a PtxJit> {} + +impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter + for &'a mut PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b> where Self: 'b; + type FfiType<'stream, 'b> = + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + // FIXME: forward impl + crate::lend::LendToCuda::lend_to_cuda_mut(param, |param| { + // FIXME: express the same with param.as_async(stream).as_mut() + let _ = stream; + inner.with({ + // Safety: this projection cannot be moved to a different stream + // without first exiting lend_to_cuda_mut and synchronizing + unsafe { crate::utils::r#async::AsyncProj::new(param.into_mut(), None) } + }) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + let param_ref = param.proj_ref(); + let param = unsafe { param_ref.unwrap_unchecked() }; + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + // FIXME: forward impl + unsafe { + crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) + } + } +} +impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed + for &'a mut PtxJit> +{ +} + +#[cfg(feature = "host")] +fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { + NonNull::slice_from_raw_parts(NonNull::from(r).cast::(), core::mem::size_of_val(r)) +} + +#[cfg(feature = "device")] +fn emit_param_ptx_jit_marker(param: &T) { + unsafe { + core::arch::asm!( + "// //", + param_reg = in(reg32) *(core::ptr::from_ref(param).cast::()), + param_index = const(INDEX), + ); + } +} + +mod private_shared { + use core::marker::PhantomData; + + use const_type_layout::{TypeGraphLayout, TypeLayout}; + + use crate::safety::PortableBitSemantics; + + #[doc(hidden)] + #[derive(TypeLayout)] + #[repr(C)] + pub struct ThreadBlockSharedFfi { + pub(super) _dummy: [u8; 0], + pub(super) _marker: PhantomData, + } + + #[doc(hidden)] + #[derive(TypeLayout)] + #[repr(C)] + pub struct ThreadBlockSharedSliceFfi { + pub(super) len: usize, + pub(super) _marker: [T; 0], + } +} + +impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared { + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared where Self: 'b; + type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = Self; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + _stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + inner.with(param) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + _param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + Ok(private_shared::ThreadBlockSharedFfi { + _dummy: [], + _marker: PhantomData::, + }) + } + + #[cfg(feature = "device")] + #[allow(clippy::inline_always)] + #[inline(always)] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + _param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + let mut param = crate::utils::shared::ThreadBlockShared::new_uninit(); + + inner.with(&mut param) + } +} +impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared {} + +impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter + for &'a mut crate::utils::shared::ThreadBlockSharedSlice +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice where Self: 'b; + type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = Self; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'b, O, E: From>( + param: Self::SyncHostType, + _stream: crate::host::Stream<'stream>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result + where + Self: 'b, + { + inner.with(param) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O + where + Self: 'b, + { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout + where + Self: 'b, + { + param.layout() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> + where + Self: 'b, + { + Ok(private_shared::ThreadBlockSharedSliceFfi { + len: param.len(), + _marker: [], + }) + } + + #[cfg(feature = "device")] + #[allow(clippy::inline_always)] + #[inline(always)] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O + where + Self: 'short, + { + unsafe { + crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, |param| { + inner.with(param) + }) + } + } +} +impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed + for &'a mut crate::utils::shared::ThreadBlockSharedSlice +{ +} diff --git a/rust-cuda-ptx-jit/src/host/compiler/mod.rs b/src/kernel/ptx_jit/mod.rs similarity index 92% rename from rust-cuda-ptx-jit/src/host/compiler/mod.rs rename to src/kernel/ptx_jit/mod.rs index 156e8223c..43c555ab2 100644 --- a/rust-cuda-ptx-jit/src/host/compiler/mod.rs +++ b/src/kernel/ptx_jit/mod.rs @@ -6,7 +6,6 @@ mod replace; type ByteSliceOptionalArguments = Option>]>>; -#[doc(cfg(feature = "host"))] #[allow(clippy::module_name_repetitions)] pub struct PtxJITCompiler { ptx_slices: Box<[PtxElement]>, @@ -14,7 +13,6 @@ pub struct PtxJITCompiler { last_ptx: CString, } -#[doc(cfg(feature = "host"))] pub enum PtxJITResult<'s> { Cached(&'s CStr), Recomputed(&'s CStr), diff --git a/rust-cuda-ptx-jit/src/host/compiler/preprocess.rs b/src/kernel/ptx_jit/preprocess.rs similarity index 93% rename from rust-cuda-ptx-jit/src/host/compiler/preprocess.rs rename to src/kernel/ptx_jit/preprocess.rs index 0ee17733f..c22cf63e9 100644 --- a/rust-cuda-ptx-jit/src/host/compiler/preprocess.rs +++ b/src/kernel/ptx_jit/preprocess.rs @@ -5,7 +5,7 @@ use std::{ use super::{ regex::{ - CONST_BASE_REGISTER_REGEX, CONST_LOAD_INSTRUCTION_REGEX, CONST_MARKER_REGEX, REGISTER_REGEX, + const_base_register_regex, const_load_instruction_regex, const_marker_regex, register_regex, }, PtxElement, PtxJITCompiler, PtxLoadWidth, }; @@ -19,7 +19,7 @@ impl PtxJITCompiler { let mut const_markers: HashMap<&[u8], usize> = HashMap::new(); // Find injected rust-cuda-const-markers which identify dummy register rxx - for const_marker in CONST_MARKER_REGEX.captures_iter(ptx) { + for const_marker in const_marker_regex().captures_iter(ptx) { if let Some(tmpreg) = const_marker.name("tmpreg").map(|s| s.as_bytes()) { if let Some(param) = const_marker .name("param") @@ -36,7 +36,7 @@ impl PtxJITCompiler { let mut const_base_registers: HashMap<&[u8], usize> = HashMap::new(); // Find base register ryy which was used in `ld.global.u32 rxx, [ryy];` - for const_base_register in CONST_BASE_REGISTER_REGEX.captures_iter(ptx) { + for const_base_register in const_base_register_regex().captures_iter(ptx) { if let Some(tmpreg) = const_base_register.name("tmpreg").map(|s| s.as_bytes()) { if let Some(param) = const_markers.get(tmpreg) { if let Some(basereg) = const_base_register.name("basereg").map(|s| s.as_bytes()) @@ -54,7 +54,7 @@ impl PtxJITCompiler { let mut ptx_slices: Vec = Vec::new(); // Iterate over all load from base register with offset instructions - for const_load_instruction in CONST_LOAD_INSTRUCTION_REGEX.captures_iter(ptx) { + for const_load_instruction in const_load_instruction_regex().captures_iter(ptx) { // Only consider instructions where the base register is ryy if let Some(basereg) = const_load_instruction.name("basereg").map(|s| s.as_bytes()) { if let Some(param) = const_base_registers.get(basereg) { @@ -100,7 +100,7 @@ impl PtxJITCompiler { parameter_index: *param, byte_offset: loadoffset, load_width: loadwidth, - registers: REGISTER_REGEX + registers: register_regex() .captures_iter(constreg) .filter_map(|m| { m.name("register").map(|s| { diff --git a/src/kernel/ptx_jit/regex.rs b/src/kernel/ptx_jit/regex.rs new file mode 100644 index 000000000..58406b01e --- /dev/null +++ b/src/kernel/ptx_jit/regex.rs @@ -0,0 +1,58 @@ +use std::sync::OnceLock; + +use regex::bytes::Regex; + +#[allow(clippy::module_name_repetitions)] +pub fn const_marker_regex() -> &'static Regex { + static CONST_MARKER_REGEX: OnceLock = OnceLock::new(); + CONST_MARKER_REGEX.get_or_init(|| { + Regex::new(r"(?-u)// %r\d+)-(?P\d+)> //") + .unwrap() + }) +} + +#[allow(clippy::module_name_repetitions)] +pub fn const_base_register_regex() -> &'static Regex { + static CONST_BASE_REGISTER_REGEX: OnceLock = OnceLock::new(); + CONST_BASE_REGISTER_REGEX.get_or_init(|| { + Regex::new(r"(?-u)ld\.global\.u32\s*(?P%r\d+)\s*,\s*\[(?P%r[ds]?\d+)]\s*;") + .unwrap() + }) +} + +#[allow(clippy::module_name_repetitions)] +pub fn const_load_instruction_regex() -> &'static Regex { + static CONST_LOAD_INSTRUCTION_REGEX: OnceLock = OnceLock::new(); + CONST_LOAD_INSTRUCTION_REGEX.get_or_init(|| { + Regex::new( + r"(?x-u)(?P + ld\.global + (?:\.(?Pv[24]))? + \. + (?P[suf]) + (?P8|16|32|64) + \s* + (?P + (?:%[rf][sd]?\d+) | + (?:\{(?:\s*%[rf][sd]?\d+,)*\s*%[rf][sd]?\d+\s*\}) + ) + ,\s* + \[ + (?P%r[ds]?\d+) + (?: + \+ + (?P\d+) + )? + \] + \s*; + )", + ) + .unwrap() + }) +} + +#[allow(clippy::module_name_repetitions)] +pub fn register_regex() -> &'static Regex { + static REGISTER_REGEX: OnceLock = OnceLock::new(); + REGISTER_REGEX.get_or_init(|| Regex::new(r"(?-u)(?P%[rf][sd]?\d+)").unwrap()) +} diff --git a/rust-cuda-ptx-jit/src/host/compiler/replace.rs b/src/kernel/ptx_jit/replace.rs similarity index 96% rename from rust-cuda-ptx-jit/src/host/compiler/replace.rs rename to src/kernel/ptx_jit/replace.rs index df4d270b8..97a592da9 100644 --- a/rust-cuda-ptx-jit/src/host/compiler/replace.rs +++ b/src/kernel/ptx_jit/replace.rs @@ -1,10 +1,10 @@ -use std::{ffi::CString, ops::Deref}; +use std::{ffi::CString, ops::Deref, ptr::NonNull}; use super::{PtxElement, PtxJITCompiler, PtxJITResult, PtxLoadWidth}; impl PtxJITCompiler { #[allow(clippy::too_many_lines)] - pub fn with_arguments(&mut self, arguments: Option<&[Option<&[u8]>]>) -> PtxJITResult { + pub fn with_arguments(&mut self, arguments: Option<&[Option<&NonNull<[u8]>>]>) -> PtxJITResult { // Check if the arguments, cast as byte slices, are the same as the last cached // ones #[allow(clippy::explicit_deref_methods)] @@ -16,7 +16,7 @@ impl PtxJITCompiler { .zip(last_arguments.iter()) .all(|(a, b)| match (a, b) { (None, None) => false, - (Some(a), Some(b)) => *a != b.deref(), + (Some(a), Some(b)) => (unsafe { a.as_ref() }) != b.deref(), _ => true, }) }, @@ -30,7 +30,9 @@ impl PtxJITCompiler { self.last_arguments = arguments.map(|arguments| { arguments .iter() - .map(|arg| arg.map(|bytes| bytes.to_owned().into_boxed_slice())) + .map(|arg| { + arg.map(|bytes| unsafe { bytes.as_ref() }.to_owned().into_boxed_slice()) + }) .collect::>>>() .into_boxed_slice() }); diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs new file mode 100644 index 000000000..b4cec19cd --- /dev/null +++ b/src/lend/impls/box.rs @@ -0,0 +1,173 @@ +#[cfg(feature = "host")] +use std::mem::ManuallyDrop; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox}; + +use crate::{ + deps::alloc::boxed::Box, + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + safety::PortableBitSemantics, + utils::ffi::DeviceOwnedPointer, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::utils::ffi::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::{ + alloc::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, + utils::r#async::Async, + utils::r#async::CompletionFnMut, + utils::r#async::NoCompletion, +}; + +#[doc(hidden)] +#[repr(transparent)] +#[derive(TypeLayout)] +#[allow(clippy::module_name_repetitions)] +pub struct BoxCudaRepresentation(DeviceOwnedPointer); + +unsafe impl RustToCuda for Box { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocation = CudaDropWrapper>>; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocation = crate::alloc::SomeCudaAlloc; + type CudaRepresentation = BoxCudaRepresentation; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let mut device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(&**self), + )?); + + Ok(( + DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer( + device_box.as_device_ptr().as_raw_mut().cast(), + ))), + CombinedCudaAlloc::new(device_box, alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + use rustacuda::memory::CopyDestination; + + let (alloc_front, alloc_tail) = alloc.split(); + + alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(&mut **self))?; + + core::mem::drop(alloc_front); + + Ok(alloc_tail) + } +} + +unsafe impl RustToCudaAsync for Box { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocationAsync = CombinedCudaAlloc< + CudaDropWrapper>>>, + CudaDropWrapper>>>, + >; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; + + #[cfg(feature = "host")] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let locked_box = unsafe { + let mut uninit = CudaDropWrapper::from(LockedBox::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized()?); + std::ptr::copy_nonoverlapping( + std::ptr::from_ref::(&**self) + .cast::>>(), + uninit.as_mut_ptr(), + 1, + ); + uninit + }; + + let mut device_box = CudaDropWrapper::from(DeviceBox::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized()?); + device_box.async_copy_from(&*locked_box, &stream)?; + + Ok(( + Async::pending( + DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer( + device_box.as_device_ptr().as_raw_mut().cast(), + ))), + stream, + NoCompletion, + )?, + CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_box, device_box), alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> CudaResult<( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, + A, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let (alloc_front, alloc_tail) = alloc.split(); + let (mut locked_box, device_box) = alloc_front.split(); + + device_box.async_copy_to(&mut *locked_box, &stream)?; + + let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending( + this, + stream, + Box::new(move |this: &mut Self| { + let data: &mut T = &mut *this; + std::mem::drop(device_box); + // Safety: equivalent to *data = *locked_box since + // LockedBox> doesn't drop T + unsafe { + std::ptr::copy_nonoverlapping(locked_box.as_ptr().cast::(), data, 1); + } + std::mem::drop(locked_box); + Ok(()) + }), + )?; + + Ok((r#async, alloc_tail)) + } +} + +unsafe impl CudaAsRust for BoxCudaRepresentation { + type RustRepresentation = Box; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + crate::deps::alloc::boxed::Box::from_raw(this.0 .0) + } +} diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs new file mode 100644 index 000000000..5215d2acf --- /dev/null +++ b/src/lend/impls/boxed_slice.rs @@ -0,0 +1,191 @@ +use core::marker::PhantomData; +#[cfg(feature = "host")] +use std::mem::ManuallyDrop; + +use crate::{deps::alloc::boxed::Box, lend::RustToCudaAsync, utils::ffi::DeviceOwnedPointer}; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer}; + +use crate::{ + lend::{CudaAsRust, RustToCuda}, + safety::PortableBitSemantics, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::utils::ffi::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::{ + alloc::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, + utils::r#async::{Async, CompletionFnMut, NoCompletion}, +}; + +#[doc(hidden)] +#[allow(clippy::module_name_repetitions)] +#[derive(TypeLayout)] +#[repr(C)] +pub struct BoxedSliceCudaRepresentation { + data: DeviceOwnedPointer, + len: usize, + _marker: PhantomData, +} + +unsafe impl RustToCuda for Box<[T]> { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocation = + crate::host::CudaDropWrapper>>; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocation = crate::alloc::SomeCudaAlloc; + type CudaRepresentation = BoxedSliceCudaRepresentation; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice( + DeviceCopyWithPortableBitSemantics::from_slice(self), + )?); + + Ok(( + DeviceAccessible::from(BoxedSliceCudaRepresentation { + data: DeviceOwnedPointer(device_buffer.as_mut_ptr().cast()), + len: device_buffer.len(), + _marker: PhantomData::, + }), + CombinedCudaAlloc::new(device_buffer, alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + use rustacuda::memory::CopyDestination; + + let (alloc_front, alloc_tail) = alloc.split(); + + alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut_slice(self))?; + + core::mem::drop(alloc_front); + + Ok(alloc_tail) + } +} + +unsafe impl RustToCudaAsync for Box<[T]> { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocationAsync = CombinedCudaAlloc< + CudaDropWrapper>>>, + CudaDropWrapper>>>, + >; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; + + #[cfg(feature = "host")] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let locked_buffer = unsafe { + let mut uninit = CudaDropWrapper::from(LockedBuffer::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized(self.len())?); + std::ptr::copy_nonoverlapping( + self.as_ref() + .as_ptr() + .cast::>>(), + uninit.as_mut_ptr(), + self.len(), + ); + uninit + }; + + let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized(self.len())?); + device_buffer.async_copy_from(&*locked_buffer, &stream)?; + + Ok(( + Async::pending( + DeviceAccessible::from(BoxedSliceCudaRepresentation { + data: DeviceOwnedPointer(device_buffer.as_mut_ptr().cast()), + len: device_buffer.len(), + _marker: PhantomData::, + }), + stream, + NoCompletion, + )?, + CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> CudaResult<( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, + A, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let (alloc_front, alloc_tail) = alloc.split(); + let (mut locked_buffer, device_buffer) = alloc_front.split(); + + device_buffer.async_copy_to(&mut *locked_buffer, &stream)?; + + let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending( + this, + stream, + Box::new(move |this: &mut Self| { + let data: &mut [T] = &mut *this; + std::mem::drop(device_buffer); + // Safety: equivalent to data.copy_from_slice(&*locked_buffer) + // since LockedBox> doesn't drop T + unsafe { + std::ptr::copy_nonoverlapping( + locked_buffer.as_ptr().cast::(), + data.as_mut_ptr(), + data.len(), + ); + } + std::mem::drop(locked_buffer); + Ok(()) + }), + )?; + + Ok((r#async, alloc_tail)) + } +} + +unsafe impl CudaAsRust + for BoxedSliceCudaRepresentation +{ + type RustRepresentation = Box<[T]>; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut( + this.data.0, + this.len, + )) + } +} diff --git a/src/lend/impls/final.rs b/src/lend/impls/final.rs new file mode 100644 index 000000000..5799a77eb --- /dev/null +++ b/src/lend/impls/final.rs @@ -0,0 +1,102 @@ +use r#final::Final; + +use crate::{ + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + utils::ffi::DeviceAccessible, +}; + +#[doc(hidden)] +#[allow(clippy::module_name_repetitions)] +#[derive(const_type_layout::TypeLayout)] +#[repr(transparent)] +pub struct FinalCudaRepresentation(DeviceAccessible); + +unsafe impl RustToCuda for Final { + type CudaAllocation = T::CudaAllocation; + type CudaRepresentation = FinalCudaRepresentation; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + crate::alloc::CombinedCudaAlloc, + )> { + let (cuda_repr, alloc) = (**self).borrow(alloc)?; + + Ok(( + DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)), + alloc, + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: crate::alloc::CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + let (_alloc_front, alloc_tail) = alloc.split(); + Ok(alloc_tail) + } +} + +unsafe impl RustToCudaAsync for Final { + type CudaAllocationAsync = T::CudaAllocationAsync; + + #[cfg(feature = "host")] + unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, + crate::alloc::CombinedCudaAlloc, + )> { + let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?; + let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + + let final_cuda_repr = DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)); + + let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) { + crate::utils::r#async::Async::pending( + final_cuda_repr, + stream, + crate::utils::r#async::NoCompletion, + )? + } else { + crate::utils::r#async::Async::ready(final_cuda_repr, stream) + }; + + Ok((r#async, alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: crate::alloc::CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async< + 'a, + 'stream, + owning_ref::BoxRefMut<'a, O, Self>, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >, + A, + )> { + let (_alloc_front, alloc_tail) = alloc.split(); + let r#async = crate::utils::r#async::Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } +} + +unsafe impl CudaAsRust for FinalCudaRepresentation { + type RustRepresentation = Final; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + Final::new(CudaAsRust::as_rust(&this.0)) + } +} diff --git a/src/lend/impls/mod.rs b/src/lend/impls/mod.rs new file mode 100644 index 000000000..e0360671c --- /dev/null +++ b/src/lend/impls/mod.rs @@ -0,0 +1,9 @@ +mod r#box; +mod boxed_slice; +#[cfg(feature = "final")] +mod r#final; +mod option; +mod r#ref; +mod ref_mut; +mod slice_ref; +mod slice_ref_mut; diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs new file mode 100644 index 000000000..3f1d1e160 --- /dev/null +++ b/src/lend/impls/option.rs @@ -0,0 +1,214 @@ +use core::mem::MaybeUninit; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::error::CudaResult; + +use crate::{ + lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaProxy}, + safety::PortableBitSemantics, + utils::{adapter::RustToCudaWithPortableBitCopySemantics, ffi::DeviceAccessible}, +}; + +#[cfg(feature = "host")] +use crate::{ + alloc::{CombinedCudaAlloc, CudaAlloc}, + utils::r#async::{Async, CompletionFnMut, NoCompletion}, +}; + +#[doc(hidden)] +#[allow(clippy::module_name_repetitions)] +#[derive(TypeLayout)] +#[repr(C)] +pub struct OptionCudaRepresentation { + maybe: MaybeUninit>, + present: bool, +} + +unsafe impl RustToCuda for Option { + type CudaAllocation = Option<::CudaAllocation>; + type CudaRepresentation = OptionCudaRepresentation<::CudaRepresentation>; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let (cuda_repr, alloc) = match self { + None => ( + OptionCudaRepresentation { + maybe: MaybeUninit::uninit(), + present: false, + }, + CombinedCudaAlloc::new(None, alloc), + ), + Some(value) => { + let (cuda_repr, alloc) = value.borrow(alloc)?; + + let (alloc_front, alloc_tail) = alloc.split(); + + ( + OptionCudaRepresentation { + maybe: MaybeUninit::new(cuda_repr), + present: true, + }, + CombinedCudaAlloc::new(Some(alloc_front), alloc_tail), + ) + }, + }; + + Ok((DeviceAccessible::from(cuda_repr), alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + let (alloc_front, alloc_tail) = alloc.split(); + + match (self, alloc_front) { + (Some(value), Some(alloc_front)) => { + value.restore(CombinedCudaAlloc::new(alloc_front, alloc_tail)) + }, + _ => Ok(alloc_tail), + } + } +} + +unsafe impl RustToCudaAsync for Option { + type CudaAllocationAsync = Option<::CudaAllocationAsync>; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> CudaResult<( + Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + let (cuda_repr, alloc) = match self { + None => ( + Async::ready( + DeviceAccessible::from(OptionCudaRepresentation { + maybe: MaybeUninit::uninit(), + present: false, + }), + stream, + ), + CombinedCudaAlloc::new(None, alloc), + ), + Some(value) => { + let (cuda_repr, alloc) = value.borrow_async(alloc, stream)?; + + let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + + let (alloc_front, alloc_tail) = alloc.split(); + let alloc = CombinedCudaAlloc::new(Some(alloc_front), alloc_tail); + + let option_cuda_repr = DeviceAccessible::from(OptionCudaRepresentation { + maybe: MaybeUninit::new(cuda_repr), + present: true, + }); + + let r#async = if matches!(completion, Some(NoCompletion)) { + Async::pending(option_cuda_repr, stream, NoCompletion)? + } else { + Async::ready(option_cuda_repr, stream) + }; + + (r#async, alloc) + }, + }; + + Ok((cuda_repr, alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + mut this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> CudaResult<( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, + A, + )> { + let (alloc_front, alloc_tail) = alloc.split(); + + if let (Some(_), Some(alloc_front)) = (&mut *this, alloc_front) { + let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) }; + + #[allow(clippy::option_if_let_else)] + let (r#async, alloc_tail) = RustToCudaAsync::restore_async( + // Safety: we have already established value is Some above + this.map_mut(|value| unsafe { value.as_mut().unwrap_unchecked() }), + CombinedCudaAlloc::new(alloc_front, alloc_tail), + stream, + )?; + + let (value, on_completion) = unsafe { r#async.unwrap_unchecked()? }; + + std::mem::forget(value); + let this = std::mem::ManuallyDrop::into_inner(this_backup); + + if let Some(on_completion) = on_completion { + let r#async = Async::<_, CompletionFnMut<'a, Self>>::pending( + this, + stream, + Box::new(|this: &mut Self| { + if let Some(value) = this { + on_completion(value)?; + } + + Ok(()) + }), + )?; + Ok((r#async, alloc_tail)) + } else { + let r#async = Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } + } else { + let r#async = Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } + } +} + +unsafe impl CudaAsRust for OptionCudaRepresentation { + type RustRepresentation = Option<::RustRepresentation>; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + if this.present { + Some(CudaAsRust::as_rust(this.maybe.assume_init_ref())) + } else { + None + } + } +} + +impl RustToCudaProxy> + for Option> +{ + fn from_ref(val: &Option) -> &Self { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + unsafe { &*core::ptr::from_ref(val).cast() } + } + + fn from_mut(val: &mut Option) -> &mut Self { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + unsafe { &mut *core::ptr::from_mut(val).cast() } + } + + fn into(self) -> Option { + self.map(RustToCudaWithPortableBitCopySemantics::into_inner) + } +} diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs new file mode 100644 index 000000000..4233d1423 --- /dev/null +++ b/src/lend/impls/ref.rs @@ -0,0 +1,150 @@ +use core::marker::PhantomData; +#[cfg(feature = "host")] +use std::mem::ManuallyDrop; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox}; + +use crate::{ + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + safety::PortableBitSemantics, + utils::ffi::DeviceConstPointer, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::utils::ffi::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::{ + alloc::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, + utils::r#async::{Async, CompletionFnMut, NoCompletion}, +}; + +#[doc(hidden)] +#[repr(transparent)] +#[derive(TypeLayout)] +#[allow(clippy::module_name_repetitions)] +pub struct RefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> { + data: DeviceConstPointer, + _marker: PhantomData<&'a T>, +} + +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocation = CudaDropWrapper>>; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocation = crate::alloc::SomeCudaAlloc; + type CudaRepresentation = RefCudaRepresentation<'a, T>; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let mut device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(&**self), + )?); + + Ok(( + DeviceAccessible::from(RefCudaRepresentation { + data: DeviceConstPointer(device_box.as_device_ptr().as_raw().cast()), + _marker: PhantomData::<&'a T>, + }), + CombinedCudaAlloc::new(device_box, alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + let (_alloc_front, alloc_tail) = alloc.split(); + Ok(alloc_tail) + } +} + +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a T { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocationAsync = CombinedCudaAlloc< + CudaDropWrapper>>>, + CudaDropWrapper>>>, + >; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; + + #[cfg(feature = "host")] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let locked_box = unsafe { + let mut uninit = CudaDropWrapper::from(LockedBox::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized()?); + std::ptr::copy_nonoverlapping( + std::ptr::from_ref::(&**self) + .cast::>>(), + uninit.as_mut_ptr(), + 1, + ); + uninit + }; + + let mut device_box = CudaDropWrapper::from(DeviceBox::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized()?); + device_box.async_copy_from(&*locked_box, &stream)?; + + Ok(( + Async::pending( + DeviceAccessible::from(RefCudaRepresentation { + data: DeviceConstPointer(device_box.as_device_ptr().as_raw().cast()), + _marker: PhantomData::<&T>, + }), + stream, + NoCompletion, + )?, + CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_box, device_box), alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'b, O, Self>, + alloc: CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> CudaResult<( + Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>, + A, + )> { + let (_alloc_front, alloc_tail) = alloc.split(); + let r#async = Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } +} + +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust + for RefCudaRepresentation<'a, T> +{ + type RustRepresentation = &'a T; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + &*this.data.0 + } +} diff --git a/src/lend/impls/ref_mut.rs b/src/lend/impls/ref_mut.rs new file mode 100644 index 000000000..cab1ea8df --- /dev/null +++ b/src/lend/impls/ref_mut.rs @@ -0,0 +1,92 @@ +use core::marker::PhantomData; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBox}; + +use crate::{ + lend::{CudaAsRust, RustToCuda}, + safety::PortableBitSemantics, + utils::ffi::DeviceMutPointer, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::utils::ffi::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::{ + alloc::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, +}; + +#[doc(hidden)] +#[repr(transparent)] +#[derive(TypeLayout)] +#[allow(clippy::module_name_repetitions)] +pub struct RefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> { + data: DeviceMutPointer, + _marker: PhantomData<&'a mut T>, +} + +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut T { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocation = CudaDropWrapper>>; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocation = crate::alloc::SomeCudaAlloc; + type CudaRepresentation = RefMutCudaRepresentation<'a, T>; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let mut device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(&**self), + )?); + + Ok(( + DeviceAccessible::from(RefMutCudaRepresentation { + data: DeviceMutPointer(device_box.as_device_ptr().as_raw_mut().cast()), + _marker: PhantomData::<&'a mut T>, + }), + CombinedCudaAlloc::new(device_box, alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + use rustacuda::memory::CopyDestination; + + let (alloc_front, alloc_tail) = alloc.split(); + + alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(&mut **self))?; + + core::mem::drop(alloc_front); + + Ok(alloc_tail) + } +} + +// &mut T cannot implement RustToCudaAsync since the reference, potentially +// with garbage data, would remain accessible after failing a mutable restore + +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust + for RefMutCudaRepresentation<'a, T> +{ + type RustRepresentation = &'a mut T; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + let data: *mut T = this.data.0; + &mut *data + } +} diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs new file mode 100644 index 000000000..bd74dea64 --- /dev/null +++ b/src/lend/impls/slice_ref.rs @@ -0,0 +1,155 @@ +use core::marker::PhantomData; +#[cfg(feature = "host")] +use std::mem::ManuallyDrop; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer}; + +use crate::{ + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + safety::PortableBitSemantics, + utils::ffi::DeviceConstPointer, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::utils::ffi::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::{ + alloc::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, + utils::r#async::{Async, CompletionFnMut, NoCompletion}, +}; + +#[doc(hidden)] +#[allow(clippy::module_name_repetitions)] +#[derive(TypeLayout)] +#[repr(C)] +pub struct SliceRefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> { + data: DeviceConstPointer, + len: usize, + _marker: PhantomData<&'a [T]>, +} + +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T] { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocation = + crate::host::CudaDropWrapper>>; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocation = crate::alloc::SomeCudaAlloc; + type CudaRepresentation = SliceRefCudaRepresentation<'a, T>; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice( + DeviceCopyWithPortableBitSemantics::from_slice(self), + )?); + + Ok(( + DeviceAccessible::from(SliceRefCudaRepresentation { + data: DeviceConstPointer(device_buffer.as_ptr().cast()), + len: device_buffer.len(), + _marker: PhantomData::<&'a [T]>, + }), + CombinedCudaAlloc::new(device_buffer, alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + let (_alloc_front, alloc_tail) = alloc.split(); + Ok(alloc_tail) + } +} + +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a [T] { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocationAsync = CombinedCudaAlloc< + CudaDropWrapper>>>, + CudaDropWrapper>>>, + >; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; + + #[cfg(feature = "host")] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let locked_buffer = unsafe { + let mut uninit = CudaDropWrapper::from(LockedBuffer::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized(self.len())?); + std::ptr::copy_nonoverlapping( + self.as_ref() + .as_ptr() + .cast::>>(), + uninit.as_mut_ptr(), + self.len(), + ); + uninit + }; + + let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized(self.len())?); + device_buffer.async_copy_from(&*locked_buffer, &stream)?; + + Ok(( + Async::pending( + DeviceAccessible::from(SliceRefCudaRepresentation { + data: DeviceConstPointer(device_buffer.as_ptr().cast()), + len: device_buffer.len(), + _marker: PhantomData::<&'a [T]>, + }), + stream, + NoCompletion, + )?, + CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'b, O, Self>, + alloc: CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> CudaResult<( + Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>, + A, + )> { + let (_alloc_front, alloc_tail) = alloc.split(); + let r#async = Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } +} + +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust + for SliceRefCudaRepresentation<'a, T> +{ + type RustRepresentation = &'a [T]; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + core::slice::from_raw_parts(this.data.0, this.len) + } +} diff --git a/src/lend/impls/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs new file mode 100644 index 000000000..5c766dd24 --- /dev/null +++ b/src/lend/impls/slice_ref_mut.rs @@ -0,0 +1,94 @@ +use core::marker::PhantomData; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBuffer}; + +use crate::{ + lend::{CudaAsRust, RustToCuda}, + safety::PortableBitSemantics, + utils::ffi::DeviceMutPointer, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::utils::ffi::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::{ + alloc::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, +}; + +#[doc(hidden)] +#[allow(clippy::module_name_repetitions)] +#[derive(TypeLayout)] +#[repr(C)] +pub struct SliceRefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> { + data: DeviceMutPointer, + len: usize, + _marker: PhantomData<&'a mut [T]>, +} + +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut [T] { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocation = + crate::host::CudaDropWrapper>>; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocation = crate::alloc::SomeCudaAlloc; + type CudaRepresentation = SliceRefMutCudaRepresentation<'a, T>; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice( + DeviceCopyWithPortableBitSemantics::from_slice(self), + )?); + + Ok(( + DeviceAccessible::from(SliceRefMutCudaRepresentation { + data: DeviceMutPointer(device_buffer.as_mut_ptr().cast()), + len: device_buffer.len(), + _marker: PhantomData::<&'a mut [T]>, + }), + CombinedCudaAlloc::new(device_buffer, alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + use rustacuda::memory::CopyDestination; + + let (alloc_front, alloc_tail) = alloc.split(); + + alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut_slice(self))?; + + core::mem::drop(alloc_front); + + Ok(alloc_tail) + } +} + +// &mut [T] cannot implement RustToCudaAsync since the slice, potentially with +// garbage data, would remain accessible after failing a mutable restore + +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust + for SliceRefMutCudaRepresentation<'a, T> +{ + type RustRepresentation = &'a mut [T]; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + core::slice::from_raw_parts_mut(this.data.0, this.len) + } +} diff --git a/src/lend/mod.rs b/src/lend/mod.rs new file mode 100644 index 000000000..e05237768 --- /dev/null +++ b/src/lend/mod.rs @@ -0,0 +1,609 @@ +use const_type_layout::TypeGraphLayout; +#[cfg(feature = "host")] +use rustacuda::error::CudaError; + +#[cfg(feature = "derive")] +#[allow(clippy::module_name_repetitions)] +pub use rust_cuda_derive::LendRustToCuda; + +#[cfg(any(feature = "host", feature = "device", doc))] +use crate::safety::{SafeMutableAliasing, StackOnly}; +#[cfg(feature = "device")] +use crate::utils::ffi::{DeviceConstRef, DeviceMutRef, DeviceOwnedRef}; +use crate::{alloc::CudaAlloc, safety::PortableBitSemantics}; +#[cfg(any(feature = "host", feature = "device"))] +use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible}; +#[cfg(feature = "host")] +use crate::{ + alloc::{CombinedCudaAlloc, NoCudaAlloc}, + host::{HostAndDeviceConstRef, HostAndDeviceMutRef, HostAndDeviceOwned}, + utils::r#async::{Async, CompletionFnMut, NoCompletion}, +}; + +mod impls; + +/// # Safety +/// +/// This is an internal trait and should ONLY be derived automatically using +/// `#[derive(LendRustToCuda)]` +pub unsafe trait RustToCuda { + type CudaAllocation: CudaAlloc; + type CudaRepresentation: CudaAsRust; + + #[doc(hidden)] + #[cfg(feature = "host")] + /// # Errors + /// + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + /// + /// # Safety + /// + /// This is an internal function and should NEVER be called manually + /// The returned [`Self::CudaRepresentation`] must NEVER be accessed on the + /// CPU as it contains a GPU-resident copy of `self`. + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )>; + + #[doc(hidden)] + #[cfg(feature = "host")] + /// # Errors + /// + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + /// + /// # Safety + /// + /// This is an internal function and should NEVER be called manually + #[allow(clippy::type_complexity)] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult; +} + +/// # Safety +/// +/// This is an internal trait and should ONLY be derived automatically using +/// `#[derive(LendRustToCuda)]` +pub unsafe trait RustToCudaAsync: RustToCuda { + type CudaAllocationAsync: CudaAlloc; + + #[doc(hidden)] + #[cfg(feature = "host")] + /// # Errors + /// + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + /// + /// # Safety + /// + /// This is an internal function and should NEVER be called manually. + /// + /// The returned + /// [`Self::CudaRepresentation`](RustToCuda::CudaRepresentation) must NEVER + /// be accessed on the CPU as it contains a GPU-resident copy of + /// `self`. + /// + /// Since this method may perform asynchronous computation but returns its + /// result immediately, this result must only be used to construct compound + /// asynchronous computations before it has been synchronized on. + /// + /// Similarly, `&self` should remain borrowed until synchronisation has + /// been performed. + #[allow(clippy::type_complexity)] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )>; + + #[doc(hidden)] + #[cfg(feature = "host")] + /// # Errors + /// + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + /// + /// # Safety + /// + /// This is an internal function and should NEVER be called manually. + /// + /// Since this method may perform asynchronous computation but returns + /// immediately, `&mut self` not be used until it has been synchronized on. + /// + /// Therefore, `&mut self` should remain mutably borrowed until + /// synchronisation has been performed. + #[allow(clippy::type_complexity)] + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, + A, + )>; +} + +/// # Safety +/// +/// This is an internal trait and should NEVER be implemented manually +pub unsafe trait CudaAsRust: PortableBitSemantics + TypeGraphLayout { + type RustRepresentation: RustToCuda; + + #[doc(hidden)] + #[cfg(feature = "device")] + /// # Safety + /// + /// This is an internal function and should NEVER be called manually + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation; +} + +pub trait RustToCudaProxy: RustToCuda { + fn from_ref(val: &T) -> &Self; + fn from_mut(val: &mut T) -> &mut Self; + + fn into(self) -> T; +} + +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +pub trait LendToCuda: RustToCuda { + /// Lends an immutable borrow of `&self` to CUDA: + /// - code in the CUDA kernel can only access `&self` through the + /// [`DeviceConstRef`] inside the closure + /// - after the closure, `&self` will not have changed + /// + /// # Errors + /// + /// Returns a [`CudaError`] iff an error occurs inside CUDA + fn lend_to_cuda< + O, + E: From, + F: FnOnce( + HostAndDeviceConstRef::CudaRepresentation>>, + ) -> Result, + >( + &self, + inner: F, + ) -> Result + where + Self: Sync; + + /// Lends a mutable borrow of `&mut self` to CUDA iff `Self` is + /// [`SafeMutableAliasing`]: + /// - code in the CUDA kernel can only access `&mut self` through the + /// `DeviceMutRef` inside the closure + /// - after the closure, `&mut self` will reflect the changes from the + /// kernel execution + /// + /// # Errors + /// + /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + fn lend_to_cuda_mut< + O, + E: From, + F: FnOnce( + HostAndDeviceMutRef::CudaRepresentation>>, + ) -> Result, + >( + &mut self, + inner: F, + ) -> Result + where + Self: Sync + SafeMutableAliasing; + + /// Moves `self` to CUDA iff `Self` is [`StackOnly`]. + /// + /// # Errors + /// + /// Returns a [`CudaError`] iff an error occurs inside CUDA + fn move_to_cuda< + O, + E: From, + F: FnOnce( + HostAndDeviceOwned::CudaRepresentation>>, + ) -> Result, + >( + self, + inner: F, + ) -> Result + where + Self: Send + RustToCuda; +} + +#[cfg(feature = "host")] +impl LendToCuda for T { + fn lend_to_cuda< + O, + E: From, + F: FnOnce( + HostAndDeviceConstRef::CudaRepresentation>>, + ) -> Result, + >( + &self, + inner: F, + ) -> Result + where + Self: Sync, + { + let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; + + let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner); + + core::mem::drop(cuda_repr); + core::mem::drop(alloc); + + result + } + + fn lend_to_cuda_mut< + O, + E: From, + F: FnOnce( + HostAndDeviceMutRef::CudaRepresentation>>, + ) -> Result, + >( + &mut self, + inner: F, + ) -> Result + where + Self: Sync + SafeMutableAliasing, + { + let (mut cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; + + let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, inner); + + core::mem::drop(cuda_repr); + + let _: NoCudaAlloc = unsafe { self.restore(alloc) }?; + + result + } + + fn move_to_cuda< + O, + E: From, + F: FnOnce( + HostAndDeviceOwned::CudaRepresentation>>, + ) -> Result, + >( + self, + inner: F, + ) -> Result + where + Self: Send + RustToCuda, + { + let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; + + let result = HostAndDeviceOwned::with_new(cuda_repr, inner); + + core::mem::drop(alloc); + + result + } +} + +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +pub trait LendToCudaAsync: RustToCudaAsync { + /// Lends an immutable copy of `&self` to CUDA: + /// - code in the CUDA kernel can only access `&self` through the + /// [`DeviceConstRef`] inside the closure + /// - after the closure, `&self` will not have changed, i.e. interior + /// mutability is not handled by this method + /// + /// Since the [`HostAndDeviceConstRef`] is wrapped in an [`Async`] with + /// [`NoCompletion`], this [`Async`] can be safely dropped or forgotten + /// without changing any behaviour. Therefore, this [`Async`] does *not* + /// need to be returned from the `inner` closure. + /// + /// # Errors + /// + /// Returns a [`CudaError`] iff an error occurs inside CUDA + fn lend_to_cuda_async< + 'stream, + O, + E: From, + F: FnOnce( + Async< + '_, + 'stream, + HostAndDeviceConstRef::CudaRepresentation>>, + NoCompletion, + >, + ) -> Result, + >( + &self, + stream: crate::host::Stream<'stream>, + inner: F, + ) -> Result + where + Self: Sync; + + #[allow(clippy::type_complexity)] + /// Lends a mutable borrow of `&mut self` to CUDA iff `Self` is + /// [`SafeMutableAliasing`]: + /// - code in the CUDA kernel can only access `&mut self` through the + /// `DeviceMutRef` inside the closure + /// - after the closure, `&mut self` will reflect the changes from the + /// kernel execution + /// + /// # Errors + /// + /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + fn lend_to_cuda_mut_async< + 'a, + 'stream, + O, + E: From, + F: for<'b> FnOnce( + Async< + 'b, + 'stream, + HostAndDeviceMutRef::CudaRepresentation>>, + NoCompletion, + >, + ) -> Result, + T: 'a, + >( + this: owning_ref::BoxRefMut<'a, T, Self>, + stream: crate::host::Stream<'stream>, + inner: F, + ) -> Result< + ( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, T, Self>, CompletionFnMut<'a, Self>>, + O, + ), + E, + > + where + Self: Sync + SafeMutableAliasing; + + /// Moves `self` to CUDA iff `self` is [`StackOnly`]. + /// + /// Since the [`HostAndDeviceOwned`] is wrapped in an [`Async`] with + /// [`NoCompletion`], this [`Async`] can be safely dropped or forgotten + /// without changing any behaviour. Therefore, this [`Async`] does *not* + /// need to be returned from the `inner` closure. + /// + /// # Errors + /// + /// Returns a [`CudaError`] iff an error occurs inside CUDA + fn move_to_cuda_async< + 'stream, + O, + E: From, + F: for<'a> FnOnce( + Async< + 'a, + 'stream, + HostAndDeviceOwned::CudaRepresentation>>, + NoCompletion, + >, + ) -> Result, + >( + self, + stream: crate::host::Stream<'stream>, + inner: F, + ) -> Result + where + Self: Send + RustToCuda; +} + +#[cfg(feature = "host")] +impl LendToCudaAsync for T { + fn lend_to_cuda_async< + 'stream, + O, + E: From, + F: FnOnce( + Async< + '_, + 'stream, + HostAndDeviceConstRef::CudaRepresentation>>, + NoCompletion, + >, + ) -> Result, + >( + &self, + stream: crate::host::Stream<'stream>, + inner: F, + ) -> Result + where + Self: Sync, + { + let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; + + let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + + let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| { + let r#async = if matches!(completion, Some(NoCompletion)) { + Async::pending(const_ref, stream, NoCompletion)? + } else { + Async::ready(const_ref, stream) + }; + + inner(r#async) + }); + + core::mem::drop(cuda_repr); + core::mem::drop(alloc); + + result + } + + fn lend_to_cuda_mut_async< + 'a, + 'stream, + O, + E: From, + F: for<'b> FnOnce( + Async< + 'b, + 'stream, + HostAndDeviceMutRef::CudaRepresentation>>, + NoCompletion, + >, + ) -> Result, + S: 'a, + >( + this: owning_ref::BoxRefMut<'a, S, Self>, + stream: crate::host::Stream<'stream>, + inner: F, + ) -> Result< + ( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, S, Self>, CompletionFnMut<'a, Self>>, + O, + ), + E, + > + where + Self: Sync + SafeMutableAliasing, + { + let (cuda_repr, alloc) = unsafe { this.borrow_async(NoCudaAlloc, stream) }?; + + let (mut cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + + let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, |mut_ref| { + let r#async = if matches!(completion, Some(NoCompletion)) { + Async::pending(mut_ref, stream, NoCompletion)? + } else { + Async::ready(mut_ref, stream) + }; + + inner(r#async) + }); + + core::mem::drop(cuda_repr); + + let (r#async, _): (_, NoCudaAlloc) = unsafe { Self::restore_async(this, alloc, stream) }?; + + result.map(|ok| (r#async, ok)) + } + + fn move_to_cuda_async< + 'stream, + O, + E: From, + F: for<'a> FnOnce( + Async< + 'a, + 'stream, + HostAndDeviceOwned::CudaRepresentation>>, + NoCompletion, + >, + ) -> Result, + >( + self, + stream: crate::host::Stream<'stream>, + inner: F, + ) -> Result + where + Self: Send + RustToCuda, + { + let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; + + let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + + let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| { + if matches!(completion, Some(NoCompletion)) { + inner(Async::pending(owned_ref, stream, NoCompletion)?) + } else { + inner(Async::ready(owned_ref, stream)) + } + }); + + core::mem::drop(alloc); + + result + } +} + +#[cfg(feature = "device")] +pub trait BorrowFromRust: RustToCuda { + /// # Safety + /// + /// This function is only safe to call iff `cuda_repr` is the + /// [`DeviceConstRef`] borrowed on the CPU using the corresponding + /// [`LendToCuda::lend_to_cuda`]. + unsafe fn with_borrow_from_rust O>( + cuda_repr: DeviceConstRef::CudaRepresentation>>, + inner: F, + ) -> O; + + /// # Safety + /// + /// This function is only safe to call iff `cuda_repr_mut` is the + /// [`DeviceMutRef`] borrowed on the CPU using the corresponding + /// [`LendToCuda::lend_to_cuda_mut`]. + unsafe fn with_borrow_from_rust_mut O>( + cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, + inner: F, + ) -> O + where + Self: SafeMutableAliasing; + + /// # Safety + /// + /// This function is only safe to call iff `cuda_repr` is the + /// [`DeviceOwnedRef`] borrowed on the CPU using the corresponding + /// [`LendToCuda::move_to_cuda`]. + unsafe fn with_moved_from_rust O>( + cuda_repr: DeviceOwnedRef::CudaRepresentation>>, + inner: F, + ) -> O + where + Self: Sized + RustToCuda; +} + +#[cfg(feature = "device")] +impl BorrowFromRust for T { + #[inline] + unsafe fn with_borrow_from_rust O>( + cuda_repr: DeviceConstRef::CudaRepresentation>>, + inner: F, + ) -> O { + // `rust_repr` must never be dropped as we do NOT own any of the + // heap memory it might reference + let rust_repr = core::mem::ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr.as_ref())); + + inner(&rust_repr) + } + + #[inline] + unsafe fn with_borrow_from_rust_mut O>( + mut cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, + inner: F, + ) -> O + where + Self: SafeMutableAliasing, + { + // `rust_repr` must never be dropped as we do NOT own any of the + // heap memory it might reference + let mut rust_repr_mut = + core::mem::ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut())); + + inner(&mut rust_repr_mut) + } + + #[inline] + unsafe fn with_moved_from_rust O>( + mut cuda_repr: DeviceOwnedRef::CudaRepresentation>>, + inner: F, + ) -> O + where + Self: RustToCuda, + { + inner(CudaAsRust::as_rust(cuda_repr.as_mut())) + } +} diff --git a/src/lib.rs b/src/lib.rs index 3c176e4a2..35e11ed1b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,51 +1,84 @@ -#![deny(clippy::pedantic)] -#![allow(clippy::useless_attribute)] -#![cfg_attr(not(feature = "host"), no_std)] +//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License +//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod +//! Ready-to-Code]][gitpod] +//! +//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main +//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain +//! +//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange +//! [repo]: https://github.com/juntyr/rust-cuda +//! +//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue +//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda/ +//! +//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield +//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield +//! +//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx +//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda +//! +//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod +//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda + +#![deny(clippy::complexity)] +#![deny(clippy::correctness)] +#![warn(clippy::nursery)] +#![warn(clippy::pedantic)] +#![deny(clippy::perf)] +#![deny(clippy::style)] +#![deny(clippy::suspicious)] +// #![warn(clippy::multiple_unsafe_ops_per_block)] // FIXME +// #![warn(clippy::undocumented_unsafe_blocks)] // FIXME +#![deny(unused_unsafe)] +// #![warn(missing_docs)] // FIXME +#![cfg_attr(all(any(feature = "device", target_os = "cuda"), not(doc)), no_std)] #![feature(associated_type_bounds)] #![feature(auto_traits)] #![feature(negative_impls)] -#![cfg_attr( - any(all(not(feature = "host"), target_os = "cuda"), doc), - feature(stdsimd) -)] -#![cfg_attr(any(feature = "alloc", doc), feature(allocator_api))] +#![cfg_attr(feature = "device", feature(stdsimd))] +#![cfg_attr(feature = "device", feature(asm_experimental_arch))] +#![cfg_attr(feature = "device", feature(asm_const))] +#![feature(doc_auto_cfg)] #![feature(doc_cfg)] #![feature(marker_trait_attr)] #![feature(const_type_name)] -#![feature(offset_of)] #![feature(adt_const_params)] +#![feature(impl_trait_in_assoc_type)] +#![feature(ptr_metadata)] +#![feature(decl_macro)] +#![feature(panic_info_message)] +#![feature(let_chains)] +#![feature(inline_const)] +#![feature(sync_unsafe_cell)] +#![feature(never_type)] +#![feature(layout_for_ptr)] +#![feature(cfg_version)] +#![cfg_attr(feature = "device", feature(slice_ptr_get))] #![allow(incomplete_features)] #![feature(generic_const_exprs)] +#![allow(internal_features)] +#![feature(core_intrinsics)] +#![feature(const_intrinsic_compare_bytes)] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] -#[doc(hidden)] -pub extern crate alloc; - -pub extern crate rust_cuda_ptx_jit as ptx_jit; -pub extern crate rustacuda_core; +#[cfg(all(feature = "host", feature = "device", not(doc)))] +core::compile_error!("cannot enable the `host` and `device` features at the same time"); -#[doc(hidden)] -#[macro_use] -pub extern crate const_type_layout; +#[cfg(all(feature = "host", targt_os = "cuda", not(doc)))] +core::compile_error!("cannot enable the `host` feature on a target with `target_os=\"cuda\"`"); -#[cfg(feature = "derive")] -#[doc(cfg(feature = "derive"))] -pub extern crate rustacuda_derive; +#[cfg(all(feature = "device", not(target_os = "cuda"), not(doc)))] +core::compile_error!("cannot enable the `device` feature on a target without `target_os=\"cuda\"`"); -pub mod common; +pub mod alloc; +pub mod deps; +pub mod kernel; +pub mod lend; +pub mod safety; +pub mod utils; #[cfg(feature = "host")] -#[doc(cfg(feature = "host"))] pub mod host; -#[cfg(feature = "host")] -#[doc(cfg(feature = "host"))] -pub extern crate rustacuda; - -#[cfg(any(all(not(feature = "host"), target_os = "cuda"), doc))] -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] +#[cfg(feature = "device")] pub mod device; - -pub mod utils; - -pub mod safety; diff --git a/src/safety/aliasing.rs b/src/safety/aliasing.rs new file mode 100644 index 000000000..7add5775c --- /dev/null +++ b/src/safety/aliasing.rs @@ -0,0 +1,89 @@ +#[allow(clippy::module_name_repetitions)] +/// Types for which mutable references can be safely shared with each CUDA +/// thread without breaking Rust's no-mutable-aliasing memory safety +/// guarantees. +/// +/// # Safety +/// +/// A type may only implement [`SafeMutableAliasing`], if and +/// only if all of the safety conditions below hold: +/// +/// * Calling [`std::mem::replace`] on a mutable reference of the type does +/// *not* return a value which owns memory which it must deallocate on drop. +/// For instance, `&mut [T]` satisfies this criteria, but `Box` does not. +/// +/// * No safe alising mutable access is provided to the same memory locations +/// across multiple CUDA threads. You can use the +/// [`SplitSliceOverCudaThreadsConstStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride) +/// and +/// [`SplitSliceOverCudaThreadsDynamicStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride) +/// wrapper types to ensure that each thread is only given access to to its +/// own sub-slice partition so that aliasing is avoided. +/// +/// * A mutable reference of the type must not provide mutable access to some +/// shallow inner state (in contrast to deep, which refers to values behind +/// references) of the value which the API user expects to be mutably shared +/// between all threads even if it is not in practice so as to not violate the +/// second condition. For instance, `Vec` violates this third condition, as +/// code with access to `&mut Vec` can also mutate the length of the +/// vector, which is shallow state that is expected to be propagated to the +/// caller of a function sharing this vector (it is also related to the deep +/// contents of the vector via a safety invariant) and might thus assume that +/// mutations of this length are either shared across threads or shared back +/// with the host after the kernel has completed, neither of which is +/// possible. In contrast, `&mut [T]` satisfies this condition, as it is well +/// known that modifying the shallow length of a slice (by assigning a +/// sub-slice) inside a function does not alter the length of the slice that +/// the caller of the function passed in. +pub unsafe trait SafeMutableAliasing {} + +unsafe impl< + 'a, + T: crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + const_type_layout::TypeGraphLayout, + const STRIDE: usize, + > SafeMutableAliasing + for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<&'a mut [T], STRIDE> +{ +} + +unsafe impl< + 'a, + T: crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + const_type_layout::TypeGraphLayout, + > SafeMutableAliasing + for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<&'a mut [T]> +{ +} + +#[cfg(any(feature = "host", feature = "device"))] +unsafe impl< + T: crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + const_type_layout::TypeGraphLayout, + const M2D: bool, + const M2H: bool, + const STRIDE: usize, + > SafeMutableAliasing + for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride< + crate::utils::exchange::buffer::CudaExchangeBuffer, + STRIDE, + > +{ +} + +#[cfg(any(feature = "host", feature = "device"))] +unsafe impl< + T: crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + const_type_layout::TypeGraphLayout, + const M2D: bool, + const M2H: bool, + > SafeMutableAliasing + for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride< + crate::utils::exchange::buffer::CudaExchangeBuffer, + > +{ +} diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs deleted file mode 100644 index c5de73430..000000000 --- a/src/safety/device_copy.rs +++ /dev/null @@ -1,22 +0,0 @@ -#[allow(clippy::module_name_repetitions)] -pub trait SafeDeviceCopy: sealed::SafeDeviceCopy {} - -impl SafeDeviceCopy for T {} - -mod sealed { - #[marker] - pub trait SafeDeviceCopy {} - - impl SafeDeviceCopy for T {} - #[cfg(any(feature = "alloc", doc))] - impl SafeDeviceCopy for T {} - - impl SafeDeviceCopy - for crate::common::DeviceAccessible - { - } - impl SafeDeviceCopy - for crate::utils::device_copy::SafeDeviceCopyWrapper - { - } -} diff --git a/src/safety/kernel_signature.rs b/src/safety/kernel_signature.rs deleted file mode 100644 index 4a82ec1d0..000000000 --- a/src/safety/kernel_signature.rs +++ /dev/null @@ -1,29 +0,0 @@ -#[derive(PartialEq, Eq, core::marker::ConstParamTy)] -pub enum CpuAndGpuKernelSignatures { - Match, - Mismatch, -} - -pub struct Assert; - -#[must_use] -pub const fn check(haystack: &[u8], needle: &[u8]) -> CpuAndGpuKernelSignatures { - let mut i = 0; - let mut j = 0; - - while i < needle.len() { - if j >= haystack.len() { - return CpuAndGpuKernelSignatures::Mismatch; - } - - if needle[i] == haystack[j] { - i += 1; - j += 1; - } else { - j = j + 1 - i; - i = 0; - } - } - - CpuAndGpuKernelSignatures::Match -} diff --git a/src/safety/mod.rs b/src/safety/mod.rs index cf7a8f718..7e078e34e 100644 --- a/src/safety/mod.rs +++ b/src/safety/mod.rs @@ -1,19 +1,13 @@ +mod aliasing; mod arch; -mod device_copy; -mod no_aliasing; -mod register_fit; +mod portable; mod stack_only; -#[cfg(any(feature = "alloc", doc))] -mod unified_heap; #[doc(hidden)] -pub mod kernel_signature; +pub mod ptx_entry_point; #[doc(hidden)] -pub mod type_layout; +pub mod ptx_kernel_signature; -pub use device_copy::SafeDeviceCopy; -pub use no_aliasing::NoAliasing; -pub use register_fit::FitsIntoDeviceRegister; +pub use aliasing::SafeMutableAliasing; +pub use portable::PortableBitSemantics; pub use stack_only::StackOnly; -#[cfg(any(feature = "alloc", doc))] -pub use unified_heap::UnifiedHeapOnly; diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs deleted file mode 100644 index 22488efb8..000000000 --- a/src/safety/no_aliasing.rs +++ /dev/null @@ -1,25 +0,0 @@ -#[allow(clippy::module_name_repetitions)] -pub trait NoAliasing: private::NoAliasing {} -impl NoAliasing for T {} - -mod private { - pub auto trait NoAliasing {} - - impl !NoAliasing for *const T {} - impl !NoAliasing for *mut T {} - impl !NoAliasing for &mut T {} - - impl NoAliasing for core::marker::PhantomData {} - - impl NoAliasing for r#final::Final {} - impl NoAliasing - for crate::utils::aliasing::FinalCudaRepresentation - { - } - - impl NoAliasing - for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride - { - } - impl NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride {} -} diff --git a/src/safety/portable.rs b/src/safety/portable.rs new file mode 100644 index 000000000..5b438e2f7 --- /dev/null +++ b/src/safety/portable.rs @@ -0,0 +1,63 @@ +macro_rules! portable_bit_semantics_docs { + ($item:item) => { + /// Types whose in-memory bit representation on the CPU host is safe to copy + /// to and read back on the GPU device while maintaining the same semantics, + /// iff the type layout on the CPU matches the type layout on the GPU. + /// + /// For a type to implement [`PortableBitSemantics`], it + /// + /// * should have the same memory layout on both the CPU and GPU, and + /// + /// * must not contain any references to data that are exposed as safely + /// accessible on both ends but actually inaccessible on one. + /// + /// For instance, a reference `&u8` to host memory has the same well-defined + /// layout on both CPU and GPU (if their pointer sizes and alignments + /// match), but it is not portable since the host memory is generally + /// not accessible from the GPU. + /// + /// This trait is automatically implemented when the compiler determines + /// it's appropriate. + /// + /// Note that this trait is *sealed*, i.e. you cannot implement it on your + /// own custom types. + /// + /// Trait bounds usually combine [`PortableBitSemantics`] with + /// [`TypeGraphLayout`](const_type_layout::TypeGraphLayout) to check that + /// the type layout is indeed the same on both the host CPU and the GPU + /// device. + /// + /// Types that implement [`StackOnly`](crate::safety::StackOnly) and + /// [`TypeGraphLayout`](const_type_layout::TypeGraphLayout) satisfy both + /// of the above criteria and thus also implement [`PortableBitSemantics`]. + $item + }; +} + +#[cfg(not(doc))] +portable_bit_semantics_docs! { + #[allow(clippy::module_name_repetitions)] + pub trait PortableBitSemantics: sealed::PortableBitSemantics {} +} +#[cfg(doc)] +portable_bit_semantics_docs! { + pub use sealed::PortableBitSemantics; +} + +#[cfg(not(doc))] +impl PortableBitSemantics for T {} + +mod sealed { + pub auto trait PortableBitSemantics {} + + impl !PortableBitSemantics for &T {} + impl !PortableBitSemantics for &mut T {} + impl !PortableBitSemantics for *const T {} + impl !PortableBitSemantics for *mut T {} + + impl PortableBitSemantics for core::marker::PhantomData {} + + impl PortableBitSemantics for crate::utils::ffi::DeviceConstPointer {} + impl PortableBitSemantics for crate::utils::ffi::DeviceMutPointer {} + impl PortableBitSemantics for crate::utils::ffi::DeviceOwnedPointer {} +} diff --git a/src/safety/ptx_entry_point.rs b/src/safety/ptx_entry_point.rs new file mode 100644 index 000000000..ab06a13d9 --- /dev/null +++ b/src/safety/ptx_entry_point.rs @@ -0,0 +1,62 @@ +#[derive(PartialEq, Eq, core::marker::ConstParamTy)] +pub enum HostAndDeviceKernelEntryPoint { + Match, + Mismatch, +} + +pub struct Assert; + +#[must_use] +pub const fn check(ptx: &[u8], entry_point: &[u8]) -> HostAndDeviceKernelEntryPoint { + const PTX_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation"; + const KERNEL_TYPE: &[u8] = b".visible .entry "; + + // Short-circuit to avoid extra errors when PTX compilation fails + if ptx.len() == PTX_ERROR_MESSAGE.len() && starts_with(ptx, PTX_ERROR_MESSAGE, 0) { + return HostAndDeviceKernelEntryPoint::Match; + } + + let mut j = 0; + + while j < ptx.len() { + let Some(j2) = find(ptx, KERNEL_TYPE, j) else { + return HostAndDeviceKernelEntryPoint::Mismatch; + }; + + if starts_with(ptx, entry_point, j2) { + return HostAndDeviceKernelEntryPoint::Match; + } + + j += 1; + } + + HostAndDeviceKernelEntryPoint::Mismatch +} + +const fn find(haystack: &[u8], needle: &[u8], from: usize) -> Option { + let mut i = 0; + let mut j = from; + + while i < needle.len() { + if j >= haystack.len() { + return None; + } + + if needle[i] == haystack[j] { + i += 1; + j += 1; + } else { + j = j + 1 - i; + i = 0; + } + } + + Some(j) +} + +const fn starts_with(haystack: &[u8], needle: &[u8], from: usize) -> bool { + let haystack_len = haystack.len() - from; + let check_len = if needle.len() < haystack_len { needle.len() } else { haystack_len }; + + unsafe { core::intrinsics::compare_bytes(haystack.as_ptr().add(from), needle.as_ptr(), check_len) == 0 } +} diff --git a/src/safety/ptx_kernel_signature.rs b/src/safety/ptx_kernel_signature.rs new file mode 100644 index 000000000..a8b298691 --- /dev/null +++ b/src/safety/ptx_kernel_signature.rs @@ -0,0 +1,41 @@ +use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout}; + +#[allow(clippy::module_name_repetitions)] +#[derive(PartialEq, Eq, core::marker::ConstParamTy)] +pub enum HostAndDeviceKernelSignatureTypeLayout { + Match, + Mismatch, +} + +pub struct Assert; + +#[must_use] +pub const fn check( + device: &'static [u8], +) -> HostAndDeviceKernelSignatureTypeLayout +where + [u8; serialised_type_graph_len::()]:, +{ + const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation"; + + // Short-circuit to avoid extra errors when PTX compilation fails + if equals(device, SIGNATURE_ERROR_MESSAGE) { + return HostAndDeviceKernelSignatureTypeLayout::Match; + } + + let host = serialise_type_graph::(); + + if equals(device, &host) { + HostAndDeviceKernelSignatureTypeLayout::Match + } else { + HostAndDeviceKernelSignatureTypeLayout::Mismatch + } +} + +const fn equals(device: &[u8], host: &[u8]) -> bool { + if device.len() != host.len() { + return false; + } + + unsafe { core::intrinsics::compare_bytes(device.as_ptr(), host.as_ptr(), device.len()) == 0 } +} diff --git a/src/safety/register_fit.rs b/src/safety/register_fit.rs deleted file mode 100644 index 1ddf33849..000000000 --- a/src/safety/register_fit.rs +++ /dev/null @@ -1,43 +0,0 @@ -pub trait FitsIntoDeviceRegister: private::FitsIntoDeviceRegister {} -impl FitsIntoDeviceRegister for T {} - -mod private { - pub trait FitsIntoDeviceRegister {} - impl FitsIntoDeviceRegister for T where - AssertTypeFitsInto64Bits<{ TypeSize::check::() }>: FitsInto64Bits - { - } - - #[derive(PartialEq, Eq, core::marker::ConstParamTy)] - pub enum TypeSize { - TypeFitsInto64Bits, - // FIXME: ConstParamTy variant with str ICEs in rustdoc - #[cfg(not(doc))] - TypeExeceeds64Bits(&'static str), - #[cfg(doc)] - TypeExeceeds64Bits, - } - - impl TypeSize { - pub const fn check() -> Self { - if core::mem::size_of::() <= core::mem::size_of::() { - Self::TypeFitsInto64Bits - } else { - #[cfg(not(doc))] - { - Self::TypeExeceeds64Bits(core::any::type_name::()) - } - #[cfg(doc)] - { - Self::TypeExeceeds64Bits - } - } - } - } - - pub enum AssertTypeFitsInto64Bits {} - - pub trait FitsInto64Bits {} - - impl FitsInto64Bits for AssertTypeFitsInto64Bits<{ TypeSize::TypeFitsInto64Bits }> {} -} diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs index e96f48993..eac7f9456 100644 --- a/src/safety/stack_only.rs +++ b/src/safety/stack_only.rs @@ -1,40 +1,86 @@ -/// ```rust -/// # use rust_cuda::safety::StackOnly; -/// fn assert_stackonly(_x: impl StackOnly) {} -/// ``` -/// ```rust -/// # use rust_cuda::safety::StackOnly; -/// # fn assert_stackonly(_x: impl StackOnly) {} -/// assert_stackonly(42); -/// ``` -/// ```rust -/// # use rust_cuda::safety::StackOnly; -/// # fn assert_stackonly(_x: impl StackOnly) {} -/// assert_stackonly([42; 42]); -/// ``` -/// ```rust,compile_fail -/// # use alloc::vec; -/// # use rust_cuda::safety::StackOnly; -/// # fn assert_stackonly(_x: impl StackOnly) {} -/// assert_stackonly(vec![42]); -/// ``` -/// ```rust,compile_fail -/// # use alloc::vec; -/// # use rust_cuda::safety::StackOnly; -/// # fn assert_stackonly(_x: impl StackOnly) {} -/// assert_stackonly(&42); -/// ``` -#[allow(clippy::module_name_repetitions)] -pub trait StackOnly: sealed::StackOnly {} +macro_rules! stack_only_docs { + ($item:item) => { + /// Types which contain no pointers or references and can thus live entirely + /// on the stack. + /// + /// This trait is automatically implemented when the compiler determines + /// it's appropriate. + /// + /// Note that this trait is *sealed*, i.e. you cannot implement it on your + /// own custom types. + /// + /// Primitive types like [`u8`] and structs, tuples, and enums made only + /// from them implement [`StackOnly`]. + /// + /// In contrast, `&T`, `&mut T`, `*const T`, `*mut T`, and any type + /// containing a reference or a pointer do *not* implement [`StackOnly`]. + /// + /// # Examples + /// + /// ```rust + /// # use rust_cuda::safety::StackOnly; + /// fn assert_stackonly(_x: impl StackOnly) {} + /// ``` + /// ```rust + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// assert_stackonly(42); // ok + /// ``` + /// ```rust + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// assert_stackonly([42; 42]); // ok + /// ``` + /// ```rust,compile_fail + /// # use alloc::vec; + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// assert_stackonly(vec![42]); // error + /// ``` + /// ```rust,compile_fail + /// # use alloc::vec; + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// assert_stackonly(&42); // error + /// ``` + /// ```rust,compile_fail + /// # use alloc::vec; + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// # use crate::utils::shared::r#static::ThreadBlockShared; + /// assert_stackonly(ThreadBlockShared::new_uninit()); // error + /// ``` + /// ```rust,compile_fail + /// # use alloc::vec; + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// # use crate::utils::shared::slice::ThreadBlockSharedSlice; + /// assert_stackonly(ThreadBlockSharedSlice::new_uninit_with_len(0)); // error + /// ``` + $item + }; +} + +#[cfg(not(doc))] +stack_only_docs! { + #[allow(clippy::module_name_repetitions)] + pub trait StackOnly: sealed::StackOnly {} +} +#[cfg(doc)] +stack_only_docs! { + pub use sealed::StackOnly; +} + +#[cfg(not(doc))] impl StackOnly for T {} mod sealed { pub auto trait StackOnly {} - impl !StackOnly for *const T {} - impl !StackOnly for *mut T {} - impl !StackOnly for &T {} - impl !StackOnly for &mut T {} + impl !StackOnly for &T {} + impl !StackOnly for &mut T {} + impl !StackOnly for *const T {} + impl !StackOnly for *mut T {} impl StackOnly for core::marker::PhantomData {} } diff --git a/src/safety/type_layout.rs b/src/safety/type_layout.rs deleted file mode 100644 index f225f0055..000000000 --- a/src/safety/type_layout.rs +++ /dev/null @@ -1,33 +0,0 @@ -use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout}; - -#[derive(PartialEq, Eq, core::marker::ConstParamTy)] -pub enum CpuAndGpuTypeLayouts { - Match, - Mismatch, -} - -pub struct Assert; - -#[must_use] -pub const fn check(device: &'static [u8]) -> CpuAndGpuTypeLayouts -where - [u8; serialised_type_graph_len::()]:, -{ - let host = serialise_type_graph::(); - - if host.len() != device.len() { - return CpuAndGpuTypeLayouts::Mismatch; - } - - let mut i = 0; - - while i < host.len() { - if host[i] != device[i] { - return CpuAndGpuTypeLayouts::Mismatch; - } - - i += 1; - } - - CpuAndGpuTypeLayouts::Match -} diff --git a/src/safety/unified_heap.rs b/src/safety/unified_heap.rs deleted file mode 100644 index 9eda2d550..000000000 --- a/src/safety/unified_heap.rs +++ /dev/null @@ -1,46 +0,0 @@ -#[doc(cfg(feature = "alloc"))] -/// ```rust -/// # use rust_cuda::safety::UnifiedHeapOnly; -/// fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {} -/// ``` -/// ```rust -/// # use rust_cuda::safety::UnifiedHeapOnly; -/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {} -/// assert_unified_heap_only(42); -/// ``` -/// ```rust -/// # use rust_cuda::safety::UnifiedHeapOnly; -/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {} -/// assert_unified_heap_only([42; 42]); -/// ``` -/// ```rust,compile_fail -/// # use alloc::vec; -/// # use rust_cuda::safety::UnifiedHeapOnly; -/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {} -/// assert_unified_heap_only(vec![42]); -/// ``` -/// ```rust,compile_fail -/// # use rust_cuda::safety::UnifiedHeapOnly; -/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {} -/// assert_unified_heap_only(&42); -/// ``` -#[allow(clippy::module_name_repetitions)] -pub trait UnifiedHeapOnly: sealed::UnifiedHeapOnly {} -impl UnifiedHeapOnly for T {} - -mod sealed { - use crate::utils::alloc::UnifiedAllocator; - - pub auto trait UnifiedHeapOnly {} - - impl !UnifiedHeapOnly for *const T {} - impl !UnifiedHeapOnly for *mut T {} - impl !UnifiedHeapOnly for &T {} - impl !UnifiedHeapOnly for &mut T {} - - impl UnifiedHeapOnly for core::marker::PhantomData {} - - impl UnifiedHeapOnly for alloc::boxed::Box {} - impl UnifiedHeapOnly for alloc::vec::Vec {} - impl UnifiedHeapOnly for hashbrown::HashMap {} -} diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs new file mode 100644 index 000000000..fa023cc66 --- /dev/null +++ b/src/utils/adapter.rs @@ -0,0 +1,490 @@ +#![allow(clippy::trait_duplication_in_bounds)] + +use core::ops::{Deref, DerefMut}; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +use crate::{ + alloc::NoCudaAlloc, + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + safety::PortableBitSemantics, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::utils::ffi::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::alloc::{CombinedCudaAlloc, CudaAlloc}; + +#[derive(Copy, Clone, Debug, TypeLayout)] +#[repr(transparent)] +pub struct RustToCudaWithPortableBitCopySemantics( + T, +); + +impl From + for RustToCudaWithPortableBitCopySemantics +{ + fn from(value: T) -> Self { + Self(value) + } +} + +impl Deref + for RustToCudaWithPortableBitCopySemantics +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut + for RustToCudaWithPortableBitCopySemantics +{ + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl RustToCudaWithPortableBitCopySemantics { + #[must_use] + pub const fn from_copy(value: &T) -> Self { + Self(*value) + } + + #[must_use] + pub const fn into_inner(self) -> T { + self.0 + } + + #[must_use] + pub const fn from_ref(reference: &T) -> &Self { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(reference).cast() } + } + + #[must_use] + pub const fn into_ref(&self) -> &T { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(self).cast() } + } + + #[must_use] + pub fn from_mut(reference: &mut T) -> &mut Self { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(reference).cast() } + } + + #[must_use] + pub fn into_mut(&mut self) -> &mut T { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(self).cast() } + } + + #[must_use] + pub const fn from_slice(slice: &[T]) -> &[Self] { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub const fn into_slice(slice: &[Self]) -> &[T] { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } +} + +unsafe impl RustToCuda + for RustToCudaWithPortableBitCopySemantics +{ + type CudaAllocation = NoCudaAlloc; + type CudaRepresentation = Self; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); + Ok((DeviceAccessible::from(*self), alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); + + Ok(alloc_tail) + } +} + +unsafe impl RustToCudaAsync + for RustToCudaWithPortableBitCopySemantics +{ + type CudaAllocationAsync = NoCudaAlloc; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); + Ok(( + crate::utils::r#async::Async::ready(DeviceAccessible::from(*self), stream), + alloc, + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async< + 'a, + 'stream, + owning_ref::BoxRefMut<'a, O, Self>, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >, + A, + )> { + let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); + + let r#async = crate::utils::r#async::Async::< + _, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >::pending(this, stream, Box::new(|_this| Ok(())))?; + + Ok((r#async, alloc_tail)) + } +} + +unsafe impl CudaAsRust + for RustToCudaWithPortableBitCopySemantics +{ + type RustRepresentation = Self; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + let mut uninit = core::mem::MaybeUninit::uninit(); + core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1); + uninit.assume_init() + } +} + +#[derive(Copy, Clone, Debug, TypeLayout)] +#[repr(transparent)] +pub struct RustToCudaWithPortableBitCloneSemantics< + T: Clone + PortableBitSemantics + TypeGraphLayout, +>(T); + +impl From + for RustToCudaWithPortableBitCloneSemantics +{ + fn from(value: T) -> Self { + Self(value) + } +} + +impl Deref + for RustToCudaWithPortableBitCloneSemantics +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut + for RustToCudaWithPortableBitCloneSemantics +{ + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl RustToCudaWithPortableBitCloneSemantics { + #[must_use] + pub fn from_clone(value: &T) -> Self { + Self(value.clone()) + } + + #[must_use] + pub fn into_inner(self) -> T { + self.0 + } + + #[must_use] + pub const fn from_ref(reference: &T) -> &Self { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(reference).cast() } + } + + #[must_use] + pub const fn into_ref(&self) -> &T { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(self).cast() } + } + + #[must_use] + pub fn from_mut(reference: &mut T) -> &mut Self { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(reference).cast() } + } + + #[must_use] + pub fn into_mut(&mut self) -> &mut T { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(self).cast() } + } + + #[must_use] + pub const fn from_slice(slice: &[T]) -> &[Self] { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub const fn into_slice(slice: &[Self]) -> &[T] { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } +} + +unsafe impl RustToCuda + for RustToCudaWithPortableBitCloneSemantics +{ + type CudaAllocation = NoCudaAlloc; + type CudaRepresentation = Self; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); + Ok((DeviceAccessible::from(self.clone()), alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); + + Ok(alloc_tail) + } +} + +unsafe impl RustToCudaAsync + for RustToCudaWithPortableBitCloneSemantics +{ + type CudaAllocationAsync = NoCudaAlloc; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); + Ok(( + crate::utils::r#async::Async::ready(DeviceAccessible::from(self.clone()), stream), + alloc, + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async< + 'a, + 'stream, + owning_ref::BoxRefMut<'a, O, Self>, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >, + A, + )> { + let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); + + let r#async = crate::utils::r#async::Async::< + _, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >::pending(this, stream, Box::new(|_this| Ok(())))?; + + Ok((r#async, alloc_tail)) + } +} + +unsafe impl CudaAsRust + for RustToCudaWithPortableBitCloneSemantics +{ + type RustRepresentation = Self; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + let mut uninit = core::mem::MaybeUninit::uninit(); + core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1); + uninit.assume_init() + } +} + +#[allow(clippy::module_name_repetitions)] +#[derive(Copy, Clone, Debug, TypeLayout)] +#[repr(transparent)] +pub struct DeviceCopyWithPortableBitSemantics(T); + +unsafe impl rustacuda_core::DeviceCopy + for DeviceCopyWithPortableBitSemantics +{ +} + +impl From for DeviceCopyWithPortableBitSemantics { + fn from(value: T) -> Self { + Self(value) + } +} + +impl Deref for DeviceCopyWithPortableBitSemantics { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for DeviceCopyWithPortableBitSemantics { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl DeviceCopyWithPortableBitSemantics { + #[must_use] + pub fn into_inner(self) -> T { + self.0 + } + + #[must_use] + pub const fn from_ref(reference: &T) -> &Self { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(reference).cast() } + } + + #[must_use] + pub const fn into_ref(&self) -> &T { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(self).cast() } + } + + #[must_use] + pub fn from_mut(reference: &mut T) -> &mut Self { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(reference).cast() } + } + + #[must_use] + pub fn into_mut(&mut self) -> &mut T { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(self).cast() } + } + + #[must_use] + pub const fn from_slice(slice: &[T]) -> &[Self] { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub const fn into_slice(slice: &[Self]) -> &[T] { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } +} diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index 361151ac2..097b4c0f4 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -1,47 +1,68 @@ +#[cfg(any(feature = "host", feature = "device"))] use core::{ borrow::{Borrow, BorrowMut}, convert::{AsMut, AsRef}, ops::{Deref, DerefMut}, }; -use rustacuda_core::DeviceCopy; +use const_type_layout::TypeLayout; -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; +use crate::{ + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + utils::ffi::DeviceAccessible, +}; #[repr(transparent)] -#[derive(Clone, TypeLayout)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, TypeLayout)] pub struct SplitSliceOverCudaThreadsConstStride(T); impl SplitSliceOverCudaThreadsConstStride { + #[cfg(feature = "host")] #[must_use] - pub fn new(inner: T) -> Self { + pub const fn new(inner: T) -> Self { Self(inner) } } -// Safety: If `T` is `DeviceCopy`, then the newtype struct also is `DeviceCopy` -unsafe impl DeviceCopy - for SplitSliceOverCudaThreadsConstStride -{ -} - -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] fn split_slice_const_stride(slice: &[E]) -> &[E] { - let offset: usize = crate::device::utils::index() * STRIDE; + let offset: usize = crate::device::thread::Thread::this().index() * STRIDE; let len = slice.len().min(offset + STRIDE).saturating_sub(offset); unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] fn split_slice_const_stride_mut(slice: &mut [E]) -> &mut [E] { - let offset: usize = crate::device::utils::index() * STRIDE; + let offset: usize = crate::device::thread::Thread::this().index() * STRIDE; let len = slice.len().min(offset + STRIDE).saturating_sub(offset); unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] +impl SplitSliceOverCudaThreadsConstStride { + /// # Safety + /// + /// All cross-CUDA-thread aliasing guarantees are lost with this method. + /// Instead, the caller must ensure that no two threads in a kernel launch + /// access the same underlying elements. + pub const unsafe fn alias_unchecked(&self) -> &T { + &self.0 + } + + /// # Safety + /// + /// All cross-CUDA-thread aliasing guarantees are lost with this method. + /// Instead, the caller must ensure that no two threads in a kernel launch + /// access the same underlying elements. + pub unsafe fn alias_mut_unchecked(&mut self) -> &mut T { + &mut self.0 + } +} + +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> Deref for SplitSliceOverCudaThreadsConstStride { @@ -52,7 +73,8 @@ impl, const STRIDE: usize> Deref } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> DerefMut for SplitSliceOverCudaThreadsConstStride { @@ -61,7 +83,8 @@ impl, const STRIDE: usize> DerefMut } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> AsRef<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -70,7 +93,8 @@ impl, const STRIDE: usize> AsRef<[E]> } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> AsMut<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -79,7 +103,8 @@ impl, const STRIDE: usize> AsMut<[E]> } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> Borrow<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -88,7 +113,8 @@ impl, const STRIDE: usize> Borrow<[E]> } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> BorrowMut<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -97,7 +123,7 @@ impl, const STRIDE: usize> BorrowMut<[E]> } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> Deref for SplitSliceOverCudaThreadsConstStride { @@ -108,7 +134,7 @@ impl, const STRIDE: usize> Deref } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> DerefMut for SplitSliceOverCudaThreadsConstStride { @@ -117,7 +143,7 @@ impl, const STRIDE: usize> DerefMut } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> AsRef<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -126,7 +152,7 @@ impl, const STRIDE: usize> AsRef<[E]> } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> AsMut<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -135,7 +161,7 @@ impl, const STRIDE: usize> AsMut<[E]> } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> Borrow<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -144,7 +170,7 @@ impl, const STRIDE: usize> Borrow<[E]> } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> BorrowMut<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -156,21 +182,18 @@ impl, const STRIDE: usize> BorrowMut<[E]> unsafe impl RustToCuda for SplitSliceOverCudaThreadsConstStride { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] type CudaAllocation = T::CudaAllocation; type CudaRepresentation = SplitSliceOverCudaThreadsConstStride, STRIDE>; #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + crate::alloc::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.0.borrow(alloc)?; @@ -181,23 +204,96 @@ unsafe impl RustToCuda } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: crate::alloc::CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { self.0.restore(alloc) } } +unsafe impl RustToCudaAsync + for SplitSliceOverCudaThreadsConstStride +{ + type CudaAllocationAsync = T::CudaAllocationAsync; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, + crate::alloc::CombinedCudaAlloc, + )> { + let (r#async, alloc) = self.0.borrow_async(alloc, stream)?; + let (cuda_repr, completion) = unsafe { r#async.unwrap_unchecked()? }; + + let cuda_repr = + DeviceAccessible::from(SplitSliceOverCudaThreadsConstStride::new(cuda_repr)); + + let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) { + crate::utils::r#async::Async::pending( + cuda_repr, + stream, + crate::utils::r#async::NoCompletion, + )? + } else { + crate::utils::r#async::Async::ready(cuda_repr, stream) + }; + + Ok((r#async, alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: crate::alloc::CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async< + 'a, + 'stream, + owning_ref::BoxRefMut<'a, O, Self>, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >, + A, + )> { + let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) }; + + let (r#async, alloc_tail) = + T::restore_async(this.map_mut(|this| &mut this.0), alloc, stream)?; + + let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? }; + + std::mem::forget(inner); + let this = std::mem::ManuallyDrop::into_inner(this_backup); + + if let Some(on_completion) = on_completion { + let r#async = crate::utils::r#async::Async::< + _, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >::pending( + this, + stream, + Box::new(|this: &mut Self| on_completion(&mut this.0)), + )?; + Ok((r#async, alloc_tail)) + } else { + let r#async = crate::utils::r#async::Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } + } +} + unsafe impl CudaAsRust for SplitSliceOverCudaThreadsConstStride, STRIDE> { type RustRepresentation = SplitSliceOverCudaThreadsConstStride; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - SplitSliceOverCudaThreadsConstStride::new(CudaAsRust::as_rust(&this.0)) + SplitSliceOverCudaThreadsConstStride(CudaAsRust::as_rust(&this.0)) } } diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index 8b0446e08..3928c87d1 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -1,47 +1,71 @@ +#[cfg(any(feature = "host", feature = "device"))] use core::{ borrow::{Borrow, BorrowMut}, convert::{AsMut, AsRef}, ops::{Deref, DerefMut}, }; -use rustacuda_core::DeviceCopy; +use const_type_layout::TypeLayout; -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; +use crate::{ + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + utils::ffi::DeviceAccessible, +}; #[repr(C)] -#[derive(Clone, TypeLayout)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, TypeLayout)] pub struct SplitSliceOverCudaThreadsDynamicStride { stride: usize, inner: T, } impl SplitSliceOverCudaThreadsDynamicStride { + #[cfg(feature = "host")] #[must_use] - pub fn new(inner: T, stride: usize) -> Self { + pub const fn new(inner: T, stride: usize) -> Self { Self { stride, inner } } } -// Safety: If `T` is `DeviceCopy`, then the newtype struct also is `DeviceCopy` -unsafe impl DeviceCopy for SplitSliceOverCudaThreadsDynamicStride {} - -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] fn split_slice_dynamic_stride(slice: &[E], stride: usize) -> &[E] { - let offset: usize = crate::device::utils::index() * stride; + let offset: usize = crate::device::thread::Thread::this().index() * stride; let len = slice.len().min(offset + stride).saturating_sub(offset); unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] fn split_slice_dynamic_stride_mut(slice: &mut [E], stride: usize) -> &mut [E] { - let offset: usize = crate::device::utils::index() * stride; + let offset: usize = crate::device::thread::Thread::this().index() * stride; let len = slice.len().min(offset + stride).saturating_sub(offset); unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] +impl SplitSliceOverCudaThreadsDynamicStride { + /// # Safety + /// + /// All cross-CUDA-thread aliasing guarantees are lost with this method. + /// Instead, the caller must ensure that no two threads in a kernel launch + /// access the same underlying elements. + pub const unsafe fn alias_unchecked(&self) -> &T { + &self.inner + } + + /// # Safety + /// + /// All cross-CUDA-thread aliasing guarantees are lost with this method. + /// Instead, the caller must ensure that no two threads in a kernel launch + /// access the same underlying elements. + pub unsafe fn alias_mut_unchecked(&mut self) -> &mut T { + &mut self.inner + } +} + +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> Deref for SplitSliceOverCudaThreadsDynamicStride { type Target = [E]; @@ -50,42 +74,47 @@ impl> Deref for SplitSliceOverCudaThreadsDynamicStride } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> DerefMut for SplitSliceOverCudaThreadsDynamicStride { fn deref_mut(&mut self) -> &mut Self::Target { split_slice_dynamic_stride_mut(&mut self.inner, self.stride) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> AsRef<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn as_ref(&self) -> &[E] { split_slice_dynamic_stride(self.inner.as_ref(), self.stride) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> AsMut<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn as_mut(&mut self) -> &mut [E] { split_slice_dynamic_stride_mut(self.inner.as_mut(), self.stride) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> Borrow<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn borrow(&self) -> &[E] { split_slice_dynamic_stride(self.inner.borrow(), self.stride) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn borrow_mut(&mut self) -> &mut [E] { split_slice_dynamic_stride_mut(self.inner.borrow_mut(), self.stride) } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> Deref for SplitSliceOverCudaThreadsDynamicStride { type Target = [E]; @@ -94,35 +123,35 @@ impl> Deref for SplitSliceOverCudaThreadsDynamicStride } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> DerefMut for SplitSliceOverCudaThreadsDynamicStride { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.inner } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> AsRef<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn as_ref(&self) -> &[E] { self.inner.as_ref() } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> AsMut<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn as_mut(&mut self) -> &mut [E] { self.inner.as_mut() } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> Borrow<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn borrow(&self) -> &[E] { self.inner.borrow() } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn borrow_mut(&mut self) -> &mut [E] { self.inner.borrow_mut() @@ -130,21 +159,18 @@ impl> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicSt } unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] type CudaAllocation = T::CudaAllocation; type CudaRepresentation = SplitSliceOverCudaThreadsDynamicStride>; #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + crate::alloc::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.inner.borrow(alloc)?; @@ -158,23 +184,99 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: crate::alloc::CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { self.inner.restore(alloc) } } +unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDynamicStride { + type CudaAllocationAsync = T::CudaAllocationAsync; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, + crate::alloc::CombinedCudaAlloc, + )> { + let (r#async, alloc) = self.inner.borrow_async(alloc, stream)?; + let (cuda_repr, completion) = unsafe { r#async.unwrap_unchecked()? }; + + let cuda_repr = DeviceAccessible::from(SplitSliceOverCudaThreadsDynamicStride::new( + cuda_repr, + self.stride, + )); + + let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) { + crate::utils::r#async::Async::pending( + cuda_repr, + stream, + crate::utils::r#async::NoCompletion, + )? + } else { + crate::utils::r#async::Async::ready(cuda_repr, stream) + }; + + Ok((r#async, alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: crate::alloc::CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async< + 'a, + 'stream, + owning_ref::BoxRefMut<'a, O, Self>, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >, + A, + )> { + let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) }; + + let (r#async, alloc_tail) = + T::restore_async(this.map_mut(|this| &mut this.inner), alloc, stream)?; + + let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? }; + + std::mem::forget(inner); + let this = std::mem::ManuallyDrop::into_inner(this_backup); + + if let Some(on_completion) = on_completion { + let r#async = crate::utils::r#async::Async::< + _, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >::pending( + this, + stream, + Box::new(|this: &mut Self| on_completion(&mut this.inner)), + )?; + Ok((r#async, alloc_tail)) + } else { + let r#async = crate::utils::r#async::Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } + } +} + unsafe impl CudaAsRust for SplitSliceOverCudaThreadsDynamicStride> { type RustRepresentation = SplitSliceOverCudaThreadsDynamicStride; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - SplitSliceOverCudaThreadsDynamicStride::new(CudaAsRust::as_rust(&this.inner), this.stride) + SplitSliceOverCudaThreadsDynamicStride { + stride: this.stride, + inner: CudaAsRust::as_rust(&this.inner), + } } } diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs deleted file mode 100644 index f8d96d5e2..000000000 --- a/src/utils/aliasing/final.rs +++ /dev/null @@ -1,59 +0,0 @@ -use r#final::Final; - -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; - -#[doc(hidden)] -#[repr(transparent)] -#[derive(TypeLayout)] -#[allow(clippy::module_name_repetitions)] -pub struct FinalCudaRepresentation(DeviceAccessible); - -// Safety: If `T` is `CudaAsRust`, then the newtype struct is `DeviceCopy` -unsafe impl rustacuda_core::DeviceCopy for FinalCudaRepresentation {} - -unsafe impl RustToCuda for Final { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - type CudaAllocation = T::CudaAllocation; - type CudaRepresentation = FinalCudaRepresentation; - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - #[allow(clippy::type_complexity)] - unsafe fn borrow( - &self, - alloc: A, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - crate::host::CombinedCudaAlloc, - )> { - let (cuda_repr, alloc) = (**self).borrow(alloc)?; - - Ok(( - DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)), - alloc, - )) - } - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn restore( - &mut self, - alloc: crate::host::CombinedCudaAlloc, - ) -> rustacuda::error::CudaResult { - // Safety: Final is a repr(transparent) newtype wrapper around T - let inner: &mut T = &mut *(self as *mut Self).cast(); - - inner.restore(alloc) - } -} - -unsafe impl CudaAsRust for FinalCudaRepresentation { - type RustRepresentation = Final; - - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] - unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - Final::new(CudaAsRust::as_rust(&this.0)) - } -} diff --git a/src/utils/aliasing/mod.rs b/src/utils/aliasing/mod.rs index de7c58e05..e7753cf92 100644 --- a/src/utils/aliasing/mod.rs +++ b/src/utils/aliasing/mod.rs @@ -1,8 +1,5 @@ mod r#const; mod dynamic; -mod r#final; pub use dynamic::SplitSliceOverCudaThreadsDynamicStride; pub use r#const::SplitSliceOverCudaThreadsConstStride; - -pub(crate) use self::r#final::FinalCudaRepresentation; diff --git a/src/utils/alloc.rs b/src/utils/alloc.rs deleted file mode 100644 index 3bbcf225b..000000000 --- a/src/utils/alloc.rs +++ /dev/null @@ -1,67 +0,0 @@ -use alloc::alloc::{AllocError, Allocator, Layout}; -use core::ptr::NonNull; - -#[allow(clippy::module_name_repetitions)] -pub struct UnifiedAllocator; - -unsafe impl Allocator for UnifiedAllocator { - #[cfg(feature = "host")] - fn allocate(&self, layout: Layout) -> Result, AllocError> { - if layout.size() == 0 { - return Ok(NonNull::<[u8; 0]>::dangling()); - } - - match layout.align() { - 1 => alloc_unified_aligned::(layout.size()), - 2 => alloc_unified_aligned::(layout.size() >> 1), - 4 => alloc_unified_aligned::(layout.size() >> 2), - 8 => alloc_unified_aligned::(layout.size() >> 3), - _ => Err(AllocError), - } - } - - #[cfg(not(feature = "host"))] - fn allocate(&self, _layout: Layout) -> Result, AllocError> { - Err(AllocError) - } - - #[cfg(feature = "host")] - unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { - use rustacuda::{ - error::CudaResult, - memory::{cuda_free_unified, UnifiedPointer}, - }; - - if layout.size() == 0 { - return; - } - - let _: CudaResult<()> = cuda_free_unified(UnifiedPointer::wrap(ptr.as_ptr())); - } - - #[cfg(not(feature = "host"))] - unsafe fn deallocate(&self, _ptr: NonNull, _layout: Layout) { - // no-op - } -} - -#[cfg(feature = "host")] -fn alloc_unified_aligned( - size: usize, -) -> Result, AllocError> { - use rustacuda::memory::cuda_malloc_unified; - - match unsafe { cuda_malloc_unified::(size) } { - Ok(mut ptr) => { - let bytes: &mut [u8] = unsafe { - core::slice::from_raw_parts_mut( - ptr.as_raw_mut().cast(), - size * core::mem::align_of::(), - ) - }; - - NonNull::new(bytes).ok_or(AllocError) - }, - Err(_) => Err(AllocError), - } -} diff --git a/src/utils/async.rs b/src/utils/async.rs new file mode 100644 index 000000000..be4e2458c --- /dev/null +++ b/src/utils/async.rs @@ -0,0 +1,735 @@ +#[cfg(feature = "host")] +use std::{borrow::BorrowMut, future::Future, future::IntoFuture, marker::PhantomData, task::Poll}; + +#[cfg(feature = "host")] +use rustacuda::{ + error::CudaError, error::CudaResult, event::Event, event::EventFlags, + stream::StreamWaitEventFlags, +}; + +#[cfg(feature = "host")] +use crate::host::{CudaDropWrapper, Stream}; + +#[cfg(feature = "host")] +pub struct NoCompletion; +#[cfg(feature = "host")] +pub type CompletionFnMut<'a, T> = Box CudaResult<()> + 'a>; + +#[cfg(feature = "host")] +pub trait Completion>: sealed::Sealed { + type Completed: ?Sized; + + fn no_op() -> Self; + + #[doc(hidden)] + fn synchronize_on_drop(&self) -> bool; + + #[allow(clippy::missing_errors_doc)] // FIXME + fn complete(self, completed: &mut Self::Completed) -> CudaResult<()>; +} +#[cfg(feature = "host")] +mod sealed { + pub trait Sealed {} +} + +#[cfg(feature = "host")] +impl Completion for NoCompletion { + type Completed = T; + + #[inline] + fn no_op() -> Self { + Self + } + + #[inline] + fn synchronize_on_drop(&self) -> bool { + false + } + + #[inline] + fn complete(self, _completed: &mut Self::Completed) -> CudaResult<()> { + Ok(()) + } +} +#[cfg(feature = "host")] +impl sealed::Sealed for NoCompletion {} + +#[cfg(feature = "host")] +impl<'a, T: ?Sized + BorrowMut, B: ?Sized> Completion for CompletionFnMut<'a, B> { + type Completed = B; + + #[inline] + fn no_op() -> Self { + Box::new(|_value| Ok(())) + } + + #[inline] + fn synchronize_on_drop(&self) -> bool { + true + } + + #[inline] + fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> { + (self)(completed) + } +} +#[cfg(feature = "host")] +impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {} + +#[cfg(feature = "host")] +impl, C: Completion> Completion for Option { + type Completed = C::Completed; + + #[inline] + fn no_op() -> Self { + None + } + + #[inline] + fn synchronize_on_drop(&self) -> bool { + self.as_ref().map_or(false, Completion::synchronize_on_drop) + } + + #[inline] + fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> { + self.map_or(Ok(()), |completion| completion.complete(completed)) + } +} +#[cfg(feature = "host")] +impl sealed::Sealed for Option {} + +#[cfg(feature = "host")] +pub struct Async<'a, 'stream, T: BorrowMut, C: Completion = NoCompletion> { + stream: Stream<'stream>, + value: T, + status: AsyncStatus<'a, T, C>, + _capture: PhantomData<&'a ()>, +} + +#[cfg(feature = "host")] +enum AsyncStatus<'a, T: BorrowMut, C: Completion> { + #[allow(clippy::type_complexity)] + Processing { + receiver: oneshot::Receiver>, + completion: C, + event: Option>, + _capture: PhantomData<&'a T>, + }, + Completed { + result: CudaResult<()>, + }, +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'stream, T, C> { + /// Wraps a `value` which is ready on `stream`. + #[must_use] + pub const fn ready(value: T, stream: Stream<'stream>) -> Self { + Self { + stream, + value, + status: AsyncStatus::Completed { result: Ok(()) }, + _capture: PhantomData::<&'a ()>, + } + } + + /// Wraps a still-pending `value` which is being computed on `stream` + /// such that its computation can be synchronised on. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA. + pub fn pending(value: T, stream: Stream<'stream>, completion: C) -> CudaResult { + let (sender, receiver) = oneshot::channel(); + stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; + + Ok(Self { + stream, + value, + status: AsyncStatus::Processing { + receiver, + completion, + event: None, + _capture: PhantomData::<&'a T>, + }, + _capture: PhantomData::<&'a ()>, + }) + } + + /// Synchronises on this computation to block until it has completed and + /// the inner value can be safely returned and again be used in synchronous + /// operations. + /// + /// Calling `synchronize` after the computation has completed, e.g. after + /// calling [`rustacuda::stream::Stream::synchronize`], should be very + /// cheap. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA. + pub fn synchronize(self) -> CudaResult { + let (_stream, mut value, status) = self.destructure_into_parts(); + + let (receiver, completion) = match status { + AsyncStatus::Completed { result } => return result.map(|()| value), + AsyncStatus::Processing { + receiver, + completion, + event: _, + _capture, + } => (receiver, completion), + }; + + match receiver.recv() { + Ok(Ok(())) => (), + Ok(Err(err)) => return Err(err), + Err(oneshot::RecvError) => return Err(CudaError::AlreadyAcquired), + } + + completion.complete(value.borrow_mut())?; + + Ok(value) + } + + /// Moves the asynchronous data move to a different [`Stream`]. + /// + /// This method always adds a synchronisation barrier between the old and + /// and the new [`Stream`] to ensure that any usages of this [`Async`] + /// computations on the old [`Stream`] have completed before they can be + /// used on the new one. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA. + pub fn move_to_stream<'stream_new>( + self, + stream: Stream<'stream_new>, + ) -> CudaResult> { + let (old_stream, mut value, status) = self.destructure_into_parts(); + + let completion = match status { + AsyncStatus::Completed { result } => { + result?; + C::no_op() + }, + AsyncStatus::Processing { + receiver, + completion, + event: _, + _capture, + } => match receiver.try_recv() { + Ok(Ok(())) => { + completion.complete(value.borrow_mut())?; + C::no_op() + }, + Ok(Err(err)) => return Err(err), + Err(oneshot::TryRecvError::Empty) => completion, + Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired), + }, + }; + + let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?); + event.record(&old_stream)?; + stream.wait_event(&event, StreamWaitEventFlags::DEFAULT)?; + + let (sender, receiver) = oneshot::channel(); + stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; + + Ok(Async { + stream, + value, + status: AsyncStatus::Processing { + receiver, + completion, + event: Some(event), + _capture: PhantomData::<&'a T>, + }, + _capture: PhantomData::<&'a ()>, + }) + } + + #[allow(clippy::missing_errors_doc)] // FIXME + /// # Safety + /// + /// The returned inner value of type `T` may not yet have completed its + /// asynchronous work and may thus be in an inconsistent state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub unsafe fn unwrap_unchecked(self) -> CudaResult<(T, Option)> { + let (_stream, value, status) = self.destructure_into_parts(); + + match status { + AsyncStatus::Completed { result: Ok(()) } => Ok((value, None)), + AsyncStatus::Completed { result: Err(err) } => Err(err), + AsyncStatus::Processing { + receiver: _, + completion, + event: _, + _capture, + } => Ok((value, Some(completion))), + } + } + + pub const fn as_ref(&self) -> AsyncProj<'_, 'stream, &T> { + // Safety: this projection captures this async + unsafe { AsyncProj::new(&self.value, None) } + } + + pub fn as_mut(&mut self) -> AsyncProj<'_, 'stream, &mut T> { + // Safety: this projection captures this async + unsafe { + AsyncProj::new( + &mut self.value, + Some(Box::new(|| { + let completion = match &mut self.status { + AsyncStatus::Completed { result } => { + (*result)?; + C::no_op() + }, + AsyncStatus::Processing { + receiver: _, + completion, + event: _, + _capture, + } => std::mem::replace(completion, C::no_op()), + }; + + let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?); + + let (sender, receiver) = oneshot::channel(); + + self.stream + .add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; + event.record(&self.stream)?; + + self.status = AsyncStatus::Processing { + receiver, + completion, + event: Some(event), + _capture: PhantomData::<&'a T>, + }; + + Ok(()) + })), + ) + } + } + + #[must_use] + fn destructure_into_parts(self) -> (Stream<'stream>, T, AsyncStatus<'a, T, C>) { + let this = std::mem::ManuallyDrop::new(self); + + // Safety: we destructure self into its droppable components, + // value and status, without dropping self itself + unsafe { + ( + this.stream, + std::ptr::read(&this.value), + (std::ptr::read(&this.status)), + ) + } + } +} + +#[cfg(feature = "host")] +impl< + 'a, + 'stream, + T: crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout, + C: Completion>, + > Async<'a, 'stream, crate::host::HostAndDeviceConstRef<'a, T>, C> +where + crate::host::HostAndDeviceConstRef<'a, T>: BorrowMut, +{ + pub const fn extract_ref( + &self, + ) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceConstRef<'_, T>> { + // Safety: this projection captures this async + unsafe { AsyncProj::new(self.value.as_ref(), None) } + } +} + +#[cfg(feature = "host")] +impl< + 'a, + 'stream, + T: crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout, + C: Completion>, + > Async<'a, 'stream, crate::host::HostAndDeviceMutRef<'a, T>, C> +where + crate::host::HostAndDeviceMutRef<'a, T>: BorrowMut, +{ + pub fn extract_ref(&self) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceConstRef<'_, T>> { + // Safety: this projection captures this async + unsafe { AsyncProj::new(self.value.as_ref(), None) } + } + + pub fn extract_mut( + &mut self, + ) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceMutRef<'_, T>> { + // Safety: this projection captures this async + unsafe { + AsyncProj::new( + self.value.as_mut(), + Some(Box::new(|| { + let completion = match &mut self.status { + AsyncStatus::Completed { result } => { + (*result)?; + C::no_op() + }, + AsyncStatus::Processing { + receiver: _, + completion, + event: _, + _capture, + } => std::mem::replace(completion, C::no_op()), + }; + + let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?); + + let (sender, receiver) = oneshot::channel(); + + self.stream + .add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; + event.record(&self.stream)?; + + self.status = AsyncStatus::Processing { + receiver, + completion, + event: Some(event), + _capture: PhantomData, + }; + + Ok(()) + })), + ) + } + } +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: BorrowMut, C: Completion> Drop for Async<'a, 'stream, T, C> { + fn drop(&mut self) { + let AsyncStatus::Processing { + receiver, + completion, + event: _, + _capture, + } = std::mem::replace(&mut self.status, AsyncStatus::Completed { result: Ok(()) }) + else { + return; + }; + + if completion.synchronize_on_drop() && receiver.recv() == Ok(Ok(())) { + let _ = completion.complete(self.value.borrow_mut()); + } + } +} + +#[cfg(feature = "host")] +struct AsyncFuture<'a, 'stream, T: BorrowMut, C: Completion> { + _stream: PhantomData>, + value: Option, + completion: Option, + status: AsyncStatus<'a, T, NoCompletion>, +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: BorrowMut, C: Completion> Future + for AsyncFuture<'a, 'stream, T, C> +{ + type Output = CudaResult; + + fn poll( + self: core::pin::Pin<&mut Self>, + cx: &mut core::task::Context<'_>, + ) -> Poll { + // Safety: this function does not move out of `this` + let this = unsafe { self.get_unchecked_mut() }; + + match &mut this.status { + AsyncStatus::Processing { + receiver, + completion: _, + event: _, + _capture, + } => match std::pin::Pin::new(receiver).poll(cx) { + Poll::Ready(Ok(Ok(()))) => (), + Poll::Ready(Ok(Err(err))) => return Poll::Ready(Err(err)), + Poll::Ready(Err(oneshot::RecvError)) => { + return Poll::Ready(Err(CudaError::AlreadyAcquired)) + }, + Poll::Pending => return Poll::Pending, + }, + AsyncStatus::Completed { result: Ok(()) } => (), + AsyncStatus::Completed { result: Err(err) } => return Poll::Ready(Err(*err)), + } + + let Some(mut value) = this.value.take() else { + return Poll::Ready(Err(CudaError::AlreadyAcquired)); + }; + + if let Some(completion) = this.completion.take() { + completion.complete(value.borrow_mut())?; + } + + Poll::Ready(Ok(value)) + } +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: BorrowMut, C: Completion> IntoFuture + for Async<'a, 'stream, T, C> +{ + type Output = CudaResult; + + type IntoFuture = impl Future; + + fn into_future(self) -> Self::IntoFuture { + let (_stream, value, status) = self.destructure_into_parts(); + + let (completion, status): (Option, AsyncStatus<'a, T, NoCompletion>) = match status { + AsyncStatus::Completed { result } => { + (None, AsyncStatus::Completed:: { result }) + }, + AsyncStatus::Processing { + receiver, + completion, + event, + _capture, + } => ( + Some(completion), + AsyncStatus::Processing:: { + receiver, + completion: NoCompletion, + event, + _capture: PhantomData::<&'a T>, + }, + ), + }; + + AsyncFuture { + _stream: PhantomData::>, + value: Some(value), + completion, + status, + } + } +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: BorrowMut, C: Completion> Drop + for AsyncFuture<'a, 'stream, T, C> +{ + fn drop(&mut self) { + let Some(mut value) = self.value.take() else { + return; + }; + + let AsyncStatus::Processing { + receiver, + completion: NoCompletion, + event: _, + _capture, + } = std::mem::replace(&mut self.status, AsyncStatus::Completed { result: Ok(()) }) + else { + return; + }; + + let Some(completion) = self.completion.take() else { + return; + }; + + if completion.synchronize_on_drop() && receiver.recv() == Ok(Ok(())) { + let _ = completion.complete(value.borrow_mut()); + } + } +} + +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +pub struct AsyncProj<'a, 'stream, T: 'a> { + _capture: PhantomData<&'a ()>, + _stream: PhantomData>, + value: T, + use_callback: Option CudaResult<()> + 'a>>, +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { + #[must_use] + /// # Safety + /// + /// This projection must either capture an existing [`Async`] or come from + /// a source that ensures that the projected value can never (async) move + /// to a different [`Stream`]. + pub(crate) const unsafe fn new( + value: T, + use_callback: Option CudaResult<()> + 'a>>, + ) -> Self { + Self { + _capture: PhantomData::<&'a ()>, + _stream: PhantomData::>, + value, + use_callback, + } + } + + /// # Safety + /// + /// The returned reference to the inner value of type `T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub(crate) unsafe fn unwrap_unchecked(self) -> T { + self.value + } + + #[allow(clippy::type_complexity)] + /// # Safety + /// + /// The returned reference to the inner value of type `T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub(crate) unsafe fn unwrap_unchecked_with_use( + self, + ) -> (T, Option CudaResult<()> + 'a>>) { + (self.value, self.use_callback) + } +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { + #[must_use] + pub const fn proj_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T> + where + 'a: 'b, + { + AsyncProj { + _capture: PhantomData::<&'b ()>, + _stream: PhantomData::>, + value: &self.value, + use_callback: None, + } + } + + #[must_use] + pub fn proj_mut<'b>(&'b mut self) -> AsyncProj<'b, 'stream, &'b mut T> + where + 'a: 'b, + { + AsyncProj { + _capture: PhantomData::<&'b ()>, + _stream: PhantomData::>, + value: &mut self.value, + use_callback: self.use_callback.as_mut().map(|use_callback| { + let use_callback: Box CudaResult<()>> = Box::new(use_callback); + use_callback + }), + } + } + + pub(crate) fn record_mut_use(&mut self) -> CudaResult<()> { + self.use_callback + .as_mut() + .map_or(Ok(()), |use_callback| use_callback()) + } +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a T> { + #[must_use] + pub const fn as_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T> + where + 'a: 'b, + { + AsyncProj { + _capture: PhantomData::<&'b ()>, + _stream: PhantomData::>, + value: self.value, + use_callback: None, + } + } + + /// # Safety + /// + /// The returned reference to the inner value of type `&T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub(crate) const unsafe fn unwrap_ref_unchecked(&self) -> &T { + self.value + } +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> { + #[must_use] + pub fn as_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T> + where + 'a: 'b, + { + AsyncProj { + _capture: PhantomData::<&'b ()>, + _stream: PhantomData::>, + value: self.value, + use_callback: None, + } + } + + #[must_use] + pub fn as_mut<'b>(&'b mut self) -> AsyncProj<'b, 'stream, &'b mut T> + where + 'a: 'b, + { + AsyncProj { + _capture: PhantomData::<&'b ()>, + _stream: PhantomData::>, + value: self.value, + use_callback: self.use_callback.as_mut().map(|use_callback| { + let use_callback: Box CudaResult<()>> = Box::new(use_callback); + use_callback + }), + } + } + + #[allow(dead_code)] // FIXME + /// # Safety + /// + /// The returned reference to the inner value of type `&T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub(crate) unsafe fn unwrap_ref_unchecked(&self) -> &T { + self.value + } + + #[allow(dead_code)] // FIXME + /// # Safety + /// + /// The returned reference to the inner value of type `&T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub(crate) unsafe fn unwrap_mut_unchecked(&mut self) -> &mut T { + self.value + } +} diff --git a/src/utils/box.rs b/src/utils/box.rs deleted file mode 100644 index e3381f022..000000000 --- a/src/utils/box.rs +++ /dev/null @@ -1,83 +0,0 @@ -use alloc::boxed::Box; - -use const_type_layout::TypeGraphLayout; - -use crate::{ - common::{CudaAsRust, DeviceAccessible, RustToCuda}, - safety::SafeDeviceCopy, -}; - -#[cfg(feature = "host")] -use crate::{ - host::CombinedCudaAlloc, host::CudaAlloc, host::CudaDropWrapper, rustacuda::error::CudaResult, - rustacuda::memory::DeviceBox, utils::device_copy::SafeDeviceCopyWrapper, -}; - -#[doc(hidden)] -#[repr(transparent)] -#[derive(TypeLayout)] -#[allow(clippy::module_name_repetitions)] -pub struct BoxCudaRepresentation(*mut T) -where - T: SafeDeviceCopy + TypeGraphLayout; - -// Safety: This repr(C) struct only contains a device-owned pointer -unsafe impl rustacuda_core::DeviceCopy - for BoxCudaRepresentation -{ -} - -unsafe impl RustToCuda for Box { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - type CudaAllocation = CudaDropWrapper>>; - type CudaRepresentation = BoxCudaRepresentation; - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - #[allow(clippy::type_complexity)] - unsafe fn borrow( - &self, - alloc: A, - ) -> CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, - )> { - let mut device_box = - CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?); - - Ok(( - DeviceAccessible::from(BoxCudaRepresentation( - device_box.as_device_ptr().as_raw_mut().cast(), - )), - CombinedCudaAlloc::new(device_box, alloc), - )) - } - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn restore( - &mut self, - alloc: CombinedCudaAlloc, - ) -> CudaResult { - use rustacuda::memory::CopyDestination; - - let (alloc_front, alloc_tail) = alloc.split(); - - alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut(&mut **self))?; - - core::mem::drop(alloc_front); - - Ok(alloc_tail) - } -} - -unsafe impl CudaAsRust for BoxCudaRepresentation { - type RustRepresentation = Box; - - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] - unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - alloc::boxed::Box::from_raw(this.0) - } -} diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs deleted file mode 100644 index 5ed008801..000000000 --- a/src/utils/boxed_slice.rs +++ /dev/null @@ -1,85 +0,0 @@ -use alloc::boxed::Box; - -use const_type_layout::TypeGraphLayout; - -use crate::{ - common::{CudaAsRust, DeviceAccessible, RustToCuda}, - safety::SafeDeviceCopy, -}; - -#[cfg(feature = "host")] -use crate::{ - host::CombinedCudaAlloc, host::CudaAlloc, host::CudaDropWrapper, rustacuda::error::CudaResult, - rustacuda::memory::DeviceBuffer, utils::device_copy::SafeDeviceCopyWrapper, -}; - -#[doc(hidden)] -#[allow(clippy::module_name_repetitions)] -#[derive(Debug, TypeLayout)] -#[repr(C)] -pub struct BoxedSliceCudaRepresentation(*mut T, usize) -where - T: SafeDeviceCopy + TypeGraphLayout; - -// Safety: This repr(C) struct only contains a device-owned pointer -unsafe impl rustacuda_core::DeviceCopy - for BoxedSliceCudaRepresentation -{ -} - -unsafe impl RustToCuda for Box<[T]> { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - type CudaAllocation = CudaDropWrapper>>; - type CudaRepresentation = BoxedSliceCudaRepresentation; - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - #[allow(clippy::type_complexity)] - unsafe fn borrow( - &self, - alloc: A, - ) -> CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, - )> { - let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice( - SafeDeviceCopyWrapper::from_slice(self), - )?); - - Ok(( - DeviceAccessible::from(BoxedSliceCudaRepresentation( - device_buffer.as_mut_ptr().cast(), - device_buffer.len(), - )), - CombinedCudaAlloc::new(device_buffer, alloc), - )) - } - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn restore( - &mut self, - alloc: CombinedCudaAlloc, - ) -> CudaResult { - use rustacuda::memory::CopyDestination; - - let (alloc_front, alloc_tail) = alloc.split(); - - alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut_slice(self))?; - - core::mem::drop(alloc_front); - - Ok(alloc_tail) - } -} - -unsafe impl CudaAsRust for BoxedSliceCudaRepresentation { - type RustRepresentation = Box<[T]>; - - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] - unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1)) - } -} diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs deleted file mode 100644 index 289ef9969..000000000 --- a/src/utils/device_copy.rs +++ /dev/null @@ -1,112 +0,0 @@ -#![allow(clippy::trait_duplication_in_bounds)] - -use const_type_layout::TypeGraphLayout; - -use crate::{ - common::{CudaAsRust, DeviceAccessible, RustToCuda}, - safety::SafeDeviceCopy, -}; - -#[derive(Copy, Clone, Debug, TypeLayout)] -#[repr(transparent)] -pub struct SafeDeviceCopyWrapper(T) -where - T: SafeDeviceCopy + TypeGraphLayout; - -unsafe impl rustacuda_core::DeviceCopy - for SafeDeviceCopyWrapper -{ -} - -impl From for SafeDeviceCopyWrapper { - fn from(value: T) -> Self { - Self(value) - } -} - -impl SafeDeviceCopyWrapper { - pub fn into_inner(self) -> T { - self.0 - } - - pub fn from_ref(reference: &T) -> &Self { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` - unsafe { &*(reference as *const T).cast() } - } - - pub fn into_ref(&self) -> &T { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` - unsafe { &*(self as *const Self).cast() } - } - - pub fn from_mut(reference: &mut T) -> &mut Self { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` - unsafe { &mut *(reference as *mut T).cast() } - } - - pub fn into_mut(&mut self) -> &mut T { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` - unsafe { &mut *(self as *mut Self).cast() } - } - - pub fn from_slice(slice: &[T]) -> &[Self] { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` - unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } - } - - pub fn into_slice(slice: &[Self]) -> &[T] { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` - unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } - } - - pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` - unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } - } - - pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` - unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } - } -} - -unsafe impl RustToCuda for SafeDeviceCopyWrapper { - #[cfg(feature = "host")] - type CudaAllocation = crate::host::NullCudaAlloc; - type CudaRepresentation = Self; - - #[cfg(feature = "host")] - #[allow(clippy::type_complexity)] - unsafe fn borrow( - &self, - alloc: A, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - crate::host::CombinedCudaAlloc, - )> { - let alloc = crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc); - Ok((DeviceAccessible::from(&self.0), alloc)) - } - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn restore( - &mut self, - alloc: crate::host::CombinedCudaAlloc, - ) -> rustacuda::error::CudaResult { - let (_alloc_front, alloc_tail): (crate::host::NullCudaAlloc, A) = alloc.split(); - - Ok(alloc_tail) - } -} - -unsafe impl CudaAsRust for SafeDeviceCopyWrapper { - type RustRepresentation = Self; - - #[cfg(any(not(feature = "host"), doc))] - unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - let mut uninit = core::mem::MaybeUninit::uninit(); - core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1); - uninit.assume_init() - } -} diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs index a153da4d0..079dba419 100644 --- a/src/utils/exchange/buffer/common.rs +++ b/src/utils/exchange/buffer/common.rs @@ -1,7 +1,10 @@ -use const_type_layout::TypeGraphLayout; -use rustacuda_core::DeviceCopy; +use const_type_layout::{TypeGraphLayout, TypeLayout}; -use crate::{common::CudaAsRust, safety::SafeDeviceCopy}; +use crate::{ + lend::CudaAsRust, + safety::{PortableBitSemantics, StackOnly}, + utils::ffi::DeviceMutPointer, +}; use super::{CudaExchangeBuffer, CudaExchangeItem}; @@ -9,30 +12,30 @@ use super::{CudaExchangeBuffer, CudaExchangeItem}; #[doc(hidden)] #[derive(TypeLayout)] #[repr(C)] -pub struct CudaExchangeBufferCudaRepresentation( - pub(super) *mut CudaExchangeItem, +pub struct CudaExchangeBufferCudaRepresentation< + T: StackOnly + PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, +>( + pub(super) DeviceMutPointer>, pub(super) usize, -) -where - T: SafeDeviceCopy + TypeGraphLayout; +); -// Safety: `CudaExchangeBufferCudaRepresentation` is `DeviceCopy` -// iff `T` is `SafeDeviceCopy` -unsafe impl DeviceCopy - for CudaExchangeBufferCudaRepresentation -{ -} - -unsafe impl CudaAsRust - for CudaExchangeBufferCudaRepresentation +unsafe impl + CudaAsRust for CudaExchangeBufferCudaRepresentation { type RustRepresentation = CudaExchangeBuffer; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] - unsafe fn as_rust(this: &crate::common::DeviceAccessible) -> Self::RustRepresentation { - CudaExchangeBuffer(core::mem::ManuallyDrop::new(alloc::boxed::Box::from_raw( - core::slice::from_raw_parts_mut(this.0, this.1), - ))) + #[cfg(feature = "device")] + unsafe fn as_rust( + this: &crate::utils::ffi::DeviceAccessible, + ) -> Self::RustRepresentation { + CudaExchangeBuffer { + inner: super::device::CudaExchangeBufferDevice(core::mem::ManuallyDrop::new( + crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut( + this.0 .0, this.1, + )), + )), + } } } diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs index d284e1193..5083263b3 100644 --- a/src/utils/exchange/buffer/device.rs +++ b/src/utils/exchange/buffer/device.rs @@ -2,23 +2,21 @@ use core::ops::{Deref, DerefMut}; use const_type_layout::TypeGraphLayout; -use crate::{common::RustToCuda, safety::SafeDeviceCopy}; +use crate::{ + deps::alloc::boxed::Box, + safety::{PortableBitSemantics, StackOnly}, +}; -use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem}; +use super::CudaExchangeItem; #[allow(clippy::module_name_repetitions)] -#[doc(cfg(not(feature = "host")))] -/// When the `host` feature is set, -/// [`CudaExchangeBuffer`](super::CudaExchangeBuffer) -/// refers to -/// [`CudaExchangeBufferHost`](super::CudaExchangeBufferHost) -/// instead. -/// [`CudaExchangeBufferDevice`](Self) is never exposed directly. -pub struct CudaExchangeBufferDevice( - pub(super) core::mem::ManuallyDrop]>>, -); +pub struct CudaExchangeBufferDevice< + T: StackOnly + PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, +>(pub(super) core::mem::ManuallyDrop]>>); -impl Deref +impl Deref for CudaExchangeBufferDevice { type Target = [CudaExchangeItem]; @@ -28,17 +26,10 @@ impl Deref } } -impl DerefMut - for CudaExchangeBufferDevice +impl + DerefMut for CudaExchangeBufferDevice { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } - -#[cfg(not(all(doc, feature = "host")))] -unsafe impl RustToCuda - for CudaExchangeBufferDevice -{ - type CudaRepresentation = CudaExchangeBufferCudaRepresentation; -} diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index ad522629f..e252d0ce7 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -1,5 +1,4 @@ -use alloc::vec::Vec; -use core::{ +use std::{ cell::UnsafeCell, ops::{Deref, DerefMut}, }; @@ -11,40 +10,51 @@ use rustacuda::{ }; use crate::{ - common::{DeviceAccessible, RustToCuda}, - host::{CombinedCudaAlloc, CudaAlloc, CudaDropWrapper, NullCudaAlloc}, - safety::SafeDeviceCopy, + alloc::{CombinedCudaAlloc, CudaAlloc, NoCudaAlloc}, + host::CudaDropWrapper, + safety::{PortableBitSemantics, StackOnly}, + utils::{ + adapter::DeviceCopyWithPortableBitSemantics, + ffi::{DeviceAccessible, DeviceMutPointer}, + r#async::{Async, CompletionFnMut, NoCompletion}, + }, }; use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem}; #[allow(clippy::module_name_repetitions)] -#[doc(cfg(feature = "host"))] -/// When the `host` feature is **not** set, -/// [`CudaExchangeBuffer`](super::CudaExchangeBuffer) -/// refers to -/// [`CudaExchangeBufferDevice`](super::CudaExchangeBufferDevice) -/// instead. -/// [`CudaExchangeBufferHost`](Self) is never exposed directly. pub struct CudaExchangeBufferHost< - T: SafeDeviceCopy + TypeGraphLayout, + T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool, > { - host_buffer: CudaDropWrapper>>, - device_buffer: UnsafeCell>>>, + host_buffer: CudaDropWrapper< + LockedBuffer>>, + >, + device_buffer: UnsafeCell< + CudaDropWrapper< + DeviceBuffer>>, + >, + >, } -impl - CudaExchangeBufferHost +impl< + T: Clone + StackOnly + PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, + > CudaExchangeBufferHost { /// # Errors - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA pub fn new(elem: &T, capacity: usize) -> CudaResult { // Safety: CudaExchangeItem is a `repr(transparent)` wrapper around T - let elem: &CudaExchangeItem = unsafe { &*(elem as *const T).cast() }; + let elem: &CudaExchangeItem = unsafe { &*std::ptr::from_ref(elem).cast() }; - let host_buffer = CudaDropWrapper::from(LockedBuffer::new(elem, capacity)?); + let host_buffer = CudaDropWrapper::from(LockedBuffer::new( + DeviceCopyWithPortableBitSemantics::from_ref(elem), + capacity, + )?); let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice( host_buffer.as_slice(), )?)); @@ -56,20 +66,30 @@ impl +impl CudaExchangeBufferHost { /// # Errors - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA pub fn from_vec(vec: Vec) -> CudaResult { - let mut host_buffer_uninit = - CudaDropWrapper::from(unsafe { LockedBuffer::uninitialized(vec.len())? }); + let host_buffer = unsafe { + let mut uninit: CudaDropWrapper>> = + CudaDropWrapper::from(LockedBuffer::uninitialized(vec.len())?); - for (src, dst) in vec.into_iter().zip(host_buffer_uninit.iter_mut()) { - *dst = CudaExchangeItem(src); - } + let uninit_ptr: *mut DeviceCopyWithPortableBitSemantics> = + uninit.as_mut_ptr(); + + for (i, src) in vec.into_iter().enumerate() { + uninit_ptr + .add(i) + .write(DeviceCopyWithPortableBitSemantics::from(CudaExchangeItem( + src, + ))); + } - let host_buffer = host_buffer_uninit; + uninit + }; let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice( host_buffer.as_slice(), @@ -82,37 +102,34 @@ impl } } -impl Deref +impl Deref for CudaExchangeBufferHost { type Target = [CudaExchangeItem]; fn deref(&self) -> &Self::Target { - self.host_buffer.as_slice() + DeviceCopyWithPortableBitSemantics::into_slice(self.host_buffer.as_slice()) } } -impl DerefMut - for CudaExchangeBufferHost +impl + DerefMut for CudaExchangeBufferHost { fn deref_mut(&mut self) -> &mut Self::Target { - self.host_buffer.as_mut_slice() + DeviceCopyWithPortableBitSemantics::into_mut_slice(self.host_buffer.as_mut_slice()) } } -unsafe impl RustToCuda - for CudaExchangeBufferHost +impl + CudaExchangeBufferHost { - type CudaAllocation = NullCudaAlloc; - type CudaRepresentation = CudaExchangeBufferCudaRepresentation; - #[allow(clippy::type_complexity)] - unsafe fn borrow( + pub unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, + DeviceAccessible>, + CombinedCudaAlloc, )> { // Safety: device_buffer is inside an UnsafeCell // borrow checks must be satisfied through LendToCuda @@ -129,17 +146,17 @@ unsafe impl( + pub unsafe fn restore( &mut self, - alloc: CombinedCudaAlloc, + alloc: CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { let (_alloc_front, alloc_tail) = alloc.split(); @@ -155,3 +172,76 @@ unsafe impl + CudaExchangeBufferHost +{ + #[allow(clippy::type_complexity)] + pub unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + Async<'_, 'stream, DeviceAccessible>>, + CombinedCudaAlloc, + )> { + // Safety: device_buffer is inside an UnsafeCell + // borrow checks must be satisfied through LendToCuda + let device_buffer = &mut *self.device_buffer.get(); + + if M2D { + // Only move the buffer contents to the device if needed + + rustacuda::memory::AsyncCopyDestination::async_copy_from( + &mut ***device_buffer, + self.host_buffer.as_slice(), + &stream, + )?; + } + + let cuda_repr = DeviceAccessible::from(CudaExchangeBufferCudaRepresentation( + DeviceMutPointer(device_buffer.as_mut_ptr().cast()), + device_buffer.len(), + )); + + let r#async = if M2D { + Async::pending(cuda_repr, stream, NoCompletion)? + } else { + Async::ready(cuda_repr, stream) + }; + + Ok((r#async, CombinedCudaAlloc::new(NoCudaAlloc, alloc))) + } + + #[allow(clippy::type_complexity)] + pub unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + mut this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, + A, + )> { + let (_alloc_front, alloc_tail) = alloc.split(); + + if M2H { + // Only move the buffer contents back to the host if needed + + let this: &mut Self = &mut this; + + rustacuda::memory::AsyncCopyDestination::async_copy_to( + &***this.device_buffer.get_mut(), + this.host_buffer.as_mut_slice(), + &stream, + )?; + } + + let r#async = if M2H { + Async::<_, CompletionFnMut<'a, Self>>::pending(this, stream, Box::new(|_this| Ok(())))? + } else { + Async::ready(this, stream) + }; + + Ok((r#async, alloc_tail)) + } +} diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index 3648f9d04..1736b30ea 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -1,62 +1,296 @@ +#[cfg(any(feature = "host", feature = "device"))] +use core::{ + mem::MaybeUninit, + ops::{Deref, DerefMut}, +}; + +use const_type_layout::TypeLayout; + +use const_type_layout::TypeGraphLayout; + +use crate::safety::{PortableBitSemantics, StackOnly}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::{ + alloc::NoCudaAlloc, + lend::{RustToCuda, RustToCudaAsync}, +}; + +#[cfg(feature = "host")] +use crate::{ + alloc::{CombinedCudaAlloc, CudaAlloc}, + utils::ffi::DeviceAccessible, + utils::r#async::{Async, CompletionFnMut}, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use self::common::CudaExchangeBufferCudaRepresentation; + +#[cfg(any(feature = "host", feature = "device"))] mod common; -#[cfg(any(not(feature = "host"), doc))] +#[cfg(feature = "device")] mod device; #[cfg(feature = "host")] mod host; -#[cfg(not(feature = "host"))] +#[cfg(any(feature = "host", feature = "device"))] #[allow(clippy::module_name_repetitions)] -pub use device::CudaExchangeBufferDevice as CudaExchangeBuffer; +pub struct CudaExchangeBuffer< + T: StackOnly + PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, +> { + #[cfg(feature = "host")] + inner: host::CudaExchangeBufferHost, + #[cfg(all(feature = "device", not(feature = "host")))] + inner: device::CudaExchangeBufferDevice, +} + +#[cfg(any(feature = "host", feature = "device"))] +unsafe impl< + T: StackOnly + PortableBitSemantics + TypeGraphLayout + Sync, + const M2D: bool, + const M2H: bool, + > Sync for CudaExchangeBuffer +{ +} + #[cfg(feature = "host")] -#[allow(clippy::module_name_repetitions)] -pub use host::CudaExchangeBufferHost as CudaExchangeBuffer; +impl< + T: Clone + StackOnly + PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, + > CudaExchangeBuffer +{ + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn new(elem: &T, capacity: usize) -> rustacuda::error::CudaResult { + Ok(Self { + inner: host::CudaExchangeBufferHost::new(elem, capacity)?, + }) + } +} + +#[cfg(feature = "host")] +impl + CudaExchangeBuffer +{ + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn from_vec(vec: Vec) -> rustacuda::error::CudaResult { + Ok(Self { + inner: host::CudaExchangeBufferHost::from_vec(vec)?, + }) + } +} -#[cfg(doc)] -pub use self::{device::CudaExchangeBufferDevice, host::CudaExchangeBufferHost}; +#[cfg(any(feature = "host", feature = "device"))] +impl Deref + for CudaExchangeBuffer +{ + type Target = [CudaExchangeItem]; -use crate::safety::SafeDeviceCopy; + fn deref(&self) -> &Self::Target { + &self.inner + } +} -#[repr(transparent)] -#[derive(Clone, Copy, TypeLayout)] -pub struct CudaExchangeItem(T); +#[cfg(any(feature = "host", feature = "device"))] +impl + DerefMut for CudaExchangeBuffer +{ + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} -// Safety: Transparent newtype wrapper around `SafeDeviceCopy` -// is `DeviceCopy` -unsafe impl rustacuda_core::DeviceCopy - for CudaExchangeItem +#[cfg(any(feature = "host", feature = "device"))] +unsafe impl + RustToCuda for CudaExchangeBuffer { + type CudaAllocation = NoCudaAlloc; + type CudaRepresentation = CudaExchangeBufferCudaRepresentation; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + self.inner.borrow(alloc) + } + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + self.inner.restore(alloc) + } } -impl CudaExchangeItem { - #[cfg(any(feature = "host", doc))] - #[doc(cfg(feature = "host"))] - pub fn read(&self) -> &T { +#[cfg(any(feature = "host", feature = "device"))] +unsafe impl + RustToCudaAsync for CudaExchangeBuffer +{ + type CudaAllocationAsync = NoCudaAlloc; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + self.inner.borrow_async(alloc, stream) + } + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: crate::host::Stream<'stream>, + ) -> rustacuda::error::CudaResult<( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, + A, + )> { + let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) }; + + let (r#async, alloc_tail) = host::CudaExchangeBufferHost::restore_async( + this.map_mut(|this| &mut this.inner), + alloc, + stream, + )?; + + let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? }; + + std::mem::forget(inner); + let this = std::mem::ManuallyDrop::into_inner(this_backup); + + if let Some(on_completion) = on_completion { + let r#async = Async::<_, CompletionFnMut<'a, Self>>::pending( + this, + stream, + Box::new(|this: &mut Self| on_completion(&mut this.inner)), + )?; + Ok((r#async, alloc_tail)) + } else { + let r#async = Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } + } +} + +#[repr(transparent)] +#[derive(Clone, Copy, TypeLayout)] +pub struct CudaExchangeItem< + T: StackOnly + PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, +>(T); + +impl + CudaExchangeItem +{ + #[cfg(feature = "host")] + pub const fn read(&self) -> &T { &self.0 } - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(feature = "device")] pub fn write(&mut self, value: T) { self.0 = value; } } -impl CudaExchangeItem { - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] - pub fn read(&self) -> &T { +impl + CudaExchangeItem +{ + #[cfg(feature = "device")] + pub const fn read(&self) -> &T { &self.0 } - #[cfg(any(feature = "host", doc))] - #[doc(cfg(feature = "host"))] + #[cfg(feature = "host")] pub fn write(&mut self, value: T) { self.0 = value; } } -impl AsMut for CudaExchangeItem { +impl AsMut + for CudaExchangeItem +{ fn as_mut(&mut self) -> &mut T { &mut self.0 } } + +impl CudaExchangeItem { + #[cfg(feature = "host")] + pub const fn as_scratch(&self) -> &T { + &self.0 + } + + #[cfg(feature = "host")] + pub fn as_scratch_mut(&mut self) -> &mut T { + &mut self.0 + } +} + +impl CudaExchangeItem { + #[cfg(feature = "device")] + pub const fn as_scratch(&self) -> &T { + &self.0 + } + + #[cfg(feature = "device")] + pub fn as_scratch_mut(&mut self) -> &mut T { + &mut self.0 + } +} + +impl CudaExchangeItem { + #[cfg(feature = "host")] + pub const fn as_uninit(&self) -> &MaybeUninit { + // Safety: + // - MaybeUninit is a transparent newtype union + // - CudaExchangeItem is a transparent newtype + unsafe { &*core::ptr::from_ref(self).cast() } + } + + #[cfg(feature = "host")] + pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit { + // Safety: + // - MaybeUninit is a transparent newtype union + // - CudaExchangeItem is a transparent newtype + unsafe { &mut *core::ptr::from_mut(self).cast() } + } +} + +impl CudaExchangeItem { + #[cfg(feature = "device")] + pub const fn as_uninit(&self) -> &MaybeUninit { + // Safety: + // - MaybeUninit is a transparent newtype union + // - CudaExchangeItem is a transparent newtype + unsafe { &*core::ptr::from_ref(self).cast() } + } + + #[cfg(feature = "device")] + pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit { + // Safety: + // - MaybeUninit is a transparent newtype union + // - CudaExchangeItem is a transparent newtype + unsafe { &mut *core::ptr::from_mut(self).cast() } + } +} diff --git a/src/utils/exchange/mod.rs b/src/utils/exchange/mod.rs index ffca4bbf3..722e02559 100644 --- a/src/utils/exchange/mod.rs +++ b/src/utils/exchange/mod.rs @@ -1,5 +1,4 @@ pub mod buffer; #[cfg(feature = "host")] -#[doc(cfg(feature = "host"))] pub mod wrapper; diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 26958f491..bb137a4af 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -1,56 +1,152 @@ -use core::ops::{Deref, DerefMut}; +use std::ops::{Deref, DerefMut}; -use rustacuda::{error::CudaResult, memory::DeviceBox}; +use rustacuda::{ + error::CudaResult, + memory::{AsyncCopyDestination, CopyDestination, DeviceBox, LockedBox}, +}; use crate::{ - common::{DeviceAccessible, RustToCuda}, - host::{ - CombinedCudaAlloc, EmptyCudaAlloc, HostAndDeviceConstRef, HostAndDeviceMutRef, - HostDeviceBox, NullCudaAlloc, + alloc::{EmptyCudaAlloc, NoCudaAlloc}, + host::{CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceMutRef, Stream}, + lend::{RustToCuda, RustToCudaAsync}, + safety::SafeMutableAliasing, + utils::{ + adapter::DeviceCopyWithPortableBitSemantics, + ffi::DeviceAccessible, + r#async::{Async, AsyncProj, CompletionFnMut, NoCompletion}, }, }; #[allow(clippy::module_name_repetitions)] pub struct ExchangeWrapperOnHost> { - value: T, - device_box: HostDeviceBox::CudaRepresentation>>, + value: Box, + device_box: CudaDropWrapper< + DeviceBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, + >, + locked_cuda_repr: CudaDropWrapper< + LockedBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, + >, } #[allow(clippy::module_name_repetitions)] pub struct ExchangeWrapperOnDevice> { - value: T, - device_box: HostDeviceBox::CudaRepresentation>>, - cuda_repr: DeviceAccessible<::CudaRepresentation>, - null_alloc: CombinedCudaAlloc<::CudaAllocation, NullCudaAlloc>, + value: Box, + device_box: CudaDropWrapper< + DeviceBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, + >, + locked_cuda_repr: CudaDropWrapper< + LockedBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, + >, } impl> ExchangeWrapperOnHost { /// # Errors - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA pub fn new(value: T) -> CudaResult { - let (cuda_repr, _null_alloc) = unsafe { value.borrow(NullCudaAlloc) }?; + // Safety: The uninitialised memory is never exposed + // To access the device memory, [`Self::move_to_device`] has to + // be called first, which initialised the memory. + let device_box = CudaDropWrapper::from(unsafe { DeviceBox::uninitialized() }?); - let device_box = DeviceBox::new(&cuda_repr)?.into(); + let (cuda_repr, _null_alloc) = unsafe { value.borrow(NoCudaAlloc) }?; + let locked_cuda_repr = unsafe { + let mut uninit = CudaDropWrapper::from(LockedBox::< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >::uninitialized()?); + uninit + .as_mut_ptr() + .write(DeviceCopyWithPortableBitSemantics::from(cuda_repr)); + uninit + }; - Ok(Self { value, device_box }) + Ok(Self { + value: Box::new(value), + device_box, + locked_cuda_repr, + }) } + /// Moves the data synchronously to the CUDA device, where it can then be + /// lent out immutably via [`ExchangeWrapperOnDevice::as_ref`], or mutably + /// via [`ExchangeWrapperOnDevice::as_mut_async`](Async::as_mut_async). + /// /// # Errors - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA pub fn move_to_device(mut self) -> CudaResult> { - let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NullCudaAlloc) }?; + let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NoCudaAlloc) }?; + **self.locked_cuda_repr = DeviceCopyWithPortableBitSemantics::from(cuda_repr); + + self.device_box.copy_from(&**self.locked_cuda_repr)?; - self.device_box.copy_from(&cuda_repr)?; + let _: NoCudaAlloc = null_alloc.into(); Ok(ExchangeWrapperOnDevice { value: self.value, device_box: self.device_box, - cuda_repr, - null_alloc, + locked_cuda_repr: self.locked_cuda_repr, }) } } +impl> + ExchangeWrapperOnHost +{ + #[allow(clippy::needless_lifetimes)] // keep 'stream explicit + /// Moves the data asynchronously to the CUDA device. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn move_to_device_async<'stream>( + mut self, + stream: Stream<'stream>, + ) -> CudaResult, NoCompletion>> { + let (cuda_repr, _null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?; + let (cuda_repr, _completion): (_, Option) = + unsafe { cuda_repr.unwrap_unchecked()? }; + + **self.locked_cuda_repr = DeviceCopyWithPortableBitSemantics::from(cuda_repr); + + // Safety: The device value is not safely exposed until either + // - the passed-in [`Stream`] is synchronised + // - the kernel is launched on the passed-in [`Stream`] + unsafe { + self.device_box + .async_copy_from(&*self.locked_cuda_repr, &stream) + }?; + + Async::pending( + ExchangeWrapperOnDevice { + value: self.value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + }, + stream, + NoCompletion, + ) + } +} + impl> Deref for ExchangeWrapperOnHost { type Target = T; @@ -66,28 +162,200 @@ impl> DerefMut for ExchangeWrapper } impl> ExchangeWrapperOnDevice { + /// Moves the data synchronously back to the host CPU device. + /// /// # Errors - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA pub fn move_to_host(mut self) -> CudaResult> { - let _null_alloc: NullCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?; + let null_alloc = NoCudaAlloc.into(); + + // Reflect deep changes back to the CPU + let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?; + + // Note: Shallow changes are not reflected back to the CPU Ok(ExchangeWrapperOnHost { value: self.value, device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, }) } + #[must_use] pub fn as_ref( &self, ) -> HostAndDeviceConstRef::CudaRepresentation>> { - // Safety: `device_box` contains exactly the device copy of `cuda_repr` - unsafe { HostAndDeviceConstRef::new(&self.device_box, &self.cuda_repr) } + // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` + unsafe { + HostAndDeviceConstRef::new_unchecked( + &self.device_box, + (**self.locked_cuda_repr).into_ref(), + ) + } } +} + +impl> + ExchangeWrapperOnDevice +{ + #[allow(clippy::needless_lifetimes)] // keep 'stream explicit + /// Moves the data asynchronously back to the host CPU device. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn move_to_host_async<'stream>( + self, + stream: Stream<'stream>, + ) -> CudaResult< + Async< + 'static, + 'stream, + ExchangeWrapperOnHost, + CompletionFnMut<'static, ExchangeWrapperOnHost>, + >, + > { + let null_alloc = NoCudaAlloc.into(); + + let value = owning_ref::BoxRefMut::new(self.value); + + // Reflect deep changes back to the CPU + let (r#async, _null_alloc): (_, NoCudaAlloc) = + unsafe { RustToCudaAsync::restore_async(value, null_alloc, stream) }?; + let (value, on_complete) = unsafe { r#async.unwrap_unchecked()? }; + + let value = value.into_owner(); + + // Note: Shallow changes are not reflected back to the CPU - pub fn as_mut( + if let Some(on_complete) = on_complete { + Async::<_, CompletionFnMut>>::pending( + ExchangeWrapperOnHost { + value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + }, + stream, + Box::new(|on_host: &mut ExchangeWrapperOnHost| on_complete(&mut on_host.value)), + ) + } else { + Ok(Async::ready( + ExchangeWrapperOnHost { + value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + }, + stream, + )) + } + } +} + +impl< + 'a, + 'stream, + T: RustToCudaAsync, + > Async<'a, 'stream, ExchangeWrapperOnDevice, NoCompletion> +{ + /// Moves the data asynchronously back to the host CPU device. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn move_to_host_async( + self, + stream: Stream<'stream>, + ) -> CudaResult< + Async< + 'static, + 'stream, + ExchangeWrapperOnHost, + CompletionFnMut<'static, ExchangeWrapperOnHost>, + >, + > { + let (this, completion): (_, Option) = unsafe { self.unwrap_unchecked()? }; + + let null_alloc = NoCudaAlloc.into(); + + let value = owning_ref::BoxRefMut::new(this.value); + + // Reflect deep changes back to the CPU + let (r#async, _null_alloc): (_, NoCudaAlloc) = + unsafe { RustToCudaAsync::restore_async(value, null_alloc, stream) }?; + let (value, on_complete) = unsafe { r#async.unwrap_unchecked()? }; + + let value = value.into_owner(); + + // Note: Shallow changes are not reflected back to the CPU + + let on_host = ExchangeWrapperOnHost { + value, + device_box: this.device_box, + locked_cuda_repr: this.locked_cuda_repr, + }; + + if let Some(on_complete) = on_complete { + Async::<_, CompletionFnMut>>::pending( + on_host, + stream, + Box::new(|on_host: &mut ExchangeWrapperOnHost| on_complete(&mut on_host.value)), + ) + } else if matches!(completion, Some(NoCompletion)) { + Async::<_, CompletionFnMut>>::pending( + on_host, + stream, + Box::new(|_on_host: &mut ExchangeWrapperOnHost| Ok(())), + ) + } else { + Ok(Async::ready(on_host, stream)) + } + } + + #[must_use] + pub fn as_ref_async( + &self, + ) -> AsyncProj< + '_, + 'stream, + HostAndDeviceConstRef::CudaRepresentation>>, + > { + let this = unsafe { self.as_ref().unwrap_unchecked() }; + + // Safety: this projection captures this async + unsafe { + AsyncProj::new( + HostAndDeviceConstRef::new_unchecked( + &*(this.device_box), + (**(this.locked_cuda_repr)).into_ref(), + ), + None, + ) + } + } + + #[must_use] + pub fn as_mut_async( &mut self, - ) -> HostAndDeviceMutRef::CudaRepresentation>> { - // Safety: `device_box` contains exactly the device copy of `cuda_repr` - unsafe { HostAndDeviceMutRef::new(&mut self.device_box, &mut self.cuda_repr) } + ) -> AsyncProj< + '_, + 'stream, + HostAndDeviceMutRef<'_, DeviceAccessible<::CudaRepresentation>>, + > + where + T: SafeMutableAliasing, + { + let (this, use_callback) = unsafe { self.as_mut().unwrap_unchecked_with_use() }; + + // Safety: this projection captures this async + unsafe { + AsyncProj::new( + HostAndDeviceMutRef::new_unchecked( + &mut *(this.device_box), + (**(this.locked_cuda_repr)).into_mut(), + ), + use_callback, + ) + } } } diff --git a/src/utils/ffi.rs b/src/utils/ffi.rs new file mode 100644 index 000000000..52d7f691d --- /dev/null +++ b/src/utils/ffi.rs @@ -0,0 +1,208 @@ +use core::marker::PhantomData; +#[cfg(feature = "device")] +use core::{ + convert::{AsMut, AsRef}, + ops::{Deref, DerefMut}, +}; +#[cfg(feature = "host")] +use std::fmt; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +use crate::safety::PortableBitSemantics; +#[cfg(feature = "host")] +use crate::{lend::CudaAsRust, utils::adapter::RustToCudaWithPortableBitCopySemantics}; + +#[cfg_attr(any(feature = "device", doc), derive(Debug))] +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct DeviceAccessible(T); + +#[cfg(feature = "host")] +impl From for DeviceAccessible { + fn from(value: T) -> Self { + Self(value) + } +} + +#[cfg(feature = "host")] +impl From<&T> + for DeviceAccessible> +{ + fn from(value: &T) -> Self { + Self(RustToCudaWithPortableBitCopySemantics::from_copy(value)) + } +} + +#[cfg(all(feature = "host", not(doc)))] +impl fmt::Debug + for DeviceAccessible +{ + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + fmt.debug_struct(stringify!(DeviceAccessible)) + .finish_non_exhaustive() + } +} + +#[cfg(feature = "device")] +impl Deref for DeviceAccessible { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[cfg(feature = "device")] +impl DerefMut for DeviceAccessible { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct DeviceConstRef<'r, T: PortableBitSemantics + 'r> { + #[cfg_attr(feature = "host", allow(dead_code))] + pub(crate) pointer: DeviceConstPointer, + pub(crate) reference: PhantomData<&'r T>, +} + +impl<'r, T: PortableBitSemantics> Copy for DeviceConstRef<'r, T> {} + +impl<'r, T: PortableBitSemantics> Clone for DeviceConstRef<'r, T> { + fn clone(&self) -> Self { + *self + } +} + +#[cfg(feature = "device")] +impl<'r, T: PortableBitSemantics> AsRef for DeviceConstRef<'r, T> { + fn as_ref(&self) -> &T { + unsafe { &*self.pointer.0 } + } +} + +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct DeviceMutRef<'r, T: PortableBitSemantics + 'r> { + #[cfg_attr(feature = "host", allow(dead_code))] + pub(crate) pointer: DeviceMutPointer, + pub(crate) reference: PhantomData<&'r mut T>, +} + +#[cfg(feature = "device")] +impl<'r, T: PortableBitSemantics> AsRef for DeviceMutRef<'r, T> { + fn as_ref(&self) -> &T { + unsafe { &*self.pointer.0 } + } +} + +#[cfg(feature = "device")] +impl<'r, T: PortableBitSemantics> AsMut for DeviceMutRef<'r, T> { + fn as_mut(&mut self) -> &mut T { + unsafe { &mut *self.pointer.0 } + } +} + +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct DeviceOwnedRef<'r, T: PortableBitSemantics> { + #[cfg_attr(feature = "host", allow(dead_code))] + pub(crate) pointer: DeviceOwnedPointer, + pub(crate) reference: PhantomData<&'r mut ()>, + pub(crate) marker: PhantomData, +} + +#[cfg(feature = "device")] +impl<'r, T: PortableBitSemantics> AsRef for DeviceOwnedRef<'r, T> { + fn as_ref(&self) -> &T { + unsafe { &*self.pointer.0 } + } +} + +#[cfg(feature = "device")] +impl<'r, T: PortableBitSemantics> AsMut for DeviceOwnedRef<'r, T> { + fn as_mut(&mut self) -> &mut T { + unsafe { &mut *self.pointer.0 } + } +} + +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct DeviceConstPointer(pub(crate) *const T); + +impl Copy for DeviceConstPointer {} + +impl Clone for DeviceConstPointer { + fn clone(&self) -> Self { + *self + } +} + +impl DeviceConstPointer<[T]> { + #[must_use] + pub fn into_raw_parts(self) -> (DeviceConstPointer, usize) { + let (data, len) = self.0.to_raw_parts(); + (DeviceConstPointer(data.cast()), len) + } +} + +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct DeviceMutPointer(pub(crate) *mut T); + +impl Copy for DeviceMutPointer {} + +impl Clone for DeviceMutPointer { + fn clone(&self) -> Self { + *self + } +} + +impl DeviceMutPointer { + #[must_use] + pub const fn as_const(self) -> DeviceConstPointer { + DeviceConstPointer(self.0.cast_const()) + } +} + +impl DeviceMutPointer<[T]> { + #[must_use] + pub fn into_raw_parts(self) -> (DeviceMutPointer, usize) { + let (data, len) = self.0.to_raw_parts(); + (DeviceMutPointer(data.cast()), len) + } +} + +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct DeviceOwnedPointer(pub(crate) *mut T); + +impl Copy for DeviceOwnedPointer {} + +impl Clone for DeviceOwnedPointer { + fn clone(&self) -> Self { + *self + } +} + +impl DeviceOwnedPointer { + #[must_use] + pub const fn as_const(self) -> DeviceConstPointer { + DeviceConstPointer(self.0.cast_const()) + } + + #[must_use] + pub const fn as_mut(self) -> DeviceMutPointer { + DeviceMutPointer(self.0) + } +} + +impl DeviceOwnedPointer<[T]> { + #[must_use] + pub fn into_raw_parts(self) -> (DeviceOwnedPointer, usize) { + let (data, len) = self.0.to_raw_parts(); + (DeviceOwnedPointer(data.cast()), len) + } +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 303e96262..e41a3c4ee 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,10 +1,6 @@ +pub mod adapter; pub mod aliasing; -#[cfg(any(feature = "alloc", doc))] -#[doc(cfg(feature = "alloc"))] -pub mod alloc; -pub mod device_copy; +pub mod r#async; pub mod exchange; - -mod r#box; -mod boxed_slice; -mod option; +pub mod ffi; +pub mod shared; diff --git a/src/utils/option.rs b/src/utils/option.rs deleted file mode 100644 index 7ef601137..000000000 --- a/src/utils/option.rs +++ /dev/null @@ -1,116 +0,0 @@ -use core::mem::MaybeUninit; - -use const_type_layout::TypeGraphLayout; - -use crate::{ - common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaProxy}, - safety::SafeDeviceCopy, - utils::device_copy::SafeDeviceCopyWrapper, -}; - -#[cfg(feature = "host")] -use crate::{host::CombinedCudaAlloc, host::CudaAlloc, rustacuda::error::CudaResult}; - -#[doc(hidden)] -#[allow(clippy::module_name_repetitions)] -#[derive(TypeLayout)] -#[repr(C)] -pub struct OptionCudaRepresentation { - maybe: MaybeUninit>, - present: bool, -} - -// Safety: Since the CUDA representation of T is DeviceCopy, -// the full enum is also DeviceCopy -unsafe impl rustacuda_core::DeviceCopy for OptionCudaRepresentation {} - -unsafe impl RustToCuda for Option { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - type CudaAllocation = Option<::CudaAllocation>; - type CudaRepresentation = OptionCudaRepresentation<::CudaRepresentation>; - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - #[allow(clippy::type_complexity)] - unsafe fn borrow( - &self, - alloc: A, - ) -> CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, - )> { - let (cuda_repr, alloc) = match self { - None => ( - OptionCudaRepresentation { - maybe: MaybeUninit::uninit(), - present: false, - }, - CombinedCudaAlloc::new(None, alloc), - ), - Some(value) => { - let (cuda_repr, alloc) = value.borrow(alloc)?; - - let (alloc_front, alloc_tail) = alloc.split(); - - ( - OptionCudaRepresentation { - maybe: MaybeUninit::new(cuda_repr), - present: true, - }, - CombinedCudaAlloc::new(Some(alloc_front), alloc_tail), - ) - }, - }; - - Ok((DeviceAccessible::from(cuda_repr), alloc)) - } - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn restore( - &mut self, - alloc: CombinedCudaAlloc, - ) -> CudaResult { - let (alloc_front, alloc_tail) = alloc.split(); - - match (self, alloc_front) { - (Some(value), Some(alloc_front)) => { - value.restore(CombinedCudaAlloc::new(alloc_front, alloc_tail)) - }, - _ => Ok(alloc_tail), - } - } -} - -unsafe impl CudaAsRust for OptionCudaRepresentation { - type RustRepresentation = Option<::RustRepresentation>; - - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] - unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - if this.present { - Some(CudaAsRust::as_rust(this.maybe.assume_init_ref())) - } else { - None - } - } -} - -impl RustToCudaProxy> - for Option> -{ - fn from_ref(val: &Option) -> &Self { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype - unsafe { &*(val as *const Option).cast() } - } - - fn from_mut(val: &mut Option) -> &mut Self { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype - unsafe { &mut *(val as *mut Option).cast() } - } - - fn into(self) -> Option { - self.map(SafeDeviceCopyWrapper::into_inner) - } -} diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs new file mode 100644 index 000000000..b01dda26d --- /dev/null +++ b/src/utils/shared/mod.rs @@ -0,0 +1,14 @@ +mod slice; +mod r#static; + +pub use slice::ThreadBlockSharedSlice; + +#[allow(clippy::module_name_repetitions)] +pub use r#static::ThreadBlockShared; + +#[doc(hidden)] +#[cfg(feature = "device")] +pub use slice::init; + +#[cfg(feature = "host")] +pub(crate) use slice::SharedMemorySize; diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs new file mode 100644 index 000000000..72ed7fde1 --- /dev/null +++ b/src/utils/shared/slice.rs @@ -0,0 +1,169 @@ +use core::alloc::Layout; + +use const_type_layout::TypeGraphLayout; + +#[allow(clippy::module_name_repetitions)] +#[repr(transparent)] +pub struct ThreadBlockSharedSlice { + shared: *mut [T], +} + +impl ThreadBlockSharedSlice { + #[cfg(feature = "host")] + #[must_use] + pub fn new_uninit_with_len(len: usize) -> Self { + Self { + shared: Self::dangling_slice_with_len(len), + } + } + + #[cfg(feature = "host")] + #[must_use] + pub fn with_len(mut self, len: usize) -> Self { + self.shared = Self::dangling_slice_with_len(len); + self + } + + #[cfg(feature = "host")] + #[must_use] + pub fn with_len_mut(&mut self, len: usize) -> &mut Self { + self.shared = Self::dangling_slice_with_len(len); + self + } + + #[cfg(feature = "host")] + fn dangling_slice_with_len(len: usize) -> *mut [T] { + core::ptr::slice_from_raw_parts_mut(core::ptr::NonNull::dangling().as_ptr(), len) + } + + #[must_use] + pub fn len(&self) -> usize { + core::ptr::metadata(self.shared) + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + #[must_use] + pub fn layout(&self) -> Layout { + // Safety: the length of self.shared is always initialised + unsafe { Layout::for_value_raw(self.shared) } + } + + #[cfg(feature = "device")] + #[must_use] + pub const fn as_mut_ptr(&self) -> *mut T { + self.shared.cast() + } + + #[cfg(feature = "device")] + #[must_use] + pub const fn as_mut_slice_ptr(&self) -> *mut [T] { + self.shared + } + + #[cfg(feature = "device")] + /// # Safety + /// + /// The provided `index` must not be out of bounds. + #[inline] + #[must_use] + pub unsafe fn index_mut_unchecked>( + &self, + index: I, + ) -> *mut >::Output { + self.shared.get_unchecked_mut(index) + } +} + +#[cfg(feature = "device")] +impl ThreadBlockSharedSlice { + /// # Safety + /// + /// Exposing the [`ThreadBlockSharedSlice`] must be preceded by exactly one + /// call to [`init`]. + pub(crate) unsafe fn with_uninit_for_len Q, Q>( + len: usize, + inner: F, + ) -> Q { + let base: *mut u8; + + unsafe { + core::arch::asm!( + "mov.u64 {base}, %rust_cuda_dynamic_shared;", + base = out(reg64) base, + ); + } + + let aligned_base = base.byte_add(base.align_offset(core::mem::align_of::())); + + let data: *mut T = aligned_base.cast(); + + let new_base = data.add(len).cast::(); + + unsafe { + core::arch::asm!( + "mov.u64 %rust_cuda_dynamic_shared, {new_base};", + new_base = in(reg64) new_base, + ); + } + + let shared = core::ptr::slice_from_raw_parts_mut(data, len); + + inner(&mut Self { shared }) + } +} + +#[cfg(feature = "device")] +/// # Safety +/// +/// The thread-block shared dynamic memory must be initialised once and +/// only once per kernel. +pub unsafe fn init() { + unsafe { + core::arch::asm!(".reg .u64 %rust_cuda_dynamic_shared;"); + core::arch::asm!( + "cvta.shared.u64 %rust_cuda_dynamic_shared, rust_cuda_dynamic_shared_base;", + ); + } +} + +#[cfg(feature = "device")] +core::arch::global_asm!(".extern .shared .align 8 .b8 rust_cuda_dynamic_shared_base[];"); + +#[cfg(feature = "host")] +pub struct SharedMemorySize { + last_align: usize, + total_size: usize, +} + +#[cfg(feature = "host")] +impl SharedMemorySize { + #[must_use] + pub const fn new() -> Self { + Self { + // we allocate the shared memory with an alignment of 8 + last_align: 8, + total_size: 0, + } + } + + pub fn add(&mut self, layout: core::alloc::Layout) { + if layout.align() > self.last_align { + // in the worst case, we are one element of the smaller alignment + // into the larger alignment, so we need to pad the entire rest + let pessimistic_padding = layout.align() - self.last_align; + + self.total_size += pessimistic_padding; + } + + self.last_align = layout.align(); + self.total_size += layout.size(); + } + + pub const fn total(self) -> usize { + self.total_size + } +} diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs new file mode 100644 index 000000000..62c3a0c49 --- /dev/null +++ b/src/utils/shared/static.rs @@ -0,0 +1,58 @@ +#[repr(transparent)] +pub struct ThreadBlockShared { + #[cfg_attr(not(feature = "device"), allow(dead_code))] + shared: *mut T, +} + +impl ThreadBlockShared { + #[cfg(any(feature = "host", feature = "device"))] + #[must_use] + #[allow(clippy::inline_always, clippy::missing_const_for_fn)] + #[inline(always)] + pub fn new_uninit() -> Self { + #[cfg(feature = "host")] + { + Self { + shared: core::ptr::NonNull::dangling().as_ptr(), + } + } + + #[cfg(feature = "device")] + { + let shared: *mut T; + + unsafe { + core::arch::asm!( + ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];", + "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;", + reg = out(reg64) shared, + align = const(core::mem::align_of::()), + size = const(core::mem::size_of::()), + ); + } + + Self { shared } + } + } + + #[cfg(feature = "device")] + #[must_use] + pub const fn as_mut_ptr(&self) -> *mut T { + self.shared + } +} + +impl ThreadBlockShared<[T; N]> { + #[cfg(feature = "device")] + /// # Safety + /// + /// The provided `index` must not be out of bounds. + #[inline] + #[must_use] + pub unsafe fn index_mut_unchecked>( + &self, + index: I, + ) -> *mut >::Output { + core::ptr::slice_from_raw_parts_mut(self.shared.cast::(), N).get_unchecked_mut(index) + } +}