diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 57b6377f7..fcf0fd63c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,6 +23,16 @@ jobs:
         rust: [nightly]
 
     steps:
+      - name: Install CUDA
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
+          sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
+          curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb
+          sudo dpkg -i cuda-keyring_1.0-1_all.deb
+          sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
+          sudo apt-get update -q
+          sudo apt-get install cuda -y --no-install-recommends
+
       - name: Checkout the Repository
         uses: actions/checkout@v2
       
@@ -40,53 +50,26 @@ jobs:
           sudo ./llvm.sh $(rustc --version -v | grep -oP "LLVM version: \K\d+")
           rm llvm.sh
           cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force
+      
+      - name: Install cargo-hack
+        uses: taiki-e/install-action@cargo-hack
 
-      - name: Check without features on CPU
-        run: |
-          cargo check
-
-      - name: Check with alloc feature on CPU
-        run: |
-          cargo check \
-            --features alloc
-
-      - name: Check with derive feature on CPU
-        run: |
-          cargo check \
-            --features derive
-
-      - name: Check with host feature on CPU
-        run: |
-          cargo check \
-            --features host
-
-      - name: Check with host,derive,alloc features on CPU
+      - name: Check feature powerset on the CPU
         run: |
-          cargo check \
-            --features host,derive,alloc
+          cargo hack check --feature-powerset --optional-deps \
+            --skip device \
+            --keep-going
 
-      - name: Check without features on CUDA
+      - name: Check feature powerset on CUDA
         run: |
-          cargo check \
+          cargo hack check --feature-powerset --optional-deps \
+            --skip host \
+            --keep-going \
             --target nvptx64-nvidia-cuda
 
-      - name: Check with alloc feature on CUDA
-        run: |
-          cargo check \
-            --target nvptx64-nvidia-cuda \
-            --features alloc
-
-      - name: Check with derive feature on CUDA
-        run: |
-          cargo check \
-            --target nvptx64-nvidia-cuda \
-            --features derive
-
       - name: Check all workspace targets
         run: |
-          cargo check \
-            --workspace \
-            --all-targets
+          cargo check --workspace --all-targets
 
   test:
     name: Test Suite
@@ -157,6 +140,16 @@ jobs:
         rust: [nightly]
 
     steps:
+      - name: Install CUDA
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
+          sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
+          curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb
+          sudo dpkg -i cuda-keyring_1.0-1_all.deb
+          sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
+          sudo apt-get update -q
+          sudo apt-get install cuda -y --no-install-recommends
+
       - name: Checkout the Repository
         uses: actions/checkout@v2
       
@@ -176,58 +169,24 @@ jobs:
           rm llvm.sh
           cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force
 
-      - name: Check the code style without features on CPU
-        run: |
-          cargo clippy \
-            -- -D warnings
-
-      - name: Check the code style with alloc feature on CPU
-        run: |
-          cargo clippy \
-            --features alloc \
-            -- -D warnings
-
-      - name: Check the code style with derive feature on CPU
-        run: |
-          cargo clippy \
-            --features derive \
-            -- -D warnings
-
-      - name: Check the code style with host feature on CPU
-        run: |
-          cargo clippy \
-            --features host \
-            -- -D warnings
-
-      - name: Check the code style with host,derive,alloc features on CPU
-        run: |
-          cargo clippy \
-            --features host,derive,alloc \
-            -- -D warnings
-
-      - name: Check the code style without features on CUDA
-        run: |
-          cargo clippy \
-            --target nvptx64-nvidia-cuda \
-            -- -D warnings
+      - name: Install cargo-hack
+        uses: taiki-e/install-action@cargo-hack
       
-      - name: Check the code style with alloc feature on CUDA
+      - name: Check feature powerset on the CPU
         run: |
-          cargo clippy \
-            --target nvptx64-nvidia-cuda \
-            --features alloc \
+          cargo hack clippy --feature-powerset --optional-deps \
+            --skip device \
+            --keep-going \
             -- -D warnings
-
-      - name: Check the code style with derive feature on CUDA
+      
+      - name: Check feature powerset on CUDA
         run: |
-          cargo clippy \
+          cargo hack clippy --feature-powerset --optional-deps \
+            --skip host \
+            --keep-going \
             --target nvptx64-nvidia-cuda \
-            --features derive \
             -- -D warnings
 
-      - name: Check the code style for all workspace targets
+      - name: Check all workspace targets
         run: |
-          cargo clippy \
-            --workspace \
-            --all-targets \
-            -- -D warnings
+          cargo clippy --workspace --all-targets -- -D warnings
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 904e1a65c..c54f606d5 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -59,8 +59,8 @@ jobs:
           ./grcov . -s . --binary-path ./target/debug/deps \
             -t lcov -o coverage.lcov --branch \
             --keep-only "src/*" \
-            --keep-only "rust-cuda-ptx-jit/*" \
             --keep-only "rust-cuda-derive/*" \
+            --keep-only "rust-cuda-kernel/*" \
             --ignore-not-existing \
             --excl-line GRCOV_EXCL_LINE \
             --excl-start GRCOV_EXCL_START \
diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml
index 285fc57c2..5c756572c 100644
--- a/.github/workflows/rustdoc.yml
+++ b/.github/workflows/rustdoc.yml
@@ -28,6 +28,8 @@ jobs:
         run: |
           RUSTDOCFLAGS="\
             --enable-index-page \
+            --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.1/ \
+            --extern-html-root-url final=https://docs.rs/final/0.1.1/ \
             --extern-html-root-url rustacuda=https://docs.rs/rustacuda/0.1.3/ \
             --extern-html-root-url rustacuda_core=https://docs.rs/rustacuda_core/0.1.2/ \
             --extern-html-root-url rustacuda_derive=https://docs.rs/rustacuda_derive/0.1.2/ \
diff --git a/.gitignore b/.gitignore
index 767dae236..218ca8786 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,6 @@ Cargo.lock
 
 # These are backup files generated by rustfmt
 **/*.rs.bk
+
+# cargo expand dev output files
+**/expanded.rs
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 93f713cad..d12ff8221 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -4,5 +4,11 @@
     "rust-analyzer.updates.askBeforeDownload": false,
     "rust-analyzer.checkOnSave.command": "reap-clippy",
     "rust-analyzer.cargo.allFeatures": false,
-    "rust-analyzer.cargo.features": ["alloc", "derive", "host"],
+    "rust-analyzer.cargo.features": [
+        "derive",
+        "final",
+        "host",
+        "kernel"
+    ],
+    "rust-analyzer.showUnlinkedFileNotification": false,
 }
diff --git a/Cargo.toml b/Cargo.toml
index e8c86665b..acb60681a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,10 +1,10 @@
 [workspace]
 members = [
-    ".", "rust-cuda-derive", "rust-cuda-ptx-jit",
-    "examples/single-source", "examples/derive",
+    ".", "rust-cuda-derive", "rust-cuda-kernel",
+    "examples/derive", "examples/print", "examples/single-source",
 ]
 default-members = [
-    ".", "rust-cuda-derive", "rust-cuda-ptx-jit"
+    ".", "rust-cuda-derive", "rust-cuda-kernel",
 ]
 
 [package]
@@ -13,29 +13,32 @@ version = "0.1.0"
 authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
-rust-version = "1.75" # nightly
+rust-version = "1.77" # nightly
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [features]
 default = []
-alloc = ["hashbrown"]
-host = ["rustacuda", "rust-cuda-ptx-jit/host"]
-derive = ["rustacuda_derive", "rust-cuda-derive"]
+derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"]
+device = []
+final = ["dep:final"]
+host = ["dep:rustacuda", "dep:regex", "dep:oneshot", "dep:safer_owning_ref"]
+kernel = ["dep:rust-cuda-kernel"]
 
 [dependencies]
-rustacuda_core = "0.1.2"
+rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc" }
 
-rustacuda = { version = "0.1.3", optional = true }
-rustacuda_derive = { version = "0.1.2", optional = true }
+rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true }
+rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true }
 
-const-type-layout = { version = "0.2.0", features = ["derive"] }
+regex = { version = "1.10", optional = true }
 
-final = "0.1.1"
-hashbrown = { version = "0.14", default-features = false, features = ["inline-more"], optional = true }
+const-type-layout = { git = "https://github.com/juntyr/const-type-layout", branch = "compress", features = ["derive"] }
 
-rust-cuda-derive = { path = "rust-cuda-derive", optional = true }
-rust-cuda-ptx-jit = { path = "rust-cuda-ptx-jit" }
+safer_owning_ref = { version = "0.5", optional = true }
+oneshot = { version = "0.1", optional = true, features = ["std", "async"] }
+
+final = { version = "0.1.1", optional = true }
 
-[dev-dependencies]
-hashbrown = { version = "0.14", default-features = false, features = ["inline-more"] }
+rust-cuda-derive = { path = "rust-cuda-derive", optional = true }
+rust-cuda-kernel = { path = "rust-cuda-kernel", optional = true }
diff --git a/README.md b/README.md
index e9b24ddbb..5080b7033 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,11 @@
-# rust-cuda &emsp; [![CI Status]][workflow] [![Rust Doc]][docs] [![License Status]][fossa] [![Code Coverage]][codecov] [![Gitpod Ready-to-Code]][gitpod]
+# rust-cuda &emsp; [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License Status]][fossa] [![Code Coverage]][codecov] [![Gitpod Ready-to-Code]][gitpod]
 
 [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
 [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
 
+[MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange
+[repo]: https://github.com/juntyr/rust-cuda
+
 [Rust Doc]: https://img.shields.io/badge/docs-main-blue
 [docs]: https://juntyr.github.io/rust-cuda/
 
diff --git a/examples/derive/Cargo.toml b/examples/derive/Cargo.toml
index e59a344af..1b000fe8c 100644
--- a/examples/derive/Cargo.toml
+++ b/examples/derive/Cargo.toml
@@ -1,12 +1,11 @@
 [package]
 name = "derive"
 version = "0.1.0"
-authors = ["Juniper Tyree <juniper.langenstein@helsinki.fi>"]
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-const-type-layout = { version = "0.2.0" }
-rust-cuda = { path = "../../", features = ["derive", "host"] }
+rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] }
diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs
index 814e30f61..6960eadeb 100644
--- a/examples/derive/src/lib.rs
+++ b/examples/derive/src/lib.rs
@@ -1,14 +1,15 @@
 #![deny(clippy::pedantic)]
 #![feature(const_type_name)]
-#![feature(offset_of)]
 
-#[derive(rust_cuda::common::LendRustToCuda)]
+#[derive(rc::lend::LendRustToCuda)]
+#[cuda(crate = "rc")]
 struct Inner<T: Copy> {
     #[cuda(embed)]
     inner: T,
 }
 
-#[derive(rust_cuda::common::LendRustToCuda)]
+#[derive(rc::lend::LendRustToCuda)]
+#[cuda(crate = "rc")]
 struct Outer<T: Copy> {
     #[cuda(embed)]
     inner: Inner<T>,
diff --git a/examples/print/.cargo/config.toml b/examples/print/.cargo/config.toml
new file mode 100644
index 000000000..4a98afe58
--- /dev/null
+++ b/examples/print/.cargo/config.toml
@@ -0,0 +1,2 @@
+[target.nvptx64-nvidia-cuda]
+rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"]
diff --git a/examples/print/Cargo.toml b/examples/print/Cargo.toml
new file mode 100644
index 000000000..b7f864b58
--- /dev/null
+++ b/examples/print/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "print"
+version = "0.1.0"
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
+license = "MIT OR Apache-2.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[target.'cfg(target_os = "cuda")'.dependencies]
+rust-cuda = { path = "../../", features = ["kernel", "device"] }
+
+[target.'cfg(not(target_os = "cuda"))'.dependencies]
+rust-cuda = { path = "../../", features = ["kernel", "host"] }
diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
new file mode 100644
index 000000000..c99ae0df9
--- /dev/null
+++ b/examples/print/src/main.rs
@@ -0,0 +1,109 @@
+#![deny(clippy::pedantic)]
+#![cfg_attr(target_os = "cuda", no_std)]
+#![cfg_attr(target_os = "cuda", no_main)]
+#![cfg_attr(target_os = "cuda", feature(abi_ptx))]
+#![cfg_attr(target_os = "cuda", feature(alloc_error_handler))]
+#![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
+#![feature(const_type_name)]
+#![feature(cfg_version)]
+#![feature(type_alias_impl_trait)]
+#![feature(decl_macro)]
+
+extern crate alloc;
+
+#[derive(Copy, Clone, rust_cuda::deps::const_type_layout::TypeLayout)]
+#[layout(crate = "rust_cuda::deps::const_type_layout")]
+#[repr(C)]
+pub enum Action {
+    Print,
+    Panic,
+    AllocError,
+}
+
+#[rust_cuda::kernel::kernel(use link! for impl)]
+#[kernel(allow(ptx::local_memory_use))]
+pub fn kernel(action: rust_cuda::kernel::param::PerThreadShallowCopy<Action>) {
+    match action {
+        Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"),
+        Action::Panic => panic!("panic! from CUDA kernel"),
+        Action::AllocError => {
+            ::alloc::alloc::handle_alloc_error(::core::alloc::Layout::new::<i8>())
+        },
+    }
+}
+
+#[cfg(not(target_os = "cuda"))]
+fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> {
+    // Link the non-generic CUDA kernel
+    struct KernelPtx;
+    link! { impl kernel for KernelPtx }
+
+    // Initialize the CUDA API
+    rust_cuda::deps::rustacuda::init(rust_cuda::deps::rustacuda::CudaFlags::empty())?;
+
+    // Get the first CUDA GPU device
+    let device = rust_cuda::deps::rustacuda::device::Device::get_device(0)?;
+
+    // Create a CUDA context associated to this device
+    let _context = rust_cuda::host::CudaDropWrapper::from(
+        rust_cuda::deps::rustacuda::context::Context::create_and_push(
+            rust_cuda::deps::rustacuda::context::ContextFlags::MAP_HOST
+                | rust_cuda::deps::rustacuda::context::ContextFlags::SCHED_AUTO,
+            device,
+        )?,
+    );
+
+    // Create a new CUDA stream to submit kernels to
+    let mut stream =
+        rust_cuda::host::CudaDropWrapper::from(rust_cuda::deps::rustacuda::stream::Stream::new(
+            rust_cuda::deps::rustacuda::stream::StreamFlags::NON_BLOCKING,
+            None,
+        )?);
+
+    // Create a new instance of the CUDA kernel and prepare the launch config
+    let mut kernel = rust_cuda::kernel::TypedPtxKernel::<kernel>::new::<KernelPtx>(None);
+    let config = rust_cuda::kernel::LaunchConfig {
+        grid: rust_cuda::deps::rustacuda::function::GridSize::x(1),
+        block: rust_cuda::deps::rustacuda::function::BlockSize::x(4),
+        ptx_jit: false,
+    };
+
+    // Launch the CUDA kernel on the stream and synchronise to its completion
+    rust_cuda::host::Stream::with(&mut stream, |stream| {
+        println!("Launching print kernel ...");
+        kernel.launch1(stream, &config, Action::Print)?;
+        println!("Launching panic kernel ...");
+        kernel.launch1(stream, &config, Action::Panic)?;
+        println!("Launching alloc error kernel ...");
+        kernel.launch1(stream, &config, Action::AllocError)
+    })?;
+
+    Ok(())
+}
+
+#[cfg(target_os = "cuda")]
+mod cuda_prelude {
+    use rust_cuda::device::alloc::PTXAllocator;
+
+    #[global_allocator]
+    static _GLOBAL_ALLOCATOR: PTXAllocator = PTXAllocator;
+
+    #[panic_handler]
+    fn panic(info: &::core::panic::PanicInfo) -> ! {
+        // pretty format and print the panic message
+        // but don't allow dynamic formatting or panic payload downcasting
+        rust_cuda::device::utils::pretty_print_panic_info(info, false, false);
+
+        // Safety: no mutable data is shared with the kernel
+        unsafe { rust_cuda::device::utils::exit() }
+    }
+
+    #[alloc_error_handler]
+    #[track_caller]
+    fn alloc_error_handler(layout: ::core::alloc::Layout) -> ! {
+        rust_cuda::device::utils::pretty_print_alloc_error(layout);
+
+        // Safety: no mutable data is shared with the kernel
+        unsafe { rust_cuda::device::utils::exit() }
+    }
+}
diff --git a/examples/single-source/.cargo/config.toml b/examples/single-source/.cargo/config.toml
index 48db9d693..4a98afe58 100644
--- a/examples/single-source/.cargo/config.toml
+++ b/examples/single-source/.cargo/config.toml
@@ -1,5 +1,2 @@
 [target.nvptx64-nvidia-cuda]
-rustflags = ["-Clink-args=--arch sm_35", "-Clink-arg=-O3", "-Clink-arg=--lto"]
-
-[unstable]
-features = ["all"]
+rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"]
diff --git a/examples/single-source/Cargo.toml b/examples/single-source/Cargo.toml
index 128da7cef..1a27dd30e 100644
--- a/examples/single-source/Cargo.toml
+++ b/examples/single-source/Cargo.toml
@@ -1,17 +1,14 @@
 [package]
 name = "single-source"
 version = "0.1.0"
-authors = ["Juniper Tyree <juniper.langenstein@helsinki.fi>"]
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
-[dependencies]
-const-type-layout = { version = "0.2.0" }
-
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { path = "../../", features = ["derive"] }
+rc = { package = "rust-cuda", path = "../../", features = ["derive", "kernel", "device"] }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { path = "../../", features = ["derive", "host"] }
+rc = { package = "rust-cuda", path = "../../", features = ["derive", "kernel", "host"] }
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 36c0736c6..3861190d2 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -3,86 +3,117 @@
 #![cfg_attr(target_os = "cuda", no_main)]
 #![cfg_attr(target_os = "cuda", feature(abi_ptx))]
 #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))]
-#![cfg_attr(target_os = "cuda", feature(stdsimd))]
 #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
 #![feature(const_type_name)]
-#![feature(offset_of)]
+#![feature(cfg_version)]
+#![feature(type_alias_impl_trait)]
+#![feature(associated_type_bounds)]
+#![feature(decl_macro)]
+#![recursion_limit = "1024"]
 
 extern crate alloc;
 
-#[macro_use]
-extern crate const_type_layout;
-
 #[cfg(not(target_os = "cuda"))]
 fn main() {}
 
 #[repr(C)]
-#[derive(TypeLayout)]
+#[derive(rc::deps::const_type_layout::TypeLayout)]
+#[layout(crate = "rc::deps::const_type_layout")]
 pub struct Dummy(i32);
 
-#[derive(rust_cuda::common::LendRustToCuda)]
+#[derive(Clone, rc::lend::LendRustToCuda)]
+#[cuda(crate = "rc")]
 #[allow(dead_code)]
 pub struct Wrapper<T> {
     #[cuda(embed)]
     inner: T,
 }
 
-#[derive(rust_cuda::common::LendRustToCuda)]
+#[derive(Clone, rc::lend::LendRustToCuda)]
+#[cuda(crate = "rc")]
 pub struct Empty([u8; 0]);
 
 #[repr(C)]
-#[derive(TypeLayout)]
+#[derive(rc::deps::const_type_layout::TypeLayout)]
+#[layout(crate = "rc::deps::const_type_layout")]
 pub struct Tuple(u32, i32);
 
-#[rust_cuda::common::kernel(use link_kernel! as impl Kernel<KernelArgs> for Launcher)]
-pub fn kernel<'a, T: rust_cuda::common::RustToCuda>(
-    #[kernel(pass = SafeDeviceCopy)] _x: &Dummy,
-    #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy<Wrapper<T>>,
-    #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy<Wrapper<T>>,
-    #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64,
-    #[kernel(pass = LendRustToCuda)] _: Wrapper<T>,
-    #[kernel(pass = SafeDeviceCopy)] Tuple(_s, mut __t): Tuple,
-) where
-    <T as rust_cuda::common::RustToCuda>::CudaRepresentation: rust_cuda::safety::StackOnly,
-{
-}
-
-#[cfg(not(target_os = "cuda"))]
-mod host {
-    use super::{Kernel, KernelArgs};
-
-    #[allow(dead_code)]
-    struct Launcher<T: rust_cuda::common::RustToCuda>(core::marker::PhantomData<T>);
-
-    link_kernel!(crate::Empty);
-    link_kernel!(rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>);
+#[repr(C)]
+#[derive(Copy, Clone, rc::deps::const_type_layout::TypeLayout)]
+#[layout(crate = "rc::deps::const_type_layout")]
+pub struct Triple(i32, i32, i32);
+
+#[rc::kernel::kernel(pub use link! for impl)]
+#[kernel(crate = "rc")]
+#[kernel(
+    allow(ptx::double_precision_use),
+    forbid(ptx::local_memory_use, ptx::register_spills)
+)]
+pub fn kernel<
+    'a,
+    T: 'static
+        + Send
+        + Sync
+        + Clone
+        + rc::lend::RustToCuda<
+            CudaRepresentation: rc::safety::StackOnly,
+            CudaAllocation: rc::alloc::EmptyCudaAlloc,
+        >
+        + rc::safety::StackOnly,
+>(
+    _x: &rc::kernel::param::PerThreadShallowCopy<Dummy>,
+    _z: &rc::kernel::param::DeepPerThreadBorrow<Wrapper<T>>,
+    _v @ _w: &'a rc::kernel::param::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
+    _: rc::kernel::param::DeepPerThreadBorrow<Wrapper<T>>,
+    q @ Triple(s, mut __t, _u): rc::kernel::param::PerThreadShallowCopy<Triple>,
+    shared3: &mut rc::utils::shared::ThreadBlockShared<u32>,
+    dynamic: &mut rc::utils::shared::ThreadBlockSharedSlice<Dummy>,
+) {
+    let shared = rc::utils::shared::ThreadBlockShared::<[Tuple; 3]>::new_uninit();
+    let shared2 = rc::utils::shared::ThreadBlockShared::<[Tuple; 3]>::new_uninit();
+
+    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
+    unsafe {
+        (*shared.index_mut_unchecked(1)).0 = (f64::from(s) * 2.0) as u32;
+    }
+    unsafe {
+        (*shared2.index_mut_unchecked(2)).1 = q.0 + q.1 + q.2;
+    }
 
-    impl<T: rust_cuda::common::RustToCuda> rust_cuda::host::Launcher for Launcher<T> {
-        type CompilationWatcher = ();
-        type KernelTraitObject = dyn Kernel<T>;
+    unsafe {
+        *shared3.as_mut_ptr() = 12;
+    }
 
-        fn get_launch_package(&mut self) -> rust_cuda::host::LaunchPackage<Self> {
-            unimplemented!()
+    let index = rc::device::thread::Thread::this().index();
+    if index < dynamic.len() {
+        unsafe {
+            *dynamic.index_mut_unchecked(index) = Dummy(42);
         }
     }
 }
 
+#[cfg(not(target_os = "cuda"))]
+mod host {
+    // Link several instances of the generic CUDA kernel
+    struct KernelPtx<'a, T>(std::marker::PhantomData<&'a T>);
+    crate::link! { impl kernel<'a, crate::Empty> for KernelPtx }
+    crate::link! { impl kernel<'a, rc::utils::adapter::RustToCudaWithPortableBitCopySemantics<u64>> for KernelPtx }
+}
+
 #[cfg(target_os = "cuda")]
 mod cuda_prelude {
-    use core::arch::nvptx;
-
-    use rust_cuda::device::utils;
+    use rc::device::alloc::PTXAllocator;
 
     #[global_allocator]
-    static _GLOBAL_ALLOCATOR: utils::PTXAllocator = utils::PTXAllocator;
+    static _GLOBAL_ALLOCATOR: PTXAllocator = PTXAllocator;
 
     #[panic_handler]
     fn panic(_: &::core::panic::PanicInfo) -> ! {
-        unsafe { nvptx::trap() }
+        rc::device::utils::abort()
     }
 
     #[alloc_error_handler]
     fn alloc_error_handler(_: core::alloc::Layout) -> ! {
-        unsafe { nvptx::trap() }
+        rc::device::utils::abort()
     }
 }
diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml
index 4b8677df4..fc214dea7 100644
--- a/rust-cuda-derive/Cargo.toml
+++ b/rust-cuda-derive/Cargo.toml
@@ -1,9 +1,10 @@
 [package]
 name = "rust-cuda-derive"
 version = "0.1.0"
-authors = ["Juniper Tyree <juniper.langenstein@helsinki.fi>"]
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
+rust-version = "1.77" # nightly
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
@@ -11,16 +12,7 @@ edition = "2021"
 proc-macro = true
 
 [dependencies]
-syn = { version = "1.0", features = ["full"] }
+syn = { version = "1.0", features = ["full", "fold"] }
 quote = "1.0"
 proc-macro2 = "1.0"
 proc-macro-error = "1.0"
-regex = "1.5"
-lazy_static = "1.4"
-serde_json = "1.0"
-cargo_metadata = { version = "0.18", features = ["builder"] }
-strip-ansi-escapes = "0.2"
-colored = "2.0"
-
-seahash = "4.1"
-ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" }
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
deleted file mode 100644
index 506d8ea03..000000000
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ /dev/null
@@ -1,403 +0,0 @@
-use std::{
-    env, fs,
-    io::{Read, Write},
-    path::{Path, PathBuf},
-    sync::atomic::{AtomicBool, Ordering},
-};
-
-use colored::Colorize;
-use proc_macro::TokenStream;
-use ptx_builder::{
-    builder::{BuildStatus, Builder, MessageFormat, Profile},
-    error::{BuildErrorKind, Error, Result},
-};
-
-use super::utils::skip_kernel_compilation;
-
-mod config;
-mod error;
-
-use config::{CheckKernelConfig, LinkKernelConfig};
-use error::emit_ptx_build_error;
-
-pub fn check_kernel(tokens: TokenStream) -> TokenStream {
-    proc_macro_error::set_dummy(quote! {
-        "ERROR in this PTX compilation"
-    });
-
-    let CheckKernelConfig {
-        args,
-        crate_name,
-        crate_path,
-    } = match syn::parse_macro_input::parse(tokens) {
-        Ok(config) => config,
-        Err(err) => {
-            abort_call_site!(
-                "check_kernel!(ARGS NAME PATH) expects ARGS identifier, NAME and PATH string \
-                 literals: {:?}",
-                err
-            )
-        },
-    };
-
-    let kernel_ptx = compile_kernel(&args, &crate_name, &crate_path, Specialisation::Check);
-
-    match kernel_ptx {
-        Some(kernel_ptx) => quote!(#kernel_ptx).into(),
-        None => quote!("ERROR in this PTX compilation").into(),
-    }
-}
-
-#[allow(clippy::module_name_repetitions, clippy::too_many_lines)]
-pub fn link_kernel(tokens: TokenStream) -> TokenStream {
-    proc_macro_error::set_dummy(quote! {
-        const PTX_STR: &'static str = "ERROR in this PTX compilation";
-    });
-
-    let LinkKernelConfig {
-        kernel,
-        args,
-        crate_name,
-        crate_path,
-        specialisation,
-    } = match syn::parse_macro_input::parse(tokens) {
-        Ok(config) => config,
-        Err(err) => {
-            abort_call_site!(
-                "link_kernel!(KERNEL ARGS NAME PATH SPECIALISATION) expects KERNEL and ARGS \
-                 identifiers, NAME and PATH string literals, and SPECIALISATION tokens: {:?}",
-                err
-            )
-        },
-    };
-
-    if skip_kernel_compilation() {
-        return quote! {
-            const PTX_STR: &'static str = "CLIPPY skips specialised PTX compilation";
-        }
-        .into();
-    }
-
-    let Some(mut kernel_ptx) = compile_kernel(
-        &args,
-        &crate_name,
-        &crate_path,
-        Specialisation::Link(&specialisation),
-    ) else {
-        return (quote! {
-            const PTX_STR: &'static str = "ERROR in this PTX compilation";
-        })
-        .into();
-    };
-
-    let kernel_layout_name = if specialisation.is_empty() {
-        format!("{kernel}_type_layout_kernel")
-    } else {
-        format!(
-            "{kernel}_type_layout_kernel_{:016x}",
-            seahash::hash(specialisation.as_bytes())
-        )
-    };
-
-    let mut type_layouts = Vec::new();
-
-    let type_layout_start_pattern = format!("\n\t// .globl\t{kernel_layout_name}");
-
-    if let Some(type_layout_start) = kernel_ptx.find(&type_layout_start_pattern) {
-        const BEFORE_PARAM_PATTERN: &str = ".global .align 1 .b8 ";
-        const PARAM_LEN_PATTERN: &str = "[";
-        const LEN_BYTES_PATTERN: &str = "] = {";
-        const AFTER_BYTES_PATTERN: &str = "};";
-
-        let after_type_layout_start = type_layout_start + type_layout_start_pattern.len();
-
-        let Some(type_layout_middle) = kernel_ptx[after_type_layout_start..]
-            .find(&format!(".visible .entry {kernel_layout_name}"))
-            .map(|i| after_type_layout_start + i)
-        else {
-            abort_call_site!(
-                "Kernel compilation generated invalid PTX: incomplete type layout information"
-            )
-        };
-
-        let mut next_type_layout = after_type_layout_start;
-
-        while let Some(param_start_offset) =
-            kernel_ptx[next_type_layout..type_layout_middle].find(BEFORE_PARAM_PATTERN)
-        {
-            let param_start = next_type_layout + param_start_offset + BEFORE_PARAM_PATTERN.len();
-
-            if let Some(len_start_offset) =
-                kernel_ptx[param_start..type_layout_middle].find(PARAM_LEN_PATTERN)
-            {
-                let len_start = param_start + len_start_offset + PARAM_LEN_PATTERN.len();
-
-                if let Some(bytes_start_offset) =
-                    kernel_ptx[len_start..type_layout_middle].find(LEN_BYTES_PATTERN)
-                {
-                    let bytes_start = len_start + bytes_start_offset + LEN_BYTES_PATTERN.len();
-
-                    if let Some(bytes_end_offset) =
-                        kernel_ptx[bytes_start..type_layout_middle].find(AFTER_BYTES_PATTERN)
-                    {
-                        let param = &kernel_ptx[param_start..(param_start + len_start_offset)];
-                        let len = &kernel_ptx[len_start..(len_start + bytes_start_offset)];
-                        let bytes = &kernel_ptx[bytes_start..(bytes_start + bytes_end_offset)];
-
-                        let param = quote::format_ident!("{}", param);
-
-                        let Ok(len) = len.parse::<usize>() else {
-                            abort_call_site!(
-                                "Kernel compilation generated invalid PTX: invalid type layout \
-                                 length"
-                            )
-                        };
-                        let Ok(bytes) = bytes
-                            .split(", ")
-                            .map(std::str::FromStr::from_str)
-                            .collect::<Result<Vec<u8>, _>>()
-                        else {
-                            abort_call_site!(
-                                "Kernel compilation generated invalid PTX: invalid type layout \
-                                 byte"
-                            )
-                        };
-
-                        if bytes.len() != len {
-                            abort_call_site!(
-                                "Kernel compilation generated invalid PTX: type layout length \
-                                 mismatch"
-                            );
-                        }
-
-                        let byte_str = syn::LitByteStr::new(&bytes, proc_macro2::Span::call_site());
-
-                        type_layouts.push(quote! {
-                            const #param: &[u8; #len] = #byte_str;
-                        });
-
-                        next_type_layout =
-                            bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len();
-                    } else {
-                        next_type_layout = bytes_start;
-                    }
-                } else {
-                    next_type_layout = len_start;
-                }
-            } else {
-                next_type_layout = param_start;
-            }
-        }
-
-        let Some(type_layout_end) = kernel_ptx[type_layout_middle..]
-            .find('}')
-            .map(|i| type_layout_middle + i + '}'.len_utf8())
-        else {
-            abort_call_site!("Kernel compilation generated invalid PTX")
-        };
-
-        kernel_ptx.replace_range(type_layout_start..type_layout_end, "");
-    }
-
-    (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into()
-}
-
-fn compile_kernel(
-    args: &syn::Ident,
-    crate_name: &str,
-    crate_path: &Path,
-    specialisation: Specialisation,
-) -> Option<String> {
-    if let Ok(rust_flags) = proc_macro::tracked_env::var("RUSTFLAGS") {
-        env::set_var(
-            "RUSTFLAGS",
-            rust_flags
-                .replace("-Zinstrument-coverage", "")
-                .replace("-Cinstrument-coverage", ""),
-        );
-    }
-
-    let specialisation_var = format!(
-        "RUST_CUDA_DERIVE_SPECIALISE_{}_{}",
-        crate_name,
-        args.to_string().to_uppercase()
-    );
-
-    match build_kernel_with_specialisation(crate_path, &specialisation_var, specialisation) {
-        Ok(kernel_path) => {
-            let mut file = fs::File::open(&kernel_path)
-                .unwrap_or_else(|_| panic!("Failed to open kernel file at {:?}.", &kernel_path));
-
-            let mut kernel_ptx = String::new();
-
-            file.read_to_string(&mut kernel_ptx)
-                .unwrap_or_else(|_| panic!("Failed to read kernel file at {:?}.", &kernel_path));
-
-            colored::control::set_override(true);
-            eprintln!(
-                "{} {} compiling a PTX crate.",
-                "[PTX]".bright_black().bold(),
-                "Finished".green().bold()
-            );
-            colored::control::unset_override();
-
-            Some(kernel_ptx)
-        },
-        Err(err) => {
-            eprintln!("{err:?}");
-            emit_ptx_build_error();
-            None
-        },
-    }
-}
-
-#[allow(clippy::too_many_lines)]
-fn build_kernel_with_specialisation(
-    kernel_path: &Path,
-    env_var: &str,
-    specialisation: Specialisation,
-) -> Result<PathBuf> {
-    match specialisation {
-        Specialisation::Check => env::set_var(env_var, "chECK"),
-        Specialisation::Link(specialisation) => env::set_var(env_var, specialisation),
-    };
-
-    let result = (|| {
-        let mut builder = Builder::new(kernel_path)?;
-
-        builder = match specialisation {
-            Specialisation::Check => builder.set_profile(Profile::Debug),
-            Specialisation::Link(_) => builder.set_profile(Profile::Release),
-        };
-
-        builder = builder.set_message_format(MessageFormat::Json {
-            render_diagnostics: false,
-            short: false,
-            ansi: true,
-        });
-
-        let specialisation_prefix = match specialisation {
-            Specialisation::Check => String::from("chECK"),
-            Specialisation::Link(specialisation) => {
-                format!("{:016x}", seahash::hash(specialisation.as_bytes()))
-            },
-        };
-        builder = builder.set_prefix(specialisation_prefix.clone());
-
-        let any_output = AtomicBool::new(false);
-        let crate_name = String::from(builder.get_crate_name());
-
-        match builder.build_live(
-            |stdout_line| {
-                if let Ok(cargo_metadata::Message::CompilerMessage(mut message)) =
-                    serde_json::from_str(stdout_line)
-                {
-                    if any_output
-                        .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
-                        .is_ok()
-                    {
-                        colored::control::set_override(true);
-                        eprintln!(
-                            "{} of {} ({})",
-                            "[PTX]".bright_black().bold(),
-                            crate_name.bold(),
-                            specialisation_prefix.to_ascii_lowercase(),
-                        );
-                        colored::control::unset_override();
-                    }
-
-                    if let Some(rendered) = &mut message.message.rendered {
-                        colored::control::set_override(true);
-                        let prefix = "  | ".bright_black().bold().to_string();
-                        colored::control::unset_override();
-
-                        let glue = String::from('\n') + &prefix;
-
-                        let mut lines = rendered
-                            .split('\n')
-                            .rev()
-                            .skip_while(|l| l.trim().is_empty())
-                            .collect::<Vec<_>>();
-                        lines.reverse();
-
-                        let mut prefixed = prefix + &lines.join(&glue);
-
-                        std::mem::swap(rendered, &mut prefixed);
-                    }
-
-                    eprintln!("{}", serde_json::to_string(&message.message).unwrap());
-                }
-            },
-            |stderr_line| {
-                if stderr_line.trim().is_empty() {
-                    return;
-                }
-
-                if any_output
-                    .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
-                    .is_ok()
-                {
-                    colored::control::set_override(true);
-                    eprintln!(
-                        "{} of {} ({})",
-                        "[PTX]".bright_black().bold(),
-                        crate_name.bold(),
-                        specialisation_prefix.to_ascii_lowercase(),
-                    );
-                    colored::control::unset_override();
-                }
-
-                colored::control::set_override(true);
-                eprintln!(
-                    "  {} {}",
-                    "|".bright_black().bold(),
-                    stderr_line.replace("   ", "")
-                );
-                colored::control::unset_override();
-            },
-        )? {
-            BuildStatus::Success(output) => {
-                let ptx_path = output.get_assembly_path();
-
-                let mut specialised_ptx_path = ptx_path.clone();
-
-                specialised_ptx_path.set_extension(format!("{specialisation_prefix}.ptx"));
-
-                fs::copy(&ptx_path, &specialised_ptx_path).map_err(|err| {
-                    Error::from(BuildErrorKind::BuildFailed(vec![format!(
-                        "Failed to copy kernel from {ptx_path:?} to {specialised_ptx_path:?}: \
-                         {err}"
-                    )]))
-                })?;
-
-                if let Specialisation::Link(specialisation) = specialisation {
-                    fs::OpenOptions::new()
-                        .append(true)
-                        .open(&specialised_ptx_path)
-                        .and_then(|mut file| writeln!(file, "\n// {specialisation}"))
-                        .map_err(|err| {
-                            Error::from(BuildErrorKind::BuildFailed(vec![format!(
-                                "Failed to write specialisation to {specialised_ptx_path:?}: {err}"
-                            )]))
-                        })?;
-                }
-
-                Ok(specialised_ptx_path)
-            },
-            BuildStatus::NotNeeded => Err(Error::from(BuildErrorKind::BuildFailed(vec![format!(
-                "Kernel build for specialisation {:?} was not needed.",
-                &specialisation
-            )]))),
-        }
-    })();
-
-    env::remove_var(env_var);
-
-    result
-}
-
-#[derive(Copy, Clone, Debug)]
-enum Specialisation<'a> {
-    Check,
-    Link(&'a str),
-}
diff --git a/rust-cuda-derive/src/kernel/mod.rs b/rust-cuda-derive/src/kernel/mod.rs
deleted file mode 100644
index c44f1dd2f..000000000
--- a/rust-cuda-derive/src/kernel/mod.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-pub mod link;
-pub mod specialise;
-pub mod wrapper;
-
-mod utils;
diff --git a/rust-cuda-derive/src/kernel/specialise/mod.rs b/rust-cuda-derive/src/kernel/specialise/mod.rs
deleted file mode 100644
index 337508b5b..000000000
--- a/rust-cuda-derive/src/kernel/specialise/mod.rs
+++ /dev/null
@@ -1,3 +0,0 @@
-pub mod call;
-pub mod entry;
-pub mod ty;
diff --git a/rust-cuda-derive/src/kernel/specialise/ty.rs b/rust-cuda-derive/src/kernel/specialise/ty.rs
deleted file mode 100644
index 9b5a06955..000000000
--- a/rust-cuda-derive/src/kernel/specialise/ty.rs
+++ /dev/null
@@ -1,54 +0,0 @@
-use proc_macro::TokenStream;
-
-pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
-    let SpecialiseTypeConfig { kernel, typedef } = match syn::parse_macro_input::parse(tokens) {
-        Ok(config) => config,
-        Err(err) => {
-            abort_call_site!(
-                "specialise_kernel_type!(KERNEL::TYPEDEF) expects KERNEL and TYPEDEF identifiers: \
-                 {:?}",
-                err
-            )
-        },
-    };
-
-    let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") {
-        Ok(crate_name) => crate_name.to_uppercase(),
-        Err(err) => abort_call_site!("Failed to read crate name: {:?}", err),
-    };
-
-    let specialisation_var = format!(
-        "RUST_CUDA_DERIVE_SPECIALISE_{}_{}",
-        crate_name,
-        kernel.to_string().to_uppercase()
-    );
-
-    match proc_macro::tracked_env::var(&specialisation_var) {
-        Ok(specialisation) => {
-            match format!("<() as {kernel}{specialisation}>::{typedef}").parse() {
-                Ok(parsed_specialisation) => parsed_specialisation,
-                Err(err) => abort_call_site!("Failed to parse specialisation: {:?}", err),
-            }
-        },
-        Err(err) => abort_call_site!(
-            "Failed to read specialisation from {:?}: {:?}",
-            &specialisation_var,
-            err
-        ),
-    }
-}
-
-struct SpecialiseTypeConfig {
-    kernel: syn::Ident,
-    typedef: syn::Ident,
-}
-
-impl syn::parse::Parse for SpecialiseTypeConfig {
-    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
-        let kernel: syn::Ident = input.parse()?;
-        let _dc: syn::token::Colon2 = input.parse()?;
-        let typedef: syn::Ident = input.parse()?;
-
-        Ok(Self { kernel, typedef })
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs
deleted file mode 100644
index c07486c2b..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/config.rs
+++ /dev/null
@@ -1,32 +0,0 @@
-pub(super) struct KernelConfig {
-    pub(super) visibility: Option<syn::token::Pub>,
-    pub(super) linker: syn::Ident,
-    pub(super) kernel: syn::Ident,
-    pub(super) args: syn::Ident,
-    pub(super) launcher: syn::Ident,
-}
-
-impl syn::parse::Parse for KernelConfig {
-    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
-        let visibility: Option<syn::token::Pub> = input.parse()?;
-        let _use: syn::token::Use = input.parse()?;
-        let linker: syn::Ident = input.parse()?;
-        let _bang: syn::token::Bang = input.parse()?;
-        let _as: syn::token::As = input.parse()?;
-        let _impl: syn::token::Impl = input.parse()?;
-        let kernel: syn::Ident = input.parse()?;
-        let _lt_token: syn::token::Lt = input.parse()?;
-        let args: syn::Ident = input.parse()?;
-        let _gt_token: syn::token::Gt = input.parse()?;
-        let _for: syn::token::For = input.parse()?;
-        let launcher: syn::Ident = input.parse()?;
-
-        Ok(Self {
-            visibility,
-            linker,
-            kernel,
-            args,
-            launcher,
-        })
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs b/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs
deleted file mode 100644
index 4c725601b..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs
+++ /dev/null
@@ -1,77 +0,0 @@
-use proc_macro2::TokenStream;
-
-use super::super::{DeclGenerics, FunctionInputs, ImplGenerics, KernelConfig};
-
-pub(in super::super) fn quote_args_trait(
-    KernelConfig {
-        visibility, args, ..
-    }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_trait_params: generic_params,
-        generic_close_token,
-        generic_trait_where_clause: generic_where_clause,
-        ..
-    }: &DeclGenerics,
-    ImplGenerics {
-        impl_generics,
-        ty_generics,
-        where_clause,
-    }: &ImplGenerics,
-    FunctionInputs { func_inputs, .. }: &FunctionInputs,
-) -> TokenStream {
-    let func_input_typedefs = (0..func_inputs.len())
-        .map(|i| {
-            let type_ident = quote::format_ident!("__T_{}", i);
-
-            quote! {
-                type #type_ident;
-            }
-        })
-        .collect::<Vec<_>>();
-
-    let func_input_types = func_inputs
-        .iter()
-        .enumerate()
-        .map(|(i, arg)| {
-            let pat_type = match arg {
-                syn::FnArg::Typed(pat_type) => pat_type,
-                syn::FnArg::Receiver(_) => unreachable!(),
-            };
-
-            let type_ident = quote::format_ident!("__T_{}", i);
-            let arg_type = match &*pat_type.ty {
-                syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
-                other => other,
-            };
-
-            quote! {
-                type #type_ident = #arg_type;
-            }
-        })
-        .collect::<Vec<_>>();
-
-    quote! {
-        #[cfg(not(target_os = "cuda"))]
-        #[allow(clippy::missing_safety_doc)]
-        #visibility unsafe trait #args #generic_start_token #generic_params #generic_close_token
-            #generic_where_clause
-        {
-            #(#func_input_typedefs)*
-        }
-
-        // #args must always be pub in CUDA kernel as it is used to define the
-        //  public kernel entry point signature
-        #[cfg(target_os = "cuda")]
-        #[allow(clippy::missing_safety_doc)]
-        pub unsafe trait #args #generic_start_token #generic_params #generic_close_token
-            #generic_where_clause
-        {
-            #(#func_input_typedefs)*
-        }
-
-        unsafe impl #impl_generics #args #ty_generics for () #where_clause {
-            #(#func_input_types)*
-        }
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
deleted file mode 100644
index dadda41ec..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
+++ /dev/null
@@ -1,92 +0,0 @@
-use proc_macro2::TokenStream;
-use syn::spanned::Spanned;
-
-use crate::kernel::utils::skip_kernel_compilation;
-
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
-
-pub(super) fn quote_get_ptx_str(
-    FuncIdent {
-        func_ident,
-        func_ident_hash,
-        ..
-    }: &FuncIdent,
-    config @ KernelConfig { args, .. }: &KernelConfig,
-    generics @ DeclGenerics {
-        generic_start_token,
-        generic_close_token,
-        ..
-    }: &DeclGenerics,
-    inputs: &FunctionInputs,
-    func_params: &[syn::Ident],
-    macro_type_ids: &[syn::Ident],
-) -> TokenStream {
-    let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") {
-        Ok(crate_name) => crate_name.to_uppercase(),
-        Err(err) => abort_call_site!("Failed to read crate name: {:?}.", err),
-    };
-
-    let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR")
-        .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err));
-
-    let cpu_func_lifetime_erased_types =
-        super::kernel_func_raw::generate_launch_types(config, generics, inputs, macro_type_ids).1;
-
-    let matching_kernel_assert = if skip_kernel_compilation() {
-        quote!()
-    } else {
-        quote::quote_spanned! { func_ident.span()=>
-            const _: ::rust_cuda::safety::kernel_signature::Assert<{
-                ::rust_cuda::safety::kernel_signature::CpuAndGpuKernelSignatures::Match
-            }> = ::rust_cuda::safety::kernel_signature::Assert::<{
-                ::rust_cuda::safety::kernel_signature::check(
-                    PTX_STR.as_bytes(),
-                    concat!(".visible .entry ", rust_cuda::host::specialise_kernel_call!(
-                        #func_ident_hash #generic_start_token
-                            #($#macro_type_ids),*
-                        #generic_close_token
-                    )).as_bytes()
-                )
-            }>;
-        }
-    };
-
-    let type_layout_asserts = if skip_kernel_compilation() {
-        Vec::new()
-    } else {
-        cpu_func_lifetime_erased_types
-            .iter()
-            .zip(func_params.iter())
-            .map(|(ty, param)| {
-                let layout_param = syn::Ident::new(
-                    &format!("__{func_ident_hash}_{param}_layout").to_uppercase(),
-                    param.span(),
-                );
-
-                quote::quote_spanned! { ty.span()=>
-                    const _: ::rust_cuda::safety::type_layout::Assert<{
-                        ::rust_cuda::safety::type_layout::CpuAndGpuTypeLayouts::Match
-                    }> = ::rust_cuda::safety::type_layout::Assert::<{
-                        ::rust_cuda::safety::type_layout::check::<#ty>(#layout_param)
-                    }>;
-                }
-            })
-            .collect::<Vec<_>>()
-    };
-
-    quote! {
-        fn get_ptx_str() -> &'static str {
-            rust_cuda::host::link_kernel!{
-                #func_ident #args #crate_name #crate_manifest_dir #generic_start_token
-                    #($#macro_type_ids),*
-                #generic_close_token
-            }
-
-            #matching_kernel_assert
-
-            #(#type_layout_asserts)*
-
-            PTX_STR
-        }
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
deleted file mode 100644
index 7cad78e05..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
+++ /dev/null
@@ -1,169 +0,0 @@
-use proc_macro2::TokenStream;
-
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig};
-
-pub(super) fn quote_kernel_func(
-    KernelConfig { args, .. }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_wrapper_params,
-        generic_close_token,
-        generic_wrapper_where_clause,
-        ..
-    }: &DeclGenerics,
-    inputs @ FunctionInputs { func_inputs, .. }: &FunctionInputs,
-    fn_ident @ FuncIdent { func_ident, .. }: &FuncIdent,
-    func_params: &[syn::Ident],
-    func_attrs: &[syn::Attribute],
-    macro_type_ids: &[syn::Ident],
-) -> TokenStream {
-    let new_func_inputs = func_inputs
-        .iter()
-        .enumerate()
-        .map(|(i, arg)| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                pat,
-                colon_token,
-                ty,
-            }) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
-                let syn_type = quote! {
-                    <() as #args #generic_start_token
-                        #($#macro_type_ids),*
-                    #generic_close_token>::#type_ident
-                };
-
-                if let syn::Type::Reference(syn::TypeReference {
-                    and_token,
-                    lifetime,
-                    mutability,
-                    ..
-                }) = &**ty
-                {
-                    quote! {
-                        #(#attrs)* #pat #colon_token #and_token #lifetime #mutability #syn_type
-                    }
-                } else {
-                    quote! { #(#attrs)* #pat #colon_token #syn_type }
-                }
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .collect::<Vec<_>>();
-
-    let raw_func_input_wrap = generate_raw_func_input_wrap(inputs, fn_ident, func_params);
-
-    quote! {
-        #(#func_attrs)*
-        #[allow(clippy::needless_lifetimes)]
-        fn #func_ident #generic_start_token #generic_wrapper_params #generic_close_token (
-            &mut self, #(#new_func_inputs),*
-        ) -> rust_cuda::rustacuda::error::CudaResult<()>
-            #generic_wrapper_where_clause
-        {
-            // impls check adapted from Nikolai Vazquez's `impls` crate:
-            //  https://docs.rs/impls/1.0.3/src/impls/lib.rs.html#584-602
-            const fn __check_is_sync<T: ?Sized>(_x: &T) -> bool {
-                trait IsSyncMarker {
-                    const SYNC: bool = false;
-                }
-                impl<T: ?Sized> IsSyncMarker for T {}
-                struct CheckIs<T: ?Sized>(::core::marker::PhantomData<T>);
-                #[allow(dead_code)]
-                impl<T: ?Sized + Sync> CheckIs<T> {
-                    const SYNC: bool = true;
-                }
-
-                <CheckIs<T>>::SYNC
-            }
-
-            #raw_func_input_wrap
-        }
-    }
-}
-
-#[allow(clippy::too_many_lines)]
-fn generate_raw_func_input_wrap(
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-    FuncIdent { func_ident_raw, .. }: &FuncIdent,
-    func_params: &[syn::Ident],
-) -> TokenStream {
-    func_inputs
-        .iter()
-        .zip(func_params)
-        .zip(func_input_cuda_types.iter())
-        .rev()
-        .fold(
-            quote! {
-                self.#func_ident_raw(#(#func_params),*)
-            },
-            |inner, ((arg, param), (cuda_mode, _ptx_jit))| match arg {
-                syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => {
-                        if let syn::Type::Reference(..) = &**ty {
-                            let pat_box = quote::format_ident!("__{}_box", param);
-
-                            // DeviceCopy mode only supports immutable references
-                            quote! {
-                                let mut #pat_box = rust_cuda::host::HostDeviceBox::from(
-                                    rust_cuda::rustacuda::memory::DeviceBox::new(
-                                        rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat)
-                                    )?
-                                );
-                                #[allow(clippy::redundant_closure_call)]
-                                // Safety: `#pat_box` contains exactly the device copy of `#pat`
-                                let __result = (|#pat| { #inner })(unsafe {
-                                    rust_cuda::host::HostAndDeviceConstRef::new(
-                                        &#pat_box,  rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat)
-                                    )
-                                });
-
-                                #[allow(invalid_reference_casting)]
-                                if !__check_is_sync(#pat) {
-                                    // Safety:
-                                    // * Since `#ty` is `!Sync`, it contains interior mutability
-                                    // * Therefore, part of the 'immutable' device copy may have
-                                    //    been mutated
-                                    // * If all mutation was confined to interior mutability,
-                                    //    then passing these changes on is safe (and expected)
-                                    // * If any mutations occured outside interior mutability,
-                                    //    then UB occurred, in the kernel (we're not the cause)
-                                    #pat_box.copy_to(unsafe { &mut *(#pat as *const _ as *mut _) })?;
-                                }
-
-                                ::core::mem::drop(#pat_box);
-                                __result
-                            }
-                        } else {
-                            quote! { {
-                                let #pat = rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from(#pat);
-                                #inner
-                            } }
-                        }
-                    },
-                    InputCudaType::LendRustToCuda => {
-                        if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty {
-                            if mutability.is_some() {
-                                quote! { rust_cuda::host::LendToCuda::lend_to_cuda_mut(
-                                    #pat, |#pat| { #inner }
-                                ) }
-                            } else {
-                                quote! { rust_cuda::host::LendToCuda::lend_to_cuda(
-                                    #pat, |#pat| { #inner }
-                                ) }
-                            }
-                        } else {
-                            quote! { rust_cuda::host::LendToCuda::move_to_cuda(
-                                #pat, |#pat| { #inner }
-                            ) }
-                        }
-                    },
-                },
-                syn::FnArg::Receiver(_) => unreachable!(),
-            },
-        )
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/launch_types.rs
deleted file mode 100644
index 0fed7282f..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/launch_types.rs
+++ /dev/null
@@ -1,106 +0,0 @@
-use proc_macro2::TokenStream;
-use syn::spanned::Spanned;
-
-use crate::kernel::utils::r2c_move_lifetime;
-
-use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig};
-
-pub(in super::super) fn generate_launch_types(
-    KernelConfig { args, .. }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_close_token,
-        ..
-    }: &DeclGenerics,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-    macro_type_ids: &[syn::Ident],
-) -> (Vec<TokenStream>, Vec<TokenStream>, Vec<TokenStream>) {
-    let mut cpu_func_types_launch = Vec::with_capacity(func_inputs.len());
-    let mut cpu_func_lifetime_erased_types = Vec::with_capacity(func_inputs.len());
-    let mut cpu_func_unboxed_types = Vec::with_capacity(func_inputs.len());
-
-    func_inputs
-        .iter()
-        .zip(func_input_cuda_types.iter())
-        .enumerate()
-        .for_each(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg {
-            syn::FnArg::Typed(syn::PatType { ty, .. }) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
-                let syn_type = quote::quote_spanned! { ty.span()=>
-                    <() as #args #generic_start_token
-                        #($#macro_type_ids),*
-                    #generic_close_token>::#type_ident
-                };
-
-                cpu_func_unboxed_types.push(syn_type.clone());
-
-                let cuda_type = match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=>
-                        rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
-                    },
-                    InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=>
-                        rust_cuda::common::DeviceAccessible<
-                            <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation
-                        >
-                    },
-                };
-
-                cpu_func_types_launch.push(
-                    if let syn::Type::Reference(syn::TypeReference {
-                        mutability,
-                        lifetime,
-                        ..
-                    }) = &**ty
-                    {
-                        if mutability.is_some() {
-                            quote::quote_spanned! { ty.span()=>
-                                rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type>
-                            }
-                        } else {
-                            quote::quote_spanned! { ty.span()=>
-                                rust_cuda::common::DeviceConstRef<#lifetime, #cuda_type>
-                            }
-                        }
-                    } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                        let lifetime = r2c_move_lifetime(i, ty);
-
-                        quote::quote_spanned! { ty.span()=>
-                            rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type>
-                        }
-                    } else {
-                        quote! { #cuda_type }
-                    },
-                );
-
-                cpu_func_lifetime_erased_types.push(
-                    if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty {
-                        if mutability.is_some() {
-                            quote::quote_spanned! { ty.span()=>
-                                rust_cuda::common::DeviceMutRef<'static, #cuda_type>
-                            }
-                        } else {
-                            quote::quote_spanned! { ty.span()=>
-                                rust_cuda::common::DeviceConstRef<'static, #cuda_type>
-                            }
-                        }
-                    } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                        quote::quote_spanned! { ty.span()=>
-                            rust_cuda::common::DeviceMutRef<'static, #cuda_type>
-                        }
-                    } else {
-                        cuda_type
-                    },
-                );
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        });
-
-    (
-        cpu_func_types_launch,
-        cpu_func_lifetime_erased_types,
-        cpu_func_unboxed_types,
-    )
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs
deleted file mode 100644
index ab352b4c8..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-use proc_macro2::TokenStream;
-
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
-
-mod launch_types;
-mod raw_func_types;
-mod type_wrap;
-
-pub(super) use launch_types::generate_launch_types;
-use raw_func_types::generate_raw_func_types;
-use type_wrap::generate_func_input_and_ptx_jit_wraps;
-
-#[allow(clippy::too_many_arguments)]
-pub(super) fn quote_kernel_func_raw(
-    config @ KernelConfig { args, .. }: &KernelConfig,
-    decl_generics @ DeclGenerics {
-        generic_start_token,
-        generic_wrapper_params,
-        generic_close_token,
-        generic_wrapper_where_clause,
-        ..
-    }: &DeclGenerics,
-    func_inputs: &FunctionInputs,
-    FuncIdent { func_ident_raw, .. }: &FuncIdent,
-    func_params: &[syn::Ident],
-    func_attrs: &[syn::Attribute],
-    macro_type_ids: &[syn::Ident],
-) -> TokenStream {
-    let new_func_inputs_raw =
-        generate_raw_func_types(config, decl_generics, func_inputs, macro_type_ids);
-    let (func_input_wrap, func_cpu_ptx_jit_wrap) =
-        generate_func_input_and_ptx_jit_wraps(func_inputs);
-    let (cpu_func_types_launch, cpu_func_lifetime_erased_types, cpu_func_unboxed_types) =
-        generate_launch_types(config, decl_generics, func_inputs, macro_type_ids);
-
-    quote! {
-        #(#func_attrs)*
-        #[allow(clippy::extra_unused_type_parameters)]
-        fn #func_ident_raw #generic_start_token #generic_wrapper_params #generic_close_token (
-            &mut self, #(#new_func_inputs_raw),*
-        ) -> rust_cuda::rustacuda::error::CudaResult<()>
-            #generic_wrapper_where_clause
-        {
-            let rust_cuda::host::LaunchPackage {
-                kernel, watcher, config, stream
-            } = rust_cuda::host::Launcher::get_launch_package(self);
-
-            let kernel_jit_result = if config.ptx_jit {
-                rust_cuda::ptx_jit::compilePtxJITwithArguments! {
-                    kernel.compile_with_ptx_jit_args(#(#func_cpu_ptx_jit_wrap),*)
-                }?
-            } else {
-                 kernel.compile_with_ptx_jit_args(None)?
-            };
-
-            let function = match kernel_jit_result {
-                rust_cuda::host::KernelJITResult::Recompiled(function) => {
-                    // Call launcher hook on kernel compilation
-                    <Self as rust_cuda::host::Launcher>::on_compile(function, watcher)?;
-
-                    function
-                },
-                rust_cuda::host::KernelJITResult::Cached(function) => function,
-            };
-
-            #[allow(clippy::redundant_closure_call)]
-            (|#(#func_params: #cpu_func_types_launch),*| {
-                #[deny(improper_ctypes)]
-                mod __rust_cuda_ffi_safe_assert {
-                    use super::#args;
-
-                    extern "C" { #(
-                        #[allow(dead_code)]
-                        static #func_params: #cpu_func_lifetime_erased_types;
-                    )* }
-                }
-
-                if false {
-                    #[allow(dead_code)]
-                    fn assert_impl_devicecopy<T: rust_cuda::rustacuda_core::DeviceCopy>(_val: &T) {}
-
-                    #[allow(dead_code)]
-                    fn assert_impl_no_aliasing<T: rust_cuda::safety::NoAliasing>() {}
-
-                    #[allow(dead_code)]
-                    fn assert_impl_fits_into_device_register<
-                        T: rust_cuda::safety::FitsIntoDeviceRegister,
-                    >(_val: &T) {}
-
-                    #(assert_impl_devicecopy(&#func_params);)*
-                    #(assert_impl_no_aliasing::<#cpu_func_unboxed_types>();)*
-                    #(assert_impl_fits_into_device_register(&#func_params);)*
-                }
-
-                let rust_cuda::host::LaunchConfig {
-                    grid, block, shared_memory_size, ptx_jit: _,
-                } = config;
-
-                unsafe { stream.launch(function, grid, block, shared_memory_size,
-                    &[
-                        #(
-                            &#func_params as *const _ as *mut ::std::ffi::c_void
-                        ),*
-                    ]
-                ) }?;
-
-                stream.synchronize()
-            })(#(#func_input_wrap),*)
-        }
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs
deleted file mode 100644
index 380048ec5..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-use proc_macro2::TokenStream;
-use syn::spanned::Spanned;
-
-use crate::kernel::utils::r2c_move_lifetime;
-
-use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig};
-
-pub(super) fn generate_raw_func_types(
-    KernelConfig { args, .. }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_close_token,
-        ..
-    }: &DeclGenerics,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-    macro_type_ids: &[syn::Ident],
-) -> Vec<TokenStream> {
-    func_inputs
-        .iter()
-        .zip(func_input_cuda_types.iter())
-        .enumerate()
-        .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                pat,
-                colon_token,
-                ty,
-            }) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
-                let syn_type = quote! {
-                    <() as #args #generic_start_token
-                        #($#macro_type_ids),*
-                    #generic_close_token>::#type_ident
-                };
-
-                let cuda_type = match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => quote! {
-                        rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
-                    },
-                    InputCudaType::LendRustToCuda => quote! {
-                        rust_cuda::common::DeviceAccessible<
-                            <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation
-                        >
-                    },
-                };
-
-                if let syn::Type::Reference(syn::TypeReference {
-                    lifetime,
-                    mutability,
-                    ..
-                }) = &**ty
-                {
-                    let wrapped_type = if mutability.is_some() {
-                        if matches!(cuda_mode, InputCudaType::SafeDeviceCopy) {
-                            abort!(
-                                mutability.span(),
-                                "Cannot mutably alias a `SafeDeviceCopy` kernel parameter."
-                            );
-                        }
-
-                        quote!(
-                            rust_cuda::host::HostAndDeviceMutRef<#lifetime, #cuda_type>
-                        )
-                    } else {
-                        quote!(
-                            rust_cuda::host::HostAndDeviceConstRef<#lifetime, #cuda_type>
-                        )
-                    };
-
-                    quote! {
-                        #(#attrs)* #mutability #pat #colon_token #wrapped_type
-                    }
-                } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                    let lifetime = r2c_move_lifetime(i, ty);
-
-                    let wrapped_type = quote! {
-                        rust_cuda::host::HostAndDeviceOwned<#lifetime, #cuda_type>
-                    };
-
-                    quote! {
-                        #(#attrs)* #pat #colon_token #wrapped_type
-                    }
-                } else {
-                    quote! { #(#attrs)* #pat #colon_token #cuda_type }
-                }
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .collect()
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs
deleted file mode 100644
index 432930731..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs
+++ /dev/null
@@ -1,38 +0,0 @@
-use proc_macro2::TokenStream;
-
-use crate::kernel::wrapper::InputCudaType;
-
-use super::super::super::super::FunctionInputs;
-
-pub(super) fn generate_func_input_and_ptx_jit_wraps(
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-) -> (Vec<TokenStream>, Vec<TokenStream>) {
-    func_inputs
-        .iter()
-        .zip(func_input_cuda_types.iter())
-        .map(|(arg, (cuda_mode, ptx_jit))| match arg {
-            syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => {
-                #[allow(clippy::if_same_then_else)]
-                let func_input = if let syn::Type::Reference(_) = &**ty {
-                    quote! { #pat.for_device() }
-                } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                    quote! { #pat.for_device() }
-                } else {
-                    quote! { #pat }
-                };
-
-                let ptx_load = if ptx_jit.0 {
-                    quote! { ConstLoad[#pat.for_host()] }
-                } else {
-                    quote! { Ignore[#pat] }
-                };
-
-                (func_input, ptx_load)
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .unzip()
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
deleted file mode 100644
index 7ab891e7e..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
+++ /dev/null
@@ -1,105 +0,0 @@
-use proc_macro2::TokenStream;
-
-use super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
-
-mod get_ptx_str;
-mod kernel_func;
-mod kernel_func_raw;
-mod new_kernel;
-
-use get_ptx_str::quote_get_ptx_str;
-use kernel_func::quote_kernel_func;
-use kernel_func_raw::quote_kernel_func_raw;
-use new_kernel::quote_new_kernel;
-
-pub(in super::super) fn quote_cpu_linker_macro(
-    config @ KernelConfig {
-        visibility,
-        kernel,
-        linker,
-        launcher,
-        ..
-    }: &KernelConfig,
-    decl_generics @ DeclGenerics {
-        generic_start_token,
-        generic_trait_params: generic_params,
-        generic_close_token,
-        ..
-    }: &DeclGenerics,
-    func_inputs: &FunctionInputs,
-    func_ident: &FuncIdent,
-    func_params: &[syn::Ident],
-    func_attrs: &[syn::Attribute],
-) -> TokenStream {
-    let macro_types = generic_params
-        .iter()
-        .enumerate()
-        .map(|(i, generic)| {
-            let generic_ident = quote::format_ident!("__g_{}", i);
-
-            match generic {
-                syn::GenericParam::Type(_) => quote!($#generic_ident:ty),
-                syn::GenericParam::Const(_) => quote!($#generic_ident:expr),
-                syn::GenericParam::Lifetime(_) => unreachable!(),
-            }
-        })
-        .collect::<Vec<_>>();
-
-    let macro_type_ids = (0..generic_params.len())
-        .map(|i| quote::format_ident!("__g_{}", i))
-        .collect::<Vec<_>>();
-
-    let cpu_linker_macro_visibility = if visibility.is_some() {
-        quote! { #[macro_export] }
-    } else {
-        quote! {}
-    };
-
-    let get_ptx_str = quote_get_ptx_str(
-        func_ident,
-        config,
-        decl_generics,
-        func_inputs,
-        func_params,
-        &macro_type_ids,
-    );
-    let new_kernel = quote_new_kernel(config, decl_generics, func_ident, &macro_type_ids);
-    let kernel_func = quote_kernel_func(
-        config,
-        decl_generics,
-        func_inputs,
-        func_ident,
-        func_params,
-        func_attrs,
-        &macro_type_ids,
-    );
-    let kernel_func_raw = quote_kernel_func_raw(
-        config,
-        decl_generics,
-        func_inputs,
-        func_ident,
-        func_params,
-        func_attrs,
-        &macro_type_ids,
-    );
-
-    quote! {
-        #[cfg(not(target_os = "cuda"))]
-        #cpu_linker_macro_visibility
-        macro_rules! #linker {
-            (#(#macro_types),* $(,)?) => {
-                unsafe impl #kernel #generic_start_token #($#macro_type_ids),* #generic_close_token
-                    for #launcher #generic_start_token #($#macro_type_ids),* #generic_close_token
-                {
-                    #get_ptx_str
-
-                    #new_kernel
-
-                    #kernel_func
-
-                    #kernel_func_raw
-                }
-            };
-        }
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs
deleted file mode 100644
index fa32591db..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-use proc_macro2::TokenStream;
-
-use super::super::super::{DeclGenerics, FuncIdent, KernelConfig};
-
-pub(super) fn quote_new_kernel(
-    KernelConfig { kernel, .. }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_close_token,
-        ..
-    }: &DeclGenerics,
-    FuncIdent {
-        func_ident_hash, ..
-    }: &FuncIdent,
-    macro_type_ids: &[syn::Ident],
-) -> TokenStream {
-    quote! {
-        fn new_kernel() -> rust_cuda::rustacuda::error::CudaResult<
-            rust_cuda::host::TypedKernel<dyn #kernel #generic_start_token
-                #($#macro_type_ids),*
-            #generic_close_token>
-        > {
-            let ptx = Self::get_ptx_str();
-            let entry_point = rust_cuda::host::specialise_kernel_call!(
-                #func_ident_hash #generic_start_token
-                    #($#macro_type_ids),*
-                #generic_close_token
-            );
-
-            rust_cuda::host::TypedKernel::new(ptx, entry_point)
-        }
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
deleted file mode 100644
index cad3cdc6a..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
+++ /dev/null
@@ -1,186 +0,0 @@
-use proc_macro2::TokenStream;
-
-use crate::kernel::utils::r2c_move_lifetime;
-
-use super::super::{
-    DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig,
-};
-
-pub(in super::super) fn quote_cpu_wrapper(
-    config @ KernelConfig {
-        visibility, kernel, ..
-    }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_trait_params,
-        generic_close_token,
-        generic_trait_where_clause,
-        generic_wrapper_params,
-        generic_wrapper_where_clause,
-        ..
-    }: &DeclGenerics,
-    impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics,
-    func_inputs: &FunctionInputs,
-    FuncIdent {
-        func_ident,
-        func_ident_raw,
-        ..
-    }: &FuncIdent,
-    func_attrs: &[syn::Attribute],
-) -> TokenStream {
-    let launcher_predicate = quote! {
-        Self: Sized + rust_cuda::host::Launcher<
-            KernelTraitObject = dyn #kernel #ty_generics
-        >
-    };
-
-    let generic_wrapper_where_clause = match generic_wrapper_where_clause {
-        Some(syn::WhereClause {
-            where_token,
-            predicates,
-        }) if !predicates.is_empty() => {
-            let comma = if predicates.empty_or_trailing() {
-                quote!()
-            } else {
-                quote!(,)
-            };
-
-            quote! {
-                #where_token #predicates #comma #launcher_predicate
-            }
-        },
-        _ => quote! {
-            where #launcher_predicate
-        },
-    };
-
-    let (new_func_inputs_decl, new_func_inputs_raw_decl) =
-        generate_new_func_inputs_decl(config, impl_generics, func_inputs);
-
-    quote! {
-        #[cfg(not(target_os = "cuda"))]
-        #[allow(clippy::missing_safety_doc)]
-        #visibility unsafe trait #kernel #generic_start_token #generic_trait_params #generic_close_token
-            #generic_trait_where_clause
-        {
-            fn get_ptx_str() -> &'static str where #launcher_predicate;
-
-            fn new_kernel() -> rust_cuda::rustacuda::error::CudaResult<
-                rust_cuda::host::TypedKernel<dyn #kernel #ty_generics>
-            > where #launcher_predicate;
-
-            #(#func_attrs)*
-            fn #func_ident #generic_start_token #generic_wrapper_params #generic_close_token (
-                &mut self, #(#new_func_inputs_decl),*
-            ) -> rust_cuda::rustacuda::error::CudaResult<()>
-                #generic_wrapper_where_clause;
-
-            #(#func_attrs)*
-            fn #func_ident_raw #generic_start_token #generic_wrapper_params #generic_close_token (
-                &mut self, #(#new_func_inputs_raw_decl),*
-            ) -> rust_cuda::rustacuda::error::CudaResult<()>
-                #generic_wrapper_where_clause;
-        }
-    }
-}
-
-fn generate_new_func_inputs_decl(
-    KernelConfig { args, .. }: &KernelConfig,
-    ImplGenerics { ty_generics, .. }: &ImplGenerics,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-) -> (Vec<syn::FnArg>, Vec<syn::FnArg>) {
-    func_inputs
-        .iter()
-        .zip(func_input_cuda_types.iter())
-        .enumerate()
-        .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                pat,
-                colon_token,
-                ty,
-            }) => (
-                syn::FnArg::Typed(syn::PatType {
-                    attrs: attrs.clone(),
-                    pat: pat.clone(),
-                    colon_token: *colon_token,
-                    ty: {
-                        let type_ident = quote::format_ident!("__T_{}", i);
-                        let syn_type = syn::parse_quote!(<() as #args #ty_generics>::#type_ident);
-
-                        if let syn::Type::Reference(syn::TypeReference {
-                            and_token,
-                            lifetime,
-                            mutability,
-                            ..
-                        }) = &**ty
-                        {
-                            Box::new(syn::Type::Reference(syn::TypeReference {
-                                and_token: *and_token,
-                                lifetime: lifetime.clone(),
-                                mutability: *mutability,
-                                elem: syn_type,
-                            }))
-                        } else {
-                            syn_type
-                        }
-                    },
-                }),
-                syn::FnArg::Typed(syn::PatType {
-                    attrs: attrs.clone(),
-                    pat: pat.clone(),
-                    colon_token: *colon_token,
-                    ty: {
-                        let type_ident = quote::format_ident!("__T_{}", i);
-                        let syn_type: Box<syn::Type> =
-                            syn::parse_quote!(<() as #args #ty_generics>::#type_ident);
-
-                        let cuda_type = match cuda_mode {
-                            InputCudaType::SafeDeviceCopy => syn::parse_quote!(
-                                rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
-                            ),
-                            InputCudaType::LendRustToCuda => syn::parse_quote!(
-                                rust_cuda::common::DeviceAccessible<
-                                    <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation
-                                >
-                            ),
-                        };
-
-                        if let syn::Type::Reference(syn::TypeReference {
-                            lifetime,
-                            mutability,
-                            ..
-                        }) = &**ty
-                        {
-                            let wrapped_type = if mutability.is_some() {
-                                syn::parse_quote!(
-                                    rust_cuda::host::HostAndDeviceMutRef<#lifetime, #cuda_type>
-                                )
-                            } else {
-                                syn::parse_quote!(
-                                    rust_cuda::host::HostAndDeviceConstRef<#lifetime, #cuda_type>
-                                )
-                            };
-
-                            Box::new(wrapped_type)
-                        } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                            let lifetime = r2c_move_lifetime(i, ty);
-
-                            let wrapped_type = syn::parse_quote!(
-                                rust_cuda::host::HostAndDeviceOwned<#lifetime, #cuda_type>
-                            );
-
-                            Box::new(wrapped_type)
-                        } else {
-                            cuda_type
-                        }
-                    },
-                }),
-            ),
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .unzip()
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
deleted file mode 100644
index 628642fc0..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
+++ /dev/null
@@ -1,25 +0,0 @@
-use proc_macro2::TokenStream;
-
-use super::super::{DeclGenerics, FuncIdent};
-
-pub(in super::super) fn quote_cuda_generic_function(
-    DeclGenerics {
-        generic_start_token,
-        generic_kernel_params: generic_params,
-        generic_close_token,
-        generic_kernel_where_clause: generic_where_clause,
-        ..
-    }: &DeclGenerics,
-    func_inputs: &syn::punctuated::Punctuated<syn::FnArg, syn::token::Comma>,
-    FuncIdent { func_ident, .. }: &FuncIdent,
-    func_attrs: &[syn::Attribute],
-    func_block: &syn::Block,
-) -> TokenStream {
-    quote! {
-        #[cfg(target_os = "cuda")]
-        #(#func_attrs)*
-        fn #func_ident #generic_start_token #generic_params #generic_close_token (#func_inputs)
-            #generic_where_clause
-        #func_block
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
deleted file mode 100644
index d017efae1..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ /dev/null
@@ -1,249 +0,0 @@
-use proc_macro2::TokenStream;
-use quote::quote_spanned;
-use syn::spanned::Spanned;
-
-use super::super::{FuncIdent, FunctionInputs, InputCudaType, KernelConfig};
-
-#[allow(clippy::too_many_lines)]
-pub(in super::super) fn quote_cuda_wrapper(
-    config @ KernelConfig { args, .. }: &KernelConfig,
-    inputs @ FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-    FuncIdent {
-        func_ident,
-        func_ident_hash,
-        ..
-    }: &FuncIdent,
-    func_attrs: &[syn::Attribute],
-    func_params: &[syn::Ident],
-) -> TokenStream {
-    let (ptx_func_inputs, ptx_func_types) = specialise_ptx_func_inputs(config, inputs);
-    let ptx_func_unboxed_types = specialise_ptx_unboxed_types(config, inputs);
-
-    let func_layout_params = func_params
-        .iter()
-        .map(|ident| {
-            syn::Ident::new(
-                &format!("__{func_ident_hash}_{ident}_layout").to_uppercase(),
-                ident.span(),
-            )
-        })
-        .collect::<Vec<_>>();
-
-    let ptx_func_input_unwrap = func_inputs
-        .iter().zip(func_input_cuda_types.iter()).enumerate()
-        .rev()
-        .fold(quote! {
-            #func_ident(#(#func_params),*)
-        }, |inner, (i, (arg, (cuda_mode, ptx_jit)))| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                pat,
-                ty,
-                ..
-            }) => {
-                // Emit PTX JIT load markers
-                let ptx_jit_load = if ptx_jit.0 {
-                    quote! {
-                        rust_cuda::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref())
-                    }
-                } else { quote! {} };
-
-                let type_ident = quote::format_ident!("__T_{}", i);
-                let syn_type = quote::quote_spanned! { ty.span()=>
-                    rust_cuda::device::specialise_kernel_type!(#args :: #type_ident)
-                };
-
-                match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => if let syn::Type::Reference(
-                        syn::TypeReference { and_token, .. }
-                    ) = &**ty {
-                        // DeviceCopy mode only supports immutable references
-                        quote! { #ptx_jit_load; { let #pat: #and_token #syn_type = #pat.as_ref().into_ref(); #inner } }
-                    } else {
-                        quote! { { let #pat: #syn_type = #pat.into_inner(); #inner } }
-                    },
-                    InputCudaType::LendRustToCuda => if let syn::Type::Reference(
-                        syn::TypeReference { and_token, mutability, ..}
-                    ) = &**ty {
-                        if mutability.is_some() {
-                            quote! {
-                                #ptx_jit_load;
-                                rust_cuda::device::BorrowFromRust::with_borrow_from_rust_mut(
-                                    #pat, |#pat: #and_token #mutability rust_cuda::device::ShallowCopy<#syn_type>| { #inner },
-                                )
-                            }
-                        } else {
-                            quote! {
-                                #ptx_jit_load;
-                                rust_cuda::device::BorrowFromRust::with_borrow_from_rust(
-                                    #pat, |#pat: #and_token rust_cuda::device::ShallowCopy<#syn_type>| { #inner },
-                                )
-                            }
-                        }
-                    } else {
-                        quote! {
-                            #ptx_jit_load;
-                            rust_cuda::device::BorrowFromRust::with_moved_from_rust(
-                                #pat, |#pat: #syn_type| { #inner },
-                            )
-                        }
-                    }
-                }
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        });
-
-    let func_type_layout_ident = quote::format_ident!("{}_type_layout", func_ident);
-
-    quote! {
-        #[cfg(target_os = "cuda")]
-        #[rust_cuda::device::specialise_kernel_entry(#args)]
-        #[no_mangle]
-        #(#func_attrs)*
-        pub unsafe extern "ptx-kernel" fn #func_type_layout_ident(#(#func_params: &mut &[u8]),*) {
-            #(
-                #[no_mangle]
-                static #func_layout_params: [
-                    u8; rust_cuda::const_type_layout::serialised_type_graph_len::<#ptx_func_types>()
-                ] = rust_cuda::const_type_layout::serialise_type_graph::<#ptx_func_types>();
-
-                *#func_params = &#func_layout_params;
-            )*
-        }
-
-        #[cfg(target_os = "cuda")]
-        #[rust_cuda::device::specialise_kernel_entry(#args)]
-        #[no_mangle]
-        #(#func_attrs)*
-        pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) {
-            #[deny(improper_ctypes)]
-            mod __rust_cuda_ffi_safe_assert {
-                use super::#args;
-
-                extern "C" { #(
-                    #[allow(dead_code)]
-                    static #func_params: #ptx_func_types;
-                )* }
-            }
-
-            if false {
-                #[allow(dead_code)]
-                fn assert_impl_devicecopy<T: rust_cuda::rustacuda_core::DeviceCopy>(_val: &T) {}
-
-                #[allow(dead_code)]
-                fn assert_impl_no_aliasing<T: rust_cuda::safety::NoAliasing>() {}
-
-                #[allow(dead_code)]
-                fn assert_impl_fits_into_device_register<
-                    T: rust_cuda::safety::FitsIntoDeviceRegister,
-                >(_val: &T) {}
-
-                #(assert_impl_devicecopy(&#func_params);)*
-                #(assert_impl_no_aliasing::<#ptx_func_unboxed_types>();)*
-                #(assert_impl_fits_into_device_register(&#func_params);)*
-            }
-
-            #ptx_func_input_unwrap
-        }
-    }
-}
-
-fn specialise_ptx_func_inputs(
-    KernelConfig { args, .. }: &KernelConfig,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-) -> (Vec<TokenStream>, Vec<TokenStream>) {
-    func_inputs
-        .iter()
-        .zip(func_input_cuda_types.iter())
-        .enumerate()
-        .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg {
-            syn::FnArg::Typed(
-                fn_arg @ syn::PatType {
-                    attrs,
-                    pat,
-                    colon_token,
-                    ty,
-                },
-            ) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
-                let syn_type = quote::quote_spanned! { ty.span()=>
-                    rust_cuda::device::specialise_kernel_type!(#args :: #type_ident)
-                };
-
-                let cuda_type = match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=>
-                        rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
-                    },
-                    InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=>
-                        rust_cuda::common::DeviceAccessible<
-                            <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation
-                        >
-                    },
-                };
-
-                let ty = if let syn::Type::Reference(syn::TypeReference {
-                    lifetime,
-                    mutability,
-                    ..
-                }) = &**ty
-                {
-                    let lifetime = quote_spanned! { lifetime.span()=>
-                        'static
-                    };
-
-                    if mutability.is_some() {
-                        quote::quote_spanned! { ty.span()=>
-                            rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type>
-                        }
-                    } else {
-                        quote::quote_spanned! { ty.span()=>
-                            rust_cuda::common::DeviceConstRef<#lifetime, #cuda_type>
-                        }
-                    }
-                } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                    let lifetime = quote_spanned! { ty.span()=>
-                        'static
-                    };
-
-                    quote::quote_spanned! { ty.span()=>
-                        rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type>
-                    }
-                } else {
-                    cuda_type
-                };
-
-                let fn_arg = quote::quote_spanned! { fn_arg.span()=>
-                    #(#attrs)* #pat #colon_token #ty
-                };
-
-                (fn_arg, ty)
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .unzip()
-}
-
-fn specialise_ptx_unboxed_types(
-    KernelConfig { args, .. }: &KernelConfig,
-    FunctionInputs { func_inputs, .. }: &FunctionInputs,
-) -> Vec<TokenStream> {
-    func_inputs
-        .iter()
-        .enumerate()
-        .map(|(i, arg)| match arg {
-            syn::FnArg::Typed(syn::PatType { ty, .. }) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
-
-                quote::quote_spanned! { ty.span()=>
-                    rust_cuda::device::specialise_kernel_type!(#args :: #type_ident)
-                }
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .collect()
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs
deleted file mode 100644
index 4dd9b4096..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-pub mod args_trait;
-pub mod cpu_linker_macro;
-pub mod cpu_wrapper;
-pub mod cuda_generic_function;
-pub mod cuda_wrapper;
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
deleted file mode 100644
index ceeee1e3e..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-use syn::spanned::Spanned;
-
-use super::InputCudaType;
-
-pub(super) enum KernelInputAttribute {
-    PassType(proc_macro2::Span, InputCudaType),
-    PtxJit(proc_macro2::Span, bool),
-}
-
-impl syn::parse::Parse for KernelInputAttribute {
-    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
-        let ident: syn::Ident = input.parse()?;
-
-        match &*ident.to_string() {
-            "pass" => {
-                let eq: syn::token::Eq = input.parse()?;
-                let mode: syn::Ident = input.parse()?;
-
-                let cuda_type = match &*mode.to_string() {
-                    "SafeDeviceCopy" => InputCudaType::SafeDeviceCopy,
-                    "LendRustToCuda" => InputCudaType::LendRustToCuda,
-                    _ => abort!(
-                        mode.span(),
-                        "Unexpected CUDA transfer mode `{:?}`: Expected `SafeDeviceCopy` or \
-                         `LendRustToCuda`.",
-                        mode
-                    ),
-                };
-
-                Ok(KernelInputAttribute::PassType(
-                    ident
-                        .span()
-                        .join(eq.span())
-                        .unwrap()
-                        .join(mode.span())
-                        .unwrap(),
-                    cuda_type,
-                ))
-            },
-            "jit" => {
-                let eq: Option<syn::token::Eq> = input.parse()?;
-
-                let (ptx_jit, span) = if eq.is_some() {
-                    let value: syn::LitBool = input.parse()?;
-
-                    (
-                        value.value(),
-                        ident
-                            .span()
-                            .join(eq.span())
-                            .unwrap()
-                            .span()
-                            .join(value.span())
-                            .unwrap(),
-                    )
-                } else {
-                    (true, ident.span())
-                };
-
-                Ok(KernelInputAttribute::PtxJit(span, ptx_jit))
-            },
-            _ => abort!(
-                ident.span(),
-                "Unexpected kernel attribute `{:?}`: Expected `pass` or `jit`.",
-                ident
-            ),
-        }
-    }
-}
-
-pub(super) struct KernelInputAttributes(Vec<KernelInputAttribute>);
-
-impl syn::parse::Parse for KernelInputAttributes {
-    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
-        let content;
-        let _parens = syn::parenthesized!(content in input);
-
-        syn::punctuated::Punctuated::<
-            KernelInputAttribute, syn::token::Comma
-        >::parse_separated_nonempty(&content).map(|punctuated| {
-            Self(punctuated.into_iter().collect())
-        })
-    }
-}
-
-impl IntoIterator for KernelInputAttributes {
-    type IntoIter = std::vec::IntoIter<Self::Item>;
-    type Item = KernelInputAttribute;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.0.into_iter()
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
deleted file mode 100644
index f3cc1a4d8..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
+++ /dev/null
@@ -1,229 +0,0 @@
-use syn::spanned::Spanned;
-
-use crate::kernel::utils::r2c_move_lifetime;
-
-use super::{InputCudaType, InputPtxJit};
-
-mod attribute;
-use attribute::{KernelInputAttribute, KernelInputAttributes};
-
-pub(super) struct FunctionInputs {
-    pub(super) func_inputs: syn::punctuated::Punctuated<syn::FnArg, syn::token::Comma>,
-    pub(super) func_input_cuda_types: Vec<(InputCudaType, InputPtxJit)>,
-}
-
-pub(super) fn parse_function_inputs(
-    func: &syn::ItemFn,
-    generic_params: &mut syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
-) -> FunctionInputs {
-    let mut implicit_lifetime_id: usize = 0;
-
-    let (func_inputs, func_input_cuda_types): (
-        syn::punctuated::Punctuated<syn::FnArg, syn::token::Comma>,
-        Vec<(InputCudaType, InputPtxJit)>,
-    ) = func
-        .sig
-        .inputs
-        .iter()
-        .enumerate()
-        .map(|(i, arg)| match arg {
-            receiver @ syn::FnArg::Receiver(_) => {
-                abort!(receiver.span(), "Kernel function must not have a receiver.")
-            },
-            syn::FnArg::Typed(
-                input @ syn::PatType {
-                    attrs,
-                    pat,
-                    colon_token,
-                    ty,
-                },
-            ) => {
-                let mut cuda_type: Option<InputCudaType> = None;
-                let mut ptx_jit: Option<InputPtxJit> = None;
-
-                let attrs = attrs
-                    .iter()
-                    .filter(|attr| match attr.path.get_ident() {
-                        Some(ident) if ident == "kernel" => {
-                            let attrs: KernelInputAttributes =
-                                match syn::parse_macro_input::parse(attr.tokens.clone().into()) {
-                                    Ok(data) => data,
-                                    Err(err) => abort!(attr.span(), err),
-                                };
-
-                            for attr in attrs {
-                                match attr {
-                                    KernelInputAttribute::PassType(_span, pass_type)
-                                        if cuda_type.is_none() =>
-                                    {
-                                        cuda_type = Some(pass_type);
-                                    },
-                                    KernelInputAttribute::PassType(span, _pass_type) => {
-                                        abort!(span, "Duplicate CUDA transfer mode declaration.");
-                                    },
-                                    KernelInputAttribute::PtxJit(span, jit)
-                                        if ptx_jit.is_none() =>
-                                    {
-                                        if !matches!(&**ty, syn::Type::Reference(_)) && jit {
-                                            abort!(
-                                                span,
-                                                "Only reference types can be PTX JIT loaded."
-                                            );
-                                        }
-
-                                        ptx_jit = Some(InputPtxJit(jit));
-                                    },
-                                    KernelInputAttribute::PtxJit(span, _jit) => {
-                                        abort!(span, "Duplicate PTX JIT declaration.");
-                                    },
-                                }
-                            }
-
-                            false
-                        },
-                        _ => true,
-                    })
-                    .cloned()
-                    .collect();
-
-                let cuda_type = cuda_type.unwrap_or_else(|| {
-                    abort!(
-                        input.span(),
-                        "Kernel function input must specify its CUDA transfer mode using \
-                         #[kernel(pass = ...)]."
-                    );
-                });
-
-                let ty = ensure_reference_type_lifetime(
-                    i,
-                    ty,
-                    &cuda_type,
-                    &mut implicit_lifetime_id,
-                    generic_params,
-                );
-
-                (
-                    syn::FnArg::Typed(syn::PatType {
-                        attrs,
-                        pat: pat.clone(),
-                        colon_token: *colon_token,
-                        ty,
-                    }),
-                    (cuda_type, ptx_jit.unwrap_or(InputPtxJit(false))),
-                )
-            },
-        })
-        .unzip();
-
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }
-}
-
-#[allow(clippy::unnecessary_box_returns)]
-fn ensure_reference_type_lifetime(
-    i: usize,
-    ty: &syn::Type,
-    cuda_type: &InputCudaType,
-    implicit_lifetime_id: &mut usize,
-    generic_params: &mut syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
-) -> Box<syn::Type> {
-    match ty {
-        syn::Type::Reference(syn::TypeReference {
-            and_token,
-            lifetime,
-            mutability,
-            elem,
-        }) => {
-            let lifetime = lifetime.clone().unwrap_or_else(|| {
-                let lifetime = syn::Lifetime::new(
-                    &format!("'__r2c_lt_{implicit_lifetime_id}"),
-                    lifetime.span(),
-                );
-
-                generic_params.insert(
-                    *implicit_lifetime_id,
-                    syn::GenericParam::Lifetime(syn::LifetimeDef {
-                        attrs: Vec::new(),
-                        lifetime: lifetime.clone(),
-                        colon_token: None,
-                        bounds: syn::punctuated::Punctuated::new(),
-                    }),
-                );
-
-                *implicit_lifetime_id += 1;
-
-                lifetime
-            });
-
-            let elem = if matches!(cuda_type, InputCudaType::LendRustToCuda) {
-                (|| {
-                    if let syn::Type::Path(syn::TypePath {
-                        path: syn::Path { segments, .. },
-                        qself: None,
-                    }) = &**elem
-                    {
-                        if let Some(syn::PathSegment {
-                            ident,
-                            arguments:
-                                syn::PathArguments::AngleBracketed(
-                                    syn::AngleBracketedGenericArguments { args, .. },
-                                ),
-                        }) = segments.last()
-                        {
-                            if ident == "ShallowCopy" && segments.len() == 1 {
-                                match args.last() {
-                                    Some(syn::GenericArgument::Type(elem)) if args.len() == 1 => {
-                                        return Box::new(elem.clone());
-                                    },
-                                    _ => {
-                                        abort!(
-                                            args.span(),
-                                            "`ShallowCopy<T>` takes exactly one generic type \
-                                             argument."
-                                        );
-                                    },
-                                }
-                            }
-                        }
-                    }
-
-                    emit_warning!(
-                        elem.span(),
-                        "RustToCuda kernel parameters should be explicitly wrapped with the \
-                         `ShallowCopy<T>` marker to communicate their aliasing behaviour."
-                    );
-
-                    elem.clone()
-                })()
-            } else {
-                elem.clone()
-            };
-
-            Box::new(syn::Type::Reference(syn::TypeReference {
-                and_token: *and_token,
-                lifetime: Some(lifetime),
-                mutability: *mutability,
-                elem,
-            }))
-        },
-        ty => {
-            if matches!(cuda_type, InputCudaType::LendRustToCuda) {
-                generic_params.insert(
-                    *implicit_lifetime_id,
-                    syn::GenericParam::Lifetime(syn::LifetimeDef {
-                        attrs: Vec::new(),
-                        lifetime: r2c_move_lifetime(i, ty),
-                        colon_token: None,
-                        bounds: syn::punctuated::Punctuated::new(),
-                    }),
-                );
-
-                *implicit_lifetime_id += 1;
-            }
-
-            Box::new(ty.clone())
-        },
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
deleted file mode 100644
index 6f63af892..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ /dev/null
@@ -1,325 +0,0 @@
-use std::hash::{Hash, Hasher};
-
-use proc_macro::TokenStream;
-
-mod config;
-mod generate;
-mod inputs;
-mod parse;
-
-use config::KernelConfig;
-use generate::{
-    args_trait::quote_args_trait, cpu_linker_macro::quote_cpu_linker_macro,
-    cpu_wrapper::quote_cpu_wrapper, cuda_generic_function::quote_cuda_generic_function,
-    cuda_wrapper::quote_cuda_wrapper,
-};
-use inputs::{parse_function_inputs, FunctionInputs};
-use parse::parse_kernel_fn;
-use proc_macro2::Span;
-use syn::spanned::Spanned;
-
-#[allow(clippy::too_many_lines)]
-pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
-    let mut hasher = seahash::SeaHasher::new();
-
-    attr.to_string().hash(&mut hasher);
-    func.to_string().hash(&mut hasher);
-
-    let kernel_hash = hasher.finish();
-
-    let config: KernelConfig = match syn::parse_macro_input::parse(attr) {
-        Ok(config) => config,
-        Err(err) => {
-            abort_call_site!(
-                "#[kernel(pub? use LINKER! as impl KERNEL<ARGS> for LAUNCHER)] expects LINKER, \
-                 KERNEL, ARGS and LAUNCHER identifiers: {:?}",
-                err
-            )
-        },
-    };
-
-    let func = parse_kernel_fn(func);
-
-    let mut generic_kernel_params = func.sig.generics.params.clone();
-    let mut func_inputs = parse_function_inputs(&func, &mut generic_kernel_params);
-
-    let (generic_start_token, generic_close_token) = if generic_kernel_params.is_empty() {
-        (None, None)
-    } else if let (Some(start), Some(close)) =
-        (func.sig.generics.lt_token, func.sig.generics.gt_token)
-    {
-        (Some(start), Some(close))
-    } else {
-        (Some(syn::parse_quote!(<)), Some(syn::parse_quote!(>)))
-    };
-
-    let generic_trait_params = generic_kernel_params
-        .iter()
-        .filter(|generic_param| !matches!(generic_param, syn::GenericParam::Lifetime(_)))
-        .cloned()
-        .collect();
-    let generic_wrapper_params = generic_kernel_params
-        .iter()
-        .filter(|generic_param| matches!(generic_param, syn::GenericParam::Lifetime(_)))
-        .cloned()
-        .collect();
-
-    let generic_kernel_where_clause = &func.sig.generics.where_clause;
-    let generic_trait_where_clause = generic_kernel_where_clause.as_ref().map(
-        |syn::WhereClause {
-             where_token,
-             predicates,
-         }: &syn::WhereClause| {
-            let predicates = predicates
-                .iter()
-                .filter(|predicate| !matches!(predicate, syn::WherePredicate::Lifetime(_)))
-                .cloned()
-                .collect();
-
-            syn::WhereClause {
-                where_token: *where_token,
-                predicates,
-            }
-        },
-    );
-    let generic_wrapper_where_clause = generic_kernel_where_clause.as_ref().map(
-        |syn::WhereClause {
-             where_token,
-             predicates,
-         }: &syn::WhereClause| {
-            let predicates = predicates
-                .iter()
-                .filter(|predicate| matches!(predicate, syn::WherePredicate::Lifetime(_)))
-                .cloned()
-                .collect();
-
-            syn::WhereClause {
-                where_token: *where_token,
-                predicates,
-            }
-        },
-    );
-
-    let decl_generics = DeclGenerics {
-        generic_start_token: &generic_start_token,
-        generic_trait_params: &generic_trait_params,
-        generic_close_token: &generic_close_token,
-        generic_trait_where_clause: &generic_trait_where_clause,
-        generic_wrapper_params: &generic_wrapper_params,
-        generic_wrapper_where_clause: &generic_wrapper_where_clause,
-        generic_kernel_params: &generic_kernel_params,
-        generic_kernel_where_clause,
-    };
-    let trait_generics = syn::Generics {
-        lt_token: generic_start_token,
-        params: generic_trait_params.clone(),
-        gt_token: generic_close_token,
-        where_clause: generic_trait_where_clause.clone(),
-    };
-    let impl_generics = {
-        let (impl_generics, ty_generics, where_clause) = trait_generics.split_for_impl();
-
-        ImplGenerics {
-            impl_generics,
-            ty_generics,
-            where_clause,
-        }
-    };
-
-    let func_ident = FuncIdent {
-        func_ident: &func.sig.ident,
-        func_ident_raw: quote::format_ident!("{}_raw", &func.sig.ident),
-        func_ident_hash: quote::format_ident!("{}_{:016x}", &func.sig.ident, kernel_hash),
-    };
-
-    let func_params = func_inputs
-        .func_inputs
-        .iter()
-        .enumerate()
-        .map(|(i, arg)| match arg {
-            syn::FnArg::Typed(syn::PatType { pat, .. }) => match ident_from_pat(pat) {
-                Some(ident) => ident,
-                None => syn::Ident::new(&format!("{}_arg_{i}", func_ident.func_ident), pat.span()),
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .collect::<Vec<_>>();
-
-    let pat_func_inputs = func_inputs
-        .func_inputs
-        .iter_mut()
-        .zip(&func_params)
-        .map(|(arg, ident)| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                colon_token,
-                ty,
-                ..
-            }) => {
-                let ident_fn_arg = syn::FnArg::Typed(syn::PatType {
-                    attrs: attrs.clone(),
-                    pat: Box::new(syn::Pat::Ident(syn::PatIdent {
-                        attrs: Vec::new(),
-                        by_ref: None,
-                        mutability: None,
-                        ident: ident.clone(),
-                        subpat: None,
-                    })),
-                    colon_token: *colon_token,
-                    ty: ty.clone(),
-                });
-
-                std::mem::replace(arg, ident_fn_arg)
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .collect();
-
-    let args_trait = quote_args_trait(&config, &decl_generics, &impl_generics, &func_inputs);
-    let cpu_wrapper = quote_cpu_wrapper(
-        &config,
-        &decl_generics,
-        &impl_generics,
-        &func_inputs,
-        &func_ident,
-        &func.attrs,
-    );
-    let cpu_cuda_check = quote_generic_check(&func_ident, &config);
-    let cpu_linker_macro = quote_cpu_linker_macro(
-        &config,
-        &decl_generics,
-        &func_inputs,
-        &func_ident,
-        &func_params,
-        &func.attrs,
-    );
-    let cuda_wrapper = quote_cuda_wrapper(
-        &config,
-        &func_inputs,
-        &func_ident,
-        &func.attrs,
-        &func_params,
-    );
-    let cuda_generic_function = quote_cuda_generic_function(
-        &decl_generics,
-        &pat_func_inputs,
-        &func_ident,
-        &func.attrs,
-        &func.block,
-    );
-
-    (quote! {
-        #args_trait
-        #cpu_wrapper
-
-        #cpu_cuda_check
-
-        #cpu_linker_macro
-
-        #cuda_wrapper
-        #cuda_generic_function
-    })
-    .into()
-}
-
-enum InputCudaType {
-    SafeDeviceCopy,
-    LendRustToCuda,
-}
-
-struct InputPtxJit(bool);
-
-#[allow(clippy::struct_field_names)]
-struct DeclGenerics<'f> {
-    generic_start_token: &'f Option<syn::token::Lt>,
-    generic_trait_params: &'f syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
-    generic_close_token: &'f Option<syn::token::Gt>,
-    generic_trait_where_clause: &'f Option<syn::WhereClause>,
-    generic_wrapper_params: &'f syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
-    generic_wrapper_where_clause: &'f Option<syn::WhereClause>,
-    generic_kernel_params: &'f syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
-    generic_kernel_where_clause: &'f Option<syn::WhereClause>,
-}
-
-struct ImplGenerics<'f> {
-    #[allow(clippy::struct_field_names)]
-    impl_generics: syn::ImplGenerics<'f>,
-    ty_generics: syn::TypeGenerics<'f>,
-    where_clause: Option<&'f syn::WhereClause>,
-}
-
-#[allow(clippy::struct_field_names)]
-struct FuncIdent<'f> {
-    func_ident: &'f syn::Ident,
-    func_ident_raw: syn::Ident,
-    func_ident_hash: syn::Ident,
-}
-
-fn ident_from_pat(pat: &syn::Pat) -> Option<syn::Ident> {
-    match pat {
-        syn::Pat::Lit(_)
-        | syn::Pat::Macro(_)
-        | syn::Pat::Path(_)
-        | syn::Pat::Range(_)
-        | syn::Pat::Rest(_)
-        | syn::Pat::Verbatim(_)
-        | syn::Pat::Wild(_) => None,
-        syn::Pat::Ident(syn::PatIdent { ident, .. }) => Some(ident.clone()),
-        syn::Pat::Box(syn::PatBox { pat, .. })
-        | syn::Pat::Reference(syn::PatReference { pat, .. })
-        | syn::Pat::Type(syn::PatType { pat, .. }) => ident_from_pat(pat),
-        syn::Pat::Or(syn::PatOr { cases, .. }) => ident_from_pat_iter(cases.iter()),
-        syn::Pat::Slice(syn::PatSlice { elems, .. })
-        | syn::Pat::TupleStruct(syn::PatTupleStruct {
-            pat: syn::PatTuple { elems, .. },
-            ..
-        })
-        | syn::Pat::Tuple(syn::PatTuple { elems, .. }) => ident_from_pat_iter(elems.iter()),
-        syn::Pat::Struct(syn::PatStruct { fields, .. }) => {
-            ident_from_pat_iter(fields.iter().map(|field| &*field.pat))
-        },
-        _ => Err(()).ok(),
-    }
-}
-
-fn ident_from_pat_iter<'p, I: Iterator<Item = &'p syn::Pat>>(iter: I) -> Option<syn::Ident> {
-    iter.filter_map(ident_from_pat)
-        .fold(None, |acc: Option<(String, Span)>, ident| {
-            if let Some((mut str_acc, span_acc)) = acc {
-                str_acc.push('_');
-                str_acc.push_str(ident.to_string().trim_matches('_'));
-
-                Some((str_acc, span_acc.join(ident.span()).unwrap()))
-            } else {
-                Some((ident.to_string(), ident.span()))
-            }
-        })
-        .map(|(string, span)| syn::Ident::new(&string, span))
-}
-
-fn quote_generic_check(
-    FuncIdent {
-        func_ident_hash, ..
-    }: &FuncIdent,
-    KernelConfig { args, .. }: &KernelConfig,
-) -> proc_macro2::TokenStream {
-    let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") {
-        Ok(crate_name) => crate_name.to_uppercase(),
-        Err(err) => abort_call_site!("Failed to read crate name: {:?}.", err),
-    };
-
-    let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR")
-        .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err));
-
-    quote::quote_spanned! { func_ident_hash.span()=>
-        #[cfg(not(target_os = "cuda"))]
-        const _: ::rust_cuda::safety::kernel_signature::Assert<{
-            ::rust_cuda::safety::kernel_signature::CpuAndGpuKernelSignatures::Match
-        }> = ::rust_cuda::safety::kernel_signature::Assert::<{
-            ::rust_cuda::safety::kernel_signature::check(
-                rust_cuda::host::check_kernel!(#args #crate_name #crate_manifest_dir).as_bytes(),
-                concat!(".visible .entry ", stringify!(#func_ident_hash)).as_bytes()
-            )
-        }>;
-    }
-}
diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index d5d8f3018..514bbf66e 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -1,7 +1,46 @@
-#![deny(clippy::pedantic)]
-#![feature(box_patterns)]
-#![feature(proc_macro_tracked_env)]
-#![feature(proc_macro_span)]
+//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License
+//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod
+//! Ready-to-Code]][gitpod]
+//!
+//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
+//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
+//!
+//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange
+//! [repo]: https://github.com/juntyr/rust-cuda
+//!
+//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
+//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda_derive/
+//!
+//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield
+//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield
+//!
+//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx
+//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda
+//!
+//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod
+//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda
+//!
+//! `rust-cuda-derive` provides the
+//! [`#[derive(LendRustToCuda)]`](LendRustToCuda) derive macro for the
+//! [`rust_cuda::lend::RustToCuda`]
+//! utility trait, which enables the usage of the
+//! [`rust_cuda::lend::LendToCuda`]
+//! trait that allows Rust data structures to be shared with CUDA kernels.
+//!
+//! The async variants of both traits are *optionally* implemented as well.
+//!
+//! [`rust_cuda::lend::RustToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html
+//! [`rust_cuda::lend::LendToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.LendToCuda.html
+
+#![deny(clippy::complexity)]
+#![deny(clippy::correctness)]
+#![warn(clippy::nursery)]
+#![warn(clippy::pedantic)]
+#![deny(clippy::perf)]
+#![deny(clippy::style)]
+#![deny(clippy::suspicious)]
+#![deny(unsafe_code)]
+#![deny(missing_docs)]
 #![feature(if_let_guard)]
 #![feature(let_chains)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
@@ -13,16 +52,68 @@ extern crate proc_macro_error;
 
 use proc_macro::TokenStream;
 
-mod kernel;
 mod rust_to_cuda;
 
-// cargo expand --target x86_64-unknown-linux-gnu --ugly \
-//  | rustfmt --config max_width=160 > out.rs
-// cargo expand --target nvptx64-nvidia-cuda --ugly \
-//  | rustfmt --config max_width=160 > out.rs
-
 #[proc_macro_error]
 #[proc_macro_derive(LendRustToCuda, attributes(cuda))]
+/// Provides the [`#[derive(LendRustToCuda)`](LendRustToCuda)
+/// derive macro for the
+/// [`rust_cuda::lend::RustToCuda`]
+/// utility trait, which enables the usage of the
+/// [`rust_cuda::lend::LendToCuda`]
+/// trait that allows Rust data structures to be shared with CUDA kernels.
+///
+/// At the moment, only
+/// [`struct`](https://doc.rust-lang.org/std/keyword.struct.html)s are supported
+/// by this derive macro.
+///
+/// The derive also accepts a `#[cuda(...)]` attribute. You can annotate the
+/// entire struct with the `#[cuda(...)]` to configure the implementation as
+/// follows:
+///
+/// - `#[cuda(crate = "<crate-path>")]` changes the path to the [`rust-cuda`]
+///   crate that the derive uses, which by default is `rust_cuda`.
+/// - `#[cuda(bound = "<where-predicate>")]` adds the provided predicate to the
+///   where clause of the trait implementation.
+/// - `#[cuda(free = "<type>")]` removes the the auto-added trait bounds for the
+///   type parameter `<type>` from the trait implementation, e.g. when
+///   implementing a wrapper around [`std::marker::PhantomData<T>`] which should
+///   implement the trait for any `T`.
+/// - `#[cuda(async = <bool>)]` explicitly enables or disables the async
+///   implementation of the trait, [`rust_cuda::lend::RustToCudaAsync`]. By
+///   default, `#[cuda(async = true)]` is set.
+/// - `#[cuda(layout::ATTR = "VALUE")]` adds the `#[layout(ATTR = "VALUE")]`
+///   attribute to the [`#derive(const_type_layout::TypeLayout)`] derive for
+///   this struct's [`rust_cuda::lend::RustToCuda::CudaRepresentation`].
+/// - `#[cuda(ignore)]` removes all subsequent attributes from the generated
+///   [`rust_cuda::lend::RustToCuda::CudaRepresentation`] struct.
+///
+/// Additionally, the `#[cuda(...)]` attribute can also be applied individually
+/// to the fields of the struct to customise the implementation as follows:
+///
+/// - `#[cuda(embed)]` signals that this field has a non-identity CUDA
+///   representation and should be embedded by using the
+///   [`rust_cuda::lend::RustToCuda`] implementation of this field's type. When
+///   this attribute is not specified, the field must instead implement
+///   [`Copy`], [`rust_cuda::safety::PortableBitSemantics`], and
+///   [`const_type_layout::TypeGraphLayout`].
+/// - `#[cuda(embed = "<proxy-type>")]` works like `#[cuda(embed)]` but can be
+///   used when the field's type does not implement
+///   [`rust_cuda::lend::RustToCuda`] itself, but some `<proxy-type>` exists,
+///   which implements [`rust_cuda::lend::RustToCudaProxy`] for the field's
+///   type.
+/// - `#[cuda(ignore)]` removes all subsequent attributes from this field in the
+///   generated [`rust_cuda::lend::RustToCuda::CudaRepresentation`] struct.
+///
+/// [`rust_cuda::lend::RustToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html
+/// [`rust_cuda::lend::LendToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.LendToCuda.html
+/// [`rust-cuda`]: https://juntyr.github.io/rust-cuda/rust_cuda
+/// [`rust_cuda::lend::RustToCudaAsync`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCudaAsync.html
+/// [`#derive(const_type_layout::TypeLayout)`]: https://docs.rs/const-type-layout/0.2.1/const_type_layout/derive.TypeLayout.html
+/// [`rust_cuda::lend::RustToCuda::CudaRepresentation`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html#associatedtype.CudaRepresentation
+/// [`rust_cuda::safety::PortableBitSemantics`]: https://juntyr.github.io/rust-cuda/rust_cuda/safety/trait.PortableBitSemantics.html
+/// [`const_type_layout::TypeGraphLayout`]: https://docs.rs/const-type-layout/0.2.1/const_type_layout/trait.TypeGraphLayout.html
+/// [`rust_cuda::lend::RustToCudaProxy`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCudaProxy.html
 pub fn rust_to_cuda_derive(input: TokenStream) -> TokenStream {
     // Note: We cannot report a more precise span yet
     let ast = match syn::parse(input) {
@@ -33,44 +124,3 @@ pub fn rust_to_cuda_derive(input: TokenStream) -> TokenStream {
     // Build the implementation of the `RustToCuda` and `CudaAsRust` traits
     rust_to_cuda::impl_rust_to_cuda(&ast)
 }
-
-#[proc_macro_error]
-#[proc_macro_attribute]
-pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
-    kernel::wrapper::kernel(attr, func)
-}
-
-#[doc(hidden)]
-#[proc_macro_error]
-#[proc_macro]
-pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
-    kernel::specialise::ty::specialise_kernel_type(tokens)
-}
-
-#[doc(hidden)]
-#[proc_macro_error]
-#[proc_macro]
-pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream {
-    kernel::specialise::call::specialise_kernel_call(tokens)
-}
-
-#[doc(hidden)]
-#[proc_macro_error]
-#[proc_macro_attribute]
-pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStream {
-    kernel::specialise::entry::specialise_kernel_entry(attr, func)
-}
-
-#[doc(hidden)]
-#[proc_macro_error]
-#[proc_macro]
-pub fn check_kernel(tokens: TokenStream) -> TokenStream {
-    kernel::link::check_kernel(tokens)
-}
-
-#[doc(hidden)]
-#[proc_macro_error]
-#[proc_macro]
-pub fn link_kernel(tokens: TokenStream) -> TokenStream {
-    kernel::link::link_kernel(tokens)
-}
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index 0ddca9b28..18fd867c1 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -1,37 +1,55 @@
 use proc_macro2::TokenStream;
 use quote::{format_ident, quote, ToTokens};
 
-use super::field_ty::CudaReprFieldTy;
+use crate::rust_to_cuda::field_ty::CudaReprFieldTy;
 
 #[allow(clippy::too_many_arguments, clippy::too_many_lines)]
 pub fn impl_field_copy_init_and_expand_alloc_type(
+    crate_path: &syn::Path,
     field: &syn::Field,
     field_index: usize,
 
     cuda_repr_field_ty: &CudaReprFieldTy,
 
     mut combined_cuda_alloc_type: TokenStream,
+    mut combined_cuda_alloc_async_type: TokenStream,
 
     r2c_field_declarations: &mut Vec<TokenStream>,
+    r2c_field_async_declarations: &mut Vec<TokenStream>,
+    r2c_field_async_completions: &mut Vec<syn::Ident>,
     r2c_field_initialisations: &mut Vec<TokenStream>,
     r2c_field_destructors: &mut Vec<TokenStream>,
+    r2c_field_async_destructors: &mut Vec<TokenStream>,
+    r2c_field_async_completion_calls: &mut Vec<TokenStream>,
 
     c2r_field_initialisations: &mut Vec<TokenStream>,
-) -> TokenStream {
+) -> (TokenStream, TokenStream) {
+    #[allow(clippy::option_if_let_else)]
     let field_accessor = match &field.ident {
         Some(ident) => quote! { #ident },
         None => proc_macro2::Literal::usize_unsuffixed(field_index).to_token_stream(),
     };
+    #[allow(clippy::option_if_let_else)]
     let field_repr_ident = match &field.ident {
         Some(ident) => format_ident!("field_{}_repr", ident),
         None => format_ident!("field_{}_repr", field_index),
     };
+    #[allow(clippy::option_if_let_else)]
+    let field_completion_ident = match &field.ident {
+        Some(ident) => format_ident!("field_{}_completion", ident),
+        None => format_ident!("field_{}_completion", field_index),
+    };
     let optional_field_ident = field.ident.as_ref().map(|ident| quote! { #ident: });
 
     match cuda_repr_field_ty {
         CudaReprFieldTy::SafeDeviceCopy => {
             r2c_field_declarations.push(quote! {
-                let #field_repr_ident = rust_cuda::common::DeviceAccessible::from(
+                let #field_repr_ident = #crate_path::utils::ffi::DeviceAccessible::from(
+                    &self.#field_accessor,
+                );
+            });
+            r2c_field_async_declarations.push(quote! {
+                let #field_repr_ident = #crate_path::utils::ffi::DeviceAccessible::from(
                     &self.#field_accessor,
                 );
             });
@@ -42,23 +60,37 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
-                    rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor).into_inner()
+                    #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor).into_inner()
                 },
             });
         },
         CudaReprFieldTy::RustToCuda { field_ty } => {
             combined_cuda_alloc_type = quote! {
-                rust_cuda::host::CombinedCudaAlloc<
-                    <#field_ty as rust_cuda::common::RustToCuda>::CudaAllocation,
+                #crate_path::alloc::CombinedCudaAlloc<
+                    <#field_ty as #crate_path::lend::RustToCuda>::CudaAllocation,
                     #combined_cuda_alloc_type
                 >
             };
+            combined_cuda_alloc_async_type = quote! {
+                #crate_path::alloc::CombinedCudaAlloc<
+                    <#field_ty as #crate_path::lend::RustToCudaAsync>::CudaAllocationAsync,
+                    #combined_cuda_alloc_async_type
+                >
+            };
 
             r2c_field_declarations.push(quote! {
-                let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCuda::borrow(
+                let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow(
+                    &self.#field_accessor,
+                    alloc_front,
+                )?;
+            });
+            r2c_field_async_declarations.push(quote! {
+                let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCudaAsync::borrow_async(
                     &self.#field_accessor,
                     alloc_front,
+                    stream,
                 )?;
+                let (#field_repr_ident, #field_completion_ident) = #field_repr_ident.unwrap_unchecked()?;
             });
 
             r2c_field_initialisations.push(quote! {
@@ -66,33 +98,72 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
             });
 
             r2c_field_destructors.push(quote! {
-                let alloc_front = rust_cuda::common::RustToCuda::restore(
+                let alloc_front = #crate_path::lend::RustToCuda::restore(
                     &mut self.#field_accessor,
                     alloc_front,
                 )?;
             });
+            r2c_field_async_destructors.push(quote! {
+                let this_backup = unsafe {
+                    ::core::mem::ManuallyDrop::new(::core::ptr::read(&this))
+                };
+                let (r#async, alloc_front) = #crate_path::lend::RustToCudaAsync::restore_async(
+                    this.map_mut(|this| &mut this.#field_accessor),
+                    alloc_front,
+                    stream,
+                )?;
+                let (value, #field_completion_ident) = r#async.unwrap_unchecked()?;
+                ::core::mem::forget(value);
+                let this = ::core::mem::ManuallyDrop::into_inner(this_backup);
+            });
+
+            r2c_field_async_completion_calls.push(quote! {
+                #crate_path::utils::r#async::Completion::<
+                    #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, _>
+                >::complete(
+                    #field_completion_ident, &mut this.#field_accessor,
+                )?;
+            });
+
+            r2c_field_async_completions.push(field_completion_ident);
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
-                    rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor)
+                    #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor)
                 },
             });
         },
         CudaReprFieldTy::RustToCudaProxy { proxy_ty, field_ty } => {
             combined_cuda_alloc_type = quote! {
-                rust_cuda::host::CombinedCudaAlloc<
-                    <#proxy_ty as rust_cuda::common::RustToCuda>::CudaAllocation,
+                #crate_path::alloc::CombinedCudaAlloc<
+                    <#proxy_ty as #crate_path::lend::RustToCuda>::CudaAllocation,
                     #combined_cuda_alloc_type
                 >
             };
+            combined_cuda_alloc_async_type = quote! {
+                #crate_path::alloc::CombinedCudaAlloc<
+                    <#proxy_ty as #crate_path::lend::RustToCudaAsync>::CudaAllocationAsync,
+                    #combined_cuda_alloc_async_type
+                >
+            };
 
             r2c_field_declarations.push(quote! {
-                let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCuda::borrow(
+                let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow(
+                    <
+                        #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty>
+                    >::from_ref(&self.#field_accessor),
+                    alloc_front,
+                )?;
+            });
+            r2c_field_async_declarations.push(quote! {
+                let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCudaAsync::borrow_async(
                     <
-                        #proxy_ty as rust_cuda::common::RustToCudaProxy<#field_ty>
+                        #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty>
                     >::from_ref(&self.#field_accessor),
                     alloc_front,
+                    stream,
                 )?;
+                let (#field_repr_ident, #field_completion_ident) = #field_repr_ident.unwrap_unchecked()?;
             });
 
             r2c_field_initialisations.push(quote! {
@@ -100,23 +171,50 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
             });
 
             r2c_field_destructors.push(quote! {
-                let alloc_front = rust_cuda::common::RustToCuda::restore(
+                let alloc_front = #crate_path::lend::RustToCuda::restore(
                     <
-                        #proxy_ty as rust_cuda::common::RustToCudaProxy<#field_ty>
+                        #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty>
                     >::from_mut(&mut self.#field_accessor),
                     alloc_front,
                 )?;
             });
+            r2c_field_async_destructors.push(quote! {
+                let this_backup = unsafe {
+                    ::core::mem::ManuallyDrop::new(::core::ptr::read(&this))
+                };
+                let (r#async, alloc_front) = #crate_path::lend::RustToCudaAsync::restore_async(
+                    this.map_mut(|this| <
+                        #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty>
+                    >::from_mut(&mut this.#field_accessor)),
+                    alloc_front,
+                    stream,
+                )?;
+                let (value, #field_completion_ident) = r#async.unwrap_unchecked()?;
+                ::core::mem::forget(value);
+                let this = ::core::mem::ManuallyDrop::into_inner(this_backup);
+            });
+
+            r2c_field_async_completion_calls.push(quote! {
+                #crate_path::utils::r#async::Completion::<
+                    #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, _>
+                >::complete(
+                    #field_completion_ident, <
+                        #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty>
+                    >::from_mut(&mut this.#field_accessor),
+                )?;
+            });
+
+            r2c_field_async_completions.push(field_completion_ident);
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
-                    rust_cuda::common::RustToCudaProxy::<#field_ty>::into(
-                        rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor)
+                    #crate_path::lend::RustToCudaProxy::<#field_ty>::into(
+                        #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor)
                     )
                 },
             });
         },
     }
 
-    combined_cuda_alloc_type
+    (combined_cuda_alloc_type, combined_cuda_alloc_async_type)
 }
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
index 8416d3c17..c9fe48b77 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
@@ -12,7 +12,10 @@ pub enum CudaReprFieldTy {
     },
 }
 
-pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprFieldTy {
+pub fn swap_field_type_and_filter_attrs(
+    crate_path: &syn::Path,
+    field: &mut syn::Field,
+) -> CudaReprFieldTy {
     let mut cuda_repr_field_ty: Option<CudaReprFieldTy> = None;
     let mut field_ty = field.ty.clone();
 
@@ -33,8 +36,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField
                                     field_ty: Box::new(field_ty.clone()),
                                 });
                                 field_ty = parse_quote! {
-                                    rust_cuda::common::DeviceAccessible<
-                                        <#field_ty as rust_cuda::common::RustToCuda>::CudaRepresentation
+                                    #crate_path::utils::ffi::DeviceAccessible<
+                                        <#field_ty as #crate_path::lend::RustToCuda>::CudaRepresentation
                                     >
                                 };
                             } else {
@@ -54,8 +57,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField
                                     Ok(proxy_ty) => {
                                         let old_field_ty = Box::new(field_ty.clone());
                                         field_ty = parse_quote! {
-                                            rust_cuda::common::DeviceAccessible<
-                                                <#proxy_ty as rust_cuda::common::RustToCuda>::CudaRepresentation
+                                            #crate_path::utils::ffi::DeviceAccessible<
+                                                <#proxy_ty as #crate_path::lend::RustToCuda>::CudaRepresentation
                                             >
                                         };
                                         cuda_repr_field_ty = Some(CudaReprFieldTy::RustToCudaProxy {
@@ -66,7 +69,7 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField
                                     Err(err) => emit_error!(
                                         s.span(),
                                         "[rust-cuda]: Invalid #[cuda(embed = \
-                                        \"<type>\")] field attribute: {}.",
+                                        \"<proxy-type>\")] field attribute: {}.",
                                         err
                                     ),
                                 }
@@ -80,8 +83,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField
                         _ => {
                             emit_error!(
                                 meta.span(),
-                                "[rust-cuda]: Expected #[cuda(ignore)] / #[cdua(embed)] / \
-                                #[cuda(embed = \"<type>\")] field attribute"
+                                "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \
+                                #[cuda(embed = \"<proxy-type>\")] field attribute"
                             );
                         }
                     }
@@ -89,8 +92,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField
             } else {
                 emit_error!(
                     attr.span(),
-                    "[rust-cuda]: Expected #[cuda(ignore)] / #[cdua(embed)] / \
-                    #[cuda(embed = \"<type>\")] field attribute."
+                    "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \
+                    #[cuda(embed = \"<proxy-type>\")] field attribute."
                 );
             }
 
@@ -100,12 +103,13 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField
         }
     });
 
+    #[allow(clippy::option_if_let_else)]
     let cuda_repr_field_ty = if let Some(cuda_repr_field_ty) = cuda_repr_field_ty {
         cuda_repr_field_ty
     } else {
         field_ty = parse_quote! {
-            rust_cuda::common::DeviceAccessible<
-                rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#field_ty>
+            #crate_path::utils::ffi::DeviceAccessible<
+                #crate_path::utils::adapter::RustToCudaWithPortableBitCopySemantics<#field_ty>
             >
         };
 
diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs
index 8b21246d2..f090f5c70 100644
--- a/rust-cuda-derive/src/rust_to_cuda/generics.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs
@@ -4,7 +4,14 @@ use syn::spanned::Spanned;
 #[allow(clippy::too_many_lines)]
 pub fn expand_cuda_struct_generics_where_requested_in_attrs(
     ast: &syn::DeriveInput,
-) -> (Vec<syn::Attribute>, syn::Generics, Vec<syn::Attribute>) {
+) -> (
+    Vec<syn::Attribute>,
+    syn::Generics,
+    syn::Generics,
+    Vec<syn::Attribute>,
+    bool,
+    syn::Path,
+) {
     let mut type_params = ast
         .generics
         .type_params()
@@ -13,6 +20,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
 
     let mut struct_attrs_cuda = ast.attrs.clone();
     let mut struct_generics_cuda = ast.generics.clone();
+    let mut struct_generics_cuda_async = ast.generics.clone();
     let mut struct_layout_attrs = Vec::new();
 
     for ty in &type_params {
@@ -23,6 +31,8 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
     }
 
     let mut r2c_ignore = false;
+    let mut r2c_async_impl = None;
+    let mut crate_path = None;
 
     struct_attrs_cuda.retain(|attr| {
         if attr.path.is_ident("cuda") {
@@ -36,11 +46,17 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
                             path,
                             lit: syn::Lit::Str(s),
                             ..
-                        })) if path.is_ident("bound") => match syn::parse_str(&s.value()) {
-                            Ok(bound) => struct_generics_cuda
-                                .make_where_clause()
-                                .predicates
-                                .push(bound),
+                        })) if path.is_ident("bound") => match syn::parse_str::<syn::WherePredicate>(&s.value()) {
+                            Ok(bound) => {
+                                struct_generics_cuda
+                                    .make_where_clause()
+                                    .predicates
+                                    .push(bound.clone());
+                                struct_generics_cuda_async
+                                    .make_where_clause()
+                                    .predicates
+                                    .push(bound);
+                            },
                             Err(err) => emit_error!(
                                 s.span(),
                                 "[rust-cuda]: Invalid #[cuda(bound = \"<where-predicate>\")] \
@@ -78,11 +94,46 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
                             }
                         },
                         syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue {
-                            path:
-                                syn::Path {
-                                    leading_colon: None,
-                                    segments,
-                                },
+                            path,
+                            lit: syn::Lit::Bool(b),
+                            ..
+                        })) if path.is_ident("async") => if r2c_async_impl.is_none() {
+                            r2c_async_impl = Some(b.value());
+                        } else {
+                            emit_error!(
+                                b.span(),
+                                "[rust-cuda]: Duplicate #[cuda(async)] attribute.",
+                            );
+                        },
+                        syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue {
+                            path,
+                            lit: syn::Lit::Str(s),
+                            ..
+                        })) if path.is_ident("crate") => match syn::parse_str::<syn::Path>(&s.value()) {
+                            Ok(new_crate_path) => {
+                                if crate_path.is_none() {
+                                    crate_path = Some(
+                                        syn::parse_quote_spanned! { s.span() => #new_crate_path },
+                                    );
+                                } else {
+                                    emit_error!(
+                                        s.span(),
+                                        "[rust-cuda]: Duplicate #[cuda(crate)] attribute.",
+                                    );
+                                }
+                            },
+                            Err(err) => emit_error!(
+                                s.span(),
+                                "[rust-cuda]: Invalid #[cuda(crate = \
+                                 \"<crate-path>\")] attribute: {}.",
+                                err
+                            ),
+                        },
+                        syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue {
+                            path: syn::Path {
+                                leading_colon: None,
+                                segments,
+                            },
                             lit: syn::Lit::Str(s),
                             ..
                         })) if segments.len() == 2
@@ -108,9 +159,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
                         _ => {
                             emit_error!(
                                 meta.span(),
-                                "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(bound = \
-                                 \"<where-predicate>\")] / #[cuda(layout::ATTR = \"VALUE\")] \
-                                 struct attribute."
+                                "[rust-cuda]: Expected #[cuda(crate = \"<crate-path>\")] / #[cuda(bound = \"<where-predicate>\")] / #[cuda(free = \"<type>\")] / #[cuda(async = <bool>)] / #[cuda(layout::ATTR = \"VALUE\")] / #[cuda(ignore)] struct attribute."
                             );
                         },
                     }
@@ -118,8 +167,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
             } else {
                 emit_error!(
                     attr.span(),
-                    "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(bound = \
-                     \"<where-predicate>\")] / #[cuda(layout::ATTR = \"VALUE\")] struct attribute."
+                    "[rust-cuda]: Expected #[cuda(crate = \"<crate-path>\")] / #[cuda(bound = \"<where-predicate>\")] / #[cuda(free = \"<type>\")] / #[cuda(async = <bool>)] / #[cuda(layout::ATTR = \"VALUE\")] / #[cuda(ignore)] struct attribute."
                 );
             }
 
@@ -129,14 +177,29 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
         }
     });
 
+    let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda));
+
     for ty in &type_params {
         struct_generics_cuda
             .make_where_clause()
             .predicates
             .push(syn::parse_quote! {
-                #ty: ::rust_cuda::common::RustToCuda
+                #ty: #crate_path::lend::RustToCuda
+            });
+        struct_generics_cuda_async
+            .make_where_clause()
+            .predicates
+            .push(syn::parse_quote! {
+                #ty: #crate_path::lend::RustToCudaAsync
             });
     }
 
-    (struct_attrs_cuda, struct_generics_cuda, struct_layout_attrs)
+    (
+        struct_attrs_cuda,
+        struct_generics_cuda,
+        struct_generics_cuda_async,
+        struct_layout_attrs,
+        r2c_async_impl.unwrap_or(true),
+        crate_path,
+    )
 }
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 8b99e4f73..e0a67b7e3 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -1,7 +1,9 @@
 use proc_macro2::TokenStream;
 use quote::quote;
 
+#[allow(clippy::too_many_arguments)]
 pub fn cuda_struct_declaration(
+    crate_path: &syn::Path,
     struct_attrs_cuda: &[syn::Attribute],
     struct_layout_attrs: &[syn::Attribute],
     struct_vis_cuda: &syn::Visibility,
@@ -10,7 +12,7 @@ pub fn cuda_struct_declaration(
     struct_fields_cuda: &syn::Fields,
     struct_semi_cuda: Option<syn::token::Semi>,
 ) -> TokenStream {
-    let (impl_generics, ty_generics, where_clause) = struct_generics_cuda.split_for_impl();
+    let (_impl_generics, _ty_generics, where_clause) = struct_generics_cuda.split_for_impl();
 
     let struct_repr = if struct_attrs_cuda
         .iter()
@@ -21,29 +23,30 @@ pub fn cuda_struct_declaration(
         quote! { #[repr(C)] }
     };
 
+    #[allow(clippy::option_if_let_else)]
     let struct_fields_where_clause = if let Some(struct_semi_cuda) = struct_semi_cuda {
         quote!(#struct_fields_cuda #where_clause #struct_semi_cuda)
     } else {
         quote!(#where_clause #struct_fields_cuda)
     };
 
+    let const_type_layout_crate_path = quote! { #crate_path::deps::const_type_layout }.to_string();
+
     quote! {
         #[allow(dead_code)]
         #[doc(hidden)]
         #(#struct_attrs_cuda)*
-        #[derive(rust_cuda::const_type_layout::TypeLayout)]
+        #[derive(#crate_path::deps::const_type_layout::TypeLayout)]
         #struct_repr
         #(#struct_layout_attrs)*
+        #[layout(crate = #const_type_layout_crate_path)]
         #struct_vis_cuda struct #struct_name_cuda #struct_generics_cuda #struct_fields_where_clause
-
-        // #[derive(DeviceCopy)] can interfer with type parameters
-        unsafe impl #impl_generics rust_cuda::rustacuda_core::DeviceCopy
-            for #struct_name_cuda #ty_generics #where_clause {}
     }
 }
 
 #[allow(clippy::too_many_arguments)]
 pub fn rust_to_cuda_trait(
+    crate_path: &syn::Path,
     struct_name: &syn::Ident,
     struct_name_cuda: &syn::Ident,
     struct_generics_cuda: &syn::Generics,
@@ -70,22 +73,22 @@ pub fn rust_to_cuda_trait(
     let (impl_generics, ty_generics, where_clause) = struct_generics_cuda.split_for_impl();
 
     quote! {
-        unsafe impl #impl_generics rust_cuda::common::RustToCuda for #struct_name #ty_generics
+        unsafe impl #impl_generics #crate_path::lend::RustToCuda for #struct_name #ty_generics
             #where_clause
         {
             type CudaRepresentation = #struct_name_cuda #ty_generics;
 
-            #[cfg(not(target_os = "cuda"))]
             type CudaAllocation = #combined_cuda_alloc_type;
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn borrow<CudaAllocType: rust_cuda::host::CudaAlloc>(
-                &self, alloc: CudaAllocType
-            ) -> rust_cuda::rustacuda::error::CudaResult<(
-                rust_cuda::common::DeviceAccessible<Self::CudaRepresentation>,
-                rust_cuda::host::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
+            unsafe fn borrow<CudaAllocType: #crate_path::alloc::CudaAlloc>(
+                &self,
+                alloc: CudaAllocType,
+            ) -> #crate_path::deps::rustacuda::error::CudaResult<(
+                #crate_path::utils::ffi::DeviceAccessible<Self::CudaRepresentation>,
+                #crate_path::alloc::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
-                let alloc_front = rust_cuda::host::NullCudaAlloc;
+                let alloc_front = #crate_path::alloc::NoCudaAlloc;
                 let alloc_tail = alloc;
 
                 #(#r2c_field_declarations)*
@@ -93,18 +96,18 @@ pub fn rust_to_cuda_trait(
                 let borrow = #rust_to_cuda_struct_construction;
 
                 Ok((
-                    rust_cuda::common::DeviceAccessible::from(borrow),
-                    rust_cuda::host::CombinedCudaAlloc::new(alloc_front, alloc_tail)
+                    #crate_path::utils::ffi::DeviceAccessible::from(borrow),
+                    #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail)
                 ))
             }
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn restore<CudaAllocType: rust_cuda::host::CudaAlloc>(
+            unsafe fn restore<CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &mut self,
-                alloc: rust_cuda::host::CombinedCudaAlloc<
+                alloc: #crate_path::alloc::CombinedCudaAlloc<
                     Self::CudaAllocation, CudaAllocType
                 >,
-            ) -> rust_cuda::rustacuda::error::CudaResult<CudaAllocType> {
+            ) -> #crate_path::deps::rustacuda::error::CudaResult<CudaAllocType> {
                 let (alloc_front, alloc_tail) = alloc.split();
 
                 #(#r2c_field_destructors)*
@@ -115,7 +118,130 @@ pub fn rust_to_cuda_trait(
     }
 }
 
+#[allow(clippy::too_many_arguments)]
+pub fn rust_to_cuda_async_trait(
+    crate_path: &syn::Path,
+    struct_name: &syn::Ident,
+    struct_name_cuda: &syn::Ident,
+    struct_generics_cuda_async: &syn::Generics,
+    struct_fields_cuda: &syn::Fields,
+    combined_cuda_alloc_async_type: &TokenStream,
+    r2c_field_async_declarations: &[TokenStream],
+    r2c_field_async_completions: &[syn::Ident],
+    r2c_field_initialisations: &[TokenStream],
+    r2c_field_async_destructors: &[TokenStream],
+    r2c_field_async_completion_calls: &[TokenStream],
+) -> TokenStream {
+    let rust_to_cuda_struct_construction = match struct_fields_cuda {
+        syn::Fields::Named(_) => quote! {
+            #struct_name_cuda {
+                #(#r2c_field_initialisations)*
+            }
+        },
+        syn::Fields::Unnamed(_) => quote! {
+            #struct_name_cuda (
+                #(#r2c_field_initialisations)*
+            )
+        },
+        syn::Fields::Unit => quote! { #struct_name_cuda },
+    };
+
+    let async_borrow_completion = if r2c_field_async_completions.is_empty() {
+        quote! { #crate_path::utils::r#async::Async::ready(borrow, stream) }
+    } else {
+        quote! {
+            if #(#r2c_field_async_completions.is_none())&&* {
+                #crate_path::utils::r#async::Async::ready(borrow, stream)
+            } else {
+                #crate_path::utils::r#async::Async::pending(
+                    borrow, stream, #crate_path::utils::r#async::NoCompletion,
+                )?
+            }
+        }
+    };
+
+    let async_restore_completion = if r2c_field_async_completions.is_empty() {
+        quote! { #crate_path::utils::r#async::Async::ready(this, stream) }
+    } else {
+        quote! {
+            if #(#r2c_field_async_completions.is_none())&&* {
+                #crate_path::utils::r#async::Async::ready(this, stream)
+            } else {
+                #crate_path::utils::r#async::Async::<
+                    _, #crate_path::utils::r#async::CompletionFnMut<Self>,
+                >::pending(
+                    this, stream, #crate_path::deps::alloc::boxed::Box::new(|this| {
+                        #(#r2c_field_async_completion_calls)*
+                        Ok(())
+                    }),
+                )?
+            }
+        }
+    };
+
+    let (impl_generics, ty_generics, where_clause) = struct_generics_cuda_async.split_for_impl();
+
+    quote! {
+        unsafe impl #impl_generics #crate_path::lend::RustToCudaAsync for #struct_name #ty_generics
+            #where_clause
+        {
+            type CudaAllocationAsync = #combined_cuda_alloc_async_type;
+
+            #[cfg(not(target_os = "cuda"))]
+            unsafe fn borrow_async<'stream, CudaAllocType: #crate_path::alloc::CudaAlloc>(
+                &self,
+                alloc: CudaAllocType,
+                stream: #crate_path::host::Stream<'stream>,
+            ) -> #crate_path::deps::rustacuda::error::CudaResult<(
+                #crate_path::utils::r#async::Async<
+                    '_, 'stream,
+                    #crate_path::utils::ffi::DeviceAccessible<Self::CudaRepresentation>,
+                >,
+                #crate_path::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, CudaAllocType>,
+            )> {
+                let alloc_front = #crate_path::alloc::NoCudaAlloc;
+                let alloc_tail = alloc;
+
+                #(#r2c_field_async_declarations)*
+
+                let borrow = #rust_to_cuda_struct_construction;
+                let borrow = #crate_path::utils::ffi::DeviceAccessible::from(borrow);
+
+                let r#async = #async_borrow_completion;
+                let alloc = #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail);
+
+                Ok((r#async, alloc))
+            }
+
+            #[cfg(not(target_os = "cuda"))]
+            unsafe fn restore_async<'a, 'stream, CudaAllocType: #crate_path::alloc::CudaAlloc, CudaRestoreOwner>(
+                this: #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, Self>,
+                alloc: #crate_path::alloc::CombinedCudaAlloc<
+                    Self::CudaAllocationAsync, CudaAllocType
+                >,
+                stream: #crate_path::host::Stream<'stream>,
+            ) -> #crate_path::deps::rustacuda::error::CudaResult<(
+                #crate_path::utils::r#async::Async<
+                    'a, 'stream,
+                    #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, Self>,
+                    #crate_path::utils::r#async::CompletionFnMut<'a, Self>,
+                >,
+                CudaAllocType,
+            )> {
+                let (alloc_front, alloc_tail) = alloc.split();
+
+                #(#r2c_field_async_destructors)*
+
+                let r#async = #async_restore_completion;
+
+                Ok((r#async, alloc_tail))
+            }
+        }
+    }
+}
+
 pub fn cuda_as_rust_trait(
+    crate_path: &syn::Path,
     struct_name: &syn::Ident,
     struct_name_cuda: &syn::Ident,
     struct_generics_cuda: &syn::Generics,
@@ -139,14 +265,14 @@ pub fn cuda_as_rust_trait(
     let (impl_generics, ty_generics, where_clause) = &struct_generics_cuda.split_for_impl();
 
     quote! {
-        unsafe impl #impl_generics rust_cuda::common::CudaAsRust
+        unsafe impl #impl_generics #crate_path::lend::CudaAsRust
             for #struct_name_cuda #ty_generics #where_clause
         {
             type RustRepresentation = #struct_name #ty_generics;
 
             #[cfg(target_os = "cuda")]
             unsafe fn as_rust(
-                this: &rust_cuda::common::DeviceAccessible<Self>,
+                this: &#crate_path::utils::ffi::DeviceAccessible<Self>,
             ) -> #struct_name #ty_generics {
                 #cuda_as_rust_struct_construction
             }
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index 18589b78a..615c81edf 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -10,7 +10,7 @@ fn get_cuda_repr_ident(rust_repr_ident: &proc_macro2::Ident) -> proc_macro2::Ide
     format_ident!("{}CudaRepresentation", rust_repr_ident)
 }
 
-#[allow(clippy::module_name_repetitions)]
+#[allow(clippy::module_name_repetitions, clippy::too_many_lines)]
 pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     let (mut struct_fields_cuda, struct_semi_cuda) = if let syn::Data::Struct(s) = &ast.data {
         (s.fields.clone(), s.semi_token)
@@ -21,12 +21,28 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     let struct_name = &ast.ident;
     let struct_name_cuda = get_cuda_repr_ident(struct_name);
 
+    let (
+        struct_attrs_cuda,
+        struct_generics_cuda,
+        struct_generics_cuda_async,
+        struct_layout_attrs,
+        r2c_async_impl,
+        crate_path,
+    ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast);
+
     let mut combined_cuda_alloc_type: TokenStream = quote! {
-        rust_cuda::host::NullCudaAlloc
+        #crate_path::alloc::NoCudaAlloc
+    };
+    let mut combined_cuda_alloc_async_type: TokenStream = quote! {
+        #crate_path::alloc::NoCudaAlloc
     };
     let mut r2c_field_declarations: Vec<TokenStream> = Vec::new();
+    let mut r2c_field_async_declarations: Vec<TokenStream> = Vec::new();
+    let mut r2c_field_async_completions: Vec<syn::Ident> = Vec::new();
     let mut r2c_field_initialisations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_destructors: Vec<TokenStream> = Vec::new();
+    let mut r2c_field_async_destructors: Vec<TokenStream> = Vec::new();
+    let mut r2c_field_async_completion_calls: Vec<TokenStream> = Vec::new();
 
     let mut c2r_field_initialisations: Vec<TokenStream> = Vec::new();
 
@@ -40,32 +56,41 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
             ..
         }) => {
             let mut r2c_field_destructors_reverse: Vec<TokenStream> = Vec::new();
+            let mut r2c_field_async_destructors_reverse: Vec<TokenStream> = Vec::new();
 
             for (field_index, field) in fields.iter_mut().enumerate() {
-                let cuda_repr_field_ty = field_ty::swap_field_type_and_filter_attrs(field);
-
-                combined_cuda_alloc_type = field_copy::impl_field_copy_init_and_expand_alloc_type(
-                    field,
-                    field_index,
-                    &cuda_repr_field_ty,
-                    combined_cuda_alloc_type,
-                    &mut r2c_field_declarations,
-                    &mut r2c_field_initialisations,
-                    &mut r2c_field_destructors_reverse,
-                    &mut c2r_field_initialisations,
-                );
+                let cuda_repr_field_ty =
+                    field_ty::swap_field_type_and_filter_attrs(&crate_path, field);
+
+                (combined_cuda_alloc_type, combined_cuda_alloc_async_type) =
+                    field_copy::impl_field_copy_init_and_expand_alloc_type(
+                        &crate_path,
+                        field,
+                        field_index,
+                        &cuda_repr_field_ty,
+                        combined_cuda_alloc_type,
+                        combined_cuda_alloc_async_type,
+                        &mut r2c_field_declarations,
+                        &mut r2c_field_async_declarations,
+                        &mut r2c_field_async_completions,
+                        &mut r2c_field_initialisations,
+                        &mut r2c_field_destructors_reverse,
+                        &mut r2c_field_async_destructors_reverse,
+                        &mut r2c_field_async_completion_calls,
+                        &mut c2r_field_initialisations,
+                    );
             }
 
             // The fields must be deallocated in the reverse order of their allocation
             r2c_field_destructors.extend(r2c_field_destructors_reverse.into_iter().rev());
+            r2c_field_async_destructors
+                .extend(r2c_field_async_destructors_reverse.into_iter().rev());
         },
         syn::Fields::Unit => (),
     }
 
-    let (struct_attrs_cuda, struct_generics_cuda, struct_layout_attrs) =
-        generics::expand_cuda_struct_generics_where_requested_in_attrs(ast);
-
     let cuda_struct_declaration = r#impl::cuda_struct_declaration(
+        &crate_path,
         &struct_attrs_cuda,
         &struct_layout_attrs,
         &ast.vis,
@@ -76,6 +101,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     );
 
     let rust_to_cuda_trait_impl = r#impl::rust_to_cuda_trait(
+        &crate_path,
         struct_name,
         &struct_name_cuda,
         &struct_generics_cuda,
@@ -86,7 +112,26 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
         &r2c_field_destructors,
     );
 
+    let rust_to_cuda_async_trait_impl = if r2c_async_impl {
+        r#impl::rust_to_cuda_async_trait(
+            &crate_path,
+            struct_name,
+            &struct_name_cuda,
+            &struct_generics_cuda_async,
+            &struct_fields_cuda,
+            &combined_cuda_alloc_async_type,
+            &r2c_field_async_declarations,
+            &r2c_field_async_completions,
+            &r2c_field_initialisations,
+            &r2c_field_async_destructors,
+            &r2c_field_async_completion_calls,
+        )
+    } else {
+        TokenStream::new()
+    };
+
     let cuda_as_rust_trait_impl = r#impl::cuda_as_rust_trait(
+        &crate_path,
         struct_name,
         &struct_name_cuda,
         &struct_generics_cuda,
@@ -99,6 +144,8 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
 
         #rust_to_cuda_trait_impl
 
+        #rust_to_cuda_async_trait_impl
+
         #cuda_as_rust_trait_impl
     })
     .into()
diff --git a/rust-cuda-kernel/Cargo.toml b/rust-cuda-kernel/Cargo.toml
new file mode 100644
index 000000000..b944bf875
--- /dev/null
+++ b/rust-cuda-kernel/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "rust-cuda-kernel"
+version = "0.1.0"
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
+license = "MIT OR Apache-2.0"
+edition = "2021"
+rust-version = "1.77" # nightly
+links = "libnvptxcompiler_static"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[lib]
+proc-macro = true
+
+[dependencies]
+syn = { version = "1.0", features = ["full", "fold"] }
+quote = "1.0"
+proc-macro2 = "1.0"
+proc-macro-error = "1.0"
+regex = "1.5"
+lazy_static = "1.4"
+serde_json = "1.0"
+cargo_metadata = { version = "0.18", features = ["builder"] }
+strip-ansi-escapes = "0.2"
+colored = "2.0"
+thiserror = "1.0"
+seahash = "4.1"
+ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" }
+
+[build-dependencies]
+find_cuda_helper = "0.2"
diff --git a/rust-cuda-kernel/build.rs b/rust-cuda-kernel/build.rs
new file mode 100644
index 000000000..f7aa5b1a9
--- /dev/null
+++ b/rust-cuda-kernel/build.rs
@@ -0,0 +1,5 @@
+fn main() {
+    find_cuda_helper::include_cuda();
+
+    println!("cargo:rustc-link-lib=nvptxcompiler_static");
+}
diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-kernel/src/kernel/link/config.rs
similarity index 65%
rename from rust-cuda-derive/src/kernel/link/config.rs
rename to rust-cuda-kernel/src/kernel/link/config.rs
index cdfd0b575..02297ba7d 100644
--- a/rust-cuda-derive/src/kernel/link/config.rs
+++ b/rust-cuda-kernel/src/kernel/link/config.rs
@@ -1,18 +1,23 @@
-use std::path::PathBuf;
+use std::{collections::HashMap, path::PathBuf};
+
+use quote::quote;
+
+use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
 
 #[allow(clippy::module_name_repetitions)]
 pub(super) struct LinkKernelConfig {
     pub(super) kernel: syn::Ident,
-    pub(super) args: syn::Ident,
+    pub(super) kernel_hash: syn::Ident,
     pub(super) crate_name: String,
     pub(super) crate_path: PathBuf,
     pub(super) specialisation: String,
+    pub(super) ptx_lint_levels: HashMap<PtxLint, LintLevel>,
 }
 
 impl syn::parse::Parse for LinkKernelConfig {
     fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
         let kernel: syn::Ident = input.parse()?;
-        let args: syn::Ident = input.parse()?;
+        let kernel_hash: syn::Ident = input.parse()?;
         let name: syn::LitStr = input.parse()?;
         let path: syn::LitStr = input.parse()?;
 
@@ -35,31 +40,48 @@ impl syn::parse::Parse for LinkKernelConfig {
             String::new()
         };
 
+        let attrs = syn::punctuated::Punctuated::<
+            syn::MetaList,
+            syn::token::Comma,
+        >::parse_separated_nonempty(input)?;
+
+        let mut ptx_lint_levels = HashMap::new();
+
+        for syn::MetaList { path, nested, .. } in attrs {
+            parse_ptx_lint_level(&path, &nested, &mut ptx_lint_levels);
+        }
+
+        proc_macro_error::abort_if_dirty();
+
         Ok(Self {
             kernel,
-            args,
+            kernel_hash,
             crate_name: name.value(),
             crate_path: PathBuf::from(path.value()),
             specialisation,
+            ptx_lint_levels,
         })
     }
 }
 
 #[allow(clippy::module_name_repetitions)]
 pub(super) struct CheckKernelConfig {
-    pub(super) args: syn::Ident,
+    pub(super) kernel: syn::Ident,
+    pub(super) kernel_hash: syn::Ident,
     pub(super) crate_name: String,
     pub(super) crate_path: PathBuf,
 }
 
 impl syn::parse::Parse for CheckKernelConfig {
     fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
-        let args: syn::Ident = input.parse()?;
+        let kernel: syn::Ident = input.parse()?;
+        let kernel_hash: syn::Ident = input.parse()?;
         let name: syn::LitStr = input.parse()?;
         let path: syn::LitStr = input.parse()?;
 
         Ok(Self {
-            args,
+            kernel,
+            kernel_hash,
             crate_name: name.value(),
             crate_path: PathBuf::from(path.value()),
         })
diff --git a/rust-cuda-derive/src/kernel/link/error.rs b/rust-cuda-kernel/src/kernel/link/error.rs
similarity index 91%
rename from rust-cuda-derive/src/kernel/link/error.rs
rename to rust-cuda-kernel/src/kernel/link/error.rs
index 0c83e19a5..811269ccc 100644
--- a/rust-cuda-derive/src/kernel/link/error.rs
+++ b/rust-cuda-kernel/src/kernel/link/error.rs
@@ -22,15 +22,14 @@ pub fn emit_ptx_build_error() {
 
     let call_site = proc_macro::Span::call_site();
 
-    let (byte_start, byte_end) =
-        if let Some(captures) = PROC_MACRO_SPAN_REGEX.captures(&format!("{call_site:?}")) {
+    let (byte_start, byte_end) = PROC_MACRO_SPAN_REGEX
+        .captures(&format!("{call_site:?}"))
+        .map_or((0_u32, 0_u32), |captures| {
             (
                 captures["start"].parse().unwrap_or(0_u32),
                 captures["end"].parse().unwrap_or(0_u32),
             )
-        } else {
-            (0_u32, 0_u32)
-        };
+        });
 
     let span = DiagnosticSpanBuilder::default()
         .file_name(
diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
new file mode 100644
index 000000000..bbe243c9f
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -0,0 +1,842 @@
+use std::{
+    collections::HashMap,
+    env,
+    ffi::CString,
+    fmt::Write as FmtWrite,
+    fs,
+    io::{Read, Write},
+    os::raw::c_int,
+    path::{Path, PathBuf},
+    ptr::addr_of_mut,
+    sync::atomic::{AtomicBool, Ordering},
+};
+
+use colored::Colorize;
+use proc_macro::TokenStream;
+use proc_macro2::Span;
+use ptx_builder::{
+    builder::{BuildStatus, Builder, MessageFormat, Profile},
+    error::{BuildErrorKind, Error, Result},
+};
+use quote::quote;
+
+use crate::kernel::{
+    lints::{LintLevel, PtxLint},
+    utils::skip_kernel_compilation,
+    KERNEL_TYPE_LAYOUT_IDENT, KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY,
+    PTX_CSTR_IDENT,
+};
+
+mod config;
+mod error;
+mod ptx_compiler_sys;
+
+use config::{CheckKernelConfig, LinkKernelConfig};
+use error::emit_ptx_build_error;
+use ptx_compiler_sys::NvptxError;
+
+pub fn check_kernel(tokens: TokenStream) -> TokenStream {
+    proc_macro_error::set_dummy(
+        quote! {::core::compile_error!("rust-cuda PTX kernel check failed");},
+    );
+
+    let CheckKernelConfig {
+        kernel,
+        kernel_hash,
+        crate_name,
+        crate_path,
+    } = match syn::parse_macro_input::parse(tokens) {
+        Ok(config) => config,
+        Err(err) => {
+            abort_call_site!(
+                "check_kernel!(KERNEL HASH NAME PATH) expects KERNEL and HASH identifiers, annd \
+                 NAME and PATH string literals: {:?}",
+                err
+            )
+        },
+    };
+
+    let kernel_ptx = compile_kernel_ptx(&kernel, &crate_name, &crate_path, Specialisation::Check);
+
+    let Some(kernel_ptx) = kernel_ptx else {
+        return quote!(::core::compile_error!("rust-cuda PTX kernel check failed");).into();
+    };
+
+    check_kernel_ptx_and_report(
+        &kernel_ptx,
+        Specialisation::Check,
+        &kernel_hash,
+        &HashMap::new(),
+    );
+
+    quote!().into()
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
+    let ptx_cstr_ident = syn::Ident::new(PTX_CSTR_IDENT, Span::call_site());
+    let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, Span::call_site());
+
+    proc_macro_error::set_dummy(quote! {
+        const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
+
+        const fn #ffi_signature_ident<T: TypeGraphLayout>() -> HostAndDeviceKernelSignatureTypeLayout {
+            HostAndDeviceKernelSignatureTypeLayout::Match
+        }
+
+        ::core::compile_error!("rust-cuda PTX kernel compilation failed");
+    });
+
+    let LinkKernelConfig {
+        kernel,
+        kernel_hash,
+        crate_name,
+        crate_path,
+        specialisation,
+        ptx_lint_levels,
+    } = match syn::parse_macro_input::parse(tokens) {
+        Ok(config) => config,
+        Err(err) => {
+            abort_call_site!(
+                "compile_kernel!(KERNEL HASH NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \
+                 HASH identifiers, NAME and PATH string literals, and SPECIALISATION and LINTS \
+                 tokens: {:?}",
+                err
+            )
+        },
+    };
+
+    if skip_kernel_compilation() {
+        return quote! {
+            const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"CLIPPY skips specialised PTX compilation";
+        }
+        .into();
+    }
+
+    let Some(mut kernel_ptx) = compile_kernel_ptx(
+        &kernel,
+        &crate_name,
+        &crate_path,
+        Specialisation::Link(&specialisation),
+    ) else {
+        return (quote! {
+            const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
+
+            const fn #ffi_signature_ident<T: TypeGraphLayout>() -> HostAndDeviceKernelSignatureTypeLayout {
+                HostAndDeviceKernelSignatureTypeLayout::Match
+            }
+
+            ::core::compile_error!("rust-cuda PTX kernel compilation failed");
+        })
+        .into();
+    };
+
+    let type_layouts = extract_ptx_kernel_layout(&mut kernel_ptx);
+    remove_kernel_type_use_from_ptx(&mut kernel_ptx);
+
+    check_kernel_ptx_and_report(
+        &kernel_ptx,
+        Specialisation::Link(&specialisation),
+        &kernel_hash,
+        &ptx_lint_levels,
+    );
+
+    let kernel_ptx = match CString::new(kernel_ptx) {
+        Ok(kernel_ptx) => kernel_ptx,
+        Err(err) => abort_call_site!(
+            "Kernel compilation generated invalid PTX: internal nul byte: {:?}",
+            err
+        ),
+    };
+
+    let kernel_ptx = proc_macro::Literal::c_string(&kernel_ptx);
+    let kernel_ptx = proc_macro2::TokenStream::from(proc_macro::TokenStream::from(
+        proc_macro::TokenTree::Literal(kernel_ptx),
+    ));
+
+    (quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = #kernel_ptx; #(#type_layouts)* })
+        .into()
+}
+
+fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenStream> {
+    const BEFORE_PARAM_PATTERN: &str = ".visible .global .align 1 .b8 ";
+    const PARAM_LEN_PATTERN: &str = "[";
+    const LEN_BYTES_PATTERN: &str = "] = {";
+    const AFTER_BYTES_PATTERN: &str = "};";
+
+    let mut type_layouts = Vec::new();
+
+    while let Some(type_layout_start) = kernel_ptx.find(BEFORE_PARAM_PATTERN) {
+        let param_start = type_layout_start + BEFORE_PARAM_PATTERN.len();
+
+        let Some(len_start_offset) = kernel_ptx[param_start..].find(PARAM_LEN_PATTERN) else {
+            abort_call_site!("Kernel compilation generated invalid PTX: missing type layout data")
+        };
+        let len_start = param_start + len_start_offset + PARAM_LEN_PATTERN.len();
+
+        let Some(bytes_start_offset) = kernel_ptx[len_start..].find(LEN_BYTES_PATTERN) else {
+            abort_call_site!("Kernel compilation generated invalid PTX: missing type layout length")
+        };
+        let bytes_start = len_start + bytes_start_offset + LEN_BYTES_PATTERN.len();
+
+        let Some(bytes_end_offset) = kernel_ptx[bytes_start..].find(AFTER_BYTES_PATTERN) else {
+            abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout data")
+        };
+        let param = &kernel_ptx[param_start..(param_start + len_start_offset)];
+        let len = &kernel_ptx[len_start..(len_start + bytes_start_offset)];
+        let bytes = &kernel_ptx[bytes_start..(bytes_start + bytes_end_offset)];
+
+        let param = quote::format_ident!("{}", param);
+
+        let Ok(len) = len.parse::<usize>() else {
+            abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout length")
+        };
+        let Ok(bytes) = bytes
+            .split(", ")
+            .map(std::str::FromStr::from_str)
+            .collect::<Result<Vec<u8>, _>>()
+        else {
+            abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout byte")
+        };
+
+        if bytes.len() != len {
+            abort_call_site!(
+                "Kernel compilation generated invalid PTX: type layout length mismatch"
+            );
+        }
+
+        // let mut ascii_escaped_bytes = Vec::new();
+        // for b in &bytes {
+        //     ascii_escaped_bytes.extend(std::ascii::escape_default(*b));
+        // }
+        // emit_call_site_warning!("{}", std::str::from_utf8(&ascii_escaped_bytes).unwrap());
+
+        let mut zeros = 0;
+        for b in &bytes {
+            if *b == 0 {
+                zeros += 1;
+            } else {
+                zeros = 0;
+            }
+        }
+
+        #[allow(clippy::cast_precision_loss)] // FIXME
+        {
+            emit_call_site_warning!("type layout: {}B (can do {:.02} compression)", bytes.len(), (bytes.len() as f64) / ((bytes.len() - zeros) as f64));
+        }
+
+        let byte_str = syn::LitByteStr::new(&bytes[..bytes.len()-zeros], proc_macro2::Span::call_site());
+
+        type_layouts.push(quote! {
+            const fn #param<T: TypeGraphLayout>() -> HostAndDeviceKernelSignatureTypeLayout {
+                if check_serialised_type_graph::<T>(#byte_str) {
+                    HostAndDeviceKernelSignatureTypeLayout::Match
+                } else {
+                    HostAndDeviceKernelSignatureTypeLayout::Mismatch
+                }
+            }
+        });
+
+        let type_layout_end = bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len();
+
+        kernel_ptx.replace_range(type_layout_start..type_layout_end, "");
+    }
+
+    type_layouts
+}
+
+fn remove_kernel_type_use_from_ptx(kernel_ptx: &mut String) {
+    while let Some(kernel_type_layout_start) = kernel_ptx.find(KERNEL_TYPE_USE_START_CANARY) {
+        let kernel_type_layout_start = kernel_ptx[..kernel_type_layout_start]
+            .rfind('\n')
+            .unwrap_or(kernel_type_layout_start);
+
+        let Some(kernel_type_layout_end_offset) =
+            kernel_ptx[kernel_type_layout_start..].find(KERNEL_TYPE_USE_END_CANARY)
+        else {
+            abort_call_site!(
+                "Kernel compilation generated invalid PTX: incomplete type layout use section"
+            );
+        };
+
+        let kernel_type_layout_end_offset = kernel_type_layout_end_offset
+            + kernel_ptx[kernel_type_layout_start + kernel_type_layout_end_offset..]
+                .find('\n')
+                .unwrap_or(KERNEL_TYPE_USE_END_CANARY.len());
+
+        let kernel_type_layout_end = kernel_type_layout_start + kernel_type_layout_end_offset;
+
+        kernel_ptx.replace_range(kernel_type_layout_start..kernel_type_layout_end, "");
+    }
+}
+
+#[allow(clippy::too_many_lines)]
+fn check_kernel_ptx_and_report(
+    kernel_ptx: &str,
+    specialisation: Specialisation,
+    kernel_hash: &proc_macro2::Ident,
+    ptx_lint_levels: &HashMap<PtxLint, LintLevel>,
+) {
+    let (result, error_log, info_log, binary, version, drop) =
+        check_kernel_ptx(kernel_ptx, specialisation, kernel_hash, ptx_lint_levels);
+
+    let ptx_compiler = match &version {
+        Ok((major, minor)) => format!("PTX compiler v{major}.{minor}"),
+        Err(_) => String::from("PTX compiler"),
+    };
+
+    let mut errors = String::new();
+
+    if let Err(err) = drop {
+        let _ = errors.write_fmt(format_args!("Error dropping the {ptx_compiler}: {err}\n"));
+    }
+
+    if let Err(err) = version {
+        let _ = errors.write_fmt(format_args!(
+            "Error fetching the version of the {ptx_compiler}: {err}\n"
+        ));
+    }
+
+    let ptx_source_code = {
+        let mut max_lines = kernel_ptx.chars().filter(|c| *c == '\n').count() + 1;
+        let mut indent = 0;
+        while max_lines > 0 {
+            max_lines /= 10;
+            indent += 1;
+        }
+
+        format!(
+            "PTX source code:\n{}",
+            kernel_ptx
+                .lines()
+                .enumerate()
+                .map(|(i, l)| format!("{:indent$}| {l}", i + 1))
+                .collect::<Vec<_>>()
+                .join("\n")
+        )
+    };
+
+    match binary {
+        Ok(None) => (),
+        Ok(Some(binary)) => {
+            if ptx_lint_levels
+                .get(&PtxLint::DumpAssembly)
+                .map_or(false, |level| *level > LintLevel::Allow)
+            {
+                const HEX: [char; 16] = [
+                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+                ];
+
+                let mut binary_hex = String::with_capacity(binary.len() * 2);
+                for byte in binary {
+                    binary_hex.push(HEX[usize::from(byte >> 4)]);
+                    binary_hex.push(HEX[usize::from(byte & 0x0F)]);
+                }
+
+                if ptx_lint_levels
+                    .get(&PtxLint::DumpAssembly)
+                    .map_or(false, |level| *level > LintLevel::Warn)
+                {
+                    emit_call_site_error!(
+                        "{} compiled binary:\n{}\n\n{}",
+                        ptx_compiler,
+                        binary_hex,
+                        ptx_source_code
+                    );
+                } else {
+                    emit_call_site_warning!(
+                        "{} compiled binary:\n{}\n\n{}",
+                        ptx_compiler,
+                        binary_hex,
+                        ptx_source_code
+                    );
+                }
+            }
+        },
+        Err(err) => {
+            let _ = errors.write_fmt(format_args!(
+                "Error fetching the compiled binary from {ptx_compiler}: {err}\n"
+            ));
+        },
+    }
+
+    match info_log {
+        Ok(None) => (),
+        Ok(Some(info_log)) => emit_call_site_warning!(
+            "{} info log:\n{}\n{}",
+            ptx_compiler,
+            info_log,
+            ptx_source_code
+        ),
+        Err(err) => {
+            let _ = errors.write_fmt(format_args!(
+                "Error fetching the info log of the {ptx_compiler}: {err}\n"
+            ));
+        },
+    };
+
+    let error_log = match error_log {
+        Ok(None) => String::new(),
+        Ok(Some(error_log)) => {
+            format!("{ptx_compiler} error log:\n{error_log}\n{ptx_source_code}")
+        },
+        Err(err) => {
+            let _ = errors.write_fmt(format_args!(
+                "Error fetching the error log of the {ptx_compiler}: {err}\n"
+            ));
+            String::new()
+        },
+    };
+
+    if let Err(err) = result {
+        let _ = errors.write_fmt(format_args!("Error compiling the PTX source code: {err}\n"));
+    }
+
+    if !error_log.is_empty() || !errors.is_empty() {
+        abort_call_site!(
+            "{error_log}{}{errors}",
+            if !error_log.is_empty() && !errors.is_empty() {
+                "\n\n"
+            } else {
+                ""
+            }
+        );
+    }
+}
+
+#[allow(clippy::type_complexity)]
+#[allow(clippy::too_many_lines)]
+fn check_kernel_ptx(
+    kernel_ptx: &str,
+    specialisation: Specialisation,
+    kernel_hash: &proc_macro2::Ident,
+    ptx_lint_levels: &HashMap<PtxLint, LintLevel>,
+) -> (
+    Result<(), NvptxError>,
+    Result<Option<String>, NvptxError>,
+    Result<Option<String>, NvptxError>,
+    Result<Option<Vec<u8>>, NvptxError>,
+    Result<(u32, u32), NvptxError>,
+    Result<(), NvptxError>,
+) {
+    let compiler = {
+        let mut compiler = std::ptr::null_mut();
+        #[allow(unsafe_code)] // FFI
+        if let Err(err) = NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerCreate(
+                addr_of_mut!(compiler),
+                kernel_ptx.len() as ptx_compiler_sys::size_t,
+                kernel_ptx.as_ptr().cast(),
+            )
+        }) {
+            abort_call_site!("PTX compiler creation failed: {}", err);
+        }
+        compiler
+    };
+
+    let result = (|| {
+        let kernel_name = match specialisation {
+            Specialisation::Check => format!("{kernel_hash}_chECK"),
+            Specialisation::Link("") => format!("{kernel_hash}_kernel"),
+            Specialisation::Link(specialisation) => format!(
+                "{kernel_hash}_kernel_{:016x}",
+                seahash::hash(specialisation.as_bytes())
+            ),
+        };
+        let kernel_name = CString::new(kernel_name).unwrap();
+
+        let mut options = vec![c"--entry", kernel_name.as_c_str()];
+
+        if ptx_lint_levels
+            .values()
+            .any(|level| *level > LintLevel::Warn)
+        {
+            let mut options = options.clone();
+
+            if ptx_lint_levels
+                .get(&PtxLint::Verbose)
+                .map_or(false, |level| *level > LintLevel::Warn)
+            {
+                options.push(c"--verbose");
+            }
+            if ptx_lint_levels
+                .get(&PtxLint::DoublePrecisionUse)
+                .map_or(false, |level| *level > LintLevel::Warn)
+            {
+                options.push(c"--warn-on-double-precision-use");
+            }
+            if ptx_lint_levels
+                .get(&PtxLint::LocalMemoryUse)
+                .map_or(false, |level| *level > LintLevel::Warn)
+            {
+                options.push(c"--warn-on-local-memory-usage");
+            }
+            if ptx_lint_levels
+                .get(&PtxLint::RegisterSpills)
+                .map_or(false, |level| *level > LintLevel::Warn)
+            {
+                options.push(c"--warn-on-spills");
+            }
+            if ptx_lint_levels
+                .get(&PtxLint::DynamicStackSize)
+                .map_or(true, |level| *level <= LintLevel::Warn)
+            {
+                options.push(c"--suppress-stack-size-warning");
+            }
+            options.push(c"--warning-as-error");
+
+            let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
+
+            #[allow(unsafe_code)] // FFI
+            NvptxError::try_err_from(unsafe {
+                ptx_compiler_sys::nvPTXCompilerCompile(
+                    compiler,
+                    c_int::try_from(options_ptrs.len()).unwrap(),
+                    options_ptrs.as_ptr().cast(),
+                )
+            })?;
+        };
+
+        if ptx_lint_levels
+            .get(&PtxLint::Verbose)
+            .map_or(false, |level| *level > LintLevel::Allow)
+        {
+            options.push(c"--verbose");
+        }
+        if ptx_lint_levels
+            .get(&PtxLint::DoublePrecisionUse)
+            .map_or(false, |level| *level > LintLevel::Allow)
+        {
+            options.push(c"--warn-on-double-precision-use");
+        }
+        if ptx_lint_levels
+            .get(&PtxLint::LocalMemoryUse)
+            .map_or(false, |level| *level > LintLevel::Allow)
+        {
+            options.push(c"--warn-on-local-memory-usage");
+        }
+        if ptx_lint_levels
+            .get(&PtxLint::RegisterSpills)
+            .map_or(false, |level| *level > LintLevel::Allow)
+        {
+            options.push(c"--warn-on-spills");
+        }
+        if ptx_lint_levels
+            .get(&PtxLint::DynamicStackSize)
+            .map_or(true, |level| *level < LintLevel::Warn)
+        {
+            options.push(c"--suppress-stack-size-warning");
+        }
+
+        let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
+
+        #[allow(unsafe_code)] // FFI
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerCompile(
+                compiler,
+                c_int::try_from(options_ptrs.len()).unwrap(),
+                options_ptrs.as_ptr().cast(),
+            )
+        })
+    })();
+
+    let error_log = (|| {
+        let mut error_log_size = 0;
+
+        #[allow(unsafe_code)] // FFI
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size))
+        })?;
+
+        if error_log_size == 0 {
+            return Ok(None);
+        }
+
+        #[allow(clippy::cast_possible_truncation)]
+        let mut error_log: Vec<u8> = vec![0; error_log_size as usize];
+
+        #[allow(unsafe_code)] // FFI
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast())
+        })?;
+
+        Ok(Some(String::from_utf8_lossy(&error_log).into_owned()))
+    })();
+
+    let info_log = (|| {
+        let mut info_log_size = 0;
+
+        #[allow(unsafe_code)] // FFI
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size))
+        })?;
+
+        if info_log_size == 0 {
+            return Ok(None);
+        }
+
+        #[allow(clippy::cast_possible_truncation)]
+        let mut info_log: Vec<u8> = vec![0; info_log_size as usize];
+
+        #[allow(unsafe_code)] // FFI
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast())
+        })?;
+
+        Ok(Some(String::from_utf8_lossy(&info_log).into_owned()))
+    })();
+
+    let binary = (|| {
+        if result.is_err() {
+            return Ok(None);
+        }
+
+        let mut binary_size = 0;
+
+        #[allow(unsafe_code)] // FFI
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetCompiledProgramSize(
+                compiler,
+                addr_of_mut!(binary_size),
+            )
+        })?;
+
+        if binary_size == 0 {
+            return Ok(None);
+        }
+
+        #[allow(clippy::cast_possible_truncation)]
+        let mut binary: Vec<u8> = vec![0; binary_size as usize];
+
+        #[allow(unsafe_code)] // FFI
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetCompiledProgram(compiler, binary.as_mut_ptr().cast())
+        })?;
+
+        Ok(Some(binary))
+    })();
+
+    let version = (|| {
+        let mut major = 0;
+        let mut minor = 0;
+
+        #[allow(unsafe_code)] // FFI
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor))
+        })?;
+
+        Ok((major, minor))
+    })();
+
+    let drop = {
+        let mut compiler = compiler;
+        #[allow(unsafe_code)] // FFI
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerDestroy(addr_of_mut!(compiler))
+        })
+    };
+
+    (result, error_log, info_log, binary, version, drop)
+}
+
+fn compile_kernel_ptx(
+    kernel: &syn::Ident,
+    crate_name: &str,
+    crate_path: &Path,
+    specialisation: Specialisation,
+) -> Option<String> {
+    if let Ok(rust_flags) = proc_macro::tracked_env::var("RUSTFLAGS") {
+        env::set_var(
+            "RUSTFLAGS",
+            rust_flags
+                .replace("-Zinstrument-coverage", "")
+                .replace("-Cinstrument-coverage", ""),
+        );
+    }
+
+    let specialisation_var = format!(
+        "RUST_CUDA_DERIVE_SPECIALISE_{}_{}",
+        crate_name,
+        kernel.to_string().to_uppercase()
+    );
+
+    match build_kernel_with_specialisation(crate_path, &specialisation_var, specialisation) {
+        Ok(kernel_path) => {
+            let mut file = fs::File::open(&kernel_path)
+                .unwrap_or_else(|_| panic!("Failed to open kernel file at {:?}.", &kernel_path));
+
+            let mut kernel_ptx = String::new();
+
+            file.read_to_string(&mut kernel_ptx)
+                .unwrap_or_else(|_| panic!("Failed to read kernel file at {:?}.", &kernel_path));
+
+            colored::control::set_override(true);
+            eprintln!(
+                "{} {} compiling a PTX crate.",
+                "[PTX]".bright_black().bold(),
+                "Finished".green().bold()
+            );
+            colored::control::unset_override();
+
+            Some(kernel_ptx)
+        },
+        Err(err) => {
+            eprintln!("{err}");
+            emit_ptx_build_error();
+            None
+        },
+    }
+}
+
+#[allow(clippy::too_many_lines)]
+fn build_kernel_with_specialisation(
+    kernel_path: &Path,
+    env_var: &str,
+    specialisation: Specialisation,
+) -> Result<PathBuf> {
+    match specialisation {
+        Specialisation::Check => env::set_var(env_var, "chECK"),
+        Specialisation::Link(specialisation) => env::set_var(env_var, specialisation),
+    };
+
+    let result = (|| {
+        let mut builder = Builder::new(kernel_path)?;
+
+        builder = match specialisation {
+            Specialisation::Check => builder.set_profile(Profile::Debug),
+            Specialisation::Link(_) => builder.set_profile(Profile::Release),
+        };
+
+        builder = builder.set_message_format(MessageFormat::Json {
+            render_diagnostics: false,
+            short: false,
+            ansi: true,
+        });
+
+        let specialisation_prefix = match specialisation {
+            Specialisation::Check => String::from("chECK"),
+            Specialisation::Link(specialisation) => {
+                format!("{:016x}", seahash::hash(specialisation.as_bytes()))
+            },
+        };
+        builder = builder.set_prefix(specialisation_prefix.clone());
+
+        let any_output = AtomicBool::new(false);
+        let crate_name = String::from(builder.get_crate_name());
+
+        let build = builder.build_live(
+            |stdout_line| {
+                if let Ok(cargo_metadata::Message::CompilerMessage(mut message)) =
+                    serde_json::from_str(stdout_line)
+                {
+                    if any_output
+                        .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
+                        .is_ok()
+                    {
+                        colored::control::set_override(true);
+                        eprintln!(
+                            "{} of {} ({})",
+                            "[PTX]".bright_black().bold(),
+                            crate_name.bold(),
+                            specialisation_prefix.to_ascii_lowercase(),
+                        );
+                        colored::control::unset_override();
+                    }
+
+                    if let Some(rendered) = &mut message.message.rendered {
+                        colored::control::set_override(true);
+                        let prefix = "  | ".bright_black().bold().to_string();
+                        colored::control::unset_override();
+
+                        let glue = String::from('\n') + &prefix;
+
+                        let mut lines = rendered
+                            .split('\n')
+                            .rev()
+                            .skip_while(|l| l.trim().is_empty())
+                            .collect::<Vec<_>>();
+                        lines.reverse();
+
+                        let mut prefixed = prefix + &lines.join(&glue);
+
+                        std::mem::swap(rendered, &mut prefixed);
+                    }
+
+                    eprintln!("{}", serde_json::to_string(&message.message).unwrap());
+                }
+            },
+            |stderr_line| {
+                if stderr_line.trim().is_empty() {
+                    return;
+                }
+
+                if any_output
+                    .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
+                    .is_ok()
+                {
+                    colored::control::set_override(true);
+                    eprintln!(
+                        "{} of {} ({})",
+                        "[PTX]".bright_black().bold(),
+                        crate_name.bold(),
+                        specialisation_prefix.to_ascii_lowercase(),
+                    );
+                    colored::control::unset_override();
+                }
+
+                colored::control::set_override(true);
+                eprintln!(
+                    "  {} {}",
+                    "|".bright_black().bold(),
+                    stderr_line.replace("   ", "")
+                );
+                colored::control::unset_override();
+            },
+        )?;
+
+        match build {
+            BuildStatus::Success(output) => {
+                let ptx_path = output.get_assembly_path();
+
+                let mut specialised_ptx_path = ptx_path.clone();
+
+                specialised_ptx_path.set_extension(format!("{specialisation_prefix}.ptx"));
+
+                fs::copy(&ptx_path, &specialised_ptx_path).map_err(|err| {
+                    Error::from(BuildErrorKind::BuildFailed(vec![format!(
+                        "Failed to copy kernel from {ptx_path:?} to {specialised_ptx_path:?}: \
+                         {err}"
+                    )]))
+                })?;
+
+                if let Specialisation::Link(specialisation) = specialisation {
+                    fs::OpenOptions::new()
+                        .append(true)
+                        .open(&specialised_ptx_path)
+                        .and_then(|mut file| writeln!(file, "\n// {specialisation}"))
+                        .map_err(|err| {
+                            Error::from(BuildErrorKind::BuildFailed(vec![format!(
+                                "Failed to write specialisation to {specialised_ptx_path:?}: {err}"
+                            )]))
+                        })?;
+                }
+
+                Ok(specialised_ptx_path)
+            },
+            BuildStatus::NotNeeded => Err(Error::from(BuildErrorKind::BuildFailed(vec![format!(
+                "Kernel build for specialisation {:?} was not needed.",
+                &specialisation
+            )]))),
+        }
+    })();
+
+    env::remove_var(env_var);
+
+    result
+}
+
+#[derive(Copy, Clone, Debug)]
+enum Specialisation<'a> {
+    Check,
+    Link(&'a str),
+}
diff --git a/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs
new file mode 100644
index 000000000..7fffc7b4c
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs
@@ -0,0 +1,275 @@
+use thiserror::Error;
+
+#[allow(non_camel_case_types)]
+pub type size_t = ::std::os::raw::c_ulonglong;
+
+#[repr(C)]
+pub struct NvptxCompiler {
+    _private: [u8; 0],
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Error)]
+#[non_exhaustive]
+pub enum NvptxError {
+    #[error("Invalid compiler handle")]
+    InvalidCompilerHandle,
+    #[error("Invalid PTX input")]
+    InvalidInput,
+    #[error("Compilation failure")]
+    CompilationFailure,
+    #[error("Internal error")]
+    Internal,
+    #[error("Out of memory")]
+    OutOfMemory,
+    #[error("Incomplete compiler invocation")]
+    CompilerInvocationIncomplete,
+    #[error("Unsupported PTX version")]
+    UnsupportedPtxVersion,
+    #[error("Unsupported dev-side sync")]
+    UnsupportedDevSideSync,
+    #[error("Unknown error code")]
+    UnknownError,
+}
+
+impl NvptxError {
+    const NVPTXCOMPILE_ERROR_COMPILATION_FAILURE: NvptxCompileResult = 3;
+    const NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE: NvptxCompileResult = 6;
+    const NVPTXCOMPILE_ERROR_INTERNAL: NvptxCompileResult = 4;
+    const NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE: NvptxCompileResult = 1;
+    const NVPTXCOMPILE_ERROR_INVALID_INPUT: NvptxCompileResult = 2;
+    const NVPTXCOMPILE_ERROR_OUT_OF_MEMORY: NvptxCompileResult = 5;
+    const NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC: NvptxCompileResult = 8;
+    const NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION: NvptxCompileResult = 7;
+    const NVPTXCOMPILE_SUCCESS: NvptxCompileResult = 0;
+
+    pub const fn try_err_from(result: NvptxCompileResult) -> Result<(), Self> {
+        match result {
+            Self::NVPTXCOMPILE_SUCCESS => Ok(()),
+            Self::NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE => Err(Self::InvalidCompilerHandle),
+            Self::NVPTXCOMPILE_ERROR_INVALID_INPUT => Err(Self::InvalidInput),
+            Self::NVPTXCOMPILE_ERROR_COMPILATION_FAILURE => Err(Self::CompilationFailure),
+            Self::NVPTXCOMPILE_ERROR_INTERNAL => Err(Self::Internal),
+            Self::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY => Err(Self::OutOfMemory),
+            Self::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE => {
+                Err(Self::CompilerInvocationIncomplete)
+            },
+            Self::NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION => Err(Self::UnsupportedPtxVersion),
+            Self::NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC => Err(Self::UnsupportedDevSideSync),
+            _ => Err(Self::UnknownError),
+        }
+    }
+}
+
+/// [`NvptxCompilerHandle`] represents a handle to the PTX Compiler.
+///
+/// To compile a PTX program string, an instance of [`NvptxCompiler`]
+/// must be created and the handle to it must be obtained using the
+/// API [`nvPTXCompilerCreate`]. Then the compilation can be done
+/// using the API [`nvPTXCompilerCompile`].
+pub type NvptxCompilerHandle = *mut NvptxCompiler;
+
+/// The [`NvptxCompiler`] APIs return the [`NvptxCompileResult`] codes to
+/// indicate the call result"]
+pub type NvptxCompileResult = ::std::os::raw::c_int;
+
+extern "C" {
+    /// Queries the current major and minor version of PTX Compiler APIs being
+    /// used.
+    ///
+    /// # Parameters
+    /// - [out] `major`: Major version of the PTX Compiler APIs
+    /// - [out] `minor`: Minor version of the PTX Compiler APIs
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    ///
+    /// # Note
+    /// The version of PTX Compiler APIs follows the CUDA Toolkit versioning.
+    /// The PTX ISA version supported by a PTX Compiler API version is listed
+    /// [here](https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes).
+    pub fn nvPTXCompilerGetVersion(
+        major: *mut ::std::os::raw::c_uint,
+        minor: *mut ::std::os::raw::c_uint,
+    ) -> NvptxCompileResult;
+
+    /// Obtains the handle to an instance of the PTX compiler
+    /// initialized with the given PTX program `ptxCode`.
+    ///
+    /// # Parameters
+    /// - [out] `compiler`: Returns a handle to PTX compiler initialized with
+    ///   the PTX program `ptxCode`
+    /// - [in] `ptxCodeLen`: Size of the PTX program `ptxCode` passed as a
+    ///   string
+    /// - [in] `ptxCode`: The PTX program which is to be compiled passed as a
+    ///   string
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    pub fn nvPTXCompilerCreate(
+        compiler: *mut NvptxCompilerHandle,
+        ptxCodeLen: size_t,
+        ptxCode: *const ::std::os::raw::c_char,
+    ) -> NvptxCompileResult;
+
+    /// Destroys and cleans the already created PTX compiler.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to the PTX compiler which is to be
+    ///   destroyed.
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    pub fn nvPTXCompilerDestroy(compiler: *mut NvptxCompilerHandle) -> NvptxCompileResult;
+
+    /// Compile a PTX program with the given compiler options.
+    ///
+    /// # Parameters
+    /// - [in, out] `compiler`: A handle to PTX compiler initialized with the
+    ///   PTX program which is to be compiled. The compiled program can be
+    ///   accessed using the handle.
+    /// - [in] `numCompileOptions`: Length of the array `compileOptions`
+    /// - [in] `compileOptions`: Compiler options with which compilation should
+    ///   be done. The compiler options string is a null terminated character
+    ///   array. A valid list of compiler options is available at
+    ///   [link](http://docs.nvidia.com/cuda/ptx-compiler-api/index.html#compile-options).
+    ///
+    /// # Note
+    /// `--gpu-name` (`-arch`) is a mandatory option.
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILATION_FAILURE`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION`]
+    pub fn nvPTXCompilerCompile(
+        compiler: NvptxCompilerHandle,
+        numCompileOptions: ::std::os::raw::c_int,
+        compileOptions: *const *const ::std::os::raw::c_char,
+    ) -> NvptxCompileResult;
+
+    /// Obtains the size of the image of the compiled program.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `binaryImageSize`: The size of the image of the compiled program
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`]
+    ///
+    /// # Note
+    /// The [`nvPTXCompilerCompile`] function should be invoked for the handle
+    /// before calling this API. Otherwise,
+    /// [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`]
+    /// is returned.
+    pub fn nvPTXCompilerGetCompiledProgramSize(
+        compiler: NvptxCompilerHandle,
+        binaryImageSize: *mut size_t,
+    ) -> NvptxCompileResult;
+
+    /// Obtains the image of the compiled program.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `binaryImage`: The image of the compiled program. The caller
+    ///   should allocate memory for `binaryImage`.
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`]
+    ///
+    /// # Note
+    /// The [`nvPTXCompilerCompile`] function should be invoked for the handle
+    /// before calling this API. Otherwise,
+    /// [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`]
+    /// is returned.
+    pub fn nvPTXCompilerGetCompiledProgram(
+        compiler: NvptxCompilerHandle,
+        binaryImage: *mut ::std::os::raw::c_void,
+    ) -> NvptxCompileResult;
+
+    /// Query the size of the error message that was seen previously for the
+    /// handle.
+    ///
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `errorLogSize`: The size of the error log in bytes which was
+    ///   produced in previous call to [`nvPTXCompilerCompile`].
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    pub fn nvPTXCompilerGetErrorLogSize(
+        compiler: NvptxCompilerHandle,
+        errorLogSize: *mut size_t,
+    ) -> NvptxCompileResult;
+
+    /// Query the error message that was seen previously for the handle.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `errorLog`: The error log which was produced in previous call to
+    ///   [`nvPTXCompilerCompile`]. The caller should allocate memory for
+    ///   `errorLog`.
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    pub fn nvPTXCompilerGetErrorLog(
+        compiler: NvptxCompilerHandle,
+        errorLog: *mut ::std::os::raw::c_char,
+    ) -> NvptxCompileResult;
+
+    /// Query the size of the information message that was seen previously for
+    /// the handle.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `infoLogSize`: The size of the information log in bytes which
+    ///   was produced in previous call to [`nvPTXCompilerCompile`].
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    pub fn nvPTXCompilerGetInfoLogSize(
+        compiler: NvptxCompilerHandle,
+        infoLogSize: *mut size_t,
+    ) -> NvptxCompileResult;
+
+    /// Query the information message that was seen previously for the handle.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `infoLog`: The information log which was produced in previous
+    ///   call to [`nvPTXCompilerCompile`]. The caller should allocate memory
+    ///   for `infoLog`.
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    pub fn nvPTXCompilerGetInfoLog(
+        compiler: NvptxCompilerHandle,
+        infoLog: *mut ::std::os::raw::c_char,
+    ) -> NvptxCompileResult;
+}
diff --git a/rust-cuda-kernel/src/kernel/lints.rs b/rust-cuda-kernel/src/kernel/lints.rs
new file mode 100644
index 000000000..5fbe415b2
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/lints.rs
@@ -0,0 +1,171 @@
+use std::{collections::HashMap, fmt};
+
+use syn::spanned::Spanned;
+
+#[allow(clippy::too_many_lines)]
+pub fn parse_ptx_lint_level(
+    path: &syn::Path,
+    nested: &syn::punctuated::Punctuated<syn::NestedMeta, syn::token::Comma>,
+    ptx_lint_levels: &mut HashMap<PtxLint, LintLevel>,
+) {
+    let level = match path.get_ident() {
+        Some(ident) if ident == "allow" => LintLevel::Allow,
+        Some(ident) if ident == "warn" => LintLevel::Warn,
+        Some(ident) if ident == "deny" => LintLevel::Deny,
+        Some(ident) if ident == "forbid" => LintLevel::Forbid,
+        _ => {
+            emit_error!(
+                path.span(),
+                "[rust-cuda]: Invalid lint #[kernel(<level>(<lint>))] attribute: unknown lint \
+                 level, must be one of `allow`, `warn`, `deny`, `forbid`.",
+            );
+
+            return;
+        },
+    };
+
+    for meta in nested {
+        let syn::NestedMeta::Meta(syn::Meta::Path(path)) = meta else {
+            emit_error!(
+                meta.span(),
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute.",
+                level,
+            );
+            continue;
+        };
+
+        if path.leading_colon.is_some()
+            || path.segments.empty_or_trailing()
+            || path.segments.len() != 2
+        {
+            emit_error!(
+                meta.span(),
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form \
+                 `ptx::lint`.",
+                level,
+            );
+            continue;
+        }
+
+        let Some(syn::PathSegment {
+            ident: namespace,
+            arguments: syn::PathArguments::None,
+        }) = path.segments.first()
+        else {
+            emit_error!(
+                meta.span(),
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form \
+                 `ptx::lint`.",
+                level,
+            );
+            continue;
+        };
+
+        if namespace != "ptx" {
+            emit_error!(
+                meta.span(),
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form \
+                 `ptx::lint`.",
+                level,
+            );
+            continue;
+        }
+
+        let Some(syn::PathSegment {
+            ident: lint,
+            arguments: syn::PathArguments::None,
+        }) = path.segments.last()
+        else {
+            emit_error!(
+                meta.span(),
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form \
+                 `ptx::lint`.",
+                level,
+            );
+            continue;
+        };
+
+        let lint = match lint {
+            l if l == "verbose" => PtxLint::Verbose,
+            l if l == "double_precision_use" => PtxLint::DoublePrecisionUse,
+            l if l == "local_memory_use" => PtxLint::LocalMemoryUse,
+            l if l == "register_spills" => PtxLint::RegisterSpills,
+            l if l == "dump_assembly" => PtxLint::DumpAssembly,
+            l if l == "dynamic_stack_size" => PtxLint::DynamicStackSize,
+            _ => {
+                emit_error!(
+                    meta.span(),
+                    "[rust-cuda]: Unknown PTX kernel lint `ptx::{}`.",
+                    lint,
+                );
+                continue;
+            },
+        };
+
+        match ptx_lint_levels.get(&lint) {
+            None => (),
+            Some(LintLevel::Forbid) if level < LintLevel::Forbid => {
+                emit_error!(
+                    meta.span(),
+                    "[rust-cuda]: {}(ptx::{}) incompatible with previous forbid.",
+                    level,
+                    lint,
+                );
+                continue;
+            },
+            Some(previous) => {
+                emit_warning!(
+                    meta.span(),
+                    "[rust-cuda]: {}(ptx::{}) overwrites previous {}.",
+                    level,
+                    lint,
+                    previous,
+                );
+            },
+        }
+
+        ptx_lint_levels.insert(lint, level);
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+pub enum LintLevel {
+    Allow,
+    Warn,
+    Deny,
+    Forbid,
+}
+
+impl fmt::Display for LintLevel {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Self::Allow => fmt.write_str("allow"),
+            Self::Warn => fmt.write_str("warn"),
+            Self::Deny => fmt.write_str("deny"),
+            Self::Forbid => fmt.write_str("forbid"),
+        }
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+pub enum PtxLint {
+    Verbose,
+    DoublePrecisionUse,
+    LocalMemoryUse,
+    RegisterSpills,
+    DumpAssembly,
+    DynamicStackSize,
+}
+
+impl fmt::Display for PtxLint {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Self::Verbose => fmt.write_str("verbose"),
+            Self::DoublePrecisionUse => fmt.write_str("double_precision_use"),
+            Self::LocalMemoryUse => fmt.write_str("local_memory_use"),
+            Self::RegisterSpills => fmt.write_str("register_spills"),
+            Self::DumpAssembly => fmt.write_str("dump_assembly"),
+            Self::DynamicStackSize => fmt.write_str("dynamic_stack_size"),
+        }
+    }
+}
diff --git a/rust-cuda-kernel/src/kernel/mod.rs b/rust-cuda-kernel/src/kernel/mod.rs
new file mode 100644
index 000000000..86ffbd8fd
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/mod.rs
@@ -0,0 +1,11 @@
+pub mod link;
+pub mod specialise;
+pub mod wrapper;
+
+mod lints;
+mod utils;
+
+const KERNEL_TYPE_USE_START_CANARY: &str = "// <rust-cuda-kernel-param-type-use-start> //";
+const KERNEL_TYPE_USE_END_CANARY: &str = "// <rust-cuda-kernel-param-type-use-end> //";
+const KERNEL_TYPE_LAYOUT_IDENT: &str = "KERNEL_SIGNATURE_LAYOUT";
+const PTX_CSTR_IDENT: &str = "PTX_CSTR";
diff --git a/rust-cuda-derive/src/kernel/specialise/call.rs b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
similarity index 68%
rename from rust-cuda-derive/src/kernel/specialise/call.rs
rename to rust-cuda-kernel/src/kernel/specialise/entry_point.rs
index 34eb0dc35..b429a9297 100644
--- a/rust-cuda-derive/src/kernel/specialise/call.rs
+++ b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
@@ -1,7 +1,10 @@
+use std::ffi::CString;
+
 use proc_macro::TokenStream;
+use quote::quote;
 
 #[allow(clippy::module_name_repetitions)]
-pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream {
+pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
     let SpecialiseMangleConfig {
         kernel,
         specialisation,
@@ -9,13 +12,14 @@ pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "specialise_kernel_call!(KERNEL SPECIALISATION) expects KERNEL identifier and \
-                 SPECIALISATION tokens: {:?}",
+                "specialise_kernel_entry_point!(KERNEL SPECIALISATION) expects KERNEL identifier \
+                 and SPECIALISATION tokens: {:?}",
                 err
             )
         },
     };
 
+    #[allow(clippy::option_if_let_else)]
     let mangled_kernel_ident = if let Some(specialisation) = specialisation {
         format!(
             "{kernel}_kernel_{:016x}",
@@ -25,7 +29,16 @@ pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream {
         format!("{kernel}_kernel")
     };
 
-    (quote! { #mangled_kernel_ident }).into()
+    let mangled_kernel_ident = match CString::new(mangled_kernel_ident) {
+        Ok(mangled_kernel_ident) => mangled_kernel_ident,
+        Err(err) => abort_call_site!(
+            "Kernel compilation generated invalid kernel entry point: internal nul byte: {:?}",
+            err
+        ),
+    };
+
+    let mangled_kernel_ident = proc_macro::Literal::c_string(&mangled_kernel_ident);
+    proc_macro::TokenTree::Literal(mangled_kernel_ident).into()
 }
 
 struct SpecialiseMangleConfig {
diff --git a/rust-cuda-derive/src/kernel/specialise/entry.rs b/rust-cuda-kernel/src/kernel/specialise/function.rs
similarity index 82%
rename from rust-cuda-derive/src/kernel/specialise/entry.rs
rename to rust-cuda-kernel/src/kernel/specialise/function.rs
index e8bce23b9..44d8b8a81 100644
--- a/rust-cuda-derive/src/kernel/specialise/entry.rs
+++ b/rust-cuda-kernel/src/kernel/specialise/function.rs
@@ -1,12 +1,13 @@
 use std::env::VarError;
 
 use proc_macro::TokenStream;
+use quote::quote;
 
 #[allow(clippy::module_name_repetitions)]
-pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStream {
+pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream {
     let mut func: syn::ItemFn = syn::parse(func).unwrap_or_else(|err| {
         abort_call_site!(
-            "#[specialise_kernel_entry(...)] must be wrapped around a function: {:?}",
+            "#[specialise_kernel_function(...)] must be wrapped around a function: {:?}",
             err
         )
     });
@@ -14,7 +15,7 @@ pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStr
     let kernel: syn::Ident = match syn::parse_macro_input::parse(attr) {
         Ok(kernel) => kernel,
         Err(err) => abort_call_site!(
-            "#[specialise_kernel_entry(KERNEL)] expects KERNEL identifier: {:?}",
+            "#[specialise_kernel_function(KERNEL)] expects KERNEL identifier: {:?}",
             err
         ),
     };
@@ -33,7 +34,7 @@ pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStr
     func.sig.ident = match proc_macro::tracked_env::var(&specialisation_var).as_deref() {
         Ok("") => quote::format_ident!("{}_kernel", func.sig.ident),
         Ok("chECK") => {
-            let func_ident = func.sig.ident;
+            let func_ident = quote::format_ident!("{}_chECK", func.sig.ident);
 
             return (quote! {
                 #[cfg(target_os = "cuda")]
diff --git a/rust-cuda-kernel/src/kernel/specialise/mod.rs b/rust-cuda-kernel/src/kernel/specialise/mod.rs
new file mode 100644
index 000000000..e5dcd518e
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/specialise/mod.rs
@@ -0,0 +1,3 @@
+pub mod entry_point;
+pub mod function;
+pub mod param_type;
diff --git a/rust-cuda-kernel/src/kernel/specialise/param_type.rs b/rust-cuda-kernel/src/kernel/specialise/param_type.rs
new file mode 100644
index 000000000..a398e5eac
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/specialise/param_type.rs
@@ -0,0 +1,292 @@
+use proc_macro::TokenStream;
+use quote::ToTokens;
+
+#[allow(clippy::module_name_repetitions)]
+pub fn specialise_kernel_param_type(tokens: TokenStream) -> TokenStream {
+    let SpecialiseTypeConfig {
+        mut ty,
+        generics,
+        kernel,
+    } = match syn::parse_macro_input::parse(tokens) {
+        Ok(config) => config,
+        Err(err) => {
+            abort_call_site!(
+                "specialise_kernel_param_type!(TY for GENERICS in KERNEL) expects TY type, \
+                 GENERICS generics, and KERNEL identifier: {:?}",
+                err
+            )
+        },
+    };
+
+    let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") {
+        Ok(crate_name) => crate_name.to_uppercase(),
+        Err(err) => abort_call_site!("Failed to read crate name: {:?}", err),
+    };
+
+    let specialisation_var = format!(
+        "RUST_CUDA_DERIVE_SPECIALISE_{}_{}",
+        crate_name,
+        kernel.to_string().to_uppercase()
+    );
+
+    let specialisation = match proc_macro::tracked_env::var(&specialisation_var) {
+        Ok(specialisation) => specialisation,
+        Err(err) => abort_call_site!(
+            "Failed to read specialisation from {:?}: {:?}",
+            &specialisation_var,
+            err
+        ),
+    };
+    let specialisation = match syn::parse_str(&specialisation) {
+        _ if specialisation.is_empty() => syn::PathArguments::None,
+        Ok(specialisation) => syn::PathArguments::AngleBracketed(specialisation),
+        Err(err) => abort_call_site!("Failed to parse specialisation: {:?}", err),
+    };
+
+    if let syn::PathArguments::AngleBracketed(syn::AngleBracketedGenericArguments {
+        args, ..
+    }) = specialisation
+    {
+        if generics.params.len() != args.len() {
+            abort_call_site!(
+                "Mismatch specialising {} with {}",
+                generics.split_for_impl().1.to_token_stream(),
+                args.to_token_stream()
+            );
+        }
+
+        // replace all lifetimes with 'static
+        ty = syn::fold::Fold::fold_type(
+            &mut FoldLifetimeAllStatic {
+                r#static: syn::parse_quote!('static),
+            },
+            ty,
+        );
+
+        for (generic, arg) in generics.params.into_iter().zip(args.into_iter()) {
+            match (generic, arg) {
+                (
+                    syn::GenericParam::Lifetime(syn::LifetimeDef {
+                        lifetime: _generic, ..
+                    }),
+                    syn::GenericArgument::Lifetime(_arg),
+                ) => {
+                    // all lifetimes are already replaced with 'static above
+                },
+                (
+                    syn::GenericParam::Const(syn::ConstParam { ident: generic, .. }),
+                    syn::GenericArgument::Const(arg),
+                ) => {
+                    ty = syn::fold::Fold::fold_type(&mut FoldConstGeneric { generic, arg }, ty);
+                },
+                (
+                    syn::GenericParam::Type(syn::TypeParam { ident: generic, .. }),
+                    syn::GenericArgument::Type(arg),
+                ) => {
+                    ty = syn::fold::Fold::fold_type(&mut FoldTypeGeneric { generic, arg }, ty);
+                },
+                (generic, arg) => abort_call_site!(
+                    "Mismatch specialising {} with {}",
+                    generic.to_token_stream(),
+                    arg.to_token_stream()
+                ),
+            }
+        }
+    } else if !generics.params.is_empty() {
+        abort_call_site!(
+            "Missing specialisation for {}",
+            generics.split_for_impl().1.to_token_stream()
+        );
+    }
+
+    ty.into_token_stream().into()
+}
+
+struct SpecialiseTypeConfig {
+    ty: syn::Type,
+    generics: syn::Generics,
+    kernel: syn::Ident,
+}
+
+impl syn::parse::Parse for SpecialiseTypeConfig {
+    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
+        let ty: syn::Type = input.parse()?;
+        let _for: syn::token::For = input.parse()?;
+        let generics: syn::Generics = input.parse()?;
+        let _in: syn::token::In = input.parse()?;
+        let kernel: syn::Ident = input.parse()?;
+
+        Ok(Self {
+            ty,
+            generics,
+            kernel,
+        })
+    }
+}
+
+struct FoldLifetimeAllStatic {
+    r#static: syn::Lifetime,
+}
+
+impl syn::fold::Fold for FoldLifetimeAllStatic {
+    fn fold_type_reference(&mut self, r#ref: syn::TypeReference) -> syn::TypeReference {
+        let syn::TypeReference {
+            and_token,
+            lifetime: _,
+            mutability,
+            elem,
+        } = r#ref;
+
+        syn::fold::fold_type_reference(
+            self,
+            syn::TypeReference {
+                and_token,
+                lifetime: Some(self.r#static.clone()),
+                mutability,
+                elem,
+            },
+        )
+    }
+
+    fn fold_lifetime(&mut self, lt: syn::Lifetime) -> syn::Lifetime {
+        let mut r#static = self.r#static.clone();
+        r#static.set_span(lt.span());
+        r#static
+    }
+}
+
+struct FoldConstGeneric {
+    generic: syn::Ident,
+    arg: syn::Expr,
+}
+
+impl syn::fold::Fold for FoldConstGeneric {
+    fn fold_generic_argument(&mut self, arg: syn::GenericArgument) -> syn::GenericArgument {
+        let syn::GenericArgument::Type(syn::Type::Path(syn::TypePath {
+            qself: None,
+            path:
+                syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+        })) = arg
+        else {
+            return syn::fold::fold_generic_argument(self, arg);
+        };
+
+        if let Some(syn::PathSegment {
+            ident,
+            arguments: syn::PathArguments::None,
+        }) = segments.first()
+            && segments.len() == 1
+            && ident == &self.generic
+        {
+            return syn::GenericArgument::Const(self.arg.clone());
+        }
+
+        syn::fold::fold_generic_argument(
+            self,
+            syn::GenericArgument::Type(syn::Type::Path(syn::TypePath {
+                qself: None,
+                path: syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+            })),
+        )
+    }
+
+    fn fold_expr(&mut self, expr: syn::Expr) -> syn::Expr {
+        let syn::Expr::Path(syn::ExprPath {
+            qself: None,
+            path:
+                syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+            attrs,
+        }) = expr
+        else {
+            return syn::fold::fold_expr(self, expr);
+        };
+
+        if let Some(syn::PathSegment {
+            ident,
+            arguments: syn::PathArguments::None,
+        }) = segments.first()
+            && segments.len() == 1
+            && ident == &self.generic
+        {
+            return self.arg.clone();
+        }
+
+        syn::fold::fold_expr(
+            self,
+            syn::Expr::Path(syn::ExprPath {
+                qself: None,
+                path: syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+                attrs,
+            }),
+        )
+    }
+}
+
+struct FoldTypeGeneric {
+    generic: syn::Ident,
+    arg: syn::Type,
+}
+
+impl syn::fold::Fold for FoldTypeGeneric {
+    fn fold_type(&mut self, ty: syn::Type) -> syn::Type {
+        let syn::Type::Path(syn::TypePath {
+            qself: None,
+            path:
+                syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+        }) = ty
+        else {
+            return syn::fold::fold_type(self, ty);
+        };
+
+        if let Some(syn::PathSegment {
+            ident,
+            arguments: syn::PathArguments::None,
+        }) = segments.first()
+            && ident == &self.generic
+        {
+            return if segments.len() > 1 {
+                syn::Type::Path(syn::TypePath {
+                    qself: Some(syn::QSelf {
+                        lt_token: syn::parse_quote!(<),
+                        ty: Box::new(self.arg.clone()),
+                        position: 0,
+                        as_token: None,
+                        gt_token: syn::parse_quote!(>),
+                    }),
+                    path: syn::Path {
+                        leading_colon: syn::parse_quote!(::),
+                        segments: segments.into_iter().skip(1).collect(),
+                    },
+                })
+            } else {
+                self.arg.clone()
+            };
+        }
+
+        syn::fold::fold_type(
+            self,
+            syn::Type::Path(syn::TypePath {
+                qself: None,
+                path: syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+            }),
+        )
+    }
+}
diff --git a/rust-cuda-derive/src/kernel/utils.rs b/rust-cuda-kernel/src/kernel/utils.rs
similarity index 69%
rename from rust-cuda-derive/src/kernel/utils.rs
rename to rust-cuda-kernel/src/kernel/utils.rs
index 5afd05858..c73876f09 100644
--- a/rust-cuda-derive/src/kernel/utils.rs
+++ b/rust-cuda-kernel/src/kernel/utils.rs
@@ -1,5 +1,3 @@
-use syn::spanned::Spanned;
-
 pub fn skip_kernel_compilation() -> bool {
     let mut skip_compilation = false;
 
@@ -13,7 +11,3 @@ pub fn skip_kernel_compilation() -> bool {
 
     skip_compilation
 }
-
-pub fn r2c_move_lifetime(arg: usize, ty: &syn::Type) -> syn::Lifetime {
-    syn::Lifetime::new(&format!("'__r2c_move_lt_{arg}"), ty.span())
-}
diff --git a/rust-cuda-kernel/src/kernel/wrapper/config.rs b/rust-cuda-kernel/src/kernel/wrapper/config.rs
new file mode 100644
index 000000000..66807f2d1
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/wrapper/config.rs
@@ -0,0 +1,17 @@
+pub(super) struct KernelConfig {
+    pub(super) visibility: Option<syn::token::Pub>,
+    pub(super) link: syn::Ident,
+}
+
+impl syn::parse::Parse for KernelConfig {
+    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
+        let visibility: Option<syn::token::Pub> = input.parse()?;
+        let _use: syn::token::Use = input.parse()?;
+        let link: syn::Ident = input.parse()?;
+        let _bang: syn::token::Bang = input.parse()?;
+        let _for: syn::token::For = input.parse()?;
+        let _impl: syn::token::Impl = input.parse()?;
+
+        Ok(Self { visibility, link })
+    }
+}
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
new file mode 100644
index 000000000..0799f4cc7
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
@@ -0,0 +1,95 @@
+use proc_macro2::TokenStream;
+use syn::spanned::Spanned;
+use quote::quote;
+
+use crate::kernel::wrapper::{DeclGenerics, FuncIdent};
+
+pub(in super::super) fn quote_cuda_generic_function(
+    crate_path: &syn::Path,
+    DeclGenerics {
+        generic_start_token,
+        generic_kernel_params: generic_params,
+        generic_close_token,
+        ..
+    }: &DeclGenerics,
+    func_inputs: &syn::punctuated::Punctuated<syn::PatType, syn::token::Comma>,
+    FuncIdent { func_ident, .. }: &FuncIdent,
+    func_attrs: &[syn::Attribute],
+    func_block: &syn::Block,
+) -> TokenStream {
+    let mut generic_params = (*generic_params).clone();
+
+    let kernel_func_inputs = func_inputs
+        .iter()
+        .enumerate()
+        .map(
+            |(
+                i,
+                syn::PatType {
+                    attrs,
+                    ty,
+                    pat,
+                    colon_token,
+                },
+            )| {
+                let (ty, lt) = if let syn::Type::Reference(syn::TypeReference {
+                    and_token,
+                    lifetime,
+                    mutability,
+                    elem,
+                }) = &**ty
+                {
+                    let lifetime = lifetime.clone().unwrap_or_else(|| {
+                        let lifetime =
+                            syn::Lifetime::new(&format!("'__rust_cuda_lt_{i}"), ty.span());
+                        generic_params.insert(
+                            0,
+                            syn::GenericParam::Lifetime(syn::LifetimeDef {
+                                attrs: Vec::new(),
+                                colon_token: None,
+                                lifetime: lifetime.clone(),
+                                bounds: syn::punctuated::Punctuated::new(),
+                            }),
+                        );
+                        lifetime
+                    });
+                    let lt = quote!(#lifetime);
+                    (
+                        syn::Type::Reference(syn::TypeReference {
+                            and_token: *and_token,
+                            lifetime: Some(lifetime),
+                            mutability: *mutability,
+                            elem: elem.clone(),
+                        }),
+                        lt,
+                    )
+                } else {
+                    (syn::Type::clone(ty), quote!('_))
+                };
+
+                let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
+                    <#ty as #crate_path::kernel::CudaKernelParameter>::DeviceType<#lt>
+                };
+
+                syn::FnArg::Typed(syn::PatType {
+                    attrs: attrs.clone(),
+                    ty: Box::new(ty),
+                    pat: pat.clone(),
+                    colon_token: *colon_token,
+                })
+            },
+        )
+        .collect::<Vec<_>>();
+
+    let generic_start_token = generic_start_token.unwrap_or_default();
+    let generic_close_token = generic_close_token.unwrap_or_default();
+
+    quote! {
+        #[cfg(target_os = "cuda")]
+        #(#func_attrs)*
+        fn #func_ident #generic_start_token #generic_params #generic_close_token (
+            #(#kernel_func_inputs),*
+        )
+        #func_block
+    }
+}
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
new file mode 100644
index 000000000..ff7e2ee48
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -0,0 +1,131 @@
+use proc_macro2::TokenStream;
+use syn::spanned::Spanned;
+use quote::quote;
+
+use crate::kernel::{
+    wrapper::{FuncIdent, FunctionInputs, ImplGenerics},
+    KERNEL_TYPE_LAYOUT_IDENT, KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY,
+};
+
+#[allow(clippy::too_many_lines)]
+pub(in super::super) fn quote_cuda_wrapper(
+    crate_path: &syn::Path,
+    inputs @ FunctionInputs { func_inputs }: &FunctionInputs,
+    func @ FuncIdent {
+        func_ident,
+        func_ident_hash,
+        ..
+    }: &FuncIdent,
+    impl_generics @ ImplGenerics {
+        impl_generics: generics,
+        ..
+    }: &ImplGenerics,
+    func_attrs: &[syn::Attribute],
+    func_params: &[syn::Ident],
+) -> TokenStream {
+    let (ffi_inputs, ffi_types) =
+        specialise_ffi_input_types(crate_path, inputs, func, impl_generics);
+
+    let ffi_param_ptx_jit_wrap = func_inputs.iter().enumerate().rev().fold(
+        quote! {
+            #func_ident(#(#func_params),*)
+        },
+        |inner, (i, syn::PatType { pat, ty, .. })| {
+            let specialised_ty = quote::quote_spanned! { ty.span()=>
+                #crate_path::device::specialise_kernel_param_type!(#ty for #generics in #func_ident)
+            };
+
+            // Load the device param from its FFI representation
+            // To allow some parameters to also inject PTX JIT load markers here,
+            //  we pass them the param index i
+            quote::quote_spanned! { ty.span()=>
+                unsafe {
+                    <
+                        #specialised_ty as #crate_path::kernel::CudaKernelParameter
+                    >::with_ffi_as_device::<_, #i>(
+                        #pat, |#pat: <
+                            #specialised_ty as #crate_path::kernel::CudaKernelParameter
+                        >::DeviceType::<'_>| { #inner }
+                    )
+                }
+            }
+        },
+    );
+
+    let private_func_params = func_params
+        .iter()
+        .map(|param| {
+            let mut private = syn::Ident::clone(param);
+            private.set_span(proc_macro::Span::def_site().into());
+            private
+        })
+        .collect::<Vec<_>>();
+
+    let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span());
+    let ffi_signature_ty = quote! { extern "C" fn(#(#ffi_types),*) };
+
+    quote! {
+        #[cfg(target_os = "cuda")]
+        #[#crate_path::device::specialise_kernel_function(#func_ident)]
+        #[no_mangle]
+        #[allow(unused_unsafe)]
+        #(#func_attrs)*
+        pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ffi_inputs),*) {
+            extern "C" { #(
+                #[allow(dead_code)]
+                #[deny(improper_ctypes)]
+                static #private_func_params: #ffi_types;
+            )* }
+
+            unsafe {
+                // Initialise the dynamically-sized thread-block shared memory
+                //  and the thread-local offset pointer that points to it
+                #crate_path::utils::shared::init();
+            }
+
+            unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY); }
+            #[no_mangle]
+            static #ffi_signature_ident: [
+                u8; #crate_path::deps::const_type_layout::serialised_type_graph_len::<#ffi_signature_ty>()
+            ] = #crate_path::deps::const_type_layout::serialise_type_graph::<#ffi_signature_ty>();
+            unsafe { ::core::ptr::read_volatile(&#ffi_signature_ident) };
+            unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_END_CANARY); }
+
+            #ffi_param_ptx_jit_wrap
+        }
+    }
+}
+
+fn specialise_ffi_input_types(
+    crate_path: &syn::Path,
+    FunctionInputs { func_inputs }: &FunctionInputs,
+    FuncIdent { func_ident, .. }: &FuncIdent,
+    ImplGenerics { impl_generics, .. }: &ImplGenerics,
+) -> (Vec<syn::FnArg>, Vec<syn::Type>) {
+    func_inputs
+        .iter()
+        .map(|syn::PatType {
+            attrs,
+            pat,
+            colon_token,
+            ty,
+        }| {
+            let specialised_ty = quote::quote_spanned! { ty.span()=>
+                #crate_path::device::specialise_kernel_param_type!(#ty for #impl_generics in #func_ident)
+            };
+
+            let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
+                <#specialised_ty as #crate_path::kernel::CudaKernelParameter>::FfiType<'static, 'static>
+            };
+
+            let ffi_param = syn::FnArg::Typed(syn::PatType {
+                attrs: attrs.clone(),
+                ty: Box::new(ffi_ty.clone()),
+                pat: pat.clone(),
+                colon_token: *colon_token,
+            });
+
+            (ffi_param, ffi_ty)
+        })
+        .unzip()
+}
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs
new file mode 100644
index 000000000..757f22470
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs
@@ -0,0 +1,72 @@
+use proc_macro2::TokenStream;
+use quote::quote;
+
+use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
+
+pub(in super::super) fn quote_host_kernel_ty(
+    crate_path: &syn::Path,
+    DeclGenerics {
+        generic_kernel_params,
+        generic_start_token,
+        generic_close_token,
+        ..
+    }: &DeclGenerics,
+    ImplGenerics { ty_generics, .. }: &ImplGenerics,
+    FunctionInputs { func_inputs }: &FunctionInputs,
+    FuncIdent { func_ident, .. }: &FuncIdent,
+    func_params: &[syn::Ident],
+    func_attrs: &[syn::Attribute],
+) -> TokenStream {
+    let cuda_kernel_param_tys = func_inputs
+        .iter()
+        .map(|syn::PatType { ty, .. }| &**ty)
+        .collect::<Vec<_>>();
+
+    let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
+
+    let full_generics = generic_kernel_params
+        .iter()
+        .map(|param| match param {
+            syn::GenericParam::Type(syn::TypeParam { ident, .. })
+            | syn::GenericParam::Const(syn::ConstParam { ident, .. }) => quote!(#ident),
+            syn::GenericParam::Lifetime(syn::LifetimeDef { lifetime, .. }) => quote!(#lifetime),
+        })
+        .collect::<Vec<_>>();
+
+    let mut private_func_ident = syn::Ident::clone(func_ident);
+    private_func_ident.set_span(proc_macro::Span::def_site().into());
+
+    let ty_turbofish = ty_generics.as_turbofish();
+
+    quote! {
+        #[cfg(not(target_os = "cuda"))]
+        #[allow(non_camel_case_types)]
+        pub type #func_ident #generic_start_token
+            #generic_kernel_params
+        #generic_close_token = impl Fn(
+            &mut #crate_path::kernel::Launcher<#func_ident #generic_start_token
+                #(#full_generics),*
+            #generic_close_token>,
+            #(#cuda_kernel_param_tys),*
+        );
+
+        #[cfg(not(target_os = "cuda"))]
+        #(#func_attrs)*
+        #[allow(clippy::too_many_arguments)]
+        #[allow(clippy::used_underscore_binding)]
+        fn #private_func_ident #generic_start_token
+            #generic_kernel_params
+        #generic_close_token (
+            #launcher: &mut #crate_path::kernel::Launcher<#func_ident #generic_start_token
+                #(#full_generics),*
+            #generic_close_token>,
+            #func_inputs
+        ) {
+            let _: #func_ident <#(#full_generics),*> = #private_func_ident #ty_turbofish;
+
+            #(
+                let _ = #func_params;
+            )*
+        }
+    }
+}
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs
new file mode 100644
index 000000000..1813942d8
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs
@@ -0,0 +1,50 @@
+use proc_macro2::TokenStream;
+use quote::quote;
+
+use crate::kernel::wrapper::{FunctionInputs, ImplGenerics};
+
+pub(in super::super) fn quote_args_trait(
+    args: &syn::Ident,
+    ImplGenerics {
+        impl_generics,
+        ty_generics,
+    }: &ImplGenerics,
+    FunctionInputs { func_inputs }: &FunctionInputs,
+) -> TokenStream {
+    let func_input_typedefs = (0..func_inputs.len())
+        .map(|i| {
+            let type_ident = quote::format_ident!("__T_{}", i);
+
+            quote! {
+                type #type_ident;
+            }
+        })
+        .collect::<Vec<_>>();
+
+    let func_input_types = func_inputs
+        .iter()
+        .enumerate()
+        .map(|(i, pat_type)| {
+            let type_ident = quote::format_ident!("__T_{}", i);
+            let arg_type = match &*pat_type.ty {
+                syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
+                other => other,
+            };
+
+            quote! {
+                type #type_ident = #arg_type;
+            }
+        })
+        .collect::<Vec<_>>();
+
+    quote! {
+        #[allow(non_camel_case_types)]
+        pub trait #args #impl_generics {
+            #(#func_input_typedefs)*
+        }
+
+        impl #impl_generics #args #ty_generics for () {
+            #(#func_input_types)*
+        }
+    }
+}
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
new file mode 100644
index 000000000..ef65d5596
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
@@ -0,0 +1,151 @@
+use proc_macro2::TokenStream;
+use syn::spanned::Spanned;
+use quote::quote;
+
+use crate::kernel::{
+    utils::skip_kernel_compilation,
+    wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics},
+    KERNEL_TYPE_LAYOUT_IDENT, PTX_CSTR_IDENT,
+};
+
+#[allow(clippy::too_many_arguments)]
+pub(super) fn quote_get_ptx(
+    crate_path: &syn::Path,
+    FuncIdent {
+        func_ident,
+        func_ident_hash,
+        ..
+    }: &FuncIdent,
+    generics @ DeclGenerics {
+        generic_start_token,
+        generic_close_token,
+        ..
+    }: &DeclGenerics,
+    impl_generics: &ImplGenerics,
+    inputs: &FunctionInputs,
+    func_params: &[syn::Ident],
+    macro_type_ids: &[syn::Ident],
+    ptx_lint_levels: &TokenStream,
+) -> TokenStream {
+    let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") {
+        Ok(crate_name) => crate_name.to_uppercase(),
+        Err(err) => abort_call_site!("Failed to read crate name: {:?}.", err),
+    };
+
+    let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR")
+        .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err));
+
+    let args = syn::Ident::new("KernelArgs", proc_macro::Span::def_site().into());
+    let args_trait = super::args_trait::quote_args_trait(&args, impl_generics, inputs);
+
+    let cpu_func_lifetime_erased_types =
+        generate_lifetime_erased_types(crate_path, &args, generics, inputs, macro_type_ids);
+
+    let ptx_cstr_ident = syn::Ident::new(PTX_CSTR_IDENT, func_ident.span());
+
+    let matching_kernel_assert = if skip_kernel_compilation() {
+        quote!()
+    } else {
+        quote::quote_spanned! { func_ident.span()=>
+            const _: #crate_path::safety::ptx_entry_point::Assert<{
+                #crate_path::safety::ptx_entry_point::HostAndDeviceKernelEntryPoint::Match
+            }> = #crate_path::safety::ptx_entry_point::Assert::<{
+                #crate_path::safety::ptx_entry_point::check(
+                    #ptx_cstr_ident.to_bytes(),
+                    #crate_path::kernel::specialise_kernel_entry_point!(
+                        #func_ident_hash #generic_start_token
+                            #($#macro_type_ids),*
+                        #generic_close_token
+                    ).to_bytes(),
+                )
+            }>;
+        }
+    };
+
+    let signature_layout_assert = if skip_kernel_compilation() {
+        quote!()
+    } else {
+        let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span());
+        let ffi_signature_ty = quote! { extern "C" fn(#(#cpu_func_lifetime_erased_types),*) };
+
+        quote::quote_spanned! { func_ident.span()=>
+            const _: #crate_path::safety::ptx_kernel_signature::Assert<{
+                #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
+            }> = #crate_path::safety::ptx_kernel_signature::Assert::<{
+                #ffi_signature_ident::<#ffi_signature_ty>()
+            }>;
+        }
+    };
+
+    let private_func_params = func_params
+        .iter()
+        .map(|param| {
+            let mut private = syn::Ident::clone(param);
+            private.set_span(proc_macro::Span::def_site().into());
+            private
+        })
+        .collect::<Vec<_>>();
+
+    quote! {
+        fn get_ptx() -> &'static ::core::ffi::CStr {
+            // FIXME: don't use imports here
+            use #crate_path::deps::const_type_layout::{TypeGraphLayout, check_serialised_type_graph};
+            use #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout;
+
+            #args_trait
+
+            extern "C" { #(
+                #[allow(dead_code)]
+                #[deny(improper_ctypes)]
+                static #private_func_params: #cpu_func_lifetime_erased_types;
+            )* }
+
+            #crate_path::kernel::compile_kernel!{
+                #func_ident #func_ident_hash #crate_name #crate_manifest_dir #generic_start_token
+                    #($#macro_type_ids),*
+                #generic_close_token #ptx_lint_levels
+            }
+
+            #matching_kernel_assert
+
+            #signature_layout_assert
+
+            #ptx_cstr_ident
+        }
+    }
+}
+
+fn generate_lifetime_erased_types(
+    crate_path: &syn::Path,
+    args: &syn::Ident,
+    DeclGenerics {
+        generic_start_token,
+        generic_close_token,
+        ..
+    }: &DeclGenerics,
+    FunctionInputs { func_inputs }: &FunctionInputs,
+    macro_type_ids: &[syn::Ident],
+) -> Vec<proc_macro2::TokenStream> {
+    func_inputs
+        .iter()
+        .enumerate()
+        .map(|(i, syn::PatType { ty, .. })| {
+            let type_ident = quote::format_ident!("__T_{}", i);
+
+            let mut specialised_ty = quote::quote_spanned! { ty.span()=>
+                <() as #args #generic_start_token
+                    #($#macro_type_ids),*
+                #generic_close_token>::#type_ident
+            };
+            // the args trait has to unbox outer lifetimes, so we need to add them back in here
+            if let syn::Type::Reference(syn::TypeReference { and_token, lifetime, mutability, .. }) = &**ty {
+                let lifetime = quote::quote_spanned! { lifetime.span()=> 'static };
+
+                specialised_ty = quote! { #and_token #lifetime #mutability #specialised_ty };
+            }
+
+            quote::quote_spanned! { ty.span()=>
+                <#specialised_ty as #crate_path::kernel::CudaKernelParameter>::FfiType<'static, 'static>
+            }
+        }).collect()
+}
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs
new file mode 100644
index 000000000..353e6c5dc
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs
@@ -0,0 +1,111 @@
+use proc_macro2::TokenStream;
+use quote::quote;
+
+use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig};
+
+mod args_trait;
+mod get_ptx;
+
+use get_ptx::quote_get_ptx;
+
+#[allow(clippy::too_many_arguments)] // FIXME
+pub(in super::super) fn quote_host_link_macro(
+    crate_path: &syn::Path,
+    KernelConfig {
+        visibility, link, ..
+    }: &KernelConfig,
+    decl_generics @ DeclGenerics {
+        generic_start_token,
+        generic_close_token,
+        generic_kernel_params,
+        ..
+    }: &DeclGenerics,
+    impl_generics: &ImplGenerics,
+    func_inputs: &FunctionInputs,
+    func_ident @ FuncIdent {
+        func_ident: func_ident_name,
+        func_ident_hash,
+        ..
+    }: &FuncIdent,
+    func_params: &[syn::Ident],
+    ptx_lint_levels: &TokenStream,
+) -> TokenStream {
+    let macro_generics = generic_kernel_params
+        .iter()
+        .enumerate()
+        .map(|(i, generic)| {
+            let generic_ident = quote::format_ident!("__g_{}", i);
+
+            match generic {
+                syn::GenericParam::Type(_) => quote!($#generic_ident:ty),
+                syn::GenericParam::Const(_) => quote!($#generic_ident:expr),
+                syn::GenericParam::Lifetime(_) => quote!($#generic_ident:lifetime),
+            }
+        })
+        .collect::<Vec<_>>();
+
+    let macro_generic_ids = (0..generic_kernel_params.len())
+        .map(|i| quote::format_ident!("__g_{}", i))
+        .collect::<Vec<_>>();
+
+    let macro_only_lt_generic_ids = generic_kernel_params
+        .iter()
+        .enumerate()
+        .filter_map(|(i, generic)| {
+            let generic_ident = quote::format_ident!("__g_{}", i);
+
+            match generic {
+                syn::GenericParam::Type(_) | syn::GenericParam::Const(_) => None,
+                syn::GenericParam::Lifetime(_) => Some(generic_ident),
+            }
+        })
+        .collect::<Vec<_>>();
+
+    let macro_non_lt_generic_ids = generic_kernel_params
+        .iter()
+        .enumerate()
+        .filter_map(|(i, generic)| {
+            let generic_ident = quote::format_ident!("__g_{}", i);
+
+            match generic {
+                syn::GenericParam::Type(_) | syn::GenericParam::Const(_) => Some(generic_ident),
+                syn::GenericParam::Lifetime(_) => None,
+            }
+        })
+        .collect::<Vec<_>>();
+
+    let get_ptx = quote_get_ptx(
+        crate_path,
+        func_ident,
+        decl_generics,
+        impl_generics,
+        func_inputs,
+        func_params,
+        &macro_non_lt_generic_ids,
+        ptx_lint_levels,
+    );
+
+    quote! {
+        #[cfg(not(target_os = "cuda"))]
+        #visibility macro #link(
+            impl #func_ident_name #generic_start_token
+                #(#macro_generics),* $(,)?
+            #generic_close_token for $ptx:ident
+        ) {
+            unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::kernel::CompiledKernelPtx<
+                #func_ident_name #generic_start_token #($#macro_generic_ids),* #generic_close_token
+            > for $ptx #generic_start_token #($#macro_generic_ids),* #generic_close_token
+            {
+                #get_ptx
+
+                fn get_entry_point() -> &'static ::core::ffi::CStr {
+                    #crate_path::kernel::specialise_kernel_entry_point!(
+                        #func_ident_hash #generic_start_token
+                            #($#macro_non_lt_generic_ids),*
+                        #generic_close_token
+                    )
+                }
+            }
+        }
+    }
+}
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs
new file mode 100644
index 000000000..829cb0433
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs
@@ -0,0 +1,4 @@
+pub mod cuda_generic_function;
+pub mod cuda_wrapper;
+pub mod host_kernel_ty;
+pub mod host_link_macro;
diff --git a/rust-cuda-kernel/src/kernel/wrapper/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/mod.rs
new file mode 100644
index 000000000..9dffacc51
--- /dev/null
+++ b/rust-cuda-kernel/src/kernel/wrapper/mod.rs
@@ -0,0 +1,354 @@
+use std::{
+    collections::HashMap,
+    hash::{Hash, Hasher},
+};
+
+use proc_macro::TokenStream;
+
+mod config;
+mod generate;
+mod parse;
+
+use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
+
+use config::KernelConfig;
+use generate::{
+    cuda_generic_function::quote_cuda_generic_function, cuda_wrapper::quote_cuda_wrapper,
+    host_kernel_ty::quote_host_kernel_ty, host_link_macro::quote_host_link_macro,
+};
+use parse::parse_kernel_fn;
+use proc_macro2::{Ident, Span};
+use syn::spanned::Spanned;
+use quote::quote;
+
+#[allow(clippy::too_many_lines)]
+pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
+    let mut hasher = seahash::SeaHasher::new();
+
+    attr.to_string().hash(&mut hasher);
+    func.to_string().hash(&mut hasher);
+
+    let kernel_hash = hasher.finish();
+
+    let config: KernelConfig = match syn::parse_macro_input::parse(attr) {
+        Ok(config) => config,
+        Err(err) => {
+            abort_call_site!(
+                "#[kernel(pub? use LINK! for impl)] expects LINK macro identifier: {:?}",
+                err
+            )
+        },
+    };
+
+    let mut func = parse_kernel_fn(func);
+
+    let mut crate_path = None;
+    let mut ptx_lint_levels = HashMap::new();
+
+    func.attrs.retain(|attr| {
+        if attr.path.is_ident("kernel") {
+            if let Ok(syn::Meta::List(list)) = attr.parse_meta() {
+                for meta in &list.nested {
+                    match meta {
+                        syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue {
+                            path,
+                            lit: syn::Lit::Str(s),
+                            ..
+                        })) if path.is_ident("crate") => match syn::parse_str::<syn::Path>(&s.value()) {
+                            Ok(new_crate_path) => {
+                                if crate_path.is_none() {
+                                    crate_path = Some(
+                                        syn::parse_quote_spanned! { s.span() => #new_crate_path },
+                                    );
+
+                                    continue;
+                                }
+
+                                emit_error!(
+                                    s.span(),
+                                    "[rust-cuda]: Duplicate #[kernel(crate)] attribute.",
+                                );
+                            },
+                            Err(err) => emit_error!(
+                                s.span(),
+                                "[rust-cuda]: Invalid #[kernel(crate = \
+                                 \"<crate-path>\")] attribute: {}.",
+                                err
+                            ),
+                        },
+                        syn::NestedMeta::Meta(syn::Meta::List(syn::MetaList {
+                            path,
+                            nested,
+                            ..
+                        })) if path.is_ident("allow") || path.is_ident("warn") || path.is_ident("deny") || path.is_ident("forbid") => {
+                            parse_ptx_lint_level(path, nested, &mut ptx_lint_levels);
+                        },
+                        _ => {
+                            emit_error!(
+                                meta.span(),
+                                "[rust-cuda]: Expected #[kernel(crate = \"<crate-path>\")] or #[kernel(allow/warn/deny/forbid(<lint>))] function attribute."
+                            );
+                        }
+                    }
+                }
+            } else {
+                emit_error!(
+                    attr.span(),
+                    "[rust-cuda]: Expected #[kernel(crate = \"<crate-path>\")] or or #[kernel(allow/warn/deny/forbid(<lint>))] function attribute."
+                );
+            }
+
+            false
+        } else {
+            true
+        }
+    });
+
+    let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda));
+
+    let _ = ptx_lint_levels.try_insert(PtxLint::Verbose, LintLevel::Allow);
+    let _ = ptx_lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn);
+    let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUse, LintLevel::Warn);
+    let _ = ptx_lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn);
+    let _ = ptx_lint_levels.try_insert(PtxLint::DumpAssembly, LintLevel::Allow);
+    let _ = ptx_lint_levels.try_insert(PtxLint::DynamicStackSize, LintLevel::Warn);
+
+    let ptx_lint_levels = {
+        let (lints, levels): (Vec<Ident>, Vec<Ident>) = ptx_lint_levels
+            .into_iter()
+            .map(|(lint, level)| {
+                (
+                    Ident::new(&lint.to_string(), Span::call_site()),
+                    Ident::new(&level.to_string(), Span::call_site()),
+                )
+            })
+            .unzip();
+
+        quote! {
+            #(#levels(ptx::#lints)),*
+        }
+    };
+
+    let mut func_inputs = FunctionInputs {
+        func_inputs: func
+            .sig
+            .inputs
+            .into_iter()
+            .map(|arg| match arg {
+                syn::FnArg::Typed(arg) => arg,
+                syn::FnArg::Receiver(_) => {
+                    unreachable!("already checked that no receiver arg exists")
+                },
+            })
+            .collect(),
+    };
+
+    let generic_kernel_params = func.sig.generics.params.clone();
+    let (generic_start_token, generic_close_token) =
+        (func.sig.generics.lt_token, func.sig.generics.gt_token);
+
+    let generic_trait_params = generic_kernel_params
+        .iter()
+        .filter(|generic_param| !matches!(generic_param, syn::GenericParam::Lifetime(_)))
+        .cloned()
+        .collect();
+
+    let decl_generics = DeclGenerics {
+        generic_start_token: &generic_start_token,
+        generic_close_token: &generic_close_token,
+        generic_kernel_params: &generic_kernel_params,
+    };
+    let trait_generics = syn::Generics {
+        lt_token: generic_start_token,
+        params: generic_trait_params,
+        gt_token: generic_close_token,
+        where_clause: None,
+    };
+    let (impl_generics, ty_generics, _where_clause) = trait_generics.split_for_impl();
+    let impl_generics = ImplGenerics {
+        impl_generics,
+        ty_generics,
+    };
+
+    let func_ident = FuncIdent {
+        func_ident: &func.sig.ident,
+        func_ident_hash: quote::format_ident!("{}_{:016x}", &func.sig.ident, kernel_hash),
+    };
+
+    let func_params = func_inputs
+        .func_inputs
+        .iter()
+        .enumerate()
+        .map(|(i, syn::PatType { pat, .. })| match ident_from_pat(pat) {
+            Some(ident) => ident,
+            None => syn::Ident::new(&format!("{}_arg_{i}", func_ident.func_ident), pat.span()),
+        })
+        .collect::<Vec<_>>();
+
+    let pat_func_inputs = func_inputs
+        .func_inputs
+        .iter_mut()
+        .zip(&func_params)
+        .map(|(arg, ident)| {
+            let syn::PatType {
+                attrs,
+                colon_token,
+                ty,
+                ..
+            } = arg;
+
+            let ident_fn_arg = syn::PatType {
+                attrs: attrs.clone(),
+                pat: Box::new(syn::Pat::Ident(syn::PatIdent {
+                    attrs: Vec::new(),
+                    by_ref: None,
+                    mutability: None,
+                    ident: ident.clone(),
+                    subpat: None,
+                })),
+                colon_token: *colon_token,
+                ty: ty.clone(),
+            };
+
+            std::mem::replace(arg, ident_fn_arg)
+        })
+        .collect();
+
+    let host_kernel_ty = quote_host_kernel_ty(
+        &crate_path,
+        &decl_generics,
+        &impl_generics,
+        &func_inputs,
+        &func_ident,
+        &func_params,
+        &func.attrs,
+    );
+    let host_generic_kernel_check = quote_generic_check(&crate_path, &func_ident);
+    let host_link_macro = quote_host_link_macro(
+        &crate_path,
+        &config,
+        &decl_generics,
+        &impl_generics,
+        &func_inputs,
+        &func_ident,
+        &func_params,
+        &ptx_lint_levels,
+    );
+    let cuda_wrapper = quote_cuda_wrapper(
+        &crate_path,
+        &func_inputs,
+        &func_ident,
+        &impl_generics,
+        &func.attrs,
+        &func_params,
+    );
+    let cuda_generic_function = quote_cuda_generic_function(
+        &crate_path,
+        &decl_generics,
+        &pat_func_inputs,
+        &func_ident,
+        &func.attrs,
+        &func.block,
+    );
+
+    (quote! {
+        #host_kernel_ty
+
+        #host_generic_kernel_check
+
+        #host_link_macro
+
+        #cuda_wrapper
+        #cuda_generic_function
+    })
+    .into()
+}
+
+struct FunctionInputs {
+    func_inputs: syn::punctuated::Punctuated<syn::PatType, syn::token::Comma>,
+}
+
+#[allow(clippy::struct_field_names)]
+struct DeclGenerics<'f> {
+    generic_start_token: &'f Option<syn::token::Lt>,
+    generic_close_token: &'f Option<syn::token::Gt>,
+    generic_kernel_params: &'f syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
+}
+
+struct ImplGenerics<'f> {
+    #[allow(clippy::struct_field_names)]
+    impl_generics: syn::ImplGenerics<'f>,
+    ty_generics: syn::TypeGenerics<'f>,
+}
+
+#[allow(clippy::struct_field_names)]
+struct FuncIdent<'f> {
+    func_ident: &'f syn::Ident,
+    func_ident_hash: syn::Ident,
+}
+
+fn ident_from_pat(pat: &syn::Pat) -> Option<syn::Ident> {
+    match pat {
+        syn::Pat::Lit(_)
+        | syn::Pat::Macro(_)
+        | syn::Pat::Path(_)
+        | syn::Pat::Range(_)
+        | syn::Pat::Rest(_)
+        | syn::Pat::Verbatim(_)
+        | syn::Pat::Wild(_) => None,
+        syn::Pat::Ident(syn::PatIdent { ident, .. }) => Some(ident.clone()),
+        syn::Pat::Box(syn::PatBox { pat, .. })
+        | syn::Pat::Reference(syn::PatReference { pat, .. })
+        | syn::Pat::Type(syn::PatType { pat, .. }) => ident_from_pat(pat),
+        syn::Pat::Or(syn::PatOr { cases, .. }) => ident_from_pat_iter(cases.iter()),
+        syn::Pat::Slice(syn::PatSlice { elems, .. })
+        | syn::Pat::TupleStruct(syn::PatTupleStruct {
+            pat: syn::PatTuple { elems, .. },
+            ..
+        })
+        | syn::Pat::Tuple(syn::PatTuple { elems, .. }) => ident_from_pat_iter(elems.iter()),
+        syn::Pat::Struct(syn::PatStruct { fields, .. }) => {
+            ident_from_pat_iter(fields.iter().map(|field| &*field.pat))
+        },
+        _ => Err(()).ok(),
+    }
+}
+
+fn ident_from_pat_iter<'p, I: Iterator<Item = &'p syn::Pat>>(iter: I) -> Option<syn::Ident> {
+    iter.filter_map(ident_from_pat)
+        .fold(None, |acc: Option<(String, Span)>, ident| {
+            if let Some((mut str_acc, span_acc)) = acc {
+                str_acc.push('_');
+                str_acc.push_str(ident.to_string().trim_matches('_'));
+
+                Some((str_acc, span_acc.join(ident.span()).unwrap()))
+            } else {
+                Some((ident.to_string(), ident.span()))
+            }
+        })
+        .map(|(string, span)| syn::Ident::new(&string, span))
+}
+
+fn quote_generic_check(
+    crate_path: &syn::Path,
+    FuncIdent {
+        func_ident,
+        func_ident_hash,
+        ..
+    }: &FuncIdent,
+) -> proc_macro2::TokenStream {
+    let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") {
+        Ok(crate_name) => crate_name.to_uppercase(),
+        Err(err) => abort_call_site!("Failed to read crate name: {:?}.", err),
+    };
+
+    let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR")
+        .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err));
+
+    quote::quote_spanned! { func_ident_hash.span()=>
+        #[cfg(not(target_os = "cuda"))]
+        #crate_path::kernel::check_kernel! {
+            #func_ident #func_ident_hash #crate_name #crate_manifest_dir
+        }
+    }
+}
diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-kernel/src/kernel/wrapper/parse.rs
similarity index 66%
rename from rust-cuda-derive/src/kernel/wrapper/parse.rs
rename to rust-cuda-kernel/src/kernel/wrapper/parse.rs
index 936143cf2..8d1662772 100644
--- a/rust-cuda-derive/src/kernel/wrapper/parse.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/parse.rs
@@ -23,7 +23,7 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn {
     if func.sig.asyncness.is_some() {
         abort!(
             func.sig.asyncness.span(),
-            "Kernel function must not (yet) be async."
+            "Kernel function must not be async."
         );
     }
 
@@ -41,6 +41,20 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn {
         );
     }
 
+    for param in &func.sig.inputs {
+        if let syn::FnArg::Receiver(receiver) = param {
+            abort!(receiver.span(), "Kernel function must not have a receiver.");
+        }
+    }
+
+    if func.sig.inputs.len() > 12 {
+        emit_warning!(
+            func.sig.inputs.span(),
+            "Kernel function has too many arguments, {} were found but at most 12 are supported.",
+            func.sig.inputs.len()
+        );
+    }
+
     match &func.sig.output {
         syn::ReturnType::Default => (),
         syn::ReturnType::Type(_, box syn::Type::Tuple(tuple)) if tuple.elems.is_empty() => (),
@@ -50,5 +64,12 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn {
         ),
     };
 
+    if let Some(r#where) = &func.sig.generics.where_clause {
+        abort!(
+            r#where.span(),
+            "Kernel function must not have a where clause, use type generic bounds instead."
+        );
+    }
+
     func
 }
diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs
new file mode 100644
index 000000000..e6d5cf3ac
--- /dev/null
+++ b/rust-cuda-kernel/src/lib.rs
@@ -0,0 +1,207 @@
+//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License
+//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod
+//! Ready-to-Code]][gitpod]
+//!
+//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
+//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
+//!
+//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange
+//! [repo]: https://github.com/juntyr/rust-cuda
+//!
+//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
+//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda_kernel/
+//!
+//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield
+//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield
+//!
+//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx
+//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda
+//!
+//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod
+//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda
+//!
+//! `rust-cuda-kernel` provides the [`#[kernel]`](macro@kernel) attribute
+//! macro. When applied to a function, it compiles it as a CUDA kernel that
+//! can be *safely* called from Rust code on the host.
+
+#![deny(clippy::complexity)]
+#![deny(clippy::correctness)]
+#![warn(clippy::nursery)]
+#![warn(clippy::pedantic)]
+#![deny(clippy::perf)]
+#![deny(clippy::style)]
+#![deny(clippy::suspicious)]
+#![deny(unsafe_code)]
+#![warn(missing_docs)]
+#![feature(box_patterns)]
+#![feature(proc_macro_tracked_env)]
+#![feature(proc_macro_span)]
+#![feature(let_chains)]
+#![feature(map_try_insert)]
+#![feature(proc_macro_def_site)]
+#![feature(proc_macro_c_str_literals)]
+#![feature(cfg_version)]
+#![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
+
+extern crate proc_macro;
+
+#[macro_use]
+extern crate proc_macro_error;
+
+use proc_macro::TokenStream;
+
+mod kernel;
+
+#[proc_macro_error]
+#[proc_macro_attribute]
+/// Provides the [`#[kernel]`](macro@kernel) attribute macro. When applied to a
+/// function, it compiles it as a CUDA kernel that can be *safely* called from
+/// Rust code on the host.
+///
+/// The annotated function must be public, not const, not async, not have an
+/// explicit ABI, not be variadic, not have a receiver (e.g. `&self`), and
+/// return the unit type `()`. At the moment, the kernel function must also
+/// not use a where clause – use type generic bounds instead.
+///
+/// While the [`#[kernel]`](macro@kernel) attribute supports functions with any
+/// number of arguments, [`rust_cuda::kernel::TypedPtxKernel`] only supports
+/// launching kernels with up to 12 parameters at the moment.
+///
+/// The [`#[kernel]`](macro@kernel) attribute uses the following syntax:
+///
+/// ```rust,ignore
+/// #[kernel(pub? use link! for impl)]
+/// fn my_kernel(/* parameters */) {
+///     /* kernel code */
+/// }
+/// ```
+///
+/// where `link` is the name of a macro that will be generated to manually link
+/// specific monomorphised instantiations of the (optionally generic) kernel
+/// function, and the optional `pub` controls whether this macro is public or
+/// private.
+///
+/// Note that all kernel parameters must implement the sealed
+/// [`rust_cuda::kernel::CudaKernelParameter`] trait.
+///
+/// To use a specific monomorphised instantiation of the kernel, the generated
+/// `link!` macro must be invoked with the following syntax:
+///
+/// ```rust,ignore
+/// struct KernelPtx;
+/// link! { impl my_kernel for KernelPtx }
+/// ```
+/// for the non-generic kernel function `my_kernel` and a non-generic marker
+/// type `KernelPtx`, which can be used as the generic `Kernel` type parameter
+/// for [`rust_cuda::kernel::TypedPtxKernel`] to instantiate and launch the
+/// kernel. Specifically, the [`rust_cuda::kernel::CompiledKernelPtx`] trait is
+/// implemented for the `KernelPtx` type.
+///
+/// If the kernel function is generic, the following syntax is used instead:
+/// ```rust,ignore
+/// #[kernel(pub? use link! for impl)]
+/// fn my_kernel<'a, A, B: Bounded, const N: usize>(/* parameters */) {
+///     /* kernel code */
+/// }
+///
+/// struct KernelPtx<'a, A, B: Bounded, const N: usize>(/* ... */);
+/// link! { impl my_kernel<'a, u32, MyStruct, 42> for KernelPtx }
+/// link! { impl my_kernel<'a, bool, MyOtherStruct, 24> for KernelPtx }
+/// ```
+///
+/// If the kernel generic space is closed, the `link!` macro can be made
+/// private and all instantiations must be requested in the same crate that
+/// defines the kernel function. If downstream code should be allowed to use
+/// and compile new specific monomorphised instantiations of the kernel, the
+/// `link!` macro should be publicly exported. Then, downstream code can define
+/// its own `MyKernelPtx` marker types for which the kernel is linked and which
+/// can be passed to [`rust_cuda::kernel::CompiledKernelPtx`]-generic code in
+/// the kernel-defining crate to construct the requested
+/// [`rust_cuda::kernel::TypedPtxKernel`].
+///
+/// Inside the scope of the [`#[kernel]`](macro@kernel) attribute, a helper
+/// `#[kernel(...)]` attribute can be applied to the kernel function:
+///
+/// - `#[kernel(crate = "<crate-path>")]` changes the path to the [`rust-cuda`]
+///   crate that the kernel compilation uses, which by default is `rust_cuda`.
+/// - `#[kernel(allow/warn/deny/forbid(<lint>))]` checks the specified
+///   CUDA-specific lint for each kernel compilation, using default Rust
+///   semantics for allowing, warning on, denying, or forbidding a lint. The
+///   following lints are supported:
+///   - `ptx::double_precision_use`: check for any uses of [`f64`] operations
+///     inside the compiled PTX binary, as they are often significantly less
+///     performant on NVIDIA GPUs than [`f32`] operations. By default,
+///     `#[kernel(warn(ptx::double_precision_use))]` is set.
+///   - `ptx::local_memory_use`: check for any usage of local memory, which may
+///     slow down kernel execution. By default,
+///     `#[kernel(warn(ptx::local_memory_use))]` is set.
+///   - `ptx::register_spills`: check for any spills of registers to local
+///     memory. While using less registers can allow more kernels to be run in
+///     parallel, register spills may also point to missed optimisations. By
+///     default, `#[kernel(warn(ptx::register_spills))]` is set.
+///   - `ptx::dynamic_stack_size`: check if the PTX compiler is unable to
+///     statically determine the size of the required kernel function stack.
+///     When the static stack size is known, the compiler may be able to keep it
+///     entirely within the fast register file. However, when the stack size is
+///     dynamic, more costly memory load and store operations are needed. By
+///     default, `#[kernel(warn(ptx::dynamic_stack_size))]` is set.
+///   - `ptx::verbose`: utility lint to output verbose PTX compiler messages as
+///     warnings (`warn`) or errors (`deny` or `forbid`) or to not output them
+///     (`allow`). By default, `#[kernel(allow(ptx::verbose))]` is set.
+///   - `ptx::dump_assembly`: utility lint to output the compiled PTX assembly
+///     code as a warning (`warn`) or an error (`deny` or `forbid`) or to not
+///     output it (`allow`). By default, `#[kernel(allow(ptx::dump_assembly))]`
+///     is set.
+///
+/// [`rust_cuda::kernel::TypedPtxKernel`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/struct.TypedPtxKernel.html
+/// [`rust_cuda::kernel::CudaKernelParameter`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/trait.CudaKernelParameter.html
+/// [`rust_cuda::kernel::CompiledKernelPtx`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/trait.CompiledKernelPtx.html
+/// [`rust-cuda`]: https://juntyr.github.io/rust-cuda/rust_cuda
+pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
+    kernel::wrapper::kernel(attr, func)
+}
+
+#[doc(hidden)]
+#[proc_macro_error]
+#[proc_macro]
+/// Helper macro to specialise the generic kernel param types when compiling
+/// the specialised kernel for CUDA.
+pub fn specialise_kernel_param_type(tokens: TokenStream) -> TokenStream {
+    kernel::specialise::param_type::specialise_kernel_param_type(tokens)
+}
+
+#[doc(hidden)]
+#[proc_macro_error]
+#[proc_macro]
+/// Helper macro to specialise the CUDA kernel entry point name, used on the
+/// host for linking to it.
+pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
+    kernel::specialise::entry_point::specialise_kernel_entry_point(tokens)
+}
+
+#[doc(hidden)]
+#[proc_macro_error]
+#[proc_macro_attribute]
+/// Helper macro to specialise the name of the CUDA kernel function item, used
+/// to give each specialised version a unique ident when compiling for CUDA.
+pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream {
+    kernel::specialise::function::specialise_kernel_function(attr, func)
+}
+
+#[doc(hidden)]
+#[proc_macro_error]
+#[proc_macro]
+/// Helper macro to cheaply check the generic CUDA kernel, used on the host to
+/// provide code error feedback even when no specialised kernel is linked.
+pub fn check_kernel(tokens: TokenStream) -> TokenStream {
+    kernel::link::check_kernel(tokens)
+}
+
+#[doc(hidden)]
+#[proc_macro_error]
+#[proc_macro]
+/// Helper macro to compile a specialised CUDA kernel and produce its PTX
+/// assembly code, which is used on the host when linking specialised kernels.
+pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
+    kernel::link::compile_kernel(tokens)
+}
diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml
deleted file mode 100644
index f2a4cd09a..000000000
--- a/rust-cuda-ptx-jit/Cargo.toml
+++ /dev/null
@@ -1,17 +0,0 @@
-[package]
-name = "rust-cuda-ptx-jit"
-version = "0.1.0"
-authors = ["Juniper Tyree <juniper.langenstein@helsinki.fi>"]
-license = "MIT OR Apache-2.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[features]
-default = []
-host = ["regex", "rustacuda", "lazy_static"]
-
-[dependencies]
-rustacuda = { version = "0.1.3", optional = true }
-regex = { version = "1.5", optional = true }
-lazy_static = { version = "1.4", optional = true }
diff --git a/rust-cuda-ptx-jit/src/device.rs b/rust-cuda-ptx-jit/src/device.rs
deleted file mode 100644
index 533021b90..000000000
--- a/rust-cuda-ptx-jit/src/device.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-#[macro_export]
-#[doc(hidden)]
-#[doc(cfg(not(feature = "host")))]
-macro_rules! PtxJITConstLoad {
-    ([$index:literal] => $reference:expr) => {
-        unsafe {
-            ::core::arch::asm!(
-                concat!("// <rust-cuda-ptx-jit-const-load-{}-", $index, "> //"),
-                in(reg32) *($reference as *const _ as *const u32),
-            )
-        }
-    };
-}
diff --git a/rust-cuda-ptx-jit/src/host/arguments.rs b/rust-cuda-ptx-jit/src/host/arguments.rs
deleted file mode 100644
index 0a67d42ea..000000000
--- a/rust-cuda-ptx-jit/src/host/arguments.rs
+++ /dev/null
@@ -1,48 +0,0 @@
-#[macro_export]
-#[doc(hidden)]
-#[doc(cfg(feature = "host"))]
-#[allow(clippy::module_name_repetitions)]
-macro_rules! compilePtxJITwithArguments {
-    // Invocation without arguments fast track
-    ($compiler:ident ()) => {
-        $crate::compilePtxJITwithArguments!($compiler.with_arguments ())
-    };
-    // Invocation without arguments fast track
-    ($compiler:ident $(. $path:ident)+ ()) => {
-        $compiler$(.$path)+(None)
-    };
-    // Invocation with arguments is forwarded to incremental muncher
-    ($compiler:ident ( $($args:tt)* )) => {
-        $crate::compilePtxJITwithArguments!($compiler.with_arguments ( $($args)* ))
-    };
-    // Invocation with arguments is forwarded to incremental muncher
-    ($compiler:ident $(. $path:ident)+ ( $($args:tt)* )) => {
-        $crate::compilePtxJITwithArguments!(@munch None $compiler$(.$path)+ => [, $($args)*] =>)
-    };
-    // Muncher base case: no `ConstLoad[$expr]` arguments
-    (@munch None $compiler:ident $(. $path:ident)+ => [] => $($rubbish:expr),*) => {
-        $compiler$(.$path)+(None)
-    };
-    // Muncher base case: at least one `ConstLoad[$expr]` argument
-    (@munch Some $compiler:ident $(. $path:ident)+ => [] => $($exprs:expr),*) => {
-        $compiler$(.$path)+(Some(&[$($exprs),*]))
-    };
-    // Muncher helper case: first `ConstLoad[$expr]` argument is recognised (redirect)
-    (@munch None $compiler:ident $(. $path:ident)+ => [, ConstLoad [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => {
-        $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [, ConstLoad [ $head ] $($tail)*] => $($exprs),*)
-    };
-    // Muncher recursive case: much one `Ignore[$expr]` argument (no `ConstLoad[$expr]`s so far)
-    (@munch None $compiler:ident $(. $path:ident)+ => [, Ignore [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => {
-        $crate::compilePtxJITwithArguments!(@munch None $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* None)
-    };
-    // Muncher recursive case: much one `Ignore[$expr]` argument (some `ConstLoad[$expr]`s already)
-    (@munch Some $compiler:ident $(. $path:ident)+ => [, Ignore [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => {
-        $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* None)
-    };
-    // Muncher recursive case: much one `ConstLoad[$expr]` (some `ConstLoad[$expr]`s already)
-    (@munch Some $compiler:ident $(. $path:ident)+ => [, ConstLoad [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => {
-        $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* Some(unsafe {
-            ::std::slice::from_raw_parts($head as *const _ as *const u8, ::std::mem::size_of_val($head))
-        }))
-    };
-}
diff --git a/rust-cuda-ptx-jit/src/host/compiler/regex.rs b/rust-cuda-ptx-jit/src/host/compiler/regex.rs
deleted file mode 100644
index 5cff3bdc9..000000000
--- a/rust-cuda-ptx-jit/src/host/compiler/regex.rs
+++ /dev/null
@@ -1,46 +0,0 @@
-#[allow(unused_imports)]
-use regex::bytes::Regex;
-
-lazy_static::lazy_static! {
-    pub static ref CONST_MARKER_REGEX: Regex = {
-        Regex::new(
-            r"(?-u)// <rust-cuda-ptx-jit-const-load-(?P<tmpreg>%r\d+)-(?P<param>\d+)> //"
-        ).unwrap()
-    };
-
-    pub static ref CONST_BASE_REGISTER_REGEX: Regex = {
-        Regex::new(
-            r"(?-u)ld\.global\.u32\s*(?P<tmpreg>%r\d+)\s*,\s*\[(?P<basereg>%r[ds]?\d+)]\s*;",
-        ).unwrap()
-    };
-
-    pub static ref CONST_LOAD_INSTRUCTION_REGEX: Regex = {
-        Regex::new(
-            r"(?x-u)(?P<instruction>
-                ld\.global
-                (?:\.(?P<vector>v[24]))?
-                \.
-                (?P<loadtype>[suf])
-                (?P<loadwidth>8|16|32|64)
-                \s*
-                (?P<constreg>
-                    (?:%[rf][sd]?\d+) |
-                    (?:\{(?:\s*%[rf][sd]?\d+,)*\s*%[rf][sd]?\d+\s*\})
-                )
-                ,\s*
-                \[
-                (?P<basereg>%r[ds]?\d+)
-                (?:
-                    \+
-                    (?P<loadoffset>\d+)
-                )?
-                \]
-                \s*;
-            )",
-        ).unwrap()
-    };
-
-    pub static ref REGISTER_REGEX: Regex = {
-        Regex::new(r"(?-u)(?P<register>%[rf][sd]?\d+)").unwrap()
-    };
-}
diff --git a/rust-cuda-ptx-jit/src/host/kernel.rs b/rust-cuda-ptx-jit/src/host/kernel.rs
deleted file mode 100644
index 02baabfcf..000000000
--- a/rust-cuda-ptx-jit/src/host/kernel.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-use std::{ffi::CStr, mem::ManuallyDrop};
-
-use rustacuda::{error::CudaResult, function::Function, module::Module};
-
-#[doc(cfg(feature = "host"))]
-#[allow(clippy::module_name_repetitions)]
-pub struct CudaKernel {
-    module: ManuallyDrop<Box<Module>>,
-    function: ManuallyDrop<Function<'static>>,
-}
-
-impl CudaKernel {
-    /// # Errors
-    ///
-    /// Returns a `CudaError` if `ptx` is not a valid PTX source, or it does
-    ///  not contain an entry point named `entry_point`.
-    pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult<Self> {
-        let module = Box::new(Module::load_from_string(ptx)?);
-
-        let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point);
-
-        let function = match function {
-            Ok(function) => function,
-            Err(err) => {
-                if let Err((_err, module)) = Module::drop(*module) {
-                    std::mem::forget(module);
-                }
-
-                return Err(err);
-            },
-        };
-
-        Ok(Self {
-            function: ManuallyDrop::new(function),
-            module: ManuallyDrop::new(module),
-        })
-    }
-
-    #[must_use]
-    pub fn get_function(&self) -> &Function {
-        &self.function
-    }
-}
-
-impl Drop for CudaKernel {
-    fn drop(&mut self) {
-        {
-            // Ensure that self.function is dropped before self.module as
-            //  it borrows data from the module and must not outlive it
-            let _function = unsafe { ManuallyDrop::take(&mut self.function) };
-        }
-
-        if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) })
-        {
-            std::mem::forget(module);
-        }
-    }
-}
diff --git a/rust-cuda-ptx-jit/src/host/mod.rs b/rust-cuda-ptx-jit/src/host/mod.rs
deleted file mode 100644
index d0d9ffb53..000000000
--- a/rust-cuda-ptx-jit/src/host/mod.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-pub mod compiler;
-pub mod kernel;
-
-mod arguments;
diff --git a/rust-cuda-ptx-jit/src/lib.rs b/rust-cuda-ptx-jit/src/lib.rs
deleted file mode 100644
index ae6080a3e..000000000
--- a/rust-cuda-ptx-jit/src/lib.rs
+++ /dev/null
@@ -1,14 +0,0 @@
-#![deny(clippy::pedantic)]
-#![cfg_attr(not(feature = "host"), no_std)]
-#![feature(doc_cfg)]
-#![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
-
-#[cfg(feature = "host")]
-mod host;
-
-#[cfg(feature = "host")]
-pub use host::{compiler::PtxJITCompiler, compiler::PtxJITResult, kernel::CudaKernel};
-
-#[cfg(any(not(feature = "host"), doc))]
-#[doc(cfg(not(feature = "host")))]
-mod device;
diff --git a/rust-toolchain b/rust-toolchain
index 512b40786..7734bcf14 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,5 +1,4 @@
 [toolchain]
-# Pin version pin until const traits are back
-channel = "nightly-2023-11-10"
+channel = "nightly"
 components = [ "cargo", "rustfmt", "clippy" ]
 targets = [ "x86_64-unknown-linux-gnu", "nvptx64-nvidia-cuda" ]
diff --git a/src/alloc.rs b/src/alloc.rs
new file mode 100644
index 000000000..80d0ee840
--- /dev/null
+++ b/src/alloc.rs
@@ -0,0 +1,67 @@
+#![allow(clippy::module_name_repetitions)]
+
+pub trait EmptyCudaAlloc: From<NoCudaAlloc> + Into<NoCudaAlloc> + sealed::empty::Sealed {}
+
+pub trait CudaAlloc: sealed::alloc::Sealed {}
+
+impl<T: CudaAlloc> CudaAlloc for Option<T> {}
+impl<T: CudaAlloc> sealed::alloc::Sealed for Option<T> {}
+
+pub struct NoCudaAlloc;
+impl CudaAlloc for NoCudaAlloc {}
+impl sealed::alloc::Sealed for NoCudaAlloc {}
+impl EmptyCudaAlloc for NoCudaAlloc {}
+impl sealed::empty::Sealed for NoCudaAlloc {}
+
+pub struct SomeCudaAlloc(());
+impl CudaAlloc for SomeCudaAlloc {}
+impl sealed::alloc::Sealed for SomeCudaAlloc {}
+impl !EmptyCudaAlloc for SomeCudaAlloc {}
+impl !sealed::empty::Sealed for SomeCudaAlloc {}
+
+pub struct CombinedCudaAlloc<A: CudaAlloc, B: CudaAlloc>(A, B);
+impl<A: CudaAlloc, B: CudaAlloc> CudaAlloc for CombinedCudaAlloc<A, B> {}
+impl<A: CudaAlloc, B: CudaAlloc> sealed::alloc::Sealed for CombinedCudaAlloc<A, B> {}
+impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> EmptyCudaAlloc
+    for CombinedCudaAlloc<A, B>
+{
+}
+impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> sealed::empty::Sealed
+    for CombinedCudaAlloc<A, B>
+{
+}
+impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> From<NoCudaAlloc>
+    for CombinedCudaAlloc<A, B>
+{
+    fn from(_: NoCudaAlloc) -> Self {
+        Self(A::from(NoCudaAlloc), B::from(NoCudaAlloc))
+    }
+}
+impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> From<CombinedCudaAlloc<A, B>>
+    for NoCudaAlloc
+{
+    fn from(val: CombinedCudaAlloc<A, B>) -> Self {
+        let _: (Self, Self) = (val.0.into(), val.1.into());
+        Self
+    }
+}
+impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
+    #[must_use]
+    pub const fn new(front: A, tail: B) -> Self {
+        Self(front, tail)
+    }
+
+    pub fn split(self) -> (A, B) {
+        (self.0, self.1)
+    }
+}
+
+pub(crate) mod sealed {
+    pub(super) mod empty {
+        pub trait Sealed {}
+    }
+
+    pub mod alloc {
+        pub trait Sealed {}
+    }
+}
diff --git a/src/common.rs b/src/common.rs
deleted file mode 100644
index b2d398e09..000000000
--- a/src/common.rs
+++ /dev/null
@@ -1,186 +0,0 @@
-#[cfg(any(not(feature = "host"), doc))]
-use core::convert::{AsMut, AsRef};
-use core::marker::PhantomData;
-
-#[cfg(feature = "host")]
-use alloc::fmt;
-#[cfg(not(feature = "host"))]
-use core::ops::{Deref, DerefMut};
-#[cfg(feature = "host")]
-use core::{mem::MaybeUninit, ptr::copy_nonoverlapping};
-
-use const_type_layout::TypeGraphLayout;
-use rustacuda_core::DeviceCopy;
-
-#[cfg(feature = "derive")]
-#[doc(cfg(feature = "derive"))]
-pub use rust_cuda_derive::LendRustToCuda;
-
-#[cfg(feature = "derive")]
-#[doc(cfg(feature = "derive"))]
-pub use rust_cuda_derive::kernel;
-
-#[cfg(feature = "host")]
-use crate::{safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper};
-
-#[repr(transparent)]
-#[cfg_attr(not(feature = "host"), derive(Debug))]
-#[derive(TypeLayout)]
-pub struct DeviceAccessible<T: ?Sized + DeviceCopy>(T);
-
-unsafe impl<T: ?Sized + DeviceCopy> DeviceCopy for DeviceAccessible<T> {}
-
-#[cfg(feature = "host")]
-impl<T: CudaAsRust> From<T> for DeviceAccessible<T> {
-    fn from(value: T) -> Self {
-        Self(value)
-    }
-}
-
-#[cfg(feature = "host")]
-impl<T: SafeDeviceCopy + TypeGraphLayout> From<&T> for DeviceAccessible<SafeDeviceCopyWrapper<T>> {
-    fn from(value: &T) -> Self {
-        let value = unsafe {
-            let mut uninit = MaybeUninit::uninit();
-            copy_nonoverlapping(value, uninit.as_mut_ptr(), 1);
-            uninit.assume_init()
-        };
-
-        Self(SafeDeviceCopyWrapper::from(value))
-    }
-}
-
-#[cfg(feature = "host")]
-impl<T: ?Sized + DeviceCopy + fmt::Debug> fmt::Debug for DeviceAccessible<T> {
-    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
-        fmt.debug_struct(stringify!(DeviceAccessible))
-            .finish_non_exhaustive()
-    }
-}
-
-#[cfg(not(feature = "host"))]
-impl<T: ?Sized + DeviceCopy> Deref for DeviceAccessible<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-#[cfg(not(feature = "host"))]
-impl<T: ?Sized + DeviceCopy> DerefMut for DeviceAccessible<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
-
-/// # Safety
-///
-/// This is an internal trait and should ONLY be derived automatically using
-/// `#[derive(LendRustToCuda)]`
-pub unsafe trait RustToCuda {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    type CudaAllocation: crate::host::CudaAlloc;
-    type CudaRepresentation: CudaAsRust<RustRepresentation = Self> + TypeGraphLayout;
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    /// # Errors
-    ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
-    ///
-    /// # Safety
-    ///
-    /// This is an internal function and should NEVER be called manually
-    /// The returned `Self::CudaRepresentation` must NEVER be accessed on the
-    ///  CPU  as it contains a GPU-resident copy of `self`.
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
-        &self,
-        alloc: A,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )>;
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    /// # Errors
-    ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
-    ///
-    /// # Safety
-    ///
-    /// This is an internal function and should NEVER be called manually
-    #[allow(clippy::type_complexity)]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
-        &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A>;
-}
-
-/// # Safety
-///
-/// This is an internal trait and should NEVER be implemented manually
-pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout {
-    type RustRepresentation: RustToCuda<CudaRepresentation = Self>;
-
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
-    /// # Safety
-    ///
-    /// This is an internal function and should NEVER be called manually
-    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation;
-}
-
-pub trait RustToCudaProxy<T>: RustToCuda {
-    fn from_ref(val: &T) -> &Self;
-    fn from_mut(val: &mut T) -> &mut Self;
-
-    fn into(self) -> T;
-}
-
-#[repr(transparent)]
-#[derive(Clone, Copy, TypeLayout)]
-pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> {
-    #[cfg_attr(feature = "host", allow(dead_code))]
-    pub(super) pointer: *const T,
-    pub(super) reference: PhantomData<&'r T>,
-}
-
-unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceConstRef<'r, T> {}
-
-#[cfg(any(not(feature = "host"), doc))]
-#[doc(cfg(not(feature = "host")))]
-impl<'r, T: DeviceCopy> AsRef<T> for DeviceConstRef<'r, T> {
-    fn as_ref(&self) -> &T {
-        unsafe { &*self.pointer }
-    }
-}
-
-#[repr(transparent)]
-#[derive(TypeLayout)]
-pub struct DeviceMutRef<'r, T: DeviceCopy + 'r> {
-    #[cfg_attr(feature = "host", allow(dead_code))]
-    pub(super) pointer: *mut T,
-    pub(super) reference: PhantomData<&'r mut T>,
-}
-
-unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceMutRef<'r, T> {}
-
-#[cfg(any(not(feature = "host"), doc))]
-#[doc(cfg(not(feature = "host")))]
-impl<'r, T: DeviceCopy> AsRef<T> for DeviceMutRef<'r, T> {
-    fn as_ref(&self) -> &T {
-        unsafe { &*self.pointer }
-    }
-}
-
-#[cfg(any(not(feature = "host"), doc))]
-#[doc(cfg(not(feature = "host")))]
-impl<'r, T: DeviceCopy> AsMut<T> for DeviceMutRef<'r, T> {
-    fn as_mut(&mut self) -> &mut T {
-        unsafe { &mut *self.pointer }
-    }
-}
diff --git a/src/deps.rs b/src/deps.rs
new file mode 100644
index 000000000..50fd38f3f
--- /dev/null
+++ b/src/deps.rs
@@ -0,0 +1,12 @@
+#[doc(hidden)]
+pub extern crate alloc;
+
+pub extern crate const_type_layout;
+
+#[cfg(feature = "host")]
+pub extern crate owning_ref;
+
+#[cfg(feature = "host")]
+pub extern crate rustacuda;
+
+pub extern crate rustacuda_core;
diff --git a/src/device/alloc.rs b/src/device/alloc.rs
new file mode 100644
index 000000000..bca59a1eb
--- /dev/null
+++ b/src/device/alloc.rs
@@ -0,0 +1,21 @@
+#[cfg(all(feature = "device", not(doc)))]
+use core::arch::nvptx;
+
+use crate::deps::alloc::alloc::{GlobalAlloc, Layout};
+
+/// Memory allocator using CUDA malloc/free
+pub struct PTXAllocator;
+
+unsafe impl GlobalAlloc for PTXAllocator {
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        nvptx::malloc(layout.size()).cast()
+    }
+
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
+        nvptx::free(ptr.cast());
+    }
+}
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 225bc8252..df20ae5a8 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -1,115 +1,7 @@
-use core::{
-    mem::ManuallyDrop,
-    ops::{Deref, DerefMut},
-};
-
-#[cfg(feature = "derive")]
-#[doc(cfg(feature = "derive"))]
-pub use rust_cuda_derive::{specialise_kernel_entry, specialise_kernel_type};
-
-use crate::{
-    common::{CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceMutRef, RustToCuda},
-    safety::SafeDeviceCopy,
-};
+#[doc(hidden)]
+#[cfg(feature = "kernel")]
+pub use rust_cuda_kernel::{specialise_kernel_function, specialise_kernel_param_type};
 
+pub mod alloc;
+pub mod thread;
 pub mod utils;
-
-pub trait BorrowFromRust: RustToCuda {
-    /// # Safety
-    ///
-    /// This function is only safe to call iff `cuda_repr` is the
-    ///  `DeviceConstRef` borrowed on the CPU using the corresponding
-    ///  `LendToCuda::lend_to_cuda`.
-    unsafe fn with_borrow_from_rust<O, F: FnOnce(&ShallowCopy<Self>) -> O>(
-        cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O;
-
-    /// # Safety
-    ///
-    /// This function is only safe to call iff `cuda_repr_mut` is the
-    ///  `DeviceMutRef` borrowed on the CPU using the corresponding
-    ///  `LendToCuda::lend_to_cuda_mut`.
-    /// Furthermore, since different GPU threads can access heap storage
-    ///  mutably inside the safe `inner` scope, there must not be any
-    ///  aliasing between concurrently running threads.
-    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut ShallowCopy<Self>) -> O>(
-        cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O;
-
-    /// # Safety
-    ///
-    /// This function is only safe to call iff `cuda_repr` is the
-    ///  `DeviceMutRef` borrowed on the CPU using the corresponding
-    ///  `LendToCuda::move_to_cuda`.
-    unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
-        cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O
-    where
-        Self: Sized + SafeDeviceCopy,
-        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy;
-}
-
-impl<T: RustToCuda> BorrowFromRust for T {
-    #[inline]
-    unsafe fn with_borrow_from_rust<O, F: FnOnce(&ShallowCopy<Self>) -> O>(
-        cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O {
-        // rust_repr must never be dropped as we do NOT own any of the
-        //  heap memory it might reference
-        let rust_repr = ShallowCopy::new(CudaAsRust::as_rust(cuda_repr.as_ref()));
-
-        inner(&rust_repr)
-    }
-
-    #[inline]
-    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut ShallowCopy<Self>) -> O>(
-        mut cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O {
-        // rust_repr must never be dropped as we do NOT own any of the
-        //  heap memory it might reference
-        let mut rust_repr_mut = ShallowCopy::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut()));
-
-        inner(&mut rust_repr_mut)
-    }
-
-    #[inline]
-    unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
-        mut cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O
-    where
-        Self: Sized + SafeDeviceCopy,
-        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
-    {
-        inner(CudaAsRust::as_rust(cuda_repr_mut.as_mut()))
-    }
-}
-
-#[repr(transparent)]
-#[derive(Debug)]
-pub struct ShallowCopy<T>(ManuallyDrop<T>);
-
-impl<T> ShallowCopy<T> {
-    fn new(value: T) -> Self {
-        Self(ManuallyDrop::new(value))
-    }
-}
-
-impl<T> Deref for ShallowCopy<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl<T> DerefMut for ShallowCopy<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
diff --git a/src/device/thread.rs b/src/device/thread.rs
new file mode 100644
index 000000000..bb5599cda
--- /dev/null
+++ b/src/device/thread.rs
@@ -0,0 +1,155 @@
+#[cfg(all(feature = "device", not(doc)))]
+use core::arch::nvptx;
+
+#[allow(clippy::module_name_repetitions)]
+pub struct Thread {
+    _private: (),
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct ThreadBlock {
+    _private: (),
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct ThreadBlockGrid {
+    _private: (),
+}
+
+impl Thread {
+    #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub const fn this() -> Self {
+        Self { _private: () }
+    }
+
+    #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub fn index(&self) -> usize {
+        let block = self.block();
+        let grid = block.grid();
+
+        let block_id = block.idx().as_id(&grid.dim());
+        let thread_id = self.idx().as_id(&block.dim());
+
+        block_id * block.dim().size() + thread_id
+    }
+
+    #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub fn idx(&self) -> Idx3 {
+        #[allow(clippy::cast_sign_loss)]
+        unsafe {
+            Idx3 {
+                x: nvptx::_thread_idx_x() as u32,
+                y: nvptx::_thread_idx_y() as u32,
+                z: nvptx::_thread_idx_z() as u32,
+            }
+        }
+    }
+
+    #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub const fn block(&self) -> ThreadBlock {
+        ThreadBlock { _private: () }
+    }
+}
+
+impl ThreadBlock {
+    #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub fn dim(&self) -> Dim3 {
+        #[allow(clippy::cast_sign_loss)]
+        unsafe {
+            Dim3 {
+                x: nvptx::_block_dim_x() as u32,
+                y: nvptx::_block_dim_y() as u32,
+                z: nvptx::_block_dim_z() as u32,
+            }
+        }
+    }
+
+    #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub fn idx(&self) -> Idx3 {
+        #[allow(clippy::cast_sign_loss)]
+        unsafe {
+            Idx3 {
+                x: nvptx::_block_idx_x() as u32,
+                y: nvptx::_block_idx_y() as u32,
+                z: nvptx::_block_idx_z() as u32,
+            }
+        }
+    }
+
+    #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub const fn grid(&self) -> ThreadBlockGrid {
+        ThreadBlockGrid { _private: () }
+    }
+
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub fn synchronize(&self) {
+        unsafe { nvptx::_syncthreads() }
+    }
+}
+
+impl ThreadBlockGrid {
+    #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub fn dim(&self) -> Dim3 {
+        #[allow(clippy::cast_sign_loss)]
+        unsafe {
+            Dim3 {
+                x: nvptx::_grid_dim_x() as u32,
+                y: nvptx::_grid_dim_y() as u32,
+                z: nvptx::_grid_dim_z() as u32,
+            }
+        }
+    }
+}
+
+/// Dimension specified in kernel launching
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct Dim3 {
+    pub x: u32,
+    pub y: u32,
+    pub z: u32,
+}
+
+/// Indices that the kernel code is running on
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct Idx3 {
+    pub x: u32,
+    pub y: u32,
+    pub z: u32,
+}
+
+impl Dim3 {
+    #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub const fn size(&self) -> usize {
+        (self.x as usize) * (self.y as usize) * (self.z as usize)
+    }
+}
+
+impl Idx3 {
+    #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub const fn as_id(&self, dim: &Dim3) -> usize {
+        (self.x as usize)
+            + (self.y as usize) * (dim.x as usize)
+            + (self.z as usize) * (dim.x as usize) * (dim.y as usize)
+    }
+}
diff --git a/src/device/utils.rs b/src/device/utils.rs
index a45ff9c71..8447c5235 100644
--- a/src/device/utils.rs
+++ b/src/device/utils.rs
@@ -1,220 +1,214 @@
-use alloc::alloc::{GlobalAlloc, Layout};
-#[cfg(target_os = "cuda")]
-use core::arch::nvptx;
-
-/// Memory allocator using CUDA malloc/free
-pub struct PTXAllocator;
-
-unsafe impl GlobalAlloc for PTXAllocator {
-    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
-        nvptx::malloc(layout.size()).cast()
-    }
-
-    unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
-        nvptx::free(ptr.cast());
-    }
-}
-
-// Based on https://github.com/popzxc/stdext-rs/blob/master/src/macros.rs
-#[macro_export]
-#[doc(hidden)]
-macro_rules! function {
-    () => {{
-        // Hack to get the name of the enclosing function
-        fn f() {}
-        fn type_name_of<T>(_: T) -> &'static str {
-            core::any::type_name::<T>()
-        }
-        let name = type_name_of(f);
-
-        // Remove the `::f` suffix
-        &name[..name.len() - 3]
-    }};
-}
-
-/// Alternative of [`std::print!`](https://doc.rust-lang.org/std/macro.print.html) using CUDA `vprintf` system-call
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-#[macro_export]
-macro_rules! print {
+use crate::deps::alloc::{fmt, string::String};
+
+/// Abort the CUDA kernel using the `trap` system call.
+///
+/// [`abort`] poisons the CUDA context and no more work can be performed in it.
+#[allow(clippy::inline_always)]
+#[inline(always)]
+pub fn abort() -> ! {
+    unsafe { ::core::arch::nvptx::trap() }
+}
+
+/// Exit the CUDA kernel using the `exit` instruction.
+///
+/// # Safety
+///
+/// [`exit`] quits the kernel early and any mutable data accessible outside this
+/// kernel launch (by the host or a subsequent kernel launch) may be in an
+/// inconsistent state. Therefore, kernel failure must be communicated back to
+/// host and handled in some other manner.
+///
+/// Safely return from the main kernel function instead.
+#[allow(clippy::inline_always)]
+#[inline(always)]
+pub unsafe fn exit() -> ! {
+    unsafe { ::core::arch::asm!("exit;", options(noreturn)) }
+}
+
+/// Prints to the CUDA kernel's standard output using the `vprintf` system call.
+///
+/// Replacement for the [`std::print!`] macro, which now forwards to the
+/// [`print()`] function.
+pub macro print($($arg:tt)*) {
+    self::print(::core::format_args!($($arg)*))
+}
+
+/// Prints to the CUDA kernel's standard output using the `vprintf` system call.
+///
+/// Replacement for the [`std::println!`] macro, which now forwards to the
+/// [`print()`] function.
+pub macro println {
+    () => {
+        self::print(::core::format_args!("\n"))
+    },
     ($($arg:tt)*) => {
-        let msg = $crate::alloc::format!($($arg)*);
-
-        #[allow(unused_unsafe)]
-        unsafe {
-            ::core::arch::nvptx::vprintf(msg.as_ptr(), ::core::ptr::null_mut());
-        }
+        self::print(::core::format_args!("{}\n", ::core::format_args!($($arg)*)))
+    },
+}
+
+/// The [`print()`] function takes an [`Arguments`](core::fmt::Arguments) struct
+/// and formats and prints it to the CUDA kernel's standard output using the
+/// `vprintf` system call.
+///
+/// The [`Arguments`](core::fmt::Arguments) instance can be created with the
+/// [`format_args!`](core::format_args) macro.
+#[allow(clippy::inline_always)]
+#[inline(always)]
+pub fn print(args: ::core::fmt::Arguments) {
+    #[repr(C)]
+    struct FormatArgs {
+        msg_len: u32,
+        msg_ptr: *const u8,
     }
-}
-
-/// Alternative of [`std::println!`](https://doc.rust-lang.org/std/macro.println.html) using CUDA `vprintf` system-call
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-#[macro_export]
-macro_rules! println {
-    () => ($crate::print!("\n"));
-    ($fmt:expr) => ($crate::print!(concat!($fmt, "\n")));
-    ($fmt:expr, $($arg:tt)*) => ($crate::print!(concat!($fmt, "\n"), $($arg)*));
-}
 
-/// Assertion in GPU kernel for one expression is true.
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-#[macro_export]
-macro_rules! assert {
-    ($e:expr) => {
-        if !$e {
-            let msg = $crate::alloc::format!(
-                "\nassertion failed: {}\nexpression: {:?}",
-                stringify!($e),
-                $e,
-            );
-
-            unsafe {
-                ::core::arch::nvptx::__assert_fail(
-                    msg.as_ptr(),
-                    file!().as_ptr(),
-                    line!(),
-                    $crate::function!().as_ptr(),
-                )
-            };
-        }
+    let msg; // place to store the dynamically expanded format string
+    #[allow(clippy::option_if_let_else)]
+    let msg = if let Some(msg) = args.as_str() {
+        msg
+    } else {
+        msg = fmt::format(args);
+        msg.as_str()
     };
-}
 
-/// Assertion in GPU kernel for two expressions are equal.
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-#[macro_export]
-macro_rules! assert_eq {
-    ($a:expr, $b:expr) => {
-        if $a != $b {
-            let msg = $crate::alloc::format!(
-                "\nassertion failed: ({} == {})\nleft : {:?}\nright: {:?}",
-                stringify!($a),
-                stringify!($b),
-                $a,
-                $b
-            );
-
-            unsafe {
-                ::core::arch::nvptx::__assert_fail(
-                    msg.as_ptr(),
-                    file!().as_ptr(),
-                    line!(),
-                    $crate::function!().as_ptr(),
-                )
-            };
-        }
-    };
-}
-
-/// Assertion in GPU kernel for two expressions are not equal.
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-#[macro_export]
-macro_rules! assert_ne {
-    ($a:expr, $b:expr) => {
-        if $a == $b {
-            let msg = $crate::alloc::format!(
-                "\nassertion failed: ({} != {})\nleft : {:?}\nright: {:?}",
-                stringify!($a),
-                stringify!($b),
-                $a,
-                $b
-            );
-
-            unsafe {
-                ::core::arch::nvptx::__assert_fail(
-                    msg.as_ptr(),
-                    file!().as_ptr(),
-                    line!(),
-                    $crate::function!().as_ptr(),
-                )
-            };
-        }
+    let args = FormatArgs {
+        msg_len: u32::try_from(msg.len()).unwrap_or(u32::MAX),
+        msg_ptr: msg.as_ptr(),
     };
-}
-
-/// Dimension specified in kernel launching
-#[derive(Debug)]
-pub struct Dim3 {
-    pub x: u32,
-    pub y: u32,
-    pub z: u32,
-}
 
-/// Indices that the kernel code is running on
-#[derive(Debug)]
-pub struct Idx3 {
-    pub x: u32,
-    pub y: u32,
-    pub z: u32,
-}
-
-#[must_use]
-pub fn block_dim() -> Dim3 {
-    #[allow(clippy::cast_sign_loss)]
     unsafe {
-        Dim3 {
-            x: nvptx::_block_dim_x() as u32,
-            y: nvptx::_block_dim_y() as u32,
-            z: nvptx::_block_dim_z() as u32,
-        }
+        ::core::arch::nvptx::vprintf(c"%*s".as_ptr().cast(), ::core::ptr::from_ref(&args).cast());
     }
 }
 
-#[must_use]
-pub fn block_idx() -> Idx3 {
-    #[allow(clippy::cast_sign_loss)]
-    unsafe {
-        Idx3 {
-            x: nvptx::_block_idx_x() as u32,
-            y: nvptx::_block_idx_y() as u32,
-            z: nvptx::_block_idx_z() as u32,
-        }
+/// Helper function to efficiently pretty-print a [`core::panic::PanicInfo`]
+/// using the `vprintf` system call.
+///
+/// If `allow_dynamic_message` is set,
+/// [`alloc::fmt::format`](crate::deps::alloc::fmt::format) is used to print
+/// [`core::panic::PanicInfo::message`] message when
+/// [`core::fmt::Arguments::as_str`] returns [`None`]. Note that this may pull
+/// in a large amount of string formatting and dynamic allocation code.
+/// If unset, a default placeholder panic message is printed instead.
+///
+/// If `allow_dynamic_payload` is set, [`core::panic::PanicInfo::payload`] is
+/// checked for [`&str`] and [`String`] to get a message to print if
+/// [`core::panic::PanicInfo::message`] returns [`None`]. Note that this may
+/// pull in some dynamic dispatch code. If unset, a default placeholder panic
+/// message is printed instead.
+#[allow(clippy::inline_always)]
+#[inline(always)]
+pub fn pretty_print_panic_info(
+    info: &::core::panic::PanicInfo,
+    allow_dynamic_message: bool,
+    allow_dynamic_payload: bool,
+) {
+    #[repr(C)]
+    struct FormatArgs {
+        file_len: u32,
+        file_ptr: *const u8,
+        line: u32,
+        column: u32,
+        thread_idx_x: u32,
+        thread_idx_y: u32,
+        thread_idx_z: u32,
+        msg_len: u32,
+        msg_ptr: *const u8,
     }
-}
 
-#[must_use]
-pub fn grid_dim() -> Dim3 {
-    #[allow(clippy::cast_sign_loss)]
-    unsafe {
-        Dim3 {
-            x: nvptx::_grid_dim_x() as u32,
-            y: nvptx::_grid_dim_y() as u32,
-            z: nvptx::_grid_dim_z() as u32,
+    let msg; // place to store the dynamically expanded format string
+    #[allow(clippy::option_if_let_else)]
+    let msg = if let Some(message) = info.message() {
+        if let Some(msg) = message.as_str() {
+            msg
+        } else if allow_dynamic_message {
+            msg = fmt::format(*message);
+            msg.as_str()
+        } else {
+            "<dynamic panic message>"
         }
-    }
-}
+    } else if let Some(msg) = info.payload().downcast_ref::<&'static str>()
+        && allow_dynamic_payload
+    {
+        msg
+    } else if let Some(msg) = info.payload().downcast_ref::<String>()
+        && allow_dynamic_payload
+    {
+        msg.as_str()
+    } else {
+        "<unknown panic payload type>"
+    };
 
-#[must_use]
-pub fn thread_idx() -> Idx3 {
-    #[allow(clippy::cast_sign_loss)]
-    unsafe {
-        Idx3 {
-            x: nvptx::_thread_idx_x() as u32,
-            y: nvptx::_thread_idx_y() as u32,
-            z: nvptx::_thread_idx_z() as u32,
-        }
-    }
-}
+    let location_line = info.location().map_or(0, ::core::panic::Location::line);
+    let location_column = info.location().map_or(0, ::core::panic::Location::column);
+    let location_file = info
+        .location()
+        .map_or("<unknown panic location>", ::core::panic::Location::file);
+
+    let thread_idx = crate::device::thread::Thread::this().idx();
+
+    let args = FormatArgs {
+        file_len: u32::try_from(location_file.len()).unwrap_or(u32::MAX),
+        file_ptr: location_file.as_ptr(),
+        line: location_line,
+        column: location_column,
+        thread_idx_x: thread_idx.x,
+        thread_idx_y: thread_idx.y,
+        thread_idx_z: thread_idx.z,
+        msg_len: u32::try_from(msg.len()).unwrap_or(u32::MAX),
+        msg_ptr: msg.as_ptr(),
+    };
 
-impl Dim3 {
-    #[must_use]
-    pub fn size(&self) -> usize {
-        (self.x as usize) * (self.y as usize) * (self.z as usize)
+    unsafe {
+        ::core::arch::nvptx::vprintf(
+            c"panicked at %*s:%u:%u on thread (x=%u, y=%u, z=%u):\n%*s\n"
+                .as_ptr()
+                .cast(),
+            ::core::ptr::from_ref(&args).cast(),
+        );
     }
 }
 
-impl Idx3 {
-    #[must_use]
-    pub fn as_id(&self, dim: &Dim3) -> usize {
-        (self.x as usize)
-            + (self.y as usize) * (dim.x as usize)
-            + (self.z as usize) * (dim.x as usize) * (dim.y as usize)
+/// Helper function to efficiently pretty-print an error message (inside an
+/// allocation error handler) using the `vprintf` system call.
+#[track_caller]
+#[allow(clippy::inline_always)]
+#[inline(always)]
+pub fn pretty_print_alloc_error(layout: ::core::alloc::Layout) {
+    #[repr(C)]
+    struct FormatArgs {
+        size: usize,
+        align: usize,
+        file_len: u32,
+        file_ptr: *const u8,
+        line: u32,
+        column: u32,
+        thread_idx_x: u32,
+        thread_idx_y: u32,
+        thread_idx_z: u32,
     }
-}
 
-#[must_use]
-pub fn index() -> usize {
-    let block_id = block_idx().as_id(&grid_dim());
-    let thread_id = thread_idx().as_id(&block_dim());
+    let location = ::core::panic::Location::caller();
+    let thread_idx = crate::device::thread::Thread::this().idx();
+
+    let args = FormatArgs {
+        size: layout.size(),
+        align: layout.align(),
+        file_len: u32::try_from(location.file().len()).unwrap_or(u32::MAX),
+        file_ptr: location.file().as_ptr(),
+        line: location.line(),
+        column: location.column(),
+        thread_idx_x: thread_idx.x,
+        thread_idx_y: thread_idx.y,
+        thread_idx_z: thread_idx.z,
+    };
 
-    block_id * block_dim().size() + thread_id
+    unsafe {
+        ::core::arch::nvptx::vprintf(
+            c"memory allocation of %llu bytes with alignment %llu failed at \
+            %*s:%u:%u on thread (x=%u, y=%u, z=%u)\n"
+                .as_ptr()
+                .cast(),
+            ::core::ptr::from_ref(&args).cast(),
+        );
+    }
 }
diff --git a/src/host.rs b/src/host.rs
deleted file mode 100644
index 6c91a26bc..000000000
--- a/src/host.rs
+++ /dev/null
@@ -1,612 +0,0 @@
-use core::{
-    marker::PhantomData,
-    mem::ManuallyDrop,
-    ops::{Deref, DerefMut},
-};
-
-use rustacuda::{
-    context::Context,
-    error::{CudaError, CudaResult},
-    function::Function,
-    memory::{DeviceBox, DeviceBuffer, LockedBuffer},
-    module::Module,
-    stream::Stream,
-};
-use rustacuda_core::{DeviceCopy, DevicePointer};
-
-#[cfg(feature = "derive")]
-#[doc(cfg(feature = "derive"))]
-pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_call};
-
-use crate::{
-    common::{DeviceAccessible, DeviceConstRef, DeviceMutRef, RustToCuda},
-    ptx_jit::{CudaKernel, PtxJITCompiler, PtxJITResult},
-    safety::SafeDeviceCopy,
-};
-
-pub trait Launcher {
-    type KernelTraitObject: ?Sized;
-    type CompilationWatcher;
-
-    fn get_launch_package(&mut self) -> LaunchPackage<Self>;
-
-    /// # Errors
-    ///
-    /// Should only return a `CudaError` if some implementation-defined
-    ///  critical kernel function configuration failed.
-    #[allow(unused_variables)]
-    fn on_compile(kernel: &Function, watcher: &mut Self::CompilationWatcher) -> CudaResult<()> {
-        Ok(())
-    }
-}
-
-#[derive(Clone, Debug, PartialEq, Eq)]
-pub struct LaunchConfig {
-    pub grid: rustacuda::function::GridSize,
-    pub block: rustacuda::function::BlockSize,
-    pub shared_memory_size: u32,
-    pub ptx_jit: bool,
-}
-
-pub struct LaunchPackage<'l, L: ?Sized + Launcher> {
-    pub config: LaunchConfig,
-
-    pub kernel: &'l mut TypedKernel<L::KernelTraitObject>,
-    pub stream: &'l mut Stream,
-
-    pub watcher: &'l mut L::CompilationWatcher,
-}
-
-pub enum KernelJITResult<'k> {
-    Cached(&'k Function<'k>),
-    Recompiled(&'k Function<'k>),
-}
-
-pub struct TypedKernel<KernelTraitObject: ?Sized> {
-    compiler: PtxJITCompiler,
-    kernel: Option<CudaKernel>,
-    entry_point: alloc::boxed::Box<std::ffi::CStr>,
-    marker: PhantomData<KernelTraitObject>,
-}
-
-impl<KernelTraitObject: ?Sized> TypedKernel<KernelTraitObject> {
-    /// # Errors
-    ///
-    /// Returns a `CudaError` if `ptx` or `entry_point` contain nul bytes.
-    pub fn new(ptx: &str, entry_point: &str) -> CudaResult<Self> {
-        let ptx_cstring = std::ffi::CString::new(ptx).map_err(|_| CudaError::InvalidPtx)?;
-
-        let compiler = crate::ptx_jit::PtxJITCompiler::new(&ptx_cstring);
-
-        let entry_point_cstring =
-            std::ffi::CString::new(entry_point).map_err(|_| CudaError::InvalidValue)?;
-        let entry_point = entry_point_cstring.into_boxed_c_str();
-
-        Ok(Self {
-            compiler,
-            kernel: None,
-            entry_point,
-            marker: PhantomData,
-        })
-    }
-
-    /// # Errors
-    ///
-    /// Returns a `CudaError` if `ptx` (from [`Self::new`]) is not a valid
-    ///  PTX source, or it does not contain an entry point named `entry_point`
-    ///  (from [`Self::new`]).
-    pub fn compile_with_ptx_jit_args(
-        &mut self,
-        arguments: Option<&[Option<&[u8]>]>,
-    ) -> CudaResult<KernelJITResult> {
-        let ptx_jit = self.compiler.with_arguments(arguments);
-
-        let kernel_jit = match (&mut self.kernel, ptx_jit) {
-            (Some(kernel), PtxJITResult::Cached(_)) => {
-                KernelJITResult::Cached(kernel.get_function())
-            },
-            (kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => {
-                let recomputed_kernel = CudaKernel::new(ptx_cstr, &self.entry_point)?;
-
-                // Replace the existing compiled kernel, drop the old one
-                let kernel = kernel.insert(recomputed_kernel);
-
-                KernelJITResult::Recompiled(kernel.get_function())
-            },
-        };
-
-        Ok(kernel_jit)
-    }
-}
-
-pub trait LendToCuda: RustToCuda {
-    /// Lends an immutable copy of `&self` to CUDA:
-    /// - code in the CUDA kernel can only access `&self` through the
-    ///   `DeviceConstRef` inside the closure
-    /// - after the closure, `&self` will not have changed
-    ///
-    /// # Errors
-    ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
-    fn lend_to_cuda<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        &self,
-        inner: F,
-    ) -> Result<O, E>;
-
-    /// Lends a mutable copy of `&mut self` to CUDA:
-    /// - code in the CUDA kernel can only access `&mut self` through the
-    ///   `DeviceMutRef` inside the closure
-    /// - after the closure, `&mut self` might have changed in the following
-    ///   ways:
-    ///   - to avoid aliasing, each CUDA thread gets its own shallow copy of
-    ///     `&mut self`, i.e. any shallow changes will NOT be reflected after
-    ///     the closure
-    ///   - each CUDA thread can access the same heap allocated storage, i.e.
-    ///     any deep changes will be reflected after the closure
-    ///
-    /// # Errors
-    ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
-    fn lend_to_cuda_mut<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        &mut self,
-        inner: F,
-    ) -> Result<O, E>;
-
-    /// Moves `self` to CUDA iff `self` is `SafeDeviceCopy`
-    ///
-    /// # Errors
-    ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
-    fn move_to_cuda<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        self,
-        inner: F,
-    ) -> Result<O, E>
-    where
-        Self: Sized + SafeDeviceCopy,
-        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
-        <Self as RustToCuda>::CudaAllocation: EmptyCudaAlloc;
-}
-
-impl<T: RustToCuda> LendToCuda for T {
-    fn lend_to_cuda<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        &self,
-        inner: F,
-    ) -> Result<O, E> {
-        let (cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?;
-
-        let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner);
-
-        core::mem::drop(cuda_repr);
-        core::mem::drop(alloc);
-
-        result
-    }
-
-    fn lend_to_cuda_mut<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        &mut self,
-        inner: F,
-    ) -> Result<O, E> {
-        let (mut cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?;
-
-        let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, inner);
-
-        core::mem::drop(cuda_repr);
-
-        let _: NullCudaAlloc = unsafe { self.restore(alloc) }?;
-
-        result
-    }
-
-    fn move_to_cuda<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        self,
-        inner: F,
-    ) -> Result<O, E>
-    where
-        Self: Sized + SafeDeviceCopy,
-        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
-        <Self as RustToCuda>::CudaAllocation: EmptyCudaAlloc,
-    {
-        let (cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?;
-
-        let result = HostAndDeviceOwned::with_new(cuda_repr, inner);
-
-        core::mem::drop(alloc);
-
-        result
-    }
-}
-
-pub(crate) mod private {
-    pub mod alloc {
-        pub trait Sealed {}
-    }
-
-    pub mod drop {
-        pub trait Sealed: Sized {
-            fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>;
-        }
-    }
-
-    pub mod empty {
-        pub trait Sealed {}
-    }
-}
-
-pub trait EmptyCudaAlloc: private::empty::Sealed {}
-impl<T: private::empty::Sealed> EmptyCudaAlloc for T {}
-
-pub trait CudaAlloc: private::alloc::Sealed {}
-impl<T: private::alloc::Sealed> CudaAlloc for T {}
-
-impl<T: CudaAlloc> private::alloc::Sealed for Option<T> {}
-
-pub struct NullCudaAlloc;
-impl private::alloc::Sealed for NullCudaAlloc {}
-impl private::empty::Sealed for NullCudaAlloc {}
-
-pub struct CombinedCudaAlloc<A: CudaAlloc, B: CudaAlloc>(A, B);
-impl<A: CudaAlloc, B: CudaAlloc> private::alloc::Sealed for CombinedCudaAlloc<A, B> {}
-impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> private::empty::Sealed
-    for CombinedCudaAlloc<A, B>
-{
-}
-impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
-    pub fn new(front: A, tail: B) -> Self {
-        Self(front, tail)
-    }
-
-    pub fn split(self) -> (A, B) {
-        (self.0, self.1)
-    }
-}
-
-pub struct CudaDropWrapper<C: private::drop::Sealed>(Option<C>);
-impl<C: private::drop::Sealed> private::alloc::Sealed for CudaDropWrapper<C> {}
-impl<C: private::drop::Sealed> From<C> for CudaDropWrapper<C> {
-    fn from(val: C) -> Self {
-        Self(Some(val))
-    }
-}
-impl<C: private::drop::Sealed> Drop for CudaDropWrapper<C> {
-    fn drop(&mut self) {
-        if let Some(val) = self.0.take() {
-            if let Err((_err, val)) = C::drop(val) {
-                core::mem::forget(val);
-            }
-        }
-    }
-}
-impl<C: private::drop::Sealed> Deref for CudaDropWrapper<C> {
-    type Target = C;
-
-    fn deref(&self) -> &Self::Target {
-        self.0.as_ref().unwrap()
-    }
-}
-impl<C: private::drop::Sealed> DerefMut for CudaDropWrapper<C> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.0.as_mut().unwrap()
-    }
-}
-
-macro_rules! impl_sealed_drop_collection {
-    ($type:ident) => {
-        impl<C: DeviceCopy> private::drop::Sealed for $type<C> {
-            fn drop(val: Self) -> Result<(), (CudaError, Self)> {
-                Self::drop(val)
-            }
-        }
-    };
-}
-
-impl_sealed_drop_collection!(DeviceBuffer);
-impl_sealed_drop_collection!(DeviceBox);
-impl_sealed_drop_collection!(LockedBuffer);
-
-macro_rules! impl_sealed_drop_value {
-    ($type:ident) => {
-        impl private::drop::Sealed for $type {
-            fn drop(val: Self) -> Result<(), (CudaError, Self)> {
-                Self::drop(val)
-            }
-        }
-    };
-}
-
-impl_sealed_drop_value!(Module);
-impl_sealed_drop_value!(Stream);
-impl_sealed_drop_value!(Context);
-
-#[repr(transparent)]
-#[allow(clippy::module_name_repetitions)]
-pub struct HostDeviceBox<T: DeviceCopy>(DevicePointer<T>);
-
-impl<T: DeviceCopy> private::alloc::Sealed for HostDeviceBox<T> {}
-
-impl<T: DeviceCopy> HostDeviceBox<T> {
-    /// # Errors
-    ///
-    /// Returns a `CudaError` iff copying from `value` into `self` failed.
-    pub fn copy_from(&mut self, value: &T) -> CudaResult<()> {
-        // Safety: pointer comes from `DeviceBox::into_device`
-        //         i.e. this function completes the roundtrip
-        let mut device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) };
-
-        rustacuda::memory::CopyDestination::copy_from(&mut *device_box, value)
-    }
-
-    /// # Errors
-    ///
-    /// Returns a `CudaError` iff copying from `self` into `value` failed.
-    pub fn copy_to(&self, value: &mut T) -> CudaResult<()> {
-        // Safety: pointer comes from `DeviceBox::into_device`
-        //         i.e. this function completes the roundtrip
-        let device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) };
-
-        rustacuda::memory::CopyDestination::copy_to(&*device_box, value)
-    }
-}
-
-impl<T: DeviceCopy> From<DeviceBox<T>> for HostDeviceBox<T> {
-    fn from(device_box: DeviceBox<T>) -> Self {
-        Self(DeviceBox::into_device(device_box))
-    }
-}
-
-impl<T: DeviceCopy> From<HostDeviceBox<T>> for DeviceBox<T> {
-    fn from(host_device_box: HostDeviceBox<T>) -> Self {
-        // Safety: pointer comes from `DeviceBox::into_device`
-        //         i.e. this function completes the roundtrip
-        unsafe { DeviceBox::from_device(host_device_box.0) }
-    }
-}
-
-impl<T: DeviceCopy> Drop for HostDeviceBox<T> {
-    fn drop(&mut self) {
-        // Safety: pointer comes from `DeviceBox::into_device`
-        //         i.e. this function completes the roundtrip
-        let device_box = unsafe { DeviceBox::from_device(self.0) };
-
-        core::mem::drop(CudaDropWrapper::from(device_box));
-    }
-}
-
-#[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceMutRef<'a, T: DeviceCopy> {
-    device_box: &'a mut HostDeviceBox<T>,
-    host_ref: &'a mut T,
-}
-
-impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> {
-    /// # Safety
-    ///
-    /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub unsafe fn new(device_box: &'a mut HostDeviceBox<T>, host_ref: &'a mut T) -> Self {
-        Self {
-            device_box,
-            host_ref,
-        }
-    }
-
-    /// # Errors
-    ///
-    /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved
-    ///  to CUDA or an error occurs inside `inner`.
-    pub fn with_new<
-        O,
-        E: From<CudaError>,
-        F: for<'b> FnOnce(HostAndDeviceMutRef<'b, T>) -> Result<O, E>,
-    >(
-        host_ref: &mut T,
-        inner: F,
-    ) -> Result<O, E> {
-        let mut device_box: HostDeviceBox<_> = DeviceBox::new(host_ref)?.into();
-
-        // Safety: `device_box` contains exactly the device copy of `host_ref`
-        let result = inner(HostAndDeviceMutRef {
-            device_box: &mut device_box,
-            host_ref,
-        });
-
-        // Copy back any changes made
-        device_box.copy_to(host_ref)?;
-
-        core::mem::drop(device_box);
-
-        result
-    }
-
-    #[must_use]
-    pub fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T>
-    where
-        'a: 'b,
-    {
-        DeviceMutRef {
-            pointer: self.device_box.0.as_raw_mut(),
-            reference: PhantomData,
-        }
-    }
-
-    #[must_use]
-    pub fn for_host<'b: 'a>(&'b self) -> &'a T {
-        self.host_ref
-    }
-
-    #[must_use]
-    pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T>
-    where
-        'a: 'b,
-    {
-        // Safety: `device_box` contains EXACTLY the device copy of `host_ref`
-        //          by construction of `HostAndDeviceMutRef`
-        unsafe { HostAndDeviceConstRef::new(self.device_box, self.host_ref) }
-    }
-
-    #[must_use]
-    pub fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRef<'b, T>
-    where
-        'a: 'b,
-    {
-        // Safety: `device_box` contains EXACTLY the device copy of `host_ref`
-        //          by construction of `HostAndDeviceMutRef`
-        unsafe { HostAndDeviceMutRef::new(self.device_box, self.host_ref) }
-    }
-}
-
-#[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceConstRef<'a, T: DeviceCopy> {
-    device_box: &'a HostDeviceBox<T>,
-    host_ref: &'a T,
-}
-
-impl<'a, T: DeviceCopy> Clone for HostAndDeviceConstRef<'a, T> {
-    fn clone(&self) -> Self {
-        *self
-    }
-}
-
-impl<'a, T: DeviceCopy> Copy for HostAndDeviceConstRef<'a, T> {}
-
-impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
-    /// # Safety
-    ///
-    /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub unsafe fn new(device_box: &'a HostDeviceBox<T>, host_ref: &'a T) -> Self {
-        Self {
-            device_box,
-            host_ref,
-        }
-    }
-
-    /// # Errors
-    ///
-    /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved
-    ///  to CUDA or an error occurs inside `inner`.
-    pub fn with_new<
-        O,
-        E: From<CudaError>,
-        F: for<'b> FnOnce(HostAndDeviceConstRef<'b, T>) -> Result<O, E>,
-    >(
-        host_ref: &T,
-        inner: F,
-    ) -> Result<O, E> {
-        let device_box: HostDeviceBox<_> = DeviceBox::new(host_ref)?.into();
-
-        // Safety: `device_box` contains exactly the device copy of `host_ref`
-        let result = inner(HostAndDeviceConstRef {
-            device_box: &device_box,
-            host_ref,
-        });
-
-        core::mem::drop(device_box);
-
-        result
-    }
-
-    #[must_use]
-    pub fn for_device<'b>(&'b self) -> DeviceConstRef<'a, T>
-    where
-        'a: 'b,
-    {
-        DeviceConstRef {
-            pointer: self.device_box.0.as_raw(),
-            reference: PhantomData,
-        }
-    }
-
-    #[must_use]
-    pub fn for_host(&'a self) -> &'a T {
-        self.host_ref
-    }
-
-    #[must_use]
-    pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T>
-    where
-        'a: 'b,
-    {
-        *self
-    }
-}
-
-#[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceOwned<'a, T: SafeDeviceCopy + DeviceCopy> {
-    device_box: &'a mut HostDeviceBox<T>,
-    host_val: &'a mut T,
-}
-
-impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> {
-    /// # Errors
-    ///
-    /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved
-    ///  to CUDA or an error occurs inside `inner`.
-    pub fn with_new<
-        O,
-        E: From<CudaError>,
-        F: for<'b> FnOnce(HostAndDeviceOwned<'b, T>) -> Result<O, E>,
-    >(
-        mut value: T,
-        inner: F,
-    ) -> Result<O, E> {
-        let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into();
-
-        // Safety: `device_box` contains exactly the device copy of `value`
-        let result = inner(HostAndDeviceOwned {
-            device_box: &mut device_box,
-            host_val: &mut value,
-        });
-
-        core::mem::drop(device_box);
-        core::mem::drop(value);
-
-        result
-    }
-
-    #[must_use]
-    pub fn for_device(self) -> DeviceMutRef<'a, T> {
-        DeviceMutRef {
-            pointer: self.device_box.0.as_raw_mut(),
-            reference: PhantomData,
-        }
-    }
-
-    #[must_use]
-    pub fn for_host(&'a mut self) -> &'a T {
-        self.host_val
-    }
-}
diff --git a/src/host/mod.rs b/src/host/mod.rs
new file mode 100644
index 000000000..589556560
--- /dev/null
+++ b/src/host/mod.rs
@@ -0,0 +1,412 @@
+use std::{
+    marker::PhantomData,
+    mem::ManuallyDrop,
+    ops::{Deref, DerefMut},
+};
+
+use const_type_layout::TypeGraphLayout;
+use rustacuda::{
+    context::Context,
+    error::CudaError,
+    event::Event,
+    memory::{CopyDestination, DeviceBox, DeviceBuffer, LockedBox, LockedBuffer},
+    module::Module,
+};
+
+use crate::{
+    safety::PortableBitSemantics,
+    utils::{
+        adapter::DeviceCopyWithPortableBitSemantics,
+        ffi::{
+            DeviceConstPointer, DeviceConstRef, DeviceMutPointer, DeviceMutRef, DeviceOwnedPointer,
+            DeviceOwnedRef,
+        },
+        r#async::{Async, NoCompletion},
+    },
+};
+
+type InvariantLifetime<'brand> = PhantomData<fn(&'brand ()) -> &'brand ()>;
+
+#[derive(Copy, Clone)]
+#[repr(transparent)]
+pub struct Stream<'stream> {
+    stream: &'stream rustacuda::stream::Stream,
+    _brand: InvariantLifetime<'stream>,
+}
+
+impl<'stream> Deref for Stream<'stream> {
+    type Target = rustacuda::stream::Stream;
+
+    fn deref(&self) -> &Self::Target {
+        self.stream
+    }
+}
+
+impl<'stream> Stream<'stream> {
+    #[allow(clippy::needless_pass_by_ref_mut)]
+    /// Create a new uniquely branded [`Stream`], which can bind async
+    /// operations to the [`Stream`] that they are computed on.
+    ///
+    /// The uniqueness guarantees are provided by using branded types,
+    /// as inspired by the Ghost Cell paper by Yanovski, J., Dang, H.-H.,
+    /// Jung, R., and Dreyer, D.: <https://doi.org/10.1145/3473597>.
+    ///
+    /// # Examples
+    ///
+    /// The following example shows that two [`Stream`]'s with different
+    /// `'stream` lifetime brands cannot be used interchangeably.
+    ///
+    /// ```rust, compile_fail
+    /// use rust_cuda::host::Stream;
+    ///
+    /// fn check_same<'stream>(_stream_a: Stream<'stream>, _stream_b: Stream<'stream>) {}
+    ///
+    /// fn two_streams<'stream_a, 'stream_b>(stream_a: Stream<'stream_a>, stream_b: Stream<'stream_b>) {
+    ///     check_same(stream_a, stream_b);
+    /// }
+    /// ```
+    pub fn with<O>(
+        stream: &mut rustacuda::stream::Stream,
+        inner: impl for<'new_stream> FnOnce(Stream<'new_stream>) -> O,
+    ) -> O {
+        inner(Stream {
+            stream,
+            _brand: InvariantLifetime::default(),
+        })
+    }
+}
+
+pub trait CudaDroppable: Sized {
+    #[allow(clippy::missing_errors_doc)]
+    fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>;
+}
+
+#[repr(transparent)]
+pub struct CudaDropWrapper<C: CudaDroppable>(ManuallyDrop<C>);
+impl<C: CudaDroppable> crate::alloc::CudaAlloc for CudaDropWrapper<C> {}
+impl<C: CudaDroppable> crate::alloc::sealed::alloc::Sealed for CudaDropWrapper<C> {}
+impl<C: CudaDroppable> From<C> for CudaDropWrapper<C> {
+    fn from(val: C) -> Self {
+        Self(ManuallyDrop::new(val))
+    }
+}
+impl<C: CudaDroppable> Drop for CudaDropWrapper<C> {
+    fn drop(&mut self) {
+        // Safety: drop is only ever called once
+        let val = unsafe { ManuallyDrop::take(&mut self.0) };
+
+        if let Err((_err, val)) = C::drop(val) {
+            core::mem::forget(val);
+        }
+    }
+}
+impl<C: CudaDroppable> Deref for CudaDropWrapper<C> {
+    type Target = C;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+impl<C: CudaDroppable> DerefMut for CudaDropWrapper<C> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl<T> CudaDroppable for DeviceBox<T> {
+    fn drop(val: Self) -> Result<(), (CudaError, Self)> {
+        Self::drop(val)
+    }
+}
+
+impl<T: rustacuda_core::DeviceCopy> CudaDroppable for DeviceBuffer<T> {
+    fn drop(val: Self) -> Result<(), (CudaError, Self)> {
+        Self::drop(val)
+    }
+}
+
+impl<T> CudaDroppable for LockedBox<T> {
+    fn drop(val: Self) -> Result<(), (CudaError, Self)> {
+        Self::drop(val)
+    }
+}
+
+impl<T: rustacuda_core::DeviceCopy> CudaDroppable for LockedBuffer<T> {
+    fn drop(val: Self) -> Result<(), (CudaError, Self)> {
+        Self::drop(val)
+    }
+}
+
+macro_rules! impl_sealed_drop_value {
+    ($type:ty) => {
+        impl CudaDroppable for $type {
+            fn drop(val: Self) -> Result<(), (CudaError, Self)> {
+                Self::drop(val)
+            }
+        }
+    };
+}
+
+impl_sealed_drop_value!(Module);
+impl_sealed_drop_value!(rustacuda::stream::Stream);
+impl_sealed_drop_value!(Context);
+impl_sealed_drop_value!(Event);
+
+#[allow(clippy::module_name_repetitions)]
+pub struct HostAndDeviceMutRef<'a, T: PortableBitSemantics + TypeGraphLayout> {
+    device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
+    host_ref: &'a mut T,
+}
+
+impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff `value` cannot be moved
+    ///  to CUDA or an error occurs inside `inner`.
+    pub fn with_new<
+        O,
+        E: From<CudaError>,
+        F: for<'b> FnOnce(HostAndDeviceMutRef<'b, T>) -> Result<O, E>,
+    >(
+        host_ref: &mut T,
+        inner: F,
+    ) -> Result<O, E> {
+        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(host_ref),
+        )?);
+
+        // Safety: `device_box` contains exactly the device copy of `host_ref`
+        let result = inner(HostAndDeviceMutRef {
+            device_box: &mut device_box,
+            host_ref,
+        });
+
+        // Copy back any changes made
+        device_box.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(host_ref))?;
+
+        core::mem::drop(device_box);
+
+        result
+    }
+
+    /// # Safety
+    ///
+    /// `device_box` must contain EXACTLY the device copy of `host_ref`
+    pub(crate) unsafe fn new_unchecked(
+        device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
+        host_ref: &'a mut T,
+    ) -> Self {
+        Self {
+            device_box,
+            host_ref,
+        }
+    }
+
+    #[must_use]
+    pub(crate) fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T>
+    where
+        'a: 'b,
+    {
+        DeviceMutRef {
+            pointer: DeviceMutPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
+            reference: PhantomData,
+        }
+    }
+
+    #[must_use]
+    pub(crate) fn for_host<'b: 'a>(&'b self) -> &'a T {
+        self.host_ref
+    }
+
+    #[must_use]
+    pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T>
+    where
+        'a: 'b,
+    {
+        HostAndDeviceConstRef {
+            device_box: self.device_box,
+            host_ref: self.host_ref,
+        }
+    }
+
+    #[must_use]
+    pub(crate) unsafe fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRef<'b, T>
+    where
+        'a: 'b,
+    {
+        HostAndDeviceMutRef {
+            device_box: self.device_box,
+            host_ref: self.host_ref,
+        }
+    }
+
+    #[must_use]
+    pub fn into_mut<'b>(self) -> HostAndDeviceMutRef<'b, T>
+    where
+        'a: 'b,
+    {
+        HostAndDeviceMutRef {
+            device_box: self.device_box,
+            host_ref: self.host_ref,
+        }
+    }
+
+    #[must_use]
+    pub fn into_async<'b, 'stream>(
+        self,
+        stream: Stream<'stream>,
+    ) -> Async<'b, 'stream, HostAndDeviceMutRef<'b, T>, NoCompletion>
+    where
+        'a: 'b,
+    {
+        Async::ready(self.into_mut(), stream)
+    }
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct HostAndDeviceConstRef<'a, T: PortableBitSemantics + TypeGraphLayout> {
+    device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
+    host_ref: &'a T,
+}
+
+impl<'a, T: PortableBitSemantics + TypeGraphLayout> Clone for HostAndDeviceConstRef<'a, T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<'a, T: PortableBitSemantics + TypeGraphLayout> Copy for HostAndDeviceConstRef<'a, T> {}
+
+impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> {
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff `value` cannot be moved
+    ///  to CUDA or an error occurs inside `inner`.
+    pub fn with_new<
+        O,
+        E: From<CudaError>,
+        F: for<'b> FnOnce(HostAndDeviceConstRef<'b, T>) -> Result<O, E>,
+    >(
+        host_ref: &T,
+        inner: F,
+    ) -> Result<O, E> {
+        let device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(host_ref),
+        )?);
+
+        // Safety: `device_box` contains exactly the device copy of `host_ref`
+        let result = inner(HostAndDeviceConstRef {
+            device_box: &device_box,
+            host_ref,
+        });
+
+        core::mem::drop(device_box);
+
+        result
+    }
+
+    /// # Safety
+    ///
+    /// `device_box` must contain EXACTLY the device copy of `host_ref`
+    pub(crate) const unsafe fn new_unchecked(
+        device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
+        host_ref: &'a T,
+    ) -> Self {
+        Self {
+            device_box,
+            host_ref,
+        }
+    }
+
+    #[must_use]
+    pub(crate) fn for_device<'b>(&'b self) -> DeviceConstRef<'a, T>
+    where
+        'a: 'b,
+    {
+        let mut hack = ManuallyDrop::new(unsafe { std::ptr::read(self.device_box) });
+
+        DeviceConstRef {
+            pointer: DeviceConstPointer(hack.as_device_ptr().as_raw().cast()),
+            reference: PhantomData,
+        }
+    }
+
+    #[must_use]
+    pub(crate) const fn for_host(&'a self) -> &'a T {
+        self.host_ref
+    }
+
+    #[must_use]
+    pub const fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T>
+    where
+        'a: 'b,
+    {
+        *self
+    }
+
+    #[must_use]
+    pub const fn as_async<'b, 'stream>(
+        &'b self,
+        stream: Stream<'stream>,
+    ) -> Async<'b, 'stream, HostAndDeviceConstRef<'b, T>, NoCompletion>
+    where
+        'a: 'b,
+    {
+        Async::ready(
+            HostAndDeviceConstRef {
+                device_box: self.device_box,
+                host_ref: self.host_ref,
+            },
+            stream,
+        )
+    }
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct HostAndDeviceOwned<'a, T: PortableBitSemantics + TypeGraphLayout> {
+    device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
+    host_val: &'a mut T,
+}
+
+impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> {
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff `value` cannot be moved
+    ///  to CUDA or an error occurs inside `inner`.
+    pub fn with_new<O, E: From<CudaError>, F: FnOnce(HostAndDeviceOwned<T>) -> Result<O, E>>(
+        mut value: T,
+        inner: F,
+    ) -> Result<O, E> {
+        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(&value),
+        )?);
+
+        // Safety: `device_box` contains exactly the device copy of `value`
+        inner(HostAndDeviceOwned {
+            device_box: &mut device_box,
+            host_val: &mut value,
+        })
+    }
+
+    #[must_use]
+    pub(crate) fn for_device(self) -> DeviceOwnedRef<'a, T> {
+        DeviceOwnedRef {
+            pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
+            marker: PhantomData::<T>,
+            reference: PhantomData::<&'a mut ()>,
+        }
+    }
+
+    #[must_use]
+    pub(crate) fn for_host(&self) -> &T {
+        self.host_val
+    }
+
+    #[must_use]
+    pub const fn into_async<'stream>(
+        self,
+        stream: Stream<'stream>,
+    ) -> Async<'a, 'stream, Self, NoCompletion> {
+        Async::ready(self, stream)
+    }
+}
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
new file mode 100644
index 000000000..3fc2b2e60
--- /dev/null
+++ b/src/kernel/mod.rs
@@ -0,0 +1,623 @@
+#[cfg(feature = "host")]
+use std::{
+    ffi::{CStr, CString},
+    marker::PhantomData,
+    mem::ManuallyDrop,
+    ptr::NonNull,
+};
+
+#[cfg(feature = "host")]
+use rustacuda::{
+    error::{CudaError, CudaResult},
+    function::Function,
+    module::Module,
+};
+
+#[cfg(feature = "kernel")]
+pub use rust_cuda_kernel::kernel;
+
+#[doc(hidden)]
+#[cfg(all(feature = "kernel", feature = "host"))]
+#[allow(clippy::module_name_repetitions)]
+pub use rust_cuda_kernel::{check_kernel, compile_kernel, specialise_kernel_entry_point};
+
+#[cfg(feature = "host")]
+mod ptx_jit;
+#[cfg(feature = "host")]
+use ptx_jit::{PtxJITCompiler, PtxJITResult};
+
+#[cfg(feature = "host")]
+use crate::host::Stream;
+use crate::safety::PortableBitSemantics;
+
+pub mod param;
+
+mod sealed {
+    #[doc(hidden)]
+    pub trait Sealed {}
+
+    #[cfg(feature = "host")]
+    pub struct Token;
+}
+
+#[cfg(all(feature = "host", not(doc)))]
+#[doc(hidden)]
+pub trait WithNewAsync<
+    'stream,
+    P: ?Sized + CudaKernelParameter,
+    O,
+    E: From<rustacuda::error::CudaError>,
+>
+{
+    fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E>
+    where
+        P: 'b;
+}
+
+#[cfg(all(feature = "host", not(doc)))]
+impl<
+        'stream,
+        P: ?Sized + CudaKernelParameter,
+        O,
+        E: From<rustacuda::error::CudaError>,
+        F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    > WithNewAsync<'stream, P, O, E> for F
+{
+    fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E>
+    where
+        P: 'b,
+    {
+        (self)(param)
+    }
+}
+
+#[cfg(feature = "device")]
+#[doc(hidden)]
+pub trait WithFfiAsDevice<P: ?Sized + CudaKernelParameter, O> {
+    fn with<'b>(self, param: P::DeviceType<'b>) -> O
+    where
+        P: 'b;
+}
+
+#[cfg(feature = "device")]
+impl<P: ?Sized + CudaKernelParameter, O, F: for<'b> FnOnce(P::DeviceType<'b>) -> O>
+    WithFfiAsDevice<P, O> for F
+{
+    fn with<'b>(self, param: P::DeviceType<'b>) -> O
+    where
+        P: 'b,
+    {
+        (self)(param)
+    }
+}
+
+pub trait CudaKernelParameter: sealed::Sealed {
+    #[cfg(feature = "host")]
+    type SyncHostType;
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b>
+    where
+        Self: 'b;
+    #[doc(hidden)]
+    type FfiType<'stream, 'b>: PortableBitSemantics
+    where
+        Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b>
+    where
+        Self: 'b;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::missing_errors_doc)] // FIXME
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+    ) -> std::alloc::Layout
+    where
+        Self: 'b;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b;
+
+    #[doc(hidden)]
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short;
+}
+
+#[cfg(feature = "host")]
+pub struct Launcher<'stream, 'kernel, Kernel> {
+    pub stream: Stream<'stream>,
+    pub kernel: &'kernel mut TypedPtxKernel<Kernel>,
+    pub config: LaunchConfig,
+}
+
+#[cfg(feature = "host")]
+macro_rules! impl_launcher_launch {
+    ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => {
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch<$($T: CudaKernelParameter),*>(
+            &mut self,
+            $($arg: $T::SyncHostType),*
+        ) -> CudaResult<()>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*),
+        {
+            self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*)
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $with_async<
+            'a,
+            Ok,
+            Err: From<CudaError>,
+            $($T: CudaKernelParameter),*
+        >(
+            &'a mut self,
+            $($arg: $T::SyncHostType,)*
+            inner: impl FnOnce(
+                &'a mut Self,
+                $($T::AsyncHostType<'stream, '_>),*
+            ) -> Result<Ok, Err>,
+        ) -> Result<Ok, Err>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*),
+        {
+            #[allow(unused_variables)]
+            let stream = self.stream;
+
+            impl_launcher_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
+                inner(self, $($arg),*)
+            } }
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch_async<$($T: CudaKernelParameter),*>(
+            &mut self,
+            $($arg: $T::AsyncHostType<'stream, '_>),*
+        ) -> CudaResult<crate::utils::r#async::Async<
+            'static, 'stream, (), crate::utils::r#async::NoCompletion,
+        >>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*),
+        {
+            self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*)
+        }
+    };
+    (impl $func:ident () + ($($other:expr),*) $inner:block) => {
+        $inner
+    };
+    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => {
+        $T0::$func($arg0 $(, $other)*, |$arg0: <$T0 as CudaKernelParameter>::AsyncHostType<'stream, '_>| {
+            impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
+        })
+    };
+}
+
+#[cfg(feature = "host")]
+impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> {
+    impl_launcher_launch! { launch0() => with0_async => launch0_async }
+
+    impl_launcher_launch! { launch1(
+        arg1: A
+    ) => with1_async => launch1_async }
+
+    impl_launcher_launch! { launch2(
+        arg1: A, arg2: B
+    ) => with2_async => launch2_async }
+
+    impl_launcher_launch! { launch3(
+        arg1: A, arg2: B, arg3: C
+    ) => with3_async => launch3_async }
+
+    impl_launcher_launch! { launch4(
+        arg1: A, arg2: B, arg3: C, arg4: D
+    ) => with4_async => launch4_async }
+
+    impl_launcher_launch! { launch5(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E
+    ) => with5_async => launch5_async }
+
+    impl_launcher_launch! { launch6(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F
+    ) => with6_async => launch6_async }
+
+    impl_launcher_launch! { launch7(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G
+    ) => with7_async => launch7_async }
+
+    impl_launcher_launch! { launch8(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H
+    ) => with8_async => launch8_async }
+
+    impl_launcher_launch! { launch9(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I
+    ) => with9_async => launch9_async }
+
+    impl_launcher_launch! { launch10(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J
+    ) => with10_async => launch10_async }
+
+    impl_launcher_launch! { launch11(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K
+    ) => with11_async => launch11_async }
+
+    impl_launcher_launch! { launch12(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K, arg12: L
+    ) => with12_async => launch12_async }
+}
+
+#[cfg(feature = "host")]
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct LaunchConfig {
+    pub grid: rustacuda::function::GridSize,
+    pub block: rustacuda::function::BlockSize,
+    pub ptx_jit: bool,
+}
+
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+pub struct RawPtxKernel {
+    module: ManuallyDrop<Box<Module>>,
+    function: ManuallyDrop<Function<'static>>,
+}
+
+#[cfg(feature = "host")]
+impl RawPtxKernel {
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does
+    ///  not contain an entry point named `entry_point`.
+    pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult<Self> {
+        let module: Box<Module> = Box::new(Module::load_from_string(ptx)?);
+
+        let function = unsafe { &*std::ptr::from_ref(module.as_ref()) }.get_function(entry_point);
+
+        let function = match function {
+            Ok(function) => function,
+            Err(err) => {
+                if let Err((_err, module)) = Module::drop(*module) {
+                    std::mem::forget(module);
+                }
+
+                return Err(err);
+            },
+        };
+
+        Ok(Self {
+            function: ManuallyDrop::new(function),
+            module: ManuallyDrop::new(module),
+        })
+    }
+
+    #[must_use]
+    pub fn get_function(&self) -> &Function {
+        &self.function
+    }
+}
+
+#[cfg(feature = "host")]
+impl Drop for RawPtxKernel {
+    fn drop(&mut self) {
+        {
+            // Ensure that self.function is dropped before self.module as
+            //  it borrows data from the module and must not outlive it
+            let _function = unsafe { ManuallyDrop::take(&mut self.function) };
+        }
+
+        if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) })
+        {
+            std::mem::forget(module);
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+pub type PtxKernelConfigure = dyn FnMut(&Function) -> CudaResult<()>;
+
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+pub struct TypedPtxKernel<Kernel> {
+    compiler: PtxJITCompiler,
+    ptx_kernel: Option<RawPtxKernel>,
+    entry_point: Box<CStr>,
+    configure: Option<Box<PtxKernelConfigure>>,
+    marker: PhantomData<Kernel>,
+}
+
+#[cfg(feature = "host")]
+macro_rules! impl_typed_kernel_launch {
+    ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => {
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch<'kernel, 'stream, $($T: CudaKernelParameter),*>(
+            &'kernel mut self,
+            stream: Stream<'stream>,
+            config: &LaunchConfig,
+            $($arg: $T::SyncHostType),*
+        ) -> CudaResult<()>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*),
+        {
+            self.$with_async::<(), CudaError, $($T),*>(
+                stream,
+                config,
+                $($arg,)*
+                |kernel, stream, config, $($arg),*| {
+                    let r#async = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*)?;
+
+                    // important: always synchronise here, this function is sync!
+                    r#async.synchronize()
+                },
+            )
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $with_async<
+            'kernel,
+            'stream,
+            Ok,
+            Err: From<CudaError>,
+            $($T: CudaKernelParameter),*
+        >(
+            &'kernel mut self,
+            stream: Stream<'stream>,
+            config: &LaunchConfig,
+            $($arg: $T::SyncHostType,)*
+            inner: impl FnOnce(
+                &'kernel mut Self,
+                Stream<'stream>,
+                &LaunchConfig,
+                $($T::AsyncHostType<'stream, '_>),*
+            ) -> Result<Ok, Err>,
+        ) -> Result<Ok, Err>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*),
+        {
+            impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
+                inner(self, stream, config, $($arg),*)
+            } }
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::needless_lifetimes)] // 'stream is unused for zero args
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch_async<'kernel, 'stream, $($T: CudaKernelParameter),*>(
+            &'kernel mut self,
+            stream: Stream<'stream>,
+            config: &LaunchConfig,
+            $($arg: $T::AsyncHostType<'stream, '_>),*
+        ) -> CudaResult<crate::utils::r#async::Async<
+            'static, 'stream, (), crate::utils::r#async::NoCompletion,
+        >>
+        // launch_async does not need to capture its parameters until kernel completion:
+        //  - moved parameters are moved and cannot be used again, deallocation will sync
+        //  - immutably borrowed parameters can be shared across multiple kernel launches
+        //  - mutably borrowed parameters are more tricky:
+        //    - Rust's borrowing rules ensure that a single mutable reference cannot be
+        //      passed into multiple parameters of the kernel (no mutable aliasing)
+        //    - CUDA guarantees that kernels launched on the same stream are executed
+        //      sequentially, so even immediate resubmissions for the same mutable data
+        //      will not have temporally overlapping mutation on the same stream
+        //    - however, we have to guarantee that mutable data cannot be used on several
+        //      different streams at the same time
+        //      - Async::move_to_stream always adds a synchronisation barrier between the
+        //        old and the new stream to ensure that all uses on the old stream happen
+        //        strictly before all uses on the new stream
+        //      - async launches take AsyncProj<&mut HostAndDeviceMutRef<..>>, which either
+        //        captures an Async, which must be moved to a different stream explicitly,
+        //        or contains data that cannot async move to a different stream without
+        //      - any use of a mutable borrow in an async kernel launch adds a sync barrier
+        //        on the launch stream s.t. the borrow is only complete once the kernel has
+        //        completed
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*),
+        {
+            let function = if config.ptx_jit {
+                impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + (sealed::Token) {
+                    self.compile_with_ptx_jit_args(Some(&[$($arg),*]))
+                } }?
+            } else {
+                self.compile_with_ptx_jit_args(None)?
+            };
+
+            #[allow(unused_mut)]
+            let mut shared_memory_size = crate::utils::shared::SharedMemorySize::new();
+            $(
+                shared_memory_size.add($T::shared_layout_for_async(&$arg, sealed::Token));
+            )*
+            let Ok(shared_memory_size) = u32::try_from(shared_memory_size.total()) else {
+                // FIXME: this should really be InvalidConfiguration = 9
+                return Err(CudaError::LaunchOutOfResources)
+            };
+
+            unsafe { stream.launch(
+                function,
+                config.grid.clone(),
+                config.block.clone(),
+                shared_memory_size,
+                &[
+                    $(core::ptr::from_mut(
+                        &mut $T::async_to_ffi($arg, sealed::Token)?
+                    ).cast::<core::ffi::c_void>()),*
+                ],
+            ) }?;
+
+            crate::utils::r#async::Async::pending(
+                (), stream, crate::utils::r#async::NoCompletion,
+            )
+        }
+    };
+    (impl $func:ident () + ($($other:expr),*) $inner:block) => {
+        $inner
+    };
+    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => {
+        $T0::$func($arg0 $(, $other)*, |$arg0: <$T0 as CudaKernelParameter>::AsyncHostType<'stream, '_>| {
+            impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
+        })
+    };
+    (impl $func:ident ref () + ($($other:expr),*) $inner:block) => {
+        $inner
+    };
+    (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => {
+        $T0::$func(&$arg0 $(, $other)*, |$arg0| {
+            impl_typed_kernel_launch! { impl $func ref ($($arg: $T),*) + ($($other),*) $inner }
+        })
+    };
+}
+
+#[cfg(feature = "host")]
+impl<Kernel> TypedPtxKernel<Kernel> {
+    #[must_use]
+    pub fn new<T: CompiledKernelPtx<Kernel>>(configure: Option<Box<PtxKernelConfigure>>) -> Self {
+        let compiler = PtxJITCompiler::new(T::get_ptx());
+        let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str();
+
+        Self {
+            compiler,
+            ptx_kernel: None,
+            entry_point,
+            configure,
+            marker: PhantomData::<Kernel>,
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+impl<Kernel> TypedPtxKernel<Kernel> {
+    impl_typed_kernel_launch! { launch0() => with0_async => launch0_async }
+
+    impl_typed_kernel_launch! { launch1(
+        arg1: A
+    ) => with1_async => launch1_async }
+
+    impl_typed_kernel_launch! { launch2(
+        arg1: A, arg2: B
+    ) => with2_async => launch2_async }
+
+    impl_typed_kernel_launch! { launch3(
+        arg1: A, arg2: B, arg3: C
+    ) => with3_async => launch3_async }
+
+    impl_typed_kernel_launch! { launch4(
+        arg1: A, arg2: B, arg3: C, arg4: D
+    ) => with4_async => launch4_async }
+
+    impl_typed_kernel_launch! { launch5(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E
+    ) => with5_async => launch5_async }
+
+    impl_typed_kernel_launch! { launch6(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F
+    ) => with6_async => launch6_async }
+
+    impl_typed_kernel_launch! { launch7(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G
+    ) => with7_async => launch7_async }
+
+    impl_typed_kernel_launch! { launch8(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H
+    ) => with8_async => launch8_async }
+
+    impl_typed_kernel_launch! { launch9(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I
+    ) => with9_async => launch9_async }
+
+    impl_typed_kernel_launch! { launch10(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J
+    ) => with10_async => launch10_async }
+
+    impl_typed_kernel_launch! { launch11(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K
+    ) => with11_async => launch11_async }
+
+    impl_typed_kernel_launch! { launch12(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K, arg12: L
+    ) => with12_async => launch12_async }
+
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to
+    /// [`Self::new`] is not a valid PTX source or does not contain the
+    /// entry point it declares.
+    fn compile_with_ptx_jit_args(
+        &mut self,
+        arguments: Option<&[Option<&NonNull<[u8]>>]>,
+    ) -> CudaResult<&Function> {
+        let ptx_jit = self.compiler.with_arguments(arguments);
+
+        let kernel_jit = match (&mut self.ptx_kernel, ptx_jit) {
+            (Some(ptx_kernel), PtxJITResult::Cached(_)) => ptx_kernel.get_function(),
+            (ptx_kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => {
+                let recomputed_ptx_kernel = RawPtxKernel::new(ptx_cstr, &self.entry_point)?;
+
+                // Replace the existing compiled kernel, drop the old one
+                let ptx_kernel = ptx_kernel.insert(recomputed_ptx_kernel);
+
+                let function = ptx_kernel.get_function();
+
+                if let Some(configure) = self.configure.as_mut() {
+                    configure(function)?;
+                }
+
+                function
+            },
+        };
+
+        Ok(kernel_jit)
+    }
+}
+
+#[cfg(feature = "host")]
+/// # Safety
+///
+/// The PTX string returned by [`CompiledKernelPtx::get_ptx`] must correspond
+/// to the compiled kernel code for the `Kernel` function and contain a kernel
+/// entry point whose name is returned by
+/// [`CompiledKernelPtx::get_entry_point`].
+///
+/// This trait should not be implemented manually &ndash; use the
+/// [`kernel`] macro instead.
+pub unsafe trait CompiledKernelPtx<Kernel> {
+    fn get_ptx() -> &'static CStr;
+    fn get_entry_point() -> &'static CStr;
+}
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
new file mode 100644
index 000000000..c87148c7a
--- /dev/null
+++ b/src/kernel/param.rs
@@ -0,0 +1,1224 @@
+#[cfg(feature = "device")]
+use core::convert::AsRef;
+use core::{
+    marker::PhantomData,
+    ops::{Deref, DerefMut},
+};
+
+#[cfg(feature = "host")]
+use std::{alloc::Layout, ptr::NonNull};
+
+use const_type_layout::TypeGraphLayout;
+
+use crate::{
+    alloc::EmptyCudaAlloc,
+    kernel::{sealed, CudaKernelParameter},
+    lend::RustToCuda,
+    safety::{PortableBitSemantics, SafeMutableAliasing},
+    utils::ffi::{DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef},
+};
+
+pub struct PtxJit<T> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<T> Deref for PtxJit<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<T> DerefMut for PtxJit<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.never
+    }
+}
+
+pub struct PerThreadShallowCopy<
+    T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
+> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout> Deref
+    for PerThreadShallowCopy<T>
+{
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout> DerefMut
+    for PerThreadShallowCopy<T>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.never
+    }
+}
+
+impl<
+        T: Copy
+            + Send
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout,
+    > CudaKernelParameter for PerThreadShallowCopy<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = T where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = T where Self: 'b;
+    type FfiType<'stream, 'b> = crate::utils::adapter::RustToCudaWithPortableBitCopySemantics<T> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        inner.with(param)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        Ok(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param))
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        let param = param.into_inner();
+
+        inner.with(param)
+    }
+}
+impl<
+        T: Copy
+            + Send
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout,
+    > sealed::Sealed for PerThreadShallowCopy<T>
+{
+}
+
+impl<
+        'a,
+        T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
+    > CudaKernelParameter for &'a PerThreadShallowCopy<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+        'b,
+        'stream,
+        crate::host::HostAndDeviceConstRef<'b, T>,
+    > where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b T where Self: 'b;
+    type FfiType<'stream, 'b> = DeviceConstRef<'b, T> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        let _ = stream;
+        crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
+            inner.with(unsafe { crate::utils::r#async::AsyncProj::new(const_ref, None) })
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        let param = unsafe { param.unwrap_unchecked() };
+        Ok(param.for_device())
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        let param = param.as_ref();
+
+        inner.with(param)
+    }
+}
+impl<
+        'a,
+        T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
+    > sealed::Sealed for &'a PerThreadShallowCopy<T>
+{
+}
+
+impl<
+        'a,
+        T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
+    > CudaKernelParameter for &'a PtxJit<PerThreadShallowCopy<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b> where Self: 'b;
+    type FfiType<'stream, 'b> =
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        let _ = stream;
+        // FIXME: forward impl
+        crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
+            inner.with(unsafe { crate::utils::r#async::AsyncProj::new(const_ref, None) })
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        let param_ref = param.proj_ref();
+        let param = unsafe { param_ref.unwrap_ref_unchecked() };
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param, token)
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        // FIXME: forward impl
+        let param = param.as_ref();
+
+        inner.with(param)
+    }
+}
+impl<
+        'a,
+        T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
+    > sealed::Sealed for &'a PtxJit<PerThreadShallowCopy<T>>
+{
+}
+
+pub struct ShallowInteriorMutable<
+    T: Sync
+        + crate::safety::StackOnly
+        + crate::safety::PortableBitSemantics
+        + TypeGraphLayout
+        + InteriorMutableSync,
+> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<
+        T: Sync
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout
+            + InteriorMutableSync,
+    > Deref for ShallowInteriorMutable<T>
+{
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<
+        'a,
+        T: Sync
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout
+            + InteriorMutableSync,
+    > CudaKernelParameter for &'a ShallowInteriorMutable<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+        'b,
+        'stream,
+        crate::host::HostAndDeviceConstRef<'b, T>
+    > where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b T where Self: 'b;
+    type FfiType<'stream, 'b> = DeviceConstRef<'b, T> where Self: 'b;
+    #[cfg(feature = "host")]
+    /// The kernel takes a mutable borrow of the interior mutable data to ensure
+    /// the interior mutability is limited to just this kernel invocation.
+    type SyncHostType = &'a mut T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        let _ = stream;
+        crate::host::HostAndDeviceMutRef::with_new(param, |mut_ref| {
+            inner.with(unsafe { crate::utils::r#async::AsyncProj::new(mut_ref.as_ref(), None) })
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        let param = unsafe { param.unwrap_unchecked() };
+        Ok(param.for_device())
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        let param = param.as_ref();
+
+        inner.with(param)
+    }
+}
+impl<
+        'a,
+        T: crate::safety::StackOnly
+            + Sync
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout
+            + InteriorMutableSync,
+    > sealed::Sealed for &'a ShallowInteriorMutable<T>
+{
+}
+
+pub trait InteriorMutableSync: Sync + sealed::Sealed {}
+
+macro_rules! impl_atomic_interior_mutable {
+    ($atomic:ident($interior:ty)) => {
+        impl InteriorMutableSync for core::sync::atomic::$atomic {}
+        impl sealed::Sealed for core::sync::atomic::$atomic {}
+    };
+    ($($atomic:ident($interior:ty)),*) => {
+        $(impl_atomic_interior_mutable! { $atomic($interior) })*
+    }
+}
+
+impl_atomic_interior_mutable! {
+    AtomicBool(bool),
+    AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize),
+    AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize)
+}
+
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> InteriorMutableSync
+    for core::cell::SyncUnsafeCell<T>
+{
+}
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> sealed::Sealed
+    for core::cell::SyncUnsafeCell<T>
+{
+}
+
+pub struct DeepPerThreadBorrow<T: RustToCuda> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<T: RustToCuda> Deref for DeepPerThreadBorrow<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<
+        T: Send
+            + Clone
+            + RustToCuda<CudaRepresentation: crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc>,
+    > CudaKernelParameter for DeepPerThreadBorrow<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
+        'b,
+        'stream,
+        crate::host::HostAndDeviceOwned<
+            'b,
+            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        >,
+        crate::utils::r#async::NoCompletion,
+    > where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = T where Self: 'b;
+    type FfiType<'stream, 'b> =
+        DeviceOwnedRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream)))
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
+            unsafe { param.unwrap_unchecked()? };
+        Ok(param.for_device())
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        unsafe {
+            crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param))
+        }
+    }
+}
+impl<
+        T: Send
+            + Clone
+            + RustToCuda<CudaRepresentation: crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc>,
+    > sealed::Sealed for DeepPerThreadBorrow<T>
+{
+}
+
+impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T> {
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+        'b,
+        'stream,
+        crate::host::HostAndDeviceConstRef<
+            'b,
+            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        >,
+    > where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b T where Self: 'b;
+    type FfiType<'stream, 'b> =
+        DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        let _ = stream;
+        crate::lend::LendToCuda::lend_to_cuda(param, |param| {
+            inner.with(unsafe { crate::utils::r#async::AsyncProj::new(param, None) })
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        let param = unsafe { param.unwrap_unchecked() };
+        Ok(param.for_device())
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        unsafe {
+            crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param))
+        }
+    }
+}
+impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow<T> {}
+
+impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
+    for &'a mut DeepPerThreadBorrow<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+        'b,
+        'stream,
+        crate::host::HostAndDeviceMutRef<
+            'b,
+            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        >,
+    > where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b mut T where Self: 'b;
+    type FfiType<'stream, 'b> =
+        DeviceMutRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a mut T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        crate::lend::LendToCuda::lend_to_cuda_mut(param, |param| {
+            // FIXME: express the same with param.into_async(stream).as_mut()
+            let _ = stream;
+            inner.with({
+                // Safety: this projection cannot be moved to a different stream
+                //         without first exiting lend_to_cuda_mut and synchronizing
+                unsafe { crate::utils::r#async::AsyncProj::new(param.into_mut(), None) }
+            })
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        mut param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        param.record_mut_use()?;
+        let mut param = unsafe { param.unwrap_unchecked() };
+        Ok(param.for_device())
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        unsafe {
+            crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param))
+        }
+    }
+}
+impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
+    for &'a mut DeepPerThreadBorrow<T>
+{
+}
+
+impl<
+        T: Send
+            + Clone
+            + RustToCuda<CudaRepresentation: crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc>,
+    > CudaKernelParameter for PtxJit<DeepPerThreadBorrow<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = <DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b> where Self: 'b;
+    type FfiType<'stream, 'b> =
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = <DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        // FIXME: forward impl
+        crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream)))
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        let param = unsafe { param.as_ref().unwrap_unchecked() };
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        // FIXME: forward impl
+        unsafe {
+            crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param))
+        }
+    }
+}
+impl<
+        T: Send
+            + Clone
+            + RustToCuda<CudaRepresentation: crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc>,
+    > sealed::Sealed for PtxJit<DeepPerThreadBorrow<T>>
+{
+}
+
+impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadBorrow<T>> {
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b> where Self: 'b;
+    type FfiType<'stream, 'b> =
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        // FIXME: forward impl
+        let _ = stream;
+        crate::lend::LendToCuda::lend_to_cuda(param, |param| {
+            inner.with(unsafe { crate::utils::r#async::AsyncProj::new(param, None) })
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        let param_ref = param.proj_ref();
+        let param = unsafe { param_ref.unwrap_unchecked() };
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        // FIXME: forward impl
+        unsafe {
+            crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param))
+        }
+    }
+}
+impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a PtxJit<DeepPerThreadBorrow<T>> {}
+
+impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
+    for &'a mut PtxJit<DeepPerThreadBorrow<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b> where Self: 'b;
+    type FfiType<'stream, 'b> =
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        // FIXME: forward impl
+        crate::lend::LendToCuda::lend_to_cuda_mut(param, |param| {
+            // FIXME: express the same with param.as_async(stream).as_mut()
+            let _ = stream;
+            inner.with({
+                // Safety: this projection cannot be moved to a different stream
+                //         without first exiting lend_to_cuda_mut and synchronizing
+                unsafe { crate::utils::r#async::AsyncProj::new(param.into_mut(), None) }
+            })
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        let param_ref = param.proj_ref();
+        let param = unsafe { param_ref.unwrap_unchecked() };
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        // FIXME: forward impl
+        unsafe {
+            crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param))
+        }
+    }
+}
+impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
+    for &'a mut PtxJit<DeepPerThreadBorrow<T>>
+{
+}
+
+#[cfg(feature = "host")]
+fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
+    NonNull::slice_from_raw_parts(NonNull::from(r).cast::<u8>(), core::mem::size_of_val(r))
+}
+
+#[cfg(feature = "device")]
+fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
+    unsafe {
+        core::arch::asm!(
+            "// <rust-cuda-ptx-jit-const-load-{param_reg}-{param_index}> //",
+            param_reg = in(reg32) *(core::ptr::from_ref(param).cast::<u32>()),
+            param_index = const(INDEX),
+        );
+    }
+}
+
+mod private_shared {
+    use core::marker::PhantomData;
+
+    use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+    use crate::safety::PortableBitSemantics;
+
+    #[doc(hidden)]
+    #[derive(TypeLayout)]
+    #[repr(C)]
+    pub struct ThreadBlockSharedFfi<T: 'static> {
+        pub(super) _dummy: [u8; 0],
+        pub(super) _marker: PhantomData<T>,
+    }
+
+    #[doc(hidden)]
+    #[derive(TypeLayout)]
+    #[repr(C)]
+    pub struct ThreadBlockSharedSliceFfi<T: 'static + PortableBitSemantics + TypeGraphLayout> {
+        pub(super) len: usize,
+        pub(super) _marker: [T; 0],
+    }
+}
+
+impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared<T> {
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared<T> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared<T> where Self: 'b;
+    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi<T> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = Self;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        inner.with(param)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        _param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        Ok(private_shared::ThreadBlockSharedFfi {
+            _dummy: [],
+            _marker: PhantomData::<T>,
+        })
+    }
+
+    #[cfg(feature = "device")]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        _param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        let mut param = crate::utils::shared::ThreadBlockShared::new_uninit();
+
+        inner.with(&mut param)
+    }
+}
+impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared<T> {}
+
+impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter
+    for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T> where Self: 'b;
+    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi<T> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = Self;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: crate::host::Stream<'stream>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        inner.with(param)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O
+    where
+        Self: 'b,
+    {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout
+    where
+        Self: 'b,
+    {
+        param.layout()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
+        Ok(private_shared::ThreadBlockSharedSliceFfi {
+            len: param.len(),
+            _marker: [],
+        })
+    }
+
+    #[cfg(feature = "device")]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O
+    where
+        Self: 'short,
+    {
+        unsafe {
+            crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, |param| {
+                inner.with(param)
+            })
+        }
+    }
+}
+impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed
+    for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
+{
+}
diff --git a/rust-cuda-ptx-jit/src/host/compiler/mod.rs b/src/kernel/ptx_jit/mod.rs
similarity index 92%
rename from rust-cuda-ptx-jit/src/host/compiler/mod.rs
rename to src/kernel/ptx_jit/mod.rs
index 156e8223c..43c555ab2 100644
--- a/rust-cuda-ptx-jit/src/host/compiler/mod.rs
+++ b/src/kernel/ptx_jit/mod.rs
@@ -6,7 +6,6 @@ mod replace;
 
 type ByteSliceOptionalArguments = Option<Box<[Option<Box<[u8]>>]>>;
 
-#[doc(cfg(feature = "host"))]
 #[allow(clippy::module_name_repetitions)]
 pub struct PtxJITCompiler {
     ptx_slices: Box<[PtxElement]>,
@@ -14,7 +13,6 @@ pub struct PtxJITCompiler {
     last_ptx: CString,
 }
 
-#[doc(cfg(feature = "host"))]
 pub enum PtxJITResult<'s> {
     Cached(&'s CStr),
     Recomputed(&'s CStr),
diff --git a/rust-cuda-ptx-jit/src/host/compiler/preprocess.rs b/src/kernel/ptx_jit/preprocess.rs
similarity index 93%
rename from rust-cuda-ptx-jit/src/host/compiler/preprocess.rs
rename to src/kernel/ptx_jit/preprocess.rs
index 0ee17733f..c22cf63e9 100644
--- a/rust-cuda-ptx-jit/src/host/compiler/preprocess.rs
+++ b/src/kernel/ptx_jit/preprocess.rs
@@ -5,7 +5,7 @@ use std::{
 
 use super::{
     regex::{
-        CONST_BASE_REGISTER_REGEX, CONST_LOAD_INSTRUCTION_REGEX, CONST_MARKER_REGEX, REGISTER_REGEX,
+        const_base_register_regex, const_load_instruction_regex, const_marker_regex, register_regex,
     },
     PtxElement, PtxJITCompiler, PtxLoadWidth,
 };
@@ -19,7 +19,7 @@ impl PtxJITCompiler {
         let mut const_markers: HashMap<&[u8], usize> = HashMap::new();
 
         // Find injected rust-cuda-const-markers which identify dummy register rxx
-        for const_marker in CONST_MARKER_REGEX.captures_iter(ptx) {
+        for const_marker in const_marker_regex().captures_iter(ptx) {
             if let Some(tmpreg) = const_marker.name("tmpreg").map(|s| s.as_bytes()) {
                 if let Some(param) = const_marker
                     .name("param")
@@ -36,7 +36,7 @@ impl PtxJITCompiler {
         let mut const_base_registers: HashMap<&[u8], usize> = HashMap::new();
 
         // Find base register ryy which was used in `ld.global.u32 rxx, [ryy];`
-        for const_base_register in CONST_BASE_REGISTER_REGEX.captures_iter(ptx) {
+        for const_base_register in const_base_register_regex().captures_iter(ptx) {
             if let Some(tmpreg) = const_base_register.name("tmpreg").map(|s| s.as_bytes()) {
                 if let Some(param) = const_markers.get(tmpreg) {
                     if let Some(basereg) = const_base_register.name("basereg").map(|s| s.as_bytes())
@@ -54,7 +54,7 @@ impl PtxJITCompiler {
         let mut ptx_slices: Vec<PtxElement> = Vec::new();
 
         // Iterate over all load from base register with offset instructions
-        for const_load_instruction in CONST_LOAD_INSTRUCTION_REGEX.captures_iter(ptx) {
+        for const_load_instruction in const_load_instruction_regex().captures_iter(ptx) {
             // Only consider instructions where the base register is ryy
             if let Some(basereg) = const_load_instruction.name("basereg").map(|s| s.as_bytes()) {
                 if let Some(param) = const_base_registers.get(basereg) {
@@ -100,7 +100,7 @@ impl PtxJITCompiler {
                                         parameter_index: *param,
                                         byte_offset: loadoffset,
                                         load_width: loadwidth,
-                                        registers: REGISTER_REGEX
+                                        registers: register_regex()
                                             .captures_iter(constreg)
                                             .filter_map(|m| {
                                                 m.name("register").map(|s| {
diff --git a/src/kernel/ptx_jit/regex.rs b/src/kernel/ptx_jit/regex.rs
new file mode 100644
index 000000000..58406b01e
--- /dev/null
+++ b/src/kernel/ptx_jit/regex.rs
@@ -0,0 +1,58 @@
+use std::sync::OnceLock;
+
+use regex::bytes::Regex;
+
+#[allow(clippy::module_name_repetitions)]
+pub fn const_marker_regex() -> &'static Regex {
+    static CONST_MARKER_REGEX: OnceLock<Regex> = OnceLock::new();
+    CONST_MARKER_REGEX.get_or_init(|| {
+        Regex::new(r"(?-u)// <rust-cuda-ptx-jit-const-load-(?P<tmpreg>%r\d+)-(?P<param>\d+)> //")
+            .unwrap()
+    })
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub fn const_base_register_regex() -> &'static Regex {
+    static CONST_BASE_REGISTER_REGEX: OnceLock<Regex> = OnceLock::new();
+    CONST_BASE_REGISTER_REGEX.get_or_init(|| {
+        Regex::new(r"(?-u)ld\.global\.u32\s*(?P<tmpreg>%r\d+)\s*,\s*\[(?P<basereg>%r[ds]?\d+)]\s*;")
+            .unwrap()
+    })
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub fn const_load_instruction_regex() -> &'static Regex {
+    static CONST_LOAD_INSTRUCTION_REGEX: OnceLock<Regex> = OnceLock::new();
+    CONST_LOAD_INSTRUCTION_REGEX.get_or_init(|| {
+        Regex::new(
+            r"(?x-u)(?P<instruction>
+                ld\.global
+                (?:\.(?P<vector>v[24]))?
+                \.
+                (?P<loadtype>[suf])
+                (?P<loadwidth>8|16|32|64)
+                \s*
+                (?P<constreg>
+                    (?:%[rf][sd]?\d+) |
+                    (?:\{(?:\s*%[rf][sd]?\d+,)*\s*%[rf][sd]?\d+\s*\})
+                )
+                ,\s*
+                \[
+                (?P<basereg>%r[ds]?\d+)
+                (?:
+                    \+
+                    (?P<loadoffset>\d+)
+                )?
+                \]
+                \s*;
+            )",
+        )
+        .unwrap()
+    })
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub fn register_regex() -> &'static Regex {
+    static REGISTER_REGEX: OnceLock<Regex> = OnceLock::new();
+    REGISTER_REGEX.get_or_init(|| Regex::new(r"(?-u)(?P<register>%[rf][sd]?\d+)").unwrap())
+}
diff --git a/rust-cuda-ptx-jit/src/host/compiler/replace.rs b/src/kernel/ptx_jit/replace.rs
similarity index 96%
rename from rust-cuda-ptx-jit/src/host/compiler/replace.rs
rename to src/kernel/ptx_jit/replace.rs
index df4d270b8..97a592da9 100644
--- a/rust-cuda-ptx-jit/src/host/compiler/replace.rs
+++ b/src/kernel/ptx_jit/replace.rs
@@ -1,10 +1,10 @@
-use std::{ffi::CString, ops::Deref};
+use std::{ffi::CString, ops::Deref, ptr::NonNull};
 
 use super::{PtxElement, PtxJITCompiler, PtxJITResult, PtxLoadWidth};
 
 impl PtxJITCompiler {
     #[allow(clippy::too_many_lines)]
-    pub fn with_arguments(&mut self, arguments: Option<&[Option<&[u8]>]>) -> PtxJITResult {
+    pub fn with_arguments(&mut self, arguments: Option<&[Option<&NonNull<[u8]>>]>) -> PtxJITResult {
         // Check if the arguments, cast as byte slices, are the same as the last cached
         //  ones
         #[allow(clippy::explicit_deref_methods)]
@@ -16,7 +16,7 @@ impl PtxJITCompiler {
                     .zip(last_arguments.iter())
                     .all(|(a, b)| match (a, b) {
                         (None, None) => false,
-                        (Some(a), Some(b)) => *a != b.deref(),
+                        (Some(a), Some(b)) => (unsafe { a.as_ref() }) != b.deref(),
                         _ => true,
                     })
             },
@@ -30,7 +30,9 @@ impl PtxJITCompiler {
             self.last_arguments = arguments.map(|arguments| {
                 arguments
                     .iter()
-                    .map(|arg| arg.map(|bytes| bytes.to_owned().into_boxed_slice()))
+                    .map(|arg| {
+                        arg.map(|bytes| unsafe { bytes.as_ref() }.to_owned().into_boxed_slice())
+                    })
                     .collect::<Vec<Option<Box<[u8]>>>>()
                     .into_boxed_slice()
             });
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
new file mode 100644
index 000000000..b4cec19cd
--- /dev/null
+++ b/src/lend/impls/box.rs
@@ -0,0 +1,173 @@
+#[cfg(feature = "host")]
+use std::mem::ManuallyDrop;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox};
+
+use crate::{
+    deps::alloc::boxed::Box,
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    safety::PortableBitSemantics,
+    utils::ffi::DeviceOwnedPointer,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::utils::ffi::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::{
+    alloc::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
+    utils::r#async::Async,
+    utils::r#async::CompletionFnMut,
+    utils::r#async::NoCompletion,
+};
+
+#[doc(hidden)]
+#[repr(transparent)]
+#[derive(TypeLayout)]
+#[allow(clippy::module_name_repetitions)]
+pub struct BoxCudaRepresentation<T: PortableBitSemantics + TypeGraphLayout>(DeviceOwnedPointer<T>);
+
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocation = CudaDropWrapper<DeviceBox<DeviceCopyWithPortableBitSemantics<T>>>;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
+    type CudaRepresentation = BoxCudaRepresentation<T>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(&**self),
+        )?);
+
+        Ok((
+            DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer(
+                device_box.as_device_ptr().as_raw_mut().cast(),
+            ))),
+            CombinedCudaAlloc::new(device_box, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        use rustacuda::memory::CopyDestination;
+
+        let (alloc_front, alloc_tail) = alloc.split();
+
+        alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(&mut **self))?;
+
+        core::mem::drop(alloc_front);
+
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T> {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocationAsync = CombinedCudaAlloc<
+        CudaDropWrapper<LockedBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+        CudaDropWrapper<DeviceBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+    >;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
+
+    #[cfg(feature = "host")]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let locked_box = unsafe {
+            let mut uninit = CudaDropWrapper::from(LockedBox::<
+                DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+            >::uninitialized()?);
+            std::ptr::copy_nonoverlapping(
+                std::ptr::from_ref::<T>(&**self)
+                    .cast::<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>(),
+                uninit.as_mut_ptr(),
+                1,
+            );
+            uninit
+        };
+
+        let mut device_box = CudaDropWrapper::from(DeviceBox::<
+            DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+        >::uninitialized()?);
+        device_box.async_copy_from(&*locked_box, &stream)?;
+
+        Ok((
+            Async::pending(
+                DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer(
+                    device_box.as_device_ptr().as_raw_mut().cast(),
+                ))),
+                stream,
+                NoCompletion,
+            )?,
+            CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_box, device_box), alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> CudaResult<(
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
+        A,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let (alloc_front, alloc_tail) = alloc.split();
+        let (mut locked_box, device_box) = alloc_front.split();
+
+        device_box.async_copy_to(&mut *locked_box, &stream)?;
+
+        let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending(
+            this,
+            stream,
+            Box::new(move |this: &mut Self| {
+                let data: &mut T = &mut *this;
+                std::mem::drop(device_box);
+                // Safety: equivalent to *data = *locked_box since
+                //         LockedBox<ManuallyDrop<T>> doesn't drop T
+                unsafe {
+                    std::ptr::copy_nonoverlapping(locked_box.as_ptr().cast::<T>(), data, 1);
+                }
+                std::mem::drop(locked_box);
+                Ok(())
+            }),
+        )?;
+
+        Ok((r#async, alloc_tail))
+    }
+}
+
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for BoxCudaRepresentation<T> {
+    type RustRepresentation = Box<T>;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        crate::deps::alloc::boxed::Box::from_raw(this.0 .0)
+    }
+}
diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs
new file mode 100644
index 000000000..5215d2acf
--- /dev/null
+++ b/src/lend/impls/boxed_slice.rs
@@ -0,0 +1,191 @@
+use core::marker::PhantomData;
+#[cfg(feature = "host")]
+use std::mem::ManuallyDrop;
+
+use crate::{deps::alloc::boxed::Box, lend::RustToCudaAsync, utils::ffi::DeviceOwnedPointer};
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer};
+
+use crate::{
+    lend::{CudaAsRust, RustToCuda},
+    safety::PortableBitSemantics,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::utils::ffi::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::{
+    alloc::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
+    utils::r#async::{Async, CompletionFnMut, NoCompletion},
+};
+
+#[doc(hidden)]
+#[allow(clippy::module_name_repetitions)]
+#[derive(TypeLayout)]
+#[repr(C)]
+pub struct BoxedSliceCudaRepresentation<T: PortableBitSemantics + TypeGraphLayout> {
+    data: DeviceOwnedPointer<T>,
+    len: usize,
+    _marker: PhantomData<T>,
+}
+
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<[T]> {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocation =
+        crate::host::CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<T>>>;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
+    type CudaRepresentation = BoxedSliceCudaRepresentation<T>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
+            DeviceCopyWithPortableBitSemantics::from_slice(self),
+        )?);
+
+        Ok((
+            DeviceAccessible::from(BoxedSliceCudaRepresentation {
+                data: DeviceOwnedPointer(device_buffer.as_mut_ptr().cast()),
+                len: device_buffer.len(),
+                _marker: PhantomData::<T>,
+            }),
+            CombinedCudaAlloc::new(device_buffer, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        use rustacuda::memory::CopyDestination;
+
+        let (alloc_front, alloc_tail) = alloc.split();
+
+        alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut_slice(self))?;
+
+        core::mem::drop(alloc_front);
+
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[T]> {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocationAsync = CombinedCudaAlloc<
+        CudaDropWrapper<LockedBuffer<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+        CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+    >;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
+
+    #[cfg(feature = "host")]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let locked_buffer = unsafe {
+            let mut uninit = CudaDropWrapper::from(LockedBuffer::<
+                DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+            >::uninitialized(self.len())?);
+            std::ptr::copy_nonoverlapping(
+                self.as_ref()
+                    .as_ptr()
+                    .cast::<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>(),
+                uninit.as_mut_ptr(),
+                self.len(),
+            );
+            uninit
+        };
+
+        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::<
+            DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+        >::uninitialized(self.len())?);
+        device_buffer.async_copy_from(&*locked_buffer, &stream)?;
+
+        Ok((
+            Async::pending(
+                DeviceAccessible::from(BoxedSliceCudaRepresentation {
+                    data: DeviceOwnedPointer(device_buffer.as_mut_ptr().cast()),
+                    len: device_buffer.len(),
+                    _marker: PhantomData::<T>,
+                }),
+                stream,
+                NoCompletion,
+            )?,
+            CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> CudaResult<(
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
+        A,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let (alloc_front, alloc_tail) = alloc.split();
+        let (mut locked_buffer, device_buffer) = alloc_front.split();
+
+        device_buffer.async_copy_to(&mut *locked_buffer, &stream)?;
+
+        let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending(
+            this,
+            stream,
+            Box::new(move |this: &mut Self| {
+                let data: &mut [T] = &mut *this;
+                std::mem::drop(device_buffer);
+                // Safety: equivalent to data.copy_from_slice(&*locked_buffer)
+                //         since LockedBox<ManuallyDrop<T>> doesn't drop T
+                unsafe {
+                    std::ptr::copy_nonoverlapping(
+                        locked_buffer.as_ptr().cast::<T>(),
+                        data.as_mut_ptr(),
+                        data.len(),
+                    );
+                }
+                std::mem::drop(locked_buffer);
+                Ok(())
+            }),
+        )?;
+
+        Ok((r#async, alloc_tail))
+    }
+}
+
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for BoxedSliceCudaRepresentation<T>
+{
+    type RustRepresentation = Box<[T]>;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(
+            this.data.0,
+            this.len,
+        ))
+    }
+}
diff --git a/src/lend/impls/final.rs b/src/lend/impls/final.rs
new file mode 100644
index 000000000..5799a77eb
--- /dev/null
+++ b/src/lend/impls/final.rs
@@ -0,0 +1,102 @@
+use r#final::Final;
+
+use crate::{
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    utils::ffi::DeviceAccessible,
+};
+
+#[doc(hidden)]
+#[allow(clippy::module_name_repetitions)]
+#[derive(const_type_layout::TypeLayout)]
+#[repr(transparent)]
+pub struct FinalCudaRepresentation<T: CudaAsRust>(DeviceAccessible<T>);
+
+unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
+    type CudaAllocation = T::CudaAllocation;
+    type CudaRepresentation = FinalCudaRepresentation<T::CudaRepresentation>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: crate::alloc::CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let (cuda_repr, alloc) = (**self).borrow(alloc)?;
+
+        Ok((
+            DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)),
+            alloc,
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: crate::alloc::CudaAlloc>(
+        &mut self,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
+    type CudaAllocationAsync = T::CudaAllocationAsync;
+
+    #[cfg(feature = "host")]
+    unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?;
+        let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+
+        let final_cuda_repr = DeviceAccessible::from(FinalCudaRepresentation(cuda_repr));
+
+        let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) {
+            crate::utils::r#async::Async::pending(
+                final_cuda_repr,
+                stream,
+                crate::utils::r#async::NoCompletion,
+            )?
+        } else {
+            crate::utils::r#async::Async::ready(final_cuda_repr, stream)
+        };
+
+        Ok((r#async, alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<
+            'a,
+            'stream,
+            owning_ref::BoxRefMut<'a, O, Self>,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >,
+        A,
+    )> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        let r#async = crate::utils::r#async::Async::ready(this, stream);
+        Ok((r#async, alloc_tail))
+    }
+}
+
+unsafe impl<T: CudaAsRust> CudaAsRust for FinalCudaRepresentation<T> {
+    type RustRepresentation = Final<T::RustRepresentation>;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        Final::new(CudaAsRust::as_rust(&this.0))
+    }
+}
diff --git a/src/lend/impls/mod.rs b/src/lend/impls/mod.rs
new file mode 100644
index 000000000..e0360671c
--- /dev/null
+++ b/src/lend/impls/mod.rs
@@ -0,0 +1,9 @@
+mod r#box;
+mod boxed_slice;
+#[cfg(feature = "final")]
+mod r#final;
+mod option;
+mod r#ref;
+mod ref_mut;
+mod slice_ref;
+mod slice_ref_mut;
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
new file mode 100644
index 000000000..3f1d1e160
--- /dev/null
+++ b/src/lend/impls/option.rs
@@ -0,0 +1,214 @@
+use core::mem::MaybeUninit;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::error::CudaResult;
+
+use crate::{
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaProxy},
+    safety::PortableBitSemantics,
+    utils::{adapter::RustToCudaWithPortableBitCopySemantics, ffi::DeviceAccessible},
+};
+
+#[cfg(feature = "host")]
+use crate::{
+    alloc::{CombinedCudaAlloc, CudaAlloc},
+    utils::r#async::{Async, CompletionFnMut, NoCompletion},
+};
+
+#[doc(hidden)]
+#[allow(clippy::module_name_repetitions)]
+#[derive(TypeLayout)]
+#[repr(C)]
+pub struct OptionCudaRepresentation<T: CudaAsRust> {
+    maybe: MaybeUninit<DeviceAccessible<T>>,
+    present: bool,
+}
+
+unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
+    type CudaAllocation = Option<<T as RustToCuda>::CudaAllocation>;
+    type CudaRepresentation = OptionCudaRepresentation<<T as RustToCuda>::CudaRepresentation>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let (cuda_repr, alloc) = match self {
+            None => (
+                OptionCudaRepresentation {
+                    maybe: MaybeUninit::uninit(),
+                    present: false,
+                },
+                CombinedCudaAlloc::new(None, alloc),
+            ),
+            Some(value) => {
+                let (cuda_repr, alloc) = value.borrow(alloc)?;
+
+                let (alloc_front, alloc_tail) = alloc.split();
+
+                (
+                    OptionCudaRepresentation {
+                        maybe: MaybeUninit::new(cuda_repr),
+                        present: true,
+                    },
+                    CombinedCudaAlloc::new(Some(alloc_front), alloc_tail),
+                )
+            },
+        };
+
+        Ok((DeviceAccessible::from(cuda_repr), alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        let (alloc_front, alloc_tail) = alloc.split();
+
+        match (self, alloc_front) {
+            (Some(value), Some(alloc_front)) => {
+                value.restore(CombinedCudaAlloc::new(alloc_front, alloc_tail))
+            },
+            _ => Ok(alloc_tail),
+        }
+    }
+}
+
+unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
+    type CudaAllocationAsync = Option<<T as RustToCudaAsync>::CudaAllocationAsync>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> CudaResult<(
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        let (cuda_repr, alloc) = match self {
+            None => (
+                Async::ready(
+                    DeviceAccessible::from(OptionCudaRepresentation {
+                        maybe: MaybeUninit::uninit(),
+                        present: false,
+                    }),
+                    stream,
+                ),
+                CombinedCudaAlloc::new(None, alloc),
+            ),
+            Some(value) => {
+                let (cuda_repr, alloc) = value.borrow_async(alloc, stream)?;
+
+                let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+
+                let (alloc_front, alloc_tail) = alloc.split();
+                let alloc = CombinedCudaAlloc::new(Some(alloc_front), alloc_tail);
+
+                let option_cuda_repr = DeviceAccessible::from(OptionCudaRepresentation {
+                    maybe: MaybeUninit::new(cuda_repr),
+                    present: true,
+                });
+
+                let r#async = if matches!(completion, Some(NoCompletion)) {
+                    Async::pending(option_cuda_repr, stream, NoCompletion)?
+                } else {
+                    Async::ready(option_cuda_repr, stream)
+                };
+
+                (r#async, alloc)
+            },
+        };
+
+        Ok((cuda_repr, alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        mut this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> CudaResult<(
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
+        A,
+    )> {
+        let (alloc_front, alloc_tail) = alloc.split();
+
+        if let (Some(_), Some(alloc_front)) = (&mut *this, alloc_front) {
+            let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) };
+
+            #[allow(clippy::option_if_let_else)]
+            let (r#async, alloc_tail) = RustToCudaAsync::restore_async(
+                // Safety: we have already established value is Some above
+                this.map_mut(|value| unsafe { value.as_mut().unwrap_unchecked() }),
+                CombinedCudaAlloc::new(alloc_front, alloc_tail),
+                stream,
+            )?;
+
+            let (value, on_completion) = unsafe { r#async.unwrap_unchecked()? };
+
+            std::mem::forget(value);
+            let this = std::mem::ManuallyDrop::into_inner(this_backup);
+
+            if let Some(on_completion) = on_completion {
+                let r#async = Async::<_, CompletionFnMut<'a, Self>>::pending(
+                    this,
+                    stream,
+                    Box::new(|this: &mut Self| {
+                        if let Some(value) = this {
+                            on_completion(value)?;
+                        }
+
+                        Ok(())
+                    }),
+                )?;
+                Ok((r#async, alloc_tail))
+            } else {
+                let r#async = Async::ready(this, stream);
+                Ok((r#async, alloc_tail))
+            }
+        } else {
+            let r#async = Async::ready(this, stream);
+            Ok((r#async, alloc_tail))
+        }
+    }
+}
+
+unsafe impl<T: CudaAsRust> CudaAsRust for OptionCudaRepresentation<T> {
+    type RustRepresentation = Option<<T as CudaAsRust>::RustRepresentation>;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        if this.present {
+            Some(CudaAsRust::as_rust(this.maybe.assume_init_ref()))
+        } else {
+            None
+        }
+    }
+}
+
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaProxy<Option<T>>
+    for Option<RustToCudaWithPortableBitCopySemantics<T>>
+{
+    fn from_ref(val: &Option<T>) -> &Self {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        unsafe { &*core::ptr::from_ref(val).cast() }
+    }
+
+    fn from_mut(val: &mut Option<T>) -> &mut Self {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        unsafe { &mut *core::ptr::from_mut(val).cast() }
+    }
+
+    fn into(self) -> Option<T> {
+        self.map(RustToCudaWithPortableBitCopySemantics::into_inner)
+    }
+}
diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs
new file mode 100644
index 000000000..4233d1423
--- /dev/null
+++ b/src/lend/impls/ref.rs
@@ -0,0 +1,150 @@
+use core::marker::PhantomData;
+#[cfg(feature = "host")]
+use std::mem::ManuallyDrop;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox};
+
+use crate::{
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    safety::PortableBitSemantics,
+    utils::ffi::DeviceConstPointer,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::utils::ffi::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::{
+    alloc::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
+    utils::r#async::{Async, CompletionFnMut, NoCompletion},
+};
+
+#[doc(hidden)]
+#[repr(transparent)]
+#[derive(TypeLayout)]
+#[allow(clippy::module_name_repetitions)]
+pub struct RefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
+    data: DeviceConstPointer<T>,
+    _marker: PhantomData<&'a T>,
+}
+
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocation = CudaDropWrapper<DeviceBox<DeviceCopyWithPortableBitSemantics<T>>>;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
+    type CudaRepresentation = RefCudaRepresentation<'a, T>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(&**self),
+        )?);
+
+        Ok((
+            DeviceAccessible::from(RefCudaRepresentation {
+                data: DeviceConstPointer(device_box.as_device_ptr().as_raw().cast()),
+                _marker: PhantomData::<&'a T>,
+            }),
+            CombinedCudaAlloc::new(device_box, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a T {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocationAsync = CombinedCudaAlloc<
+        CudaDropWrapper<LockedBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+        CudaDropWrapper<DeviceBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+    >;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
+
+    #[cfg(feature = "host")]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let locked_box = unsafe {
+            let mut uninit = CudaDropWrapper::from(LockedBox::<
+                DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+            >::uninitialized()?);
+            std::ptr::copy_nonoverlapping(
+                std::ptr::from_ref::<T>(&**self)
+                    .cast::<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>(),
+                uninit.as_mut_ptr(),
+                1,
+            );
+            uninit
+        };
+
+        let mut device_box = CudaDropWrapper::from(DeviceBox::<
+            DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+        >::uninitialized()?);
+        device_box.async_copy_from(&*locked_box, &stream)?;
+
+        Ok((
+            Async::pending(
+                DeviceAccessible::from(RefCudaRepresentation {
+                    data: DeviceConstPointer(device_box.as_device_ptr().as_raw().cast()),
+                    _marker: PhantomData::<&T>,
+                }),
+                stream,
+                NoCompletion,
+            )?,
+            CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_box, device_box), alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'b, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> CudaResult<(
+        Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>,
+        A,
+    )> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        let r#async = Async::ready(this, stream);
+        Ok((r#async, alloc_tail))
+    }
+}
+
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for RefCudaRepresentation<'a, T>
+{
+    type RustRepresentation = &'a T;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        &*this.data.0
+    }
+}
diff --git a/src/lend/impls/ref_mut.rs b/src/lend/impls/ref_mut.rs
new file mode 100644
index 000000000..cab1ea8df
--- /dev/null
+++ b/src/lend/impls/ref_mut.rs
@@ -0,0 +1,92 @@
+use core::marker::PhantomData;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBox};
+
+use crate::{
+    lend::{CudaAsRust, RustToCuda},
+    safety::PortableBitSemantics,
+    utils::ffi::DeviceMutPointer,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::utils::ffi::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::{
+    alloc::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
+};
+
+#[doc(hidden)]
+#[repr(transparent)]
+#[derive(TypeLayout)]
+#[allow(clippy::module_name_repetitions)]
+pub struct RefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
+    data: DeviceMutPointer<T>,
+    _marker: PhantomData<&'a mut T>,
+}
+
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut T {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocation = CudaDropWrapper<DeviceBox<DeviceCopyWithPortableBitSemantics<T>>>;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
+    type CudaRepresentation = RefMutCudaRepresentation<'a, T>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(&**self),
+        )?);
+
+        Ok((
+            DeviceAccessible::from(RefMutCudaRepresentation {
+                data: DeviceMutPointer(device_box.as_device_ptr().as_raw_mut().cast()),
+                _marker: PhantomData::<&'a mut T>,
+            }),
+            CombinedCudaAlloc::new(device_box, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        use rustacuda::memory::CopyDestination;
+
+        let (alloc_front, alloc_tail) = alloc.split();
+
+        alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(&mut **self))?;
+
+        core::mem::drop(alloc_front);
+
+        Ok(alloc_tail)
+    }
+}
+
+// &mut T cannot implement RustToCudaAsync since the reference, potentially
+//  with garbage data, would remain accessible after failing a mutable restore
+
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for RefMutCudaRepresentation<'a, T>
+{
+    type RustRepresentation = &'a mut T;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        let data: *mut T = this.data.0;
+        &mut *data
+    }
+}
diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs
new file mode 100644
index 000000000..bd74dea64
--- /dev/null
+++ b/src/lend/impls/slice_ref.rs
@@ -0,0 +1,155 @@
+use core::marker::PhantomData;
+#[cfg(feature = "host")]
+use std::mem::ManuallyDrop;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer};
+
+use crate::{
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    safety::PortableBitSemantics,
+    utils::ffi::DeviceConstPointer,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::utils::ffi::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::{
+    alloc::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
+    utils::r#async::{Async, CompletionFnMut, NoCompletion},
+};
+
+#[doc(hidden)]
+#[allow(clippy::module_name_repetitions)]
+#[derive(TypeLayout)]
+#[repr(C)]
+pub struct SliceRefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
+    data: DeviceConstPointer<T>,
+    len: usize,
+    _marker: PhantomData<&'a [T]>,
+}
+
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T] {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocation =
+        crate::host::CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<T>>>;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
+    type CudaRepresentation = SliceRefCudaRepresentation<'a, T>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
+            DeviceCopyWithPortableBitSemantics::from_slice(self),
+        )?);
+
+        Ok((
+            DeviceAccessible::from(SliceRefCudaRepresentation {
+                data: DeviceConstPointer(device_buffer.as_ptr().cast()),
+                len: device_buffer.len(),
+                _marker: PhantomData::<&'a [T]>,
+            }),
+            CombinedCudaAlloc::new(device_buffer, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a [T] {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocationAsync = CombinedCudaAlloc<
+        CudaDropWrapper<LockedBuffer<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+        CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+    >;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
+
+    #[cfg(feature = "host")]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let locked_buffer = unsafe {
+            let mut uninit = CudaDropWrapper::from(LockedBuffer::<
+                DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+            >::uninitialized(self.len())?);
+            std::ptr::copy_nonoverlapping(
+                self.as_ref()
+                    .as_ptr()
+                    .cast::<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>(),
+                uninit.as_mut_ptr(),
+                self.len(),
+            );
+            uninit
+        };
+
+        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::<
+            DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+        >::uninitialized(self.len())?);
+        device_buffer.async_copy_from(&*locked_buffer, &stream)?;
+
+        Ok((
+            Async::pending(
+                DeviceAccessible::from(SliceRefCudaRepresentation {
+                    data: DeviceConstPointer(device_buffer.as_ptr().cast()),
+                    len: device_buffer.len(),
+                    _marker: PhantomData::<&'a [T]>,
+                }),
+                stream,
+                NoCompletion,
+            )?,
+            CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'b, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> CudaResult<(
+        Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>,
+        A,
+    )> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        let r#async = Async::ready(this, stream);
+        Ok((r#async, alloc_tail))
+    }
+}
+
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for SliceRefCudaRepresentation<'a, T>
+{
+    type RustRepresentation = &'a [T];
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        core::slice::from_raw_parts(this.data.0, this.len)
+    }
+}
diff --git a/src/lend/impls/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs
new file mode 100644
index 000000000..5c766dd24
--- /dev/null
+++ b/src/lend/impls/slice_ref_mut.rs
@@ -0,0 +1,94 @@
+use core::marker::PhantomData;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBuffer};
+
+use crate::{
+    lend::{CudaAsRust, RustToCuda},
+    safety::PortableBitSemantics,
+    utils::ffi::DeviceMutPointer,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::utils::ffi::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::{
+    alloc::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
+};
+
+#[doc(hidden)]
+#[allow(clippy::module_name_repetitions)]
+#[derive(TypeLayout)]
+#[repr(C)]
+pub struct SliceRefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
+    data: DeviceMutPointer<T>,
+    len: usize,
+    _marker: PhantomData<&'a mut [T]>,
+}
+
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut [T] {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocation =
+        crate::host::CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<T>>>;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
+    type CudaRepresentation = SliceRefMutCudaRepresentation<'a, T>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
+            DeviceCopyWithPortableBitSemantics::from_slice(self),
+        )?);
+
+        Ok((
+            DeviceAccessible::from(SliceRefMutCudaRepresentation {
+                data: DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
+                len: device_buffer.len(),
+                _marker: PhantomData::<&'a mut [T]>,
+            }),
+            CombinedCudaAlloc::new(device_buffer, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        use rustacuda::memory::CopyDestination;
+
+        let (alloc_front, alloc_tail) = alloc.split();
+
+        alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut_slice(self))?;
+
+        core::mem::drop(alloc_front);
+
+        Ok(alloc_tail)
+    }
+}
+
+// &mut [T] cannot implement RustToCudaAsync since the slice, potentially with
+//  garbage data, would remain accessible after failing a mutable restore
+
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for SliceRefMutCudaRepresentation<'a, T>
+{
+    type RustRepresentation = &'a mut [T];
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        core::slice::from_raw_parts_mut(this.data.0, this.len)
+    }
+}
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
new file mode 100644
index 000000000..e05237768
--- /dev/null
+++ b/src/lend/mod.rs
@@ -0,0 +1,609 @@
+use const_type_layout::TypeGraphLayout;
+#[cfg(feature = "host")]
+use rustacuda::error::CudaError;
+
+#[cfg(feature = "derive")]
+#[allow(clippy::module_name_repetitions)]
+pub use rust_cuda_derive::LendRustToCuda;
+
+#[cfg(any(feature = "host", feature = "device", doc))]
+use crate::safety::{SafeMutableAliasing, StackOnly};
+#[cfg(feature = "device")]
+use crate::utils::ffi::{DeviceConstRef, DeviceMutRef, DeviceOwnedRef};
+use crate::{alloc::CudaAlloc, safety::PortableBitSemantics};
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible};
+#[cfg(feature = "host")]
+use crate::{
+    alloc::{CombinedCudaAlloc, NoCudaAlloc},
+    host::{HostAndDeviceConstRef, HostAndDeviceMutRef, HostAndDeviceOwned},
+    utils::r#async::{Async, CompletionFnMut, NoCompletion},
+};
+
+mod impls;
+
+/// # Safety
+///
+/// This is an internal trait and should ONLY be derived automatically using
+/// `#[derive(LendRustToCuda)]`
+pub unsafe trait RustToCuda {
+    type CudaAllocation: CudaAlloc;
+    type CudaRepresentation: CudaAsRust<RustRepresentation = Self>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    /// # Errors
+    ///
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    ///
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually
+    /// The returned [`Self::CudaRepresentation`] must NEVER be accessed on the
+    ///  CPU  as it contains a GPU-resident copy of `self`.
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    /// # Errors
+    ///
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    ///
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually
+    #[allow(clippy::type_complexity)]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A>;
+}
+
+/// # Safety
+///
+/// This is an internal trait and should ONLY be derived automatically using
+/// `#[derive(LendRustToCuda)]`
+pub unsafe trait RustToCudaAsync: RustToCuda {
+    type CudaAllocationAsync: CudaAlloc;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    /// # Errors
+    ///
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    ///
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually.
+    ///
+    /// The returned
+    /// [`Self::CudaRepresentation`](RustToCuda::CudaRepresentation) must NEVER
+    /// be accessed on the  CPU  as it contains a GPU-resident copy of
+    /// `self`.
+    ///
+    /// Since this method may perform asynchronous computation but returns its
+    /// result immediately, this result must only be used to construct compound
+    /// asynchronous computations before it has been synchronized on.
+    ///
+    /// Similarly, `&self` should remain borrowed until synchronisation has
+    /// been performed.
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    /// # Errors
+    ///
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    ///
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually.
+    ///
+    /// Since this method may perform asynchronous computation but returns
+    /// immediately, `&mut self` not be used until it has been synchronized on.
+    ///
+    /// Therefore, `&mut self` should remain mutably borrowed until
+    /// synchronisation has been performed.
+    #[allow(clippy::type_complexity)]
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
+        A,
+    )>;
+}
+
+/// # Safety
+///
+/// This is an internal trait and should NEVER be implemented manually
+pub unsafe trait CudaAsRust: PortableBitSemantics + TypeGraphLayout {
+    type RustRepresentation: RustToCuda<CudaRepresentation = Self>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "device")]
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation;
+}
+
+pub trait RustToCudaProxy<T>: RustToCuda {
+    fn from_ref(val: &T) -> &Self;
+    fn from_mut(val: &mut T) -> &mut Self;
+
+    fn into(self) -> T;
+}
+
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+pub trait LendToCuda: RustToCuda {
+    /// Lends an immutable borrow of `&self` to CUDA:
+    /// - code in the CUDA kernel can only access `&self` through the
+    ///   [`DeviceConstRef`] inside the closure
+    /// - after the closure, `&self` will not have changed
+    ///
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
+    fn lend_to_cuda<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        &self,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Sync;
+
+    /// Lends a mutable borrow of `&mut self` to CUDA iff `Self` is
+    /// [`SafeMutableAliasing`]:
+    /// - code in the CUDA kernel can only access `&mut self` through the
+    ///   `DeviceMutRef` inside the closure
+    /// - after the closure, `&mut self` will reflect the changes from the
+    ///   kernel execution
+    ///
+    /// # Errors
+    ///
+    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    fn lend_to_cuda_mut<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        &mut self,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Sync + SafeMutableAliasing;
+
+    /// Moves `self` to CUDA iff `Self` is [`StackOnly`].
+    ///
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
+    fn move_to_cuda<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        self,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Send + RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>;
+}
+
+#[cfg(feature = "host")]
+impl<T: RustToCuda> LendToCuda for T {
+    fn lend_to_cuda<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        &self,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Sync,
+    {
+        let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
+
+        let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner);
+
+        core::mem::drop(cuda_repr);
+        core::mem::drop(alloc);
+
+        result
+    }
+
+    fn lend_to_cuda_mut<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        &mut self,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Sync + SafeMutableAliasing,
+    {
+        let (mut cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
+
+        let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, inner);
+
+        core::mem::drop(cuda_repr);
+
+        let _: NoCudaAlloc = unsafe { self.restore(alloc) }?;
+
+        result
+    }
+
+    fn move_to_cuda<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        self,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Send + RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>,
+    {
+        let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
+
+        let result = HostAndDeviceOwned::with_new(cuda_repr, inner);
+
+        core::mem::drop(alloc);
+
+        result
+    }
+}
+
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+pub trait LendToCudaAsync: RustToCudaAsync {
+    /// Lends an immutable copy of `&self` to CUDA:
+    /// - code in the CUDA kernel can only access `&self` through the
+    ///   [`DeviceConstRef`] inside the closure
+    /// - after the closure, `&self` will not have changed, i.e. interior
+    ///   mutability is not handled by this method
+    ///
+    /// Since the [`HostAndDeviceConstRef`] is wrapped in an [`Async`] with
+    /// [`NoCompletion`], this [`Async`] can be safely dropped or forgotten
+    /// without changing any behaviour. Therefore, this [`Async`] does *not*
+    /// need to be returned from the `inner` closure.
+    ///
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
+    fn lend_to_cuda_async<
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            Async<
+                '_,
+                'stream,
+                HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
+            >,
+        ) -> Result<O, E>,
+    >(
+        &self,
+        stream: crate::host::Stream<'stream>,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Sync;
+
+    #[allow(clippy::type_complexity)]
+    /// Lends a mutable borrow of `&mut self` to CUDA iff `Self` is
+    /// [`SafeMutableAliasing`]:
+    /// - code in the CUDA kernel can only access `&mut self` through the
+    ///   `DeviceMutRef` inside the closure
+    /// - after the closure, `&mut self` will reflect the changes from the
+    ///   kernel execution
+    ///
+    /// # Errors
+    ///
+    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    fn lend_to_cuda_mut_async<
+        'a,
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: for<'b> FnOnce(
+            Async<
+                'b,
+                'stream,
+                HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
+            >,
+        ) -> Result<O, E>,
+        T: 'a,
+    >(
+        this: owning_ref::BoxRefMut<'a, T, Self>,
+        stream: crate::host::Stream<'stream>,
+        inner: F,
+    ) -> Result<
+        (
+            Async<'a, 'stream, owning_ref::BoxRefMut<'a, T, Self>, CompletionFnMut<'a, Self>>,
+            O,
+        ),
+        E,
+    >
+    where
+        Self: Sync + SafeMutableAliasing;
+
+    /// Moves `self` to CUDA iff `self` is [`StackOnly`].
+    ///
+    /// Since the [`HostAndDeviceOwned`] is wrapped in an [`Async`] with
+    /// [`NoCompletion`], this [`Async`] can be safely dropped or forgotten
+    /// without changing any behaviour. Therefore, this [`Async`] does *not*
+    /// need to be returned from the `inner` closure.
+    ///
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
+    fn move_to_cuda_async<
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: for<'a> FnOnce(
+            Async<
+                'a,
+                'stream,
+                HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
+            >,
+        ) -> Result<O, E>,
+    >(
+        self,
+        stream: crate::host::Stream<'stream>,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Send + RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>;
+}
+
+#[cfg(feature = "host")]
+impl<T: RustToCudaAsync> LendToCudaAsync for T {
+    fn lend_to_cuda_async<
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            Async<
+                '_,
+                'stream,
+                HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
+            >,
+        ) -> Result<O, E>,
+    >(
+        &self,
+        stream: crate::host::Stream<'stream>,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Sync,
+    {
+        let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
+
+        let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+
+        let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| {
+            let r#async = if matches!(completion, Some(NoCompletion)) {
+                Async::pending(const_ref, stream, NoCompletion)?
+            } else {
+                Async::ready(const_ref, stream)
+            };
+
+            inner(r#async)
+        });
+
+        core::mem::drop(cuda_repr);
+        core::mem::drop(alloc);
+
+        result
+    }
+
+    fn lend_to_cuda_mut_async<
+        'a,
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: for<'b> FnOnce(
+            Async<
+                'b,
+                'stream,
+                HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
+            >,
+        ) -> Result<O, E>,
+        S: 'a,
+    >(
+        this: owning_ref::BoxRefMut<'a, S, Self>,
+        stream: crate::host::Stream<'stream>,
+        inner: F,
+    ) -> Result<
+        (
+            Async<'a, 'stream, owning_ref::BoxRefMut<'a, S, Self>, CompletionFnMut<'a, Self>>,
+            O,
+        ),
+        E,
+    >
+    where
+        Self: Sync + SafeMutableAliasing,
+    {
+        let (cuda_repr, alloc) = unsafe { this.borrow_async(NoCudaAlloc, stream) }?;
+
+        let (mut cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+
+        let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, |mut_ref| {
+            let r#async = if matches!(completion, Some(NoCompletion)) {
+                Async::pending(mut_ref, stream, NoCompletion)?
+            } else {
+                Async::ready(mut_ref, stream)
+            };
+
+            inner(r#async)
+        });
+
+        core::mem::drop(cuda_repr);
+
+        let (r#async, _): (_, NoCudaAlloc) = unsafe { Self::restore_async(this, alloc, stream) }?;
+
+        result.map(|ok| (r#async, ok))
+    }
+
+    fn move_to_cuda_async<
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: for<'a> FnOnce(
+            Async<
+                'a,
+                'stream,
+                HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
+            >,
+        ) -> Result<O, E>,
+    >(
+        self,
+        stream: crate::host::Stream<'stream>,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Send + RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>,
+    {
+        let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
+
+        let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+
+        let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| {
+            if matches!(completion, Some(NoCompletion)) {
+                inner(Async::pending(owned_ref, stream, NoCompletion)?)
+            } else {
+                inner(Async::ready(owned_ref, stream))
+            }
+        });
+
+        core::mem::drop(alloc);
+
+        result
+    }
+}
+
+#[cfg(feature = "device")]
+pub trait BorrowFromRust: RustToCuda {
+    /// # Safety
+    ///
+    /// This function is only safe to call iff `cuda_repr` is the
+    /// [`DeviceConstRef`] borrowed on the CPU using the corresponding
+    /// [`LendToCuda::lend_to_cuda`].
+    unsafe fn with_borrow_from_rust<O, F: FnOnce(&Self) -> O>(
+        cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O;
+
+    /// # Safety
+    ///
+    /// This function is only safe to call iff `cuda_repr_mut` is the
+    /// [`DeviceMutRef`] borrowed on the CPU using the corresponding
+    /// [`LendToCuda::lend_to_cuda_mut`].
+    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut Self) -> O>(
+        cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O
+    where
+        Self: SafeMutableAliasing;
+
+    /// # Safety
+    ///
+    /// This function is only safe to call iff `cuda_repr` is the
+    ///  [`DeviceOwnedRef`] borrowed on the CPU using the corresponding
+    ///  [`LendToCuda::move_to_cuda`].
+    unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
+        cuda_repr: DeviceOwnedRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O
+    where
+        Self: Sized + RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>;
+}
+
+#[cfg(feature = "device")]
+impl<T: RustToCuda> BorrowFromRust for T {
+    #[inline]
+    unsafe fn with_borrow_from_rust<O, F: FnOnce(&Self) -> O>(
+        cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O {
+        // `rust_repr` must never be dropped as we do NOT own any of the
+        //  heap memory it might reference
+        let rust_repr = core::mem::ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr.as_ref()));
+
+        inner(&rust_repr)
+    }
+
+    #[inline]
+    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut Self) -> O>(
+        mut cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O
+    where
+        Self: SafeMutableAliasing,
+    {
+        // `rust_repr` must never be dropped as we do NOT own any of the
+        //  heap memory it might reference
+        let mut rust_repr_mut =
+            core::mem::ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut()));
+
+        inner(&mut rust_repr_mut)
+    }
+
+    #[inline]
+    unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
+        mut cuda_repr: DeviceOwnedRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O
+    where
+        Self: RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>,
+    {
+        inner(CudaAsRust::as_rust(cuda_repr.as_mut()))
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 3c176e4a2..35e11ed1b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,51 +1,84 @@
-#![deny(clippy::pedantic)]
-#![allow(clippy::useless_attribute)]
-#![cfg_attr(not(feature = "host"), no_std)]
+//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License
+//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod
+//! Ready-to-Code]][gitpod]
+//!
+//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
+//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
+//!
+//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange
+//! [repo]: https://github.com/juntyr/rust-cuda
+//!
+//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
+//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda/
+//!
+//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield
+//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield
+//!
+//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx
+//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda
+//!
+//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod
+//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda
+
+#![deny(clippy::complexity)]
+#![deny(clippy::correctness)]
+#![warn(clippy::nursery)]
+#![warn(clippy::pedantic)]
+#![deny(clippy::perf)]
+#![deny(clippy::style)]
+#![deny(clippy::suspicious)]
+// #![warn(clippy::multiple_unsafe_ops_per_block)] // FIXME
+// #![warn(clippy::undocumented_unsafe_blocks)] // FIXME
+#![deny(unused_unsafe)]
+// #![warn(missing_docs)] // FIXME
+#![cfg_attr(all(any(feature = "device", target_os = "cuda"), not(doc)), no_std)]
 #![feature(associated_type_bounds)]
 #![feature(auto_traits)]
 #![feature(negative_impls)]
-#![cfg_attr(
-    any(all(not(feature = "host"), target_os = "cuda"), doc),
-    feature(stdsimd)
-)]
-#![cfg_attr(any(feature = "alloc", doc), feature(allocator_api))]
+#![cfg_attr(feature = "device", feature(stdsimd))]
+#![cfg_attr(feature = "device", feature(asm_experimental_arch))]
+#![cfg_attr(feature = "device", feature(asm_const))]
+#![feature(doc_auto_cfg)]
 #![feature(doc_cfg)]
 #![feature(marker_trait_attr)]
 #![feature(const_type_name)]
-#![feature(offset_of)]
 #![feature(adt_const_params)]
+#![feature(impl_trait_in_assoc_type)]
+#![feature(ptr_metadata)]
+#![feature(decl_macro)]
+#![feature(panic_info_message)]
+#![feature(let_chains)]
+#![feature(inline_const)]
+#![feature(sync_unsafe_cell)]
+#![feature(never_type)]
+#![feature(layout_for_ptr)]
+#![feature(cfg_version)]
+#![cfg_attr(feature = "device", feature(slice_ptr_get))]
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
+#![allow(internal_features)]
+#![feature(core_intrinsics)]
+#![feature(const_intrinsic_compare_bytes)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
-#[doc(hidden)]
-pub extern crate alloc;
-
-pub extern crate rust_cuda_ptx_jit as ptx_jit;
-pub extern crate rustacuda_core;
+#[cfg(all(feature = "host", feature = "device", not(doc)))]
+core::compile_error!("cannot enable the `host` and `device` features at the same time");
 
-#[doc(hidden)]
-#[macro_use]
-pub extern crate const_type_layout;
+#[cfg(all(feature = "host", targt_os = "cuda", not(doc)))]
+core::compile_error!("cannot enable the `host` feature on a target with `target_os=\"cuda\"`");
 
-#[cfg(feature = "derive")]
-#[doc(cfg(feature = "derive"))]
-pub extern crate rustacuda_derive;
+#[cfg(all(feature = "device", not(target_os = "cuda"), not(doc)))]
+core::compile_error!("cannot enable the `device` feature on a target without `target_os=\"cuda\"`");
 
-pub mod common;
+pub mod alloc;
+pub mod deps;
+pub mod kernel;
+pub mod lend;
+pub mod safety;
+pub mod utils;
 
 #[cfg(feature = "host")]
-#[doc(cfg(feature = "host"))]
 pub mod host;
 
-#[cfg(feature = "host")]
-#[doc(cfg(feature = "host"))]
-pub extern crate rustacuda;
-
-#[cfg(any(all(not(feature = "host"), target_os = "cuda"), doc))]
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
+#[cfg(feature = "device")]
 pub mod device;
-
-pub mod utils;
-
-pub mod safety;
diff --git a/src/safety/aliasing.rs b/src/safety/aliasing.rs
new file mode 100644
index 000000000..7add5775c
--- /dev/null
+++ b/src/safety/aliasing.rs
@@ -0,0 +1,89 @@
+#[allow(clippy::module_name_repetitions)]
+/// Types for which mutable references can be safely shared with each CUDA
+/// thread without breaking Rust's no-mutable-aliasing memory safety
+/// guarantees.
+///
+/// # Safety
+///
+/// A type may only implement [`SafeMutableAliasing`], if and
+/// only if all of the safety conditions below hold:
+///
+/// * Calling [`std::mem::replace`] on a mutable reference of the type does
+///   *not* return a value which owns memory which it must deallocate on drop.
+///   For instance, `&mut [T]` satisfies this criteria, but `Box<T>` does not.
+///
+/// * No safe alising mutable access is provided to the same memory locations
+///   across multiple CUDA threads. You can use the
+///   [`SplitSliceOverCudaThreadsConstStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride)
+///   and
+///   [`SplitSliceOverCudaThreadsDynamicStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride)
+///   wrapper types to ensure that each thread is only given access to to its
+///   own sub-slice partition so that aliasing is avoided.
+///
+/// * A mutable reference of the type must not provide mutable access to some
+///   shallow inner state (in contrast to deep, which refers to values behind
+///   references) of the value which the API user expects to be mutably shared
+///   between all threads even if it is not in practice so as to not violate the
+///   second condition. For instance, `Vec<T>` violates this third condition, as
+///   code with access to `&mut Vec<T>` can also mutate the length of the
+///   vector, which is shallow state that is expected to be propagated to the
+///   caller of a function sharing this vector (it is also related to the deep
+///   contents of the vector via a safety invariant) and might thus assume that
+///   mutations of this length are either shared across threads or shared back
+///   with the host after the kernel has completed, neither of which is
+///   possible. In contrast, `&mut [T]` satisfies this condition, as it is well
+///   known that modifying the shallow length of a slice (by assigning a
+///   sub-slice) inside a function does not alter the length of the slice that
+///   the caller of the function passed in.
+pub unsafe trait SafeMutableAliasing {}
+
+unsafe impl<
+        'a,
+        T: crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + const_type_layout::TypeGraphLayout,
+        const STRIDE: usize,
+    > SafeMutableAliasing
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<&'a mut [T], STRIDE>
+{
+}
+
+unsafe impl<
+        'a,
+        T: crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + const_type_layout::TypeGraphLayout,
+    > SafeMutableAliasing
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<&'a mut [T]>
+{
+}
+
+#[cfg(any(feature = "host", feature = "device"))]
+unsafe impl<
+        T: crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + const_type_layout::TypeGraphLayout,
+        const M2D: bool,
+        const M2H: bool,
+        const STRIDE: usize,
+    > SafeMutableAliasing
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<
+        crate::utils::exchange::buffer::CudaExchangeBuffer<T, M2D, M2H>,
+        STRIDE,
+    >
+{
+}
+
+#[cfg(any(feature = "host", feature = "device"))]
+unsafe impl<
+        T: crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + const_type_layout::TypeGraphLayout,
+        const M2D: bool,
+        const M2H: bool,
+    > SafeMutableAliasing
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<
+        crate::utils::exchange::buffer::CudaExchangeBuffer<T, M2D, M2H>,
+    >
+{
+}
diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs
deleted file mode 100644
index c5de73430..000000000
--- a/src/safety/device_copy.rs
+++ /dev/null
@@ -1,22 +0,0 @@
-#[allow(clippy::module_name_repetitions)]
-pub trait SafeDeviceCopy: sealed::SafeDeviceCopy {}
-
-impl<T: sealed::SafeDeviceCopy> SafeDeviceCopy for T {}
-
-mod sealed {
-    #[marker]
-    pub trait SafeDeviceCopy {}
-
-    impl<T: crate::safety::StackOnly> SafeDeviceCopy for T {}
-    #[cfg(any(feature = "alloc", doc))]
-    impl<T: crate::safety::UnifiedHeapOnly> SafeDeviceCopy for T {}
-
-    impl<T: SafeDeviceCopy + rustacuda_core::DeviceCopy> SafeDeviceCopy
-        for crate::common::DeviceAccessible<T>
-    {
-    }
-    impl<T: SafeDeviceCopy + const_type_layout::TypeGraphLayout> SafeDeviceCopy
-        for crate::utils::device_copy::SafeDeviceCopyWrapper<T>
-    {
-    }
-}
diff --git a/src/safety/kernel_signature.rs b/src/safety/kernel_signature.rs
deleted file mode 100644
index 4a82ec1d0..000000000
--- a/src/safety/kernel_signature.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-#[derive(PartialEq, Eq, core::marker::ConstParamTy)]
-pub enum CpuAndGpuKernelSignatures {
-    Match,
-    Mismatch,
-}
-
-pub struct Assert<const MATCH: CpuAndGpuKernelSignatures>;
-
-#[must_use]
-pub const fn check(haystack: &[u8], needle: &[u8]) -> CpuAndGpuKernelSignatures {
-    let mut i = 0;
-    let mut j = 0;
-
-    while i < needle.len() {
-        if j >= haystack.len() {
-            return CpuAndGpuKernelSignatures::Mismatch;
-        }
-
-        if needle[i] == haystack[j] {
-            i += 1;
-            j += 1;
-        } else {
-            j = j + 1 - i;
-            i = 0;
-        }
-    }
-
-    CpuAndGpuKernelSignatures::Match
-}
diff --git a/src/safety/mod.rs b/src/safety/mod.rs
index cf7a8f718..7e078e34e 100644
--- a/src/safety/mod.rs
+++ b/src/safety/mod.rs
@@ -1,19 +1,13 @@
+mod aliasing;
 mod arch;
-mod device_copy;
-mod no_aliasing;
-mod register_fit;
+mod portable;
 mod stack_only;
-#[cfg(any(feature = "alloc", doc))]
-mod unified_heap;
 
 #[doc(hidden)]
-pub mod kernel_signature;
+pub mod ptx_entry_point;
 #[doc(hidden)]
-pub mod type_layout;
+pub mod ptx_kernel_signature;
 
-pub use device_copy::SafeDeviceCopy;
-pub use no_aliasing::NoAliasing;
-pub use register_fit::FitsIntoDeviceRegister;
+pub use aliasing::SafeMutableAliasing;
+pub use portable::PortableBitSemantics;
 pub use stack_only::StackOnly;
-#[cfg(any(feature = "alloc", doc))]
-pub use unified_heap::UnifiedHeapOnly;
diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs
deleted file mode 100644
index 22488efb8..000000000
--- a/src/safety/no_aliasing.rs
+++ /dev/null
@@ -1,25 +0,0 @@
-#[allow(clippy::module_name_repetitions)]
-pub trait NoAliasing: private::NoAliasing {}
-impl<T: private::NoAliasing> NoAliasing for T {}
-
-mod private {
-    pub auto trait NoAliasing {}
-
-    impl<T> !NoAliasing for *const T {}
-    impl<T> !NoAliasing for *mut T {}
-    impl<T> !NoAliasing for &mut T {}
-
-    impl<T> NoAliasing for core::marker::PhantomData<T> {}
-
-    impl<T> NoAliasing for r#final::Final<T> {}
-    impl<T: crate::common::CudaAsRust> NoAliasing
-        for crate::utils::aliasing::FinalCudaRepresentation<T>
-    {
-    }
-
-    impl<T, const STRIDE: usize> NoAliasing
-        for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<T, STRIDE>
-    {
-    }
-    impl<T> NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<T> {}
-}
diff --git a/src/safety/portable.rs b/src/safety/portable.rs
new file mode 100644
index 000000000..5b438e2f7
--- /dev/null
+++ b/src/safety/portable.rs
@@ -0,0 +1,63 @@
+macro_rules! portable_bit_semantics_docs {
+    ($item:item) => {
+        /// Types whose in-memory bit representation on the CPU host is safe to copy
+        /// to and read back on the GPU device while maintaining the same semantics,
+        /// iff the type layout on the CPU matches the type layout on the GPU.
+        ///
+        /// For a type to implement [`PortableBitSemantics`], it
+        ///
+        /// * should have the same memory layout on both the CPU and GPU, and
+        ///
+        /// * must not contain any references to data that are exposed as safely
+        ///   accessible on both ends but actually inaccessible on one.
+        ///
+        /// For instance, a reference `&u8` to host memory has the same well-defined
+        /// layout on both CPU and GPU (if their pointer sizes and alignments
+        /// match), but it is not portable since the host memory is generally
+        /// not accessible from the GPU.
+        ///
+        /// This trait is automatically implemented when the compiler determines
+        /// it's appropriate.
+        ///
+        /// Note that this trait is *sealed*, i.e. you cannot implement it on your
+        /// own custom types.
+        ///
+        /// Trait bounds usually combine [`PortableBitSemantics`] with
+        /// [`TypeGraphLayout`](const_type_layout::TypeGraphLayout) to check that
+        /// the type layout is indeed the same on both the host CPU and the GPU
+        /// device.
+        ///
+        /// Types that implement [`StackOnly`](crate::safety::StackOnly) and
+        /// [`TypeGraphLayout`](const_type_layout::TypeGraphLayout) satisfy both
+        /// of the above criteria and thus also implement [`PortableBitSemantics`].
+        $item
+    };
+}
+
+#[cfg(not(doc))]
+portable_bit_semantics_docs! {
+    #[allow(clippy::module_name_repetitions)]
+    pub trait PortableBitSemantics: sealed::PortableBitSemantics {}
+}
+#[cfg(doc)]
+portable_bit_semantics_docs! {
+    pub use sealed::PortableBitSemantics;
+}
+
+#[cfg(not(doc))]
+impl<T: ?Sized + sealed::PortableBitSemantics> PortableBitSemantics for T {}
+
+mod sealed {
+    pub auto trait PortableBitSemantics {}
+
+    impl<T: ?Sized> !PortableBitSemantics for &T {}
+    impl<T: ?Sized> !PortableBitSemantics for &mut T {}
+    impl<T: ?Sized> !PortableBitSemantics for *const T {}
+    impl<T: ?Sized> !PortableBitSemantics for *mut T {}
+
+    impl<T> PortableBitSemantics for core::marker::PhantomData<T> {}
+
+    impl<T: PortableBitSemantics> PortableBitSemantics for crate::utils::ffi::DeviceConstPointer<T> {}
+    impl<T: PortableBitSemantics> PortableBitSemantics for crate::utils::ffi::DeviceMutPointer<T> {}
+    impl<T: PortableBitSemantics> PortableBitSemantics for crate::utils::ffi::DeviceOwnedPointer<T> {}
+}
diff --git a/src/safety/ptx_entry_point.rs b/src/safety/ptx_entry_point.rs
new file mode 100644
index 000000000..ab06a13d9
--- /dev/null
+++ b/src/safety/ptx_entry_point.rs
@@ -0,0 +1,62 @@
+#[derive(PartialEq, Eq, core::marker::ConstParamTy)]
+pub enum HostAndDeviceKernelEntryPoint {
+    Match,
+    Mismatch,
+}
+
+pub struct Assert<const MATCH: HostAndDeviceKernelEntryPoint>;
+
+#[must_use]
+pub const fn check(ptx: &[u8], entry_point: &[u8]) -> HostAndDeviceKernelEntryPoint {
+    const PTX_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation";
+    const KERNEL_TYPE: &[u8] = b".visible .entry ";
+
+    // Short-circuit to avoid extra errors when PTX compilation fails
+    if ptx.len() == PTX_ERROR_MESSAGE.len() && starts_with(ptx, PTX_ERROR_MESSAGE, 0) {
+        return HostAndDeviceKernelEntryPoint::Match;
+    }
+
+    let mut j = 0;
+
+    while j < ptx.len() {
+        let Some(j2) = find(ptx, KERNEL_TYPE, j) else {
+            return HostAndDeviceKernelEntryPoint::Mismatch;
+        };
+
+        if starts_with(ptx, entry_point, j2) {
+            return HostAndDeviceKernelEntryPoint::Match;
+        }
+
+        j += 1;
+    }
+
+    HostAndDeviceKernelEntryPoint::Mismatch
+}
+
+const fn find(haystack: &[u8], needle: &[u8], from: usize) -> Option<usize> {
+    let mut i = 0;
+    let mut j = from;
+
+    while i < needle.len() {
+        if j >= haystack.len() {
+            return None;
+        }
+
+        if needle[i] == haystack[j] {
+            i += 1;
+            j += 1;
+        } else {
+            j = j + 1 - i;
+            i = 0;
+        }
+    }
+
+    Some(j)
+}
+
+const fn starts_with(haystack: &[u8], needle: &[u8], from: usize) -> bool {
+    let haystack_len = haystack.len() - from;
+    let check_len = if needle.len() < haystack_len { needle.len() } else { haystack_len };
+
+    unsafe { core::intrinsics::compare_bytes(haystack.as_ptr().add(from), needle.as_ptr(), check_len) == 0 }
+}
diff --git a/src/safety/ptx_kernel_signature.rs b/src/safety/ptx_kernel_signature.rs
new file mode 100644
index 000000000..a8b298691
--- /dev/null
+++ b/src/safety/ptx_kernel_signature.rs
@@ -0,0 +1,41 @@
+use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout};
+
+#[allow(clippy::module_name_repetitions)]
+#[derive(PartialEq, Eq, core::marker::ConstParamTy)]
+pub enum HostAndDeviceKernelSignatureTypeLayout {
+    Match,
+    Mismatch,
+}
+
+pub struct Assert<const MATCH: HostAndDeviceKernelSignatureTypeLayout>;
+
+#[must_use]
+pub const fn check<T: TypeGraphLayout>(
+    device: &'static [u8],
+) -> HostAndDeviceKernelSignatureTypeLayout
+where
+    [u8; serialised_type_graph_len::<T>()]:,
+{
+    const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation";
+
+    // Short-circuit to avoid extra errors when PTX compilation fails
+    if equals(device, SIGNATURE_ERROR_MESSAGE) {
+        return HostAndDeviceKernelSignatureTypeLayout::Match;
+    }
+
+    let host = serialise_type_graph::<T>();
+
+    if equals(device, &host) {
+        HostAndDeviceKernelSignatureTypeLayout::Match
+    } else {
+        HostAndDeviceKernelSignatureTypeLayout::Mismatch
+    }
+}
+
+const fn equals(device: &[u8], host: &[u8]) -> bool {
+    if device.len() != host.len() {
+        return false;
+    }
+
+    unsafe { core::intrinsics::compare_bytes(device.as_ptr(), host.as_ptr(), device.len()) == 0 }
+}
diff --git a/src/safety/register_fit.rs b/src/safety/register_fit.rs
deleted file mode 100644
index 1ddf33849..000000000
--- a/src/safety/register_fit.rs
+++ /dev/null
@@ -1,43 +0,0 @@
-pub trait FitsIntoDeviceRegister: private::FitsIntoDeviceRegister {}
-impl<T: private::FitsIntoDeviceRegister> FitsIntoDeviceRegister for T {}
-
-mod private {
-    pub trait FitsIntoDeviceRegister {}
-    impl<T> FitsIntoDeviceRegister for T where
-        AssertTypeFitsInto64Bits<{ TypeSize::check::<T>() }>: FitsInto64Bits
-    {
-    }
-
-    #[derive(PartialEq, Eq, core::marker::ConstParamTy)]
-    pub enum TypeSize {
-        TypeFitsInto64Bits,
-        // FIXME: ConstParamTy variant with str ICEs in rustdoc
-        #[cfg(not(doc))]
-        TypeExeceeds64Bits(&'static str),
-        #[cfg(doc)]
-        TypeExeceeds64Bits,
-    }
-
-    impl TypeSize {
-        pub const fn check<T>() -> Self {
-            if core::mem::size_of::<T>() <= core::mem::size_of::<u64>() {
-                Self::TypeFitsInto64Bits
-            } else {
-                #[cfg(not(doc))]
-                {
-                    Self::TypeExeceeds64Bits(core::any::type_name::<T>())
-                }
-                #[cfg(doc)]
-                {
-                    Self::TypeExeceeds64Bits
-                }
-            }
-        }
-    }
-
-    pub enum AssertTypeFitsInto64Bits<const CHECK: TypeSize> {}
-
-    pub trait FitsInto64Bits {}
-
-    impl FitsInto64Bits for AssertTypeFitsInto64Bits<{ TypeSize::TypeFitsInto64Bits }> {}
-}
diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs
index e96f48993..eac7f9456 100644
--- a/src/safety/stack_only.rs
+++ b/src/safety/stack_only.rs
@@ -1,40 +1,86 @@
-/// ```rust
-/// # use rust_cuda::safety::StackOnly;
-/// fn assert_stackonly(_x: impl StackOnly) {}
-/// ```
-/// ```rust
-/// # use rust_cuda::safety::StackOnly;
-/// # fn assert_stackonly(_x: impl StackOnly) {}
-/// assert_stackonly(42);
-/// ```
-/// ```rust
-/// # use rust_cuda::safety::StackOnly;
-/// # fn assert_stackonly(_x: impl StackOnly) {}
-/// assert_stackonly([42; 42]);
-/// ```
-/// ```rust,compile_fail
-/// # use alloc::vec;
-/// # use rust_cuda::safety::StackOnly;
-/// # fn assert_stackonly(_x: impl StackOnly) {}
-/// assert_stackonly(vec![42]);
-/// ```
-/// ```rust,compile_fail
-/// # use alloc::vec;
-/// # use rust_cuda::safety::StackOnly;
-/// # fn assert_stackonly(_x: impl StackOnly) {}
-/// assert_stackonly(&42);
-/// ```
-#[allow(clippy::module_name_repetitions)]
-pub trait StackOnly: sealed::StackOnly {}
+macro_rules! stack_only_docs {
+    ($item:item) => {
+        /// Types which contain no pointers or references and can thus live entirely
+        /// on the stack.
+        ///
+        /// This trait is automatically implemented when the compiler determines
+        /// it's appropriate.
+        ///
+        /// Note that this trait is *sealed*, i.e. you cannot implement it on your
+        /// own custom types.
+        ///
+        /// Primitive types like [`u8`] and structs, tuples, and enums made only
+        /// from them implement [`StackOnly`].
+        ///
+        /// In contrast, `&T`, `&mut T`, `*const T`, `*mut T`, and any type
+        /// containing a reference or a pointer do *not* implement [`StackOnly`].
+        ///
+        /// # Examples
+        ///
+        /// ```rust
+        /// # use rust_cuda::safety::StackOnly;
+        /// fn assert_stackonly(_x: impl StackOnly) {}
+        /// ```
+        /// ```rust
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// assert_stackonly(42); // ok
+        /// ```
+        /// ```rust
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// assert_stackonly([42; 42]); // ok
+        /// ```
+        /// ```rust,compile_fail
+        /// # use alloc::vec;
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// assert_stackonly(vec![42]); // error
+        /// ```
+        /// ```rust,compile_fail
+        /// # use alloc::vec;
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// assert_stackonly(&42); // error
+        /// ```
+        /// ```rust,compile_fail
+        /// # use alloc::vec;
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// # use crate::utils::shared::r#static::ThreadBlockShared;
+        /// assert_stackonly(ThreadBlockShared::new_uninit()); // error
+        /// ```
+        /// ```rust,compile_fail
+        /// # use alloc::vec;
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// # use crate::utils::shared::slice::ThreadBlockSharedSlice;
+        /// assert_stackonly(ThreadBlockSharedSlice::new_uninit_with_len(0)); // error
+        /// ```
+        $item
+    };
+}
+
+#[cfg(not(doc))]
+stack_only_docs! {
+    #[allow(clippy::module_name_repetitions)]
+    pub trait StackOnly: sealed::StackOnly {}
+}
+#[cfg(doc)]
+stack_only_docs! {
+    pub use sealed::StackOnly;
+}
+
+#[cfg(not(doc))]
 impl<T: sealed::StackOnly> StackOnly for T {}
 
 mod sealed {
     pub auto trait StackOnly {}
 
-    impl<T> !StackOnly for *const T {}
-    impl<T> !StackOnly for *mut T {}
-    impl<T> !StackOnly for &T {}
-    impl<T> !StackOnly for &mut T {}
+    impl<T: ?Sized> !StackOnly for &T {}
+    impl<T: ?Sized> !StackOnly for &mut T {}
+    impl<T: ?Sized> !StackOnly for *const T {}
+    impl<T: ?Sized> !StackOnly for *mut T {}
 
     impl<T> StackOnly for core::marker::PhantomData<T> {}
 }
diff --git a/src/safety/type_layout.rs b/src/safety/type_layout.rs
deleted file mode 100644
index f225f0055..000000000
--- a/src/safety/type_layout.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout};
-
-#[derive(PartialEq, Eq, core::marker::ConstParamTy)]
-pub enum CpuAndGpuTypeLayouts {
-    Match,
-    Mismatch,
-}
-
-pub struct Assert<const MATCH: CpuAndGpuTypeLayouts>;
-
-#[must_use]
-pub const fn check<T: TypeGraphLayout>(device: &'static [u8]) -> CpuAndGpuTypeLayouts
-where
-    [u8; serialised_type_graph_len::<T>()]:,
-{
-    let host = serialise_type_graph::<T>();
-
-    if host.len() != device.len() {
-        return CpuAndGpuTypeLayouts::Mismatch;
-    }
-
-    let mut i = 0;
-
-    while i < host.len() {
-        if host[i] != device[i] {
-            return CpuAndGpuTypeLayouts::Mismatch;
-        }
-
-        i += 1;
-    }
-
-    CpuAndGpuTypeLayouts::Match
-}
diff --git a/src/safety/unified_heap.rs b/src/safety/unified_heap.rs
deleted file mode 100644
index 9eda2d550..000000000
--- a/src/safety/unified_heap.rs
+++ /dev/null
@@ -1,46 +0,0 @@
-#[doc(cfg(feature = "alloc"))]
-/// ```rust
-/// # use rust_cuda::safety::UnifiedHeapOnly;
-/// fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {}
-/// ```
-/// ```rust
-/// # use rust_cuda::safety::UnifiedHeapOnly;
-/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {}
-/// assert_unified_heap_only(42);
-/// ```
-/// ```rust
-/// # use rust_cuda::safety::UnifiedHeapOnly;
-/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {}
-/// assert_unified_heap_only([42; 42]);
-/// ```
-/// ```rust,compile_fail
-/// # use alloc::vec;
-/// # use rust_cuda::safety::UnifiedHeapOnly;
-/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {}
-/// assert_unified_heap_only(vec![42]);
-/// ```
-/// ```rust,compile_fail
-/// # use rust_cuda::safety::UnifiedHeapOnly;
-/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {}
-/// assert_unified_heap_only(&42);
-/// ```
-#[allow(clippy::module_name_repetitions)]
-pub trait UnifiedHeapOnly: sealed::UnifiedHeapOnly {}
-impl<T: sealed::UnifiedHeapOnly> UnifiedHeapOnly for T {}
-
-mod sealed {
-    use crate::utils::alloc::UnifiedAllocator;
-
-    pub auto trait UnifiedHeapOnly {}
-
-    impl<T> !UnifiedHeapOnly for *const T {}
-    impl<T> !UnifiedHeapOnly for *mut T {}
-    impl<T> !UnifiedHeapOnly for &T {}
-    impl<T> !UnifiedHeapOnly for &mut T {}
-
-    impl<T> UnifiedHeapOnly for core::marker::PhantomData<T> {}
-
-    impl<T> UnifiedHeapOnly for alloc::boxed::Box<T, UnifiedAllocator> {}
-    impl<T> UnifiedHeapOnly for alloc::vec::Vec<T, UnifiedAllocator> {}
-    impl<T> UnifiedHeapOnly for hashbrown::HashMap<T, UnifiedAllocator> {}
-}
diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
new file mode 100644
index 000000000..fa023cc66
--- /dev/null
+++ b/src/utils/adapter.rs
@@ -0,0 +1,490 @@
+#![allow(clippy::trait_duplication_in_bounds)]
+
+use core::ops::{Deref, DerefMut};
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+use crate::{
+    alloc::NoCudaAlloc,
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    safety::PortableBitSemantics,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::utils::ffi::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::alloc::{CombinedCudaAlloc, CudaAlloc};
+
+#[derive(Copy, Clone, Debug, TypeLayout)]
+#[repr(transparent)]
+pub struct RustToCudaWithPortableBitCopySemantics<T: Copy + PortableBitSemantics + TypeGraphLayout>(
+    T,
+);
+
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> From<T>
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    fn from(value: T) -> Self {
+        Self(value)
+    }
+}
+
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> Deref
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> DerefMut
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaWithPortableBitCopySemantics<T> {
+    #[must_use]
+    pub const fn from_copy(value: &T) -> Self {
+        Self(*value)
+    }
+
+    #[must_use]
+    pub const fn into_inner(self) -> T {
+        self.0
+    }
+
+    #[must_use]
+    pub const fn from_ref(reference: &T) -> &Self {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(reference).cast() }
+    }
+
+    #[must_use]
+    pub const fn into_ref(&self) -> &T {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(self).cast() }
+    }
+
+    #[must_use]
+    pub fn from_mut(reference: &mut T) -> &mut Self {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(reference).cast() }
+    }
+
+    #[must_use]
+    pub fn into_mut(&mut self) -> &mut T {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(self).cast() }
+    }
+
+    #[must_use]
+    pub const fn from_slice(slice: &[T]) -> &[Self] {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub const fn into_slice(slice: &[Self]) -> &[T] {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+}
+
+unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCuda
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    type CudaAllocation = NoCudaAlloc;
+    type CudaRepresentation = Self;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
+        Ok((DeviceAccessible::from(*self), alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
+
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    type CudaAllocationAsync = NoCudaAlloc;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
+        Ok((
+            crate::utils::r#async::Async::ready(DeviceAccessible::from(*self), stream),
+            alloc,
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<
+            'a,
+            'stream,
+            owning_ref::BoxRefMut<'a, O, Self>,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >,
+        A,
+    )> {
+        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
+
+        let r#async = crate::utils::r#async::Async::<
+            _,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >::pending(this, stream, Box::new(|_this| Ok(())))?;
+
+        Ok((r#async, alloc_tail))
+    }
+}
+
+unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    type RustRepresentation = Self;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        let mut uninit = core::mem::MaybeUninit::uninit();
+        core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1);
+        uninit.assume_init()
+    }
+}
+
+#[derive(Copy, Clone, Debug, TypeLayout)]
+#[repr(transparent)]
+pub struct RustToCudaWithPortableBitCloneSemantics<
+    T: Clone + PortableBitSemantics + TypeGraphLayout,
+>(T);
+
+impl<T: Clone + PortableBitSemantics + TypeGraphLayout> From<T>
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    fn from(value: T) -> Self {
+        Self(value)
+    }
+}
+
+impl<T: Clone + PortableBitSemantics + TypeGraphLayout> Deref
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T: Clone + PortableBitSemantics + TypeGraphLayout> DerefMut
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaWithPortableBitCloneSemantics<T> {
+    #[must_use]
+    pub fn from_clone(value: &T) -> Self {
+        Self(value.clone())
+    }
+
+    #[must_use]
+    pub fn into_inner(self) -> T {
+        self.0
+    }
+
+    #[must_use]
+    pub const fn from_ref(reference: &T) -> &Self {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(reference).cast() }
+    }
+
+    #[must_use]
+    pub const fn into_ref(&self) -> &T {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(self).cast() }
+    }
+
+    #[must_use]
+    pub fn from_mut(reference: &mut T) -> &mut Self {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(reference).cast() }
+    }
+
+    #[must_use]
+    pub fn into_mut(&mut self) -> &mut T {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(self).cast() }
+    }
+
+    #[must_use]
+    pub const fn from_slice(slice: &[T]) -> &[Self] {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub const fn into_slice(slice: &[Self]) -> &[T] {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+}
+
+unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCuda
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    type CudaAllocation = NoCudaAlloc;
+    type CudaRepresentation = Self;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
+        Ok((DeviceAccessible::from(self.clone()), alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
+
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    type CudaAllocationAsync = NoCudaAlloc;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
+        Ok((
+            crate::utils::r#async::Async::ready(DeviceAccessible::from(self.clone()), stream),
+            alloc,
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<
+            'a,
+            'stream,
+            owning_ref::BoxRefMut<'a, O, Self>,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >,
+        A,
+    )> {
+        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
+
+        let r#async = crate::utils::r#async::Async::<
+            _,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >::pending(this, stream, Box::new(|_this| Ok(())))?;
+
+        Ok((r#async, alloc_tail))
+    }
+}
+
+unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    type RustRepresentation = Self;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        let mut uninit = core::mem::MaybeUninit::uninit();
+        core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1);
+        uninit.assume_init()
+    }
+}
+
+#[allow(clippy::module_name_repetitions)]
+#[derive(Copy, Clone, Debug, TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceCopyWithPortableBitSemantics<T: PortableBitSemantics + TypeGraphLayout>(T);
+
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> rustacuda_core::DeviceCopy
+    for DeviceCopyWithPortableBitSemantics<T>
+{
+}
+
+impl<T: PortableBitSemantics + TypeGraphLayout> From<T> for DeviceCopyWithPortableBitSemantics<T> {
+    fn from(value: T) -> Self {
+        Self(value)
+    }
+}
+
+impl<T: PortableBitSemantics + TypeGraphLayout> Deref for DeviceCopyWithPortableBitSemantics<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T: PortableBitSemantics + TypeGraphLayout> DerefMut for DeviceCopyWithPortableBitSemantics<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl<T: PortableBitSemantics + TypeGraphLayout> DeviceCopyWithPortableBitSemantics<T> {
+    #[must_use]
+    pub fn into_inner(self) -> T {
+        self.0
+    }
+
+    #[must_use]
+    pub const fn from_ref(reference: &T) -> &Self {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(reference).cast() }
+    }
+
+    #[must_use]
+    pub const fn into_ref(&self) -> &T {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(self).cast() }
+    }
+
+    #[must_use]
+    pub fn from_mut(reference: &mut T) -> &mut Self {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(reference).cast() }
+    }
+
+    #[must_use]
+    pub fn into_mut(&mut self) -> &mut T {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(self).cast() }
+    }
+
+    #[must_use]
+    pub const fn from_slice(slice: &[T]) -> &[Self] {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub const fn into_slice(slice: &[Self]) -> &[T] {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+}
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 361151ac2..097b4c0f4 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -1,47 +1,68 @@
+#[cfg(any(feature = "host", feature = "device"))]
 use core::{
     borrow::{Borrow, BorrowMut},
     convert::{AsMut, AsRef},
     ops::{Deref, DerefMut},
 };
 
-use rustacuda_core::DeviceCopy;
+use const_type_layout::TypeLayout;
 
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
+use crate::{
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    utils::ffi::DeviceAccessible,
+};
 
 #[repr(transparent)]
-#[derive(Clone, TypeLayout)]
+#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, TypeLayout)]
 pub struct SplitSliceOverCudaThreadsConstStride<T, const STRIDE: usize>(T);
 
 impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
+    #[cfg(feature = "host")]
     #[must_use]
-    pub fn new(inner: T) -> Self {
+    pub const fn new(inner: T) -> Self {
         Self(inner)
     }
 }
 
-// Safety: If `T` is `DeviceCopy`, then the newtype struct also is `DeviceCopy`
-unsafe impl<T: DeviceCopy, const STRIDE: usize> DeviceCopy
-    for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
-{
-}
-
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 fn split_slice_const_stride<E, const STRIDE: usize>(slice: &[E]) -> &[E] {
-    let offset: usize = crate::device::utils::index() * STRIDE;
+    let offset: usize = crate::device::thread::Thread::this().index() * STRIDE;
     let len = slice.len().min(offset + STRIDE).saturating_sub(offset);
 
     unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 fn split_slice_const_stride_mut<E, const STRIDE: usize>(slice: &mut [E]) -> &mut [E] {
-    let offset: usize = crate::device::utils::index() * STRIDE;
+    let offset: usize = crate::device::thread::Thread::this().index() * STRIDE;
     let len = slice.len().min(offset + STRIDE).saturating_sub(offset);
 
     unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
+impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
+    /// # Safety
+    ///
+    /// All cross-CUDA-thread aliasing guarantees are lost with this method.
+    /// Instead, the caller must ensure that no two threads in a kernel launch
+    /// access the same underlying elements.
+    pub const unsafe fn alias_unchecked(&self) -> &T {
+        &self.0
+    }
+
+    /// # Safety
+    ///
+    /// All cross-CUDA-thread aliasing guarantees are lost with this method.
+    /// Instead, the caller must ensure that no two threads in a kernel launch
+    /// access the same underlying elements.
+    pub unsafe fn alias_mut_unchecked(&mut self) -> &mut T {
+        &mut self.0
+    }
+}
+
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: Deref<Target = [E]>, const STRIDE: usize> Deref
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -52,7 +73,8 @@ impl<E, T: Deref<Target = [E]>, const STRIDE: usize> Deref
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: DerefMut<Target = [E]>, const STRIDE: usize> DerefMut
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -61,7 +83,8 @@ impl<E, T: DerefMut<Target = [E]>, const STRIDE: usize> DerefMut
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: AsRef<[E]>, const STRIDE: usize> AsRef<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -70,7 +93,8 @@ impl<E, T: AsRef<[E]>, const STRIDE: usize> AsRef<[E]>
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: AsMut<[E]>, const STRIDE: usize> AsMut<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -79,7 +103,8 @@ impl<E, T: AsMut<[E]>, const STRIDE: usize> AsMut<[E]>
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: Borrow<[E]>, const STRIDE: usize> Borrow<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -88,7 +113,8 @@ impl<E, T: Borrow<[E]>, const STRIDE: usize> Borrow<[E]>
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: BorrowMut<[E]>, const STRIDE: usize> BorrowMut<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -97,7 +123,7 @@ impl<E, T: BorrowMut<[E]>, const STRIDE: usize> BorrowMut<[E]>
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: Deref<Target = [E]>, const STRIDE: usize> Deref
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -108,7 +134,7 @@ impl<E, T: Deref<Target = [E]>, const STRIDE: usize> Deref
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: DerefMut<Target = [E]>, const STRIDE: usize> DerefMut
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -117,7 +143,7 @@ impl<E, T: DerefMut<Target = [E]>, const STRIDE: usize> DerefMut
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: AsRef<[E]>, const STRIDE: usize> AsRef<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -126,7 +152,7 @@ impl<E, T: AsRef<[E]>, const STRIDE: usize> AsRef<[E]>
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: AsMut<[E]>, const STRIDE: usize> AsMut<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -135,7 +161,7 @@ impl<E, T: AsMut<[E]>, const STRIDE: usize> AsMut<[E]>
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: Borrow<[E]>, const STRIDE: usize> Borrow<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -144,7 +170,7 @@ impl<E, T: Borrow<[E]>, const STRIDE: usize> Borrow<[E]>
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: BorrowMut<[E]>, const STRIDE: usize> BorrowMut<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -156,21 +182,18 @@ impl<E, T: BorrowMut<[E]>, const STRIDE: usize> BorrowMut<[E]>
 unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     type CudaAllocation = T::CudaAllocation;
     type CudaRepresentation =
         SplitSliceOverCudaThreadsConstStride<DeviceAccessible<T::CudaRepresentation>, STRIDE>;
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
+    unsafe fn borrow<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = self.0.borrow(alloc)?;
 
@@ -181,23 +204,96 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
+    unsafe fn restore<A: crate::alloc::CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A> {
         self.0.restore(alloc)
     }
 }
 
+unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
+    for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
+{
+    type CudaAllocationAsync = T::CudaAllocationAsync;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        let (r#async, alloc) = self.0.borrow_async(alloc, stream)?;
+        let (cuda_repr, completion) = unsafe { r#async.unwrap_unchecked()? };
+
+        let cuda_repr =
+            DeviceAccessible::from(SplitSliceOverCudaThreadsConstStride::new(cuda_repr));
+
+        let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) {
+            crate::utils::r#async::Async::pending(
+                cuda_repr,
+                stream,
+                crate::utils::r#async::NoCompletion,
+            )?
+        } else {
+            crate::utils::r#async::Async::ready(cuda_repr, stream)
+        };
+
+        Ok((r#async, alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<
+            'a,
+            'stream,
+            owning_ref::BoxRefMut<'a, O, Self>,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >,
+        A,
+    )> {
+        let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) };
+
+        let (r#async, alloc_tail) =
+            T::restore_async(this.map_mut(|this| &mut this.0), alloc, stream)?;
+
+        let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? };
+
+        std::mem::forget(inner);
+        let this = std::mem::ManuallyDrop::into_inner(this_backup);
+
+        if let Some(on_completion) = on_completion {
+            let r#async = crate::utils::r#async::Async::<
+                _,
+                crate::utils::r#async::CompletionFnMut<'a, Self>,
+            >::pending(
+                this,
+                stream,
+                Box::new(|this: &mut Self| on_completion(&mut this.0)),
+            )?;
+            Ok((r#async, alloc_tail))
+        } else {
+            let r#async = crate::utils::r#async::Async::ready(this, stream);
+            Ok((r#async, alloc_tail))
+        }
+    }
+}
+
 unsafe impl<T: CudaAsRust, const STRIDE: usize> CudaAsRust
     for SplitSliceOverCudaThreadsConstStride<DeviceAccessible<T>, STRIDE>
 {
     type RustRepresentation = SplitSliceOverCudaThreadsConstStride<T::RustRepresentation, STRIDE>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        SplitSliceOverCudaThreadsConstStride::new(CudaAsRust::as_rust(&this.0))
+        SplitSliceOverCudaThreadsConstStride(CudaAsRust::as_rust(&this.0))
     }
 }
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index 8b0446e08..3928c87d1 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -1,47 +1,71 @@
+#[cfg(any(feature = "host", feature = "device"))]
 use core::{
     borrow::{Borrow, BorrowMut},
     convert::{AsMut, AsRef},
     ops::{Deref, DerefMut},
 };
 
-use rustacuda_core::DeviceCopy;
+use const_type_layout::TypeLayout;
 
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
+use crate::{
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    utils::ffi::DeviceAccessible,
+};
 
 #[repr(C)]
-#[derive(Clone, TypeLayout)]
+#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, TypeLayout)]
 pub struct SplitSliceOverCudaThreadsDynamicStride<T> {
     stride: usize,
     inner: T,
 }
 
 impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
+    #[cfg(feature = "host")]
     #[must_use]
-    pub fn new(inner: T, stride: usize) -> Self {
+    pub const fn new(inner: T, stride: usize) -> Self {
         Self { stride, inner }
     }
 }
 
-// Safety: If `T` is `DeviceCopy`, then the newtype struct also is `DeviceCopy`
-unsafe impl<T: DeviceCopy> DeviceCopy for SplitSliceOverCudaThreadsDynamicStride<T> {}
-
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 fn split_slice_dynamic_stride<E>(slice: &[E], stride: usize) -> &[E] {
-    let offset: usize = crate::device::utils::index() * stride;
+    let offset: usize = crate::device::thread::Thread::this().index() * stride;
     let len = slice.len().min(offset + stride).saturating_sub(offset);
 
     unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 fn split_slice_dynamic_stride_mut<E>(slice: &mut [E], stride: usize) -> &mut [E] {
-    let offset: usize = crate::device::utils::index() * stride;
+    let offset: usize = crate::device::thread::Thread::this().index() * stride;
     let len = slice.len().min(offset + stride).saturating_sub(offset);
 
     unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
+impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
+    /// # Safety
+    ///
+    /// All cross-CUDA-thread aliasing guarantees are lost with this method.
+    /// Instead, the caller must ensure that no two threads in a kernel launch
+    /// access the same underlying elements.
+    pub const unsafe fn alias_unchecked(&self) -> &T {
+        &self.inner
+    }
+
+    /// # Safety
+    ///
+    /// All cross-CUDA-thread aliasing guarantees are lost with this method.
+    /// Instead, the caller must ensure that no two threads in a kernel launch
+    /// access the same underlying elements.
+    pub unsafe fn alias_mut_unchecked(&mut self) -> &mut T {
+        &mut self.inner
+    }
+}
+
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: Deref<Target = [E]>> Deref for SplitSliceOverCudaThreadsDynamicStride<T> {
     type Target = [E];
 
@@ -50,42 +74,47 @@ impl<E, T: Deref<Target = [E]>> Deref for SplitSliceOverCudaThreadsDynamicStride
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: DerefMut<Target = [E]>> DerefMut for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         split_slice_dynamic_stride_mut(&mut self.inner, self.stride)
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: AsRef<[E]>> AsRef<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn as_ref(&self) -> &[E] {
         split_slice_dynamic_stride(self.inner.as_ref(), self.stride)
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: AsMut<[E]>> AsMut<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn as_mut(&mut self) -> &mut [E] {
         split_slice_dynamic_stride_mut(self.inner.as_mut(), self.stride)
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: Borrow<[E]>> Borrow<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn borrow(&self) -> &[E] {
         split_slice_dynamic_stride(self.inner.borrow(), self.stride)
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: BorrowMut<[E]>> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn borrow_mut(&mut self) -> &mut [E] {
         split_slice_dynamic_stride_mut(self.inner.borrow_mut(), self.stride)
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: Deref<Target = [E]>> Deref for SplitSliceOverCudaThreadsDynamicStride<T> {
     type Target = [E];
 
@@ -94,35 +123,35 @@ impl<E, T: Deref<Target = [E]>> Deref for SplitSliceOverCudaThreadsDynamicStride
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: DerefMut<Target = [E]>> DerefMut for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.inner
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: AsRef<[E]>> AsRef<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn as_ref(&self) -> &[E] {
         self.inner.as_ref()
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: AsMut<[E]>> AsMut<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn as_mut(&mut self) -> &mut [E] {
         self.inner.as_mut()
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: Borrow<[E]>> Borrow<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn borrow(&self) -> &[E] {
         self.inner.borrow()
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: BorrowMut<[E]>> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn borrow_mut(&mut self) -> &mut [E] {
         self.inner.borrow_mut()
@@ -130,21 +159,18 @@ impl<E, T: BorrowMut<[E]>> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicSt
 }
 
 unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride<T> {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     type CudaAllocation = T::CudaAllocation;
     type CudaRepresentation =
         SplitSliceOverCudaThreadsDynamicStride<DeviceAccessible<T::CudaRepresentation>>;
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
+    unsafe fn borrow<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = self.inner.borrow(alloc)?;
 
@@ -158,23 +184,99 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
+    unsafe fn restore<A: crate::alloc::CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A> {
         self.inner.restore(alloc)
     }
 }
 
+unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDynamicStride<T> {
+    type CudaAllocationAsync = T::CudaAllocationAsync;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        let (r#async, alloc) = self.inner.borrow_async(alloc, stream)?;
+        let (cuda_repr, completion) = unsafe { r#async.unwrap_unchecked()? };
+
+        let cuda_repr = DeviceAccessible::from(SplitSliceOverCudaThreadsDynamicStride::new(
+            cuda_repr,
+            self.stride,
+        ));
+
+        let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) {
+            crate::utils::r#async::Async::pending(
+                cuda_repr,
+                stream,
+                crate::utils::r#async::NoCompletion,
+            )?
+        } else {
+            crate::utils::r#async::Async::ready(cuda_repr, stream)
+        };
+
+        Ok((r#async, alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<
+            'a,
+            'stream,
+            owning_ref::BoxRefMut<'a, O, Self>,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >,
+        A,
+    )> {
+        let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) };
+
+        let (r#async, alloc_tail) =
+            T::restore_async(this.map_mut(|this| &mut this.inner), alloc, stream)?;
+
+        let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? };
+
+        std::mem::forget(inner);
+        let this = std::mem::ManuallyDrop::into_inner(this_backup);
+
+        if let Some(on_completion) = on_completion {
+            let r#async = crate::utils::r#async::Async::<
+                _,
+                crate::utils::r#async::CompletionFnMut<'a, Self>,
+            >::pending(
+                this,
+                stream,
+                Box::new(|this: &mut Self| on_completion(&mut this.inner)),
+            )?;
+            Ok((r#async, alloc_tail))
+        } else {
+            let r#async = crate::utils::r#async::Async::ready(this, stream);
+            Ok((r#async, alloc_tail))
+        }
+    }
+}
+
 unsafe impl<T: CudaAsRust> CudaAsRust
     for SplitSliceOverCudaThreadsDynamicStride<DeviceAccessible<T>>
 {
     type RustRepresentation = SplitSliceOverCudaThreadsDynamicStride<T::RustRepresentation>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        SplitSliceOverCudaThreadsDynamicStride::new(CudaAsRust::as_rust(&this.inner), this.stride)
+        SplitSliceOverCudaThreadsDynamicStride {
+            stride: this.stride,
+            inner: CudaAsRust::as_rust(&this.inner),
+        }
     }
 }
diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs
deleted file mode 100644
index f8d96d5e2..000000000
--- a/src/utils/aliasing/final.rs
+++ /dev/null
@@ -1,59 +0,0 @@
-use r#final::Final;
-
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
-
-#[doc(hidden)]
-#[repr(transparent)]
-#[derive(TypeLayout)]
-#[allow(clippy::module_name_repetitions)]
-pub struct FinalCudaRepresentation<T: CudaAsRust>(DeviceAccessible<T>);
-
-// Safety: If `T` is `CudaAsRust`, then the newtype struct is `DeviceCopy`
-unsafe impl<T: CudaAsRust> rustacuda_core::DeviceCopy for FinalCudaRepresentation<T> {}
-
-unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    type CudaAllocation = T::CudaAllocation;
-    type CudaRepresentation = FinalCudaRepresentation<T::CudaRepresentation>;
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
-        &self,
-        alloc: A,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        let (cuda_repr, alloc) = (**self).borrow(alloc)?;
-
-        Ok((
-            DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)),
-            alloc,
-        ))
-    }
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
-        &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
-        // Safety: Final is a repr(transparent) newtype wrapper around T
-        let inner: &mut T = &mut *(self as *mut Self).cast();
-
-        inner.restore(alloc)
-    }
-}
-
-unsafe impl<T: CudaAsRust> CudaAsRust for FinalCudaRepresentation<T> {
-    type RustRepresentation = Final<T::RustRepresentation>;
-
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
-    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        Final::new(CudaAsRust::as_rust(&this.0))
-    }
-}
diff --git a/src/utils/aliasing/mod.rs b/src/utils/aliasing/mod.rs
index de7c58e05..e7753cf92 100644
--- a/src/utils/aliasing/mod.rs
+++ b/src/utils/aliasing/mod.rs
@@ -1,8 +1,5 @@
 mod r#const;
 mod dynamic;
-mod r#final;
 
 pub use dynamic::SplitSliceOverCudaThreadsDynamicStride;
 pub use r#const::SplitSliceOverCudaThreadsConstStride;
-
-pub(crate) use self::r#final::FinalCudaRepresentation;
diff --git a/src/utils/alloc.rs b/src/utils/alloc.rs
deleted file mode 100644
index 3bbcf225b..000000000
--- a/src/utils/alloc.rs
+++ /dev/null
@@ -1,67 +0,0 @@
-use alloc::alloc::{AllocError, Allocator, Layout};
-use core::ptr::NonNull;
-
-#[allow(clippy::module_name_repetitions)]
-pub struct UnifiedAllocator;
-
-unsafe impl Allocator for UnifiedAllocator {
-    #[cfg(feature = "host")]
-    fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
-        if layout.size() == 0 {
-            return Ok(NonNull::<[u8; 0]>::dangling());
-        }
-
-        match layout.align() {
-            1 => alloc_unified_aligned::<u8>(layout.size()),
-            2 => alloc_unified_aligned::<u16>(layout.size() >> 1),
-            4 => alloc_unified_aligned::<u32>(layout.size() >> 2),
-            8 => alloc_unified_aligned::<u64>(layout.size() >> 3),
-            _ => Err(AllocError),
-        }
-    }
-
-    #[cfg(not(feature = "host"))]
-    fn allocate(&self, _layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
-        Err(AllocError)
-    }
-
-    #[cfg(feature = "host")]
-    unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
-        use rustacuda::{
-            error::CudaResult,
-            memory::{cuda_free_unified, UnifiedPointer},
-        };
-
-        if layout.size() == 0 {
-            return;
-        }
-
-        let _: CudaResult<()> = cuda_free_unified(UnifiedPointer::wrap(ptr.as_ptr()));
-    }
-
-    #[cfg(not(feature = "host"))]
-    unsafe fn deallocate(&self, _ptr: NonNull<u8>, _layout: Layout) {
-        // no-op
-    }
-}
-
-#[cfg(feature = "host")]
-fn alloc_unified_aligned<T: rustacuda_core::DeviceCopy>(
-    size: usize,
-) -> Result<NonNull<[u8]>, AllocError> {
-    use rustacuda::memory::cuda_malloc_unified;
-
-    match unsafe { cuda_malloc_unified::<T>(size) } {
-        Ok(mut ptr) => {
-            let bytes: &mut [u8] = unsafe {
-                core::slice::from_raw_parts_mut(
-                    ptr.as_raw_mut().cast(),
-                    size * core::mem::align_of::<T>(),
-                )
-            };
-
-            NonNull::new(bytes).ok_or(AllocError)
-        },
-        Err(_) => Err(AllocError),
-    }
-}
diff --git a/src/utils/async.rs b/src/utils/async.rs
new file mode 100644
index 000000000..be4e2458c
--- /dev/null
+++ b/src/utils/async.rs
@@ -0,0 +1,735 @@
+#[cfg(feature = "host")]
+use std::{borrow::BorrowMut, future::Future, future::IntoFuture, marker::PhantomData, task::Poll};
+
+#[cfg(feature = "host")]
+use rustacuda::{
+    error::CudaError, error::CudaResult, event::Event, event::EventFlags,
+    stream::StreamWaitEventFlags,
+};
+
+#[cfg(feature = "host")]
+use crate::host::{CudaDropWrapper, Stream};
+
+#[cfg(feature = "host")]
+pub struct NoCompletion;
+#[cfg(feature = "host")]
+pub type CompletionFnMut<'a, T> = Box<dyn FnOnce(&mut T) -> CudaResult<()> + 'a>;
+
+#[cfg(feature = "host")]
+pub trait Completion<T: ?Sized + BorrowMut<Self::Completed>>: sealed::Sealed {
+    type Completed: ?Sized;
+
+    fn no_op() -> Self;
+
+    #[doc(hidden)]
+    fn synchronize_on_drop(&self) -> bool;
+
+    #[allow(clippy::missing_errors_doc)] // FIXME
+    fn complete(self, completed: &mut Self::Completed) -> CudaResult<()>;
+}
+#[cfg(feature = "host")]
+mod sealed {
+    pub trait Sealed {}
+}
+
+#[cfg(feature = "host")]
+impl<T: ?Sized> Completion<T> for NoCompletion {
+    type Completed = T;
+
+    #[inline]
+    fn no_op() -> Self {
+        Self
+    }
+
+    #[inline]
+    fn synchronize_on_drop(&self) -> bool {
+        false
+    }
+
+    #[inline]
+    fn complete(self, _completed: &mut Self::Completed) -> CudaResult<()> {
+        Ok(())
+    }
+}
+#[cfg(feature = "host")]
+impl sealed::Sealed for NoCompletion {}
+
+#[cfg(feature = "host")]
+impl<'a, T: ?Sized + BorrowMut<B>, B: ?Sized> Completion<T> for CompletionFnMut<'a, B> {
+    type Completed = B;
+
+    #[inline]
+    fn no_op() -> Self {
+        Box::new(|_value| Ok(()))
+    }
+
+    #[inline]
+    fn synchronize_on_drop(&self) -> bool {
+        true
+    }
+
+    #[inline]
+    fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> {
+        (self)(completed)
+    }
+}
+#[cfg(feature = "host")]
+impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {}
+
+#[cfg(feature = "host")]
+impl<T: ?Sized + BorrowMut<C::Completed>, C: Completion<T>> Completion<T> for Option<C> {
+    type Completed = C::Completed;
+
+    #[inline]
+    fn no_op() -> Self {
+        None
+    }
+
+    #[inline]
+    fn synchronize_on_drop(&self) -> bool {
+        self.as_ref().map_or(false, Completion::synchronize_on_drop)
+    }
+
+    #[inline]
+    fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> {
+        self.map_or(Ok(()), |completion| completion.complete(completed))
+    }
+}
+#[cfg(feature = "host")]
+impl<C> sealed::Sealed for Option<C> {}
+
+#[cfg(feature = "host")]
+pub struct Async<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T> = NoCompletion> {
+    stream: Stream<'stream>,
+    value: T,
+    status: AsyncStatus<'a, T, C>,
+    _capture: PhantomData<&'a ()>,
+}
+
+#[cfg(feature = "host")]
+enum AsyncStatus<'a, T: BorrowMut<C::Completed>, C: Completion<T>> {
+    #[allow(clippy::type_complexity)]
+    Processing {
+        receiver: oneshot::Receiver<CudaResult<()>>,
+        completion: C,
+        event: Option<CudaDropWrapper<Event>>,
+        _capture: PhantomData<&'a T>,
+    },
+    Completed {
+        result: CudaResult<()>,
+    },
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'stream, T, C> {
+    /// Wraps a `value` which is ready on `stream`.
+    #[must_use]
+    pub const fn ready(value: T, stream: Stream<'stream>) -> Self {
+        Self {
+            stream,
+            value,
+            status: AsyncStatus::Completed { result: Ok(()) },
+            _capture: PhantomData::<&'a ()>,
+        }
+    }
+
+    /// Wraps a still-pending `value` which is being computed on `stream`
+    /// such that its computation can be synchronised on.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA.
+    pub fn pending(value: T, stream: Stream<'stream>, completion: C) -> CudaResult<Self> {
+        let (sender, receiver) = oneshot::channel();
+        stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
+
+        Ok(Self {
+            stream,
+            value,
+            status: AsyncStatus::Processing {
+                receiver,
+                completion,
+                event: None,
+                _capture: PhantomData::<&'a T>,
+            },
+            _capture: PhantomData::<&'a ()>,
+        })
+    }
+
+    /// Synchronises on this computation to block until it has completed and
+    /// the inner value can be safely returned and again be used in synchronous
+    /// operations.
+    ///
+    /// Calling `synchronize` after the computation has completed, e.g. after
+    /// calling [`rustacuda::stream::Stream::synchronize`], should be very
+    /// cheap.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA.
+    pub fn synchronize(self) -> CudaResult<T> {
+        let (_stream, mut value, status) = self.destructure_into_parts();
+
+        let (receiver, completion) = match status {
+            AsyncStatus::Completed { result } => return result.map(|()| value),
+            AsyncStatus::Processing {
+                receiver,
+                completion,
+                event: _,
+                _capture,
+            } => (receiver, completion),
+        };
+
+        match receiver.recv() {
+            Ok(Ok(())) => (),
+            Ok(Err(err)) => return Err(err),
+            Err(oneshot::RecvError) => return Err(CudaError::AlreadyAcquired),
+        }
+
+        completion.complete(value.borrow_mut())?;
+
+        Ok(value)
+    }
+
+    /// Moves the asynchronous data move to a different [`Stream`].
+    ///
+    /// This method always adds a synchronisation barrier between the old and
+    /// and the new [`Stream`] to ensure that any usages of this [`Async`]
+    /// computations on the old [`Stream`] have completed before they can be
+    /// used on the new one.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA.
+    pub fn move_to_stream<'stream_new>(
+        self,
+        stream: Stream<'stream_new>,
+    ) -> CudaResult<Async<'a, 'stream_new, T, C>> {
+        let (old_stream, mut value, status) = self.destructure_into_parts();
+
+        let completion = match status {
+            AsyncStatus::Completed { result } => {
+                result?;
+                C::no_op()
+            },
+            AsyncStatus::Processing {
+                receiver,
+                completion,
+                event: _,
+                _capture,
+            } => match receiver.try_recv() {
+                Ok(Ok(())) => {
+                    completion.complete(value.borrow_mut())?;
+                    C::no_op()
+                },
+                Ok(Err(err)) => return Err(err),
+                Err(oneshot::TryRecvError::Empty) => completion,
+                Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired),
+            },
+        };
+
+        let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?);
+        event.record(&old_stream)?;
+        stream.wait_event(&event, StreamWaitEventFlags::DEFAULT)?;
+
+        let (sender, receiver) = oneshot::channel();
+        stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
+
+        Ok(Async {
+            stream,
+            value,
+            status: AsyncStatus::Processing {
+                receiver,
+                completion,
+                event: Some(event),
+                _capture: PhantomData::<&'a T>,
+            },
+            _capture: PhantomData::<&'a ()>,
+        })
+    }
+
+    #[allow(clippy::missing_errors_doc)] // FIXME
+    /// # Safety
+    ///
+    /// The returned inner value of type `T` may not yet have completed its
+    /// asynchronous work and may thus be in an inconsistent state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub unsafe fn unwrap_unchecked(self) -> CudaResult<(T, Option<C>)> {
+        let (_stream, value, status) = self.destructure_into_parts();
+
+        match status {
+            AsyncStatus::Completed { result: Ok(()) } => Ok((value, None)),
+            AsyncStatus::Completed { result: Err(err) } => Err(err),
+            AsyncStatus::Processing {
+                receiver: _,
+                completion,
+                event: _,
+                _capture,
+            } => Ok((value, Some(completion))),
+        }
+    }
+
+    pub const fn as_ref(&self) -> AsyncProj<'_, 'stream, &T> {
+        // Safety: this projection captures this async
+        unsafe { AsyncProj::new(&self.value, None) }
+    }
+
+    pub fn as_mut(&mut self) -> AsyncProj<'_, 'stream, &mut T> {
+        // Safety: this projection captures this async
+        unsafe {
+            AsyncProj::new(
+                &mut self.value,
+                Some(Box::new(|| {
+                    let completion = match &mut self.status {
+                        AsyncStatus::Completed { result } => {
+                            (*result)?;
+                            C::no_op()
+                        },
+                        AsyncStatus::Processing {
+                            receiver: _,
+                            completion,
+                            event: _,
+                            _capture,
+                        } => std::mem::replace(completion, C::no_op()),
+                    };
+
+                    let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?);
+
+                    let (sender, receiver) = oneshot::channel();
+
+                    self.stream
+                        .add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
+                    event.record(&self.stream)?;
+
+                    self.status = AsyncStatus::Processing {
+                        receiver,
+                        completion,
+                        event: Some(event),
+                        _capture: PhantomData::<&'a T>,
+                    };
+
+                    Ok(())
+                })),
+            )
+        }
+    }
+
+    #[must_use]
+    fn destructure_into_parts(self) -> (Stream<'stream>, T, AsyncStatus<'a, T, C>) {
+        let this = std::mem::ManuallyDrop::new(self);
+
+        // Safety: we destructure self into its droppable components,
+        //         value and status, without dropping self itself
+        unsafe {
+            (
+                this.stream,
+                std::ptr::read(&this.value),
+                (std::ptr::read(&this.status)),
+            )
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+impl<
+        'a,
+        'stream,
+        T: crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout,
+        C: Completion<crate::host::HostAndDeviceConstRef<'a, T>>,
+    > Async<'a, 'stream, crate::host::HostAndDeviceConstRef<'a, T>, C>
+where
+    crate::host::HostAndDeviceConstRef<'a, T>: BorrowMut<C::Completed>,
+{
+    pub const fn extract_ref(
+        &self,
+    ) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceConstRef<'_, T>> {
+        // Safety: this projection captures this async
+        unsafe { AsyncProj::new(self.value.as_ref(), None) }
+    }
+}
+
+#[cfg(feature = "host")]
+impl<
+        'a,
+        'stream,
+        T: crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout,
+        C: Completion<crate::host::HostAndDeviceMutRef<'a, T>>,
+    > Async<'a, 'stream, crate::host::HostAndDeviceMutRef<'a, T>, C>
+where
+    crate::host::HostAndDeviceMutRef<'a, T>: BorrowMut<C::Completed>,
+{
+    pub fn extract_ref(&self) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceConstRef<'_, T>> {
+        // Safety: this projection captures this async
+        unsafe { AsyncProj::new(self.value.as_ref(), None) }
+    }
+
+    pub fn extract_mut(
+        &mut self,
+    ) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceMutRef<'_, T>> {
+        // Safety: this projection captures this async
+        unsafe {
+            AsyncProj::new(
+                self.value.as_mut(),
+                Some(Box::new(|| {
+                    let completion = match &mut self.status {
+                        AsyncStatus::Completed { result } => {
+                            (*result)?;
+                            C::no_op()
+                        },
+                        AsyncStatus::Processing {
+                            receiver: _,
+                            completion,
+                            event: _,
+                            _capture,
+                        } => std::mem::replace(completion, C::no_op()),
+                    };
+
+                    let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?);
+
+                    let (sender, receiver) = oneshot::channel();
+
+                    self.stream
+                        .add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
+                    event.record(&self.stream)?;
+
+                    self.status = AsyncStatus::Processing {
+                        receiver,
+                        completion,
+                        event: Some(event),
+                        _capture: PhantomData,
+                    };
+
+                    Ok(())
+                })),
+            )
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Drop for Async<'a, 'stream, T, C> {
+    fn drop(&mut self) {
+        let AsyncStatus::Processing {
+            receiver,
+            completion,
+            event: _,
+            _capture,
+        } = std::mem::replace(&mut self.status, AsyncStatus::Completed { result: Ok(()) })
+        else {
+            return;
+        };
+
+        if completion.synchronize_on_drop() && receiver.recv() == Ok(Ok(())) {
+            let _ = completion.complete(self.value.borrow_mut());
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+struct AsyncFuture<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> {
+    _stream: PhantomData<Stream<'stream>>,
+    value: Option<T>,
+    completion: Option<C>,
+    status: AsyncStatus<'a, T, NoCompletion>,
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Future
+    for AsyncFuture<'a, 'stream, T, C>
+{
+    type Output = CudaResult<T>;
+
+    fn poll(
+        self: core::pin::Pin<&mut Self>,
+        cx: &mut core::task::Context<'_>,
+    ) -> Poll<Self::Output> {
+        // Safety: this function does not move out of `this`
+        let this = unsafe { self.get_unchecked_mut() };
+
+        match &mut this.status {
+            AsyncStatus::Processing {
+                receiver,
+                completion: _,
+                event: _,
+                _capture,
+            } => match std::pin::Pin::new(receiver).poll(cx) {
+                Poll::Ready(Ok(Ok(()))) => (),
+                Poll::Ready(Ok(Err(err))) => return Poll::Ready(Err(err)),
+                Poll::Ready(Err(oneshot::RecvError)) => {
+                    return Poll::Ready(Err(CudaError::AlreadyAcquired))
+                },
+                Poll::Pending => return Poll::Pending,
+            },
+            AsyncStatus::Completed { result: Ok(()) } => (),
+            AsyncStatus::Completed { result: Err(err) } => return Poll::Ready(Err(*err)),
+        }
+
+        let Some(mut value) = this.value.take() else {
+            return Poll::Ready(Err(CudaError::AlreadyAcquired));
+        };
+
+        if let Some(completion) = this.completion.take() {
+            completion.complete(value.borrow_mut())?;
+        }
+
+        Poll::Ready(Ok(value))
+    }
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> IntoFuture
+    for Async<'a, 'stream, T, C>
+{
+    type Output = CudaResult<T>;
+
+    type IntoFuture = impl Future<Output = Self::Output>;
+
+    fn into_future(self) -> Self::IntoFuture {
+        let (_stream, value, status) = self.destructure_into_parts();
+
+        let (completion, status): (Option<C>, AsyncStatus<'a, T, NoCompletion>) = match status {
+            AsyncStatus::Completed { result } => {
+                (None, AsyncStatus::Completed::<T, NoCompletion> { result })
+            },
+            AsyncStatus::Processing {
+                receiver,
+                completion,
+                event,
+                _capture,
+            } => (
+                Some(completion),
+                AsyncStatus::Processing::<T, NoCompletion> {
+                    receiver,
+                    completion: NoCompletion,
+                    event,
+                    _capture: PhantomData::<&'a T>,
+                },
+            ),
+        };
+
+        AsyncFuture {
+            _stream: PhantomData::<Stream<'stream>>,
+            value: Some(value),
+            completion,
+            status,
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Drop
+    for AsyncFuture<'a, 'stream, T, C>
+{
+    fn drop(&mut self) {
+        let Some(mut value) = self.value.take() else {
+            return;
+        };
+
+        let AsyncStatus::Processing {
+            receiver,
+            completion: NoCompletion,
+            event: _,
+            _capture,
+        } = std::mem::replace(&mut self.status, AsyncStatus::Completed { result: Ok(()) })
+        else {
+            return;
+        };
+
+        let Some(completion) = self.completion.take() else {
+            return;
+        };
+
+        if completion.synchronize_on_drop() && receiver.recv() == Ok(Ok(())) {
+            let _ = completion.complete(value.borrow_mut());
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+pub struct AsyncProj<'a, 'stream, T: 'a> {
+    _capture: PhantomData<&'a ()>,
+    _stream: PhantomData<Stream<'stream>>,
+    value: T,
+    use_callback: Option<Box<dyn FnMut() -> CudaResult<()> + 'a>>,
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
+    #[must_use]
+    /// # Safety
+    ///
+    /// This projection must either capture an existing [`Async`] or come from
+    /// a source that ensures that the projected value can never (async) move
+    /// to a different [`Stream`].
+    pub(crate) const unsafe fn new(
+        value: T,
+        use_callback: Option<Box<dyn FnMut() -> CudaResult<()> + 'a>>,
+    ) -> Self {
+        Self {
+            _capture: PhantomData::<&'a ()>,
+            _stream: PhantomData::<Stream<'stream>>,
+            value,
+            use_callback,
+        }
+    }
+
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub(crate) unsafe fn unwrap_unchecked(self) -> T {
+        self.value
+    }
+
+    #[allow(clippy::type_complexity)]
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub(crate) unsafe fn unwrap_unchecked_with_use(
+        self,
+    ) -> (T, Option<Box<dyn FnMut() -> CudaResult<()> + 'a>>) {
+        (self.value, self.use_callback)
+    }
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
+    #[must_use]
+    pub const fn proj_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T>
+    where
+        'a: 'b,
+    {
+        AsyncProj {
+            _capture: PhantomData::<&'b ()>,
+            _stream: PhantomData::<Stream<'stream>>,
+            value: &self.value,
+            use_callback: None,
+        }
+    }
+
+    #[must_use]
+    pub fn proj_mut<'b>(&'b mut self) -> AsyncProj<'b, 'stream, &'b mut T>
+    where
+        'a: 'b,
+    {
+        AsyncProj {
+            _capture: PhantomData::<&'b ()>,
+            _stream: PhantomData::<Stream<'stream>>,
+            value: &mut self.value,
+            use_callback: self.use_callback.as_mut().map(|use_callback| {
+                let use_callback: Box<dyn FnMut() -> CudaResult<()>> = Box::new(use_callback);
+                use_callback
+            }),
+        }
+    }
+
+    pub(crate) fn record_mut_use(&mut self) -> CudaResult<()> {
+        self.use_callback
+            .as_mut()
+            .map_or(Ok(()), |use_callback| use_callback())
+    }
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a T> {
+    #[must_use]
+    pub const fn as_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T>
+    where
+        'a: 'b,
+    {
+        AsyncProj {
+            _capture: PhantomData::<&'b ()>,
+            _stream: PhantomData::<Stream<'stream>>,
+            value: self.value,
+            use_callback: None,
+        }
+    }
+
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `&T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub(crate) const unsafe fn unwrap_ref_unchecked(&self) -> &T {
+        self.value
+    }
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> {
+    #[must_use]
+    pub fn as_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T>
+    where
+        'a: 'b,
+    {
+        AsyncProj {
+            _capture: PhantomData::<&'b ()>,
+            _stream: PhantomData::<Stream<'stream>>,
+            value: self.value,
+            use_callback: None,
+        }
+    }
+
+    #[must_use]
+    pub fn as_mut<'b>(&'b mut self) -> AsyncProj<'b, 'stream, &'b mut T>
+    where
+        'a: 'b,
+    {
+        AsyncProj {
+            _capture: PhantomData::<&'b ()>,
+            _stream: PhantomData::<Stream<'stream>>,
+            value: self.value,
+            use_callback: self.use_callback.as_mut().map(|use_callback| {
+                let use_callback: Box<dyn FnMut() -> CudaResult<()>> = Box::new(use_callback);
+                use_callback
+            }),
+        }
+    }
+
+    #[allow(dead_code)] // FIXME
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `&T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub(crate) unsafe fn unwrap_ref_unchecked(&self) -> &T {
+        self.value
+    }
+
+    #[allow(dead_code)] // FIXME
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `&T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub(crate) unsafe fn unwrap_mut_unchecked(&mut self) -> &mut T {
+        self.value
+    }
+}
diff --git a/src/utils/box.rs b/src/utils/box.rs
deleted file mode 100644
index e3381f022..000000000
--- a/src/utils/box.rs
+++ /dev/null
@@ -1,83 +0,0 @@
-use alloc::boxed::Box;
-
-use const_type_layout::TypeGraphLayout;
-
-use crate::{
-    common::{CudaAsRust, DeviceAccessible, RustToCuda},
-    safety::SafeDeviceCopy,
-};
-
-#[cfg(feature = "host")]
-use crate::{
-    host::CombinedCudaAlloc, host::CudaAlloc, host::CudaDropWrapper, rustacuda::error::CudaResult,
-    rustacuda::memory::DeviceBox, utils::device_copy::SafeDeviceCopyWrapper,
-};
-
-#[doc(hidden)]
-#[repr(transparent)]
-#[derive(TypeLayout)]
-#[allow(clippy::module_name_repetitions)]
-pub struct BoxCudaRepresentation<T>(*mut T)
-where
-    T: SafeDeviceCopy + TypeGraphLayout;
-
-// Safety: This repr(C) struct only contains a device-owned pointer
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
-    for BoxCudaRepresentation<T>
-{
-}
-
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    type CudaAllocation = CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
-    type CudaRepresentation = BoxCudaRepresentation<T>;
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: CudaAlloc>(
-        &self,
-        alloc: A,
-    ) -> CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        let mut device_box =
-            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?);
-
-        Ok((
-            DeviceAccessible::from(BoxCudaRepresentation(
-                device_box.as_device_ptr().as_raw_mut().cast(),
-            )),
-            CombinedCudaAlloc::new(device_box, alloc),
-        ))
-    }
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: CudaAlloc>(
-        &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> CudaResult<A> {
-        use rustacuda::memory::CopyDestination;
-
-        let (alloc_front, alloc_tail) = alloc.split();
-
-        alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut(&mut **self))?;
-
-        core::mem::drop(alloc_front);
-
-        Ok(alloc_tail)
-    }
-}
-
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for BoxCudaRepresentation<T> {
-    type RustRepresentation = Box<T>;
-
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
-    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        alloc::boxed::Box::from_raw(this.0)
-    }
-}
diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs
deleted file mode 100644
index 5ed008801..000000000
--- a/src/utils/boxed_slice.rs
+++ /dev/null
@@ -1,85 +0,0 @@
-use alloc::boxed::Box;
-
-use const_type_layout::TypeGraphLayout;
-
-use crate::{
-    common::{CudaAsRust, DeviceAccessible, RustToCuda},
-    safety::SafeDeviceCopy,
-};
-
-#[cfg(feature = "host")]
-use crate::{
-    host::CombinedCudaAlloc, host::CudaAlloc, host::CudaDropWrapper, rustacuda::error::CudaResult,
-    rustacuda::memory::DeviceBuffer, utils::device_copy::SafeDeviceCopyWrapper,
-};
-
-#[doc(hidden)]
-#[allow(clippy::module_name_repetitions)]
-#[derive(Debug, TypeLayout)]
-#[repr(C)]
-pub struct BoxedSliceCudaRepresentation<T>(*mut T, usize)
-where
-    T: SafeDeviceCopy + TypeGraphLayout;
-
-// Safety: This repr(C) struct only contains a device-owned pointer
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
-    for BoxedSliceCudaRepresentation<T>
-{
-}
-
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    type CudaAllocation = CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
-    type CudaRepresentation = BoxedSliceCudaRepresentation<T>;
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: CudaAlloc>(
-        &self,
-        alloc: A,
-    ) -> CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
-            SafeDeviceCopyWrapper::from_slice(self),
-        )?);
-
-        Ok((
-            DeviceAccessible::from(BoxedSliceCudaRepresentation(
-                device_buffer.as_mut_ptr().cast(),
-                device_buffer.len(),
-            )),
-            CombinedCudaAlloc::new(device_buffer, alloc),
-        ))
-    }
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: CudaAlloc>(
-        &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> CudaResult<A> {
-        use rustacuda::memory::CopyDestination;
-
-        let (alloc_front, alloc_tail) = alloc.split();
-
-        alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut_slice(self))?;
-
-        core::mem::drop(alloc_front);
-
-        Ok(alloc_tail)
-    }
-}
-
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for BoxedSliceCudaRepresentation<T> {
-    type RustRepresentation = Box<[T]>;
-
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
-    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1))
-    }
-}
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
deleted file mode 100644
index 289ef9969..000000000
--- a/src/utils/device_copy.rs
+++ /dev/null
@@ -1,112 +0,0 @@
-#![allow(clippy::trait_duplication_in_bounds)]
-
-use const_type_layout::TypeGraphLayout;
-
-use crate::{
-    common::{CudaAsRust, DeviceAccessible, RustToCuda},
-    safety::SafeDeviceCopy,
-};
-
-#[derive(Copy, Clone, Debug, TypeLayout)]
-#[repr(transparent)]
-pub struct SafeDeviceCopyWrapper<T>(T)
-where
-    T: SafeDeviceCopy + TypeGraphLayout;
-
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
-    for SafeDeviceCopyWrapper<T>
-{
-}
-
-impl<T: SafeDeviceCopy + TypeGraphLayout> From<T> for SafeDeviceCopyWrapper<T> {
-    fn from(value: T) -> Self {
-        Self(value)
-    }
-}
-
-impl<T: SafeDeviceCopy + TypeGraphLayout> SafeDeviceCopyWrapper<T> {
-    pub fn into_inner(self) -> T {
-        self.0
-    }
-
-    pub fn from_ref(reference: &T) -> &Self {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
-        unsafe { &*(reference as *const T).cast() }
-    }
-
-    pub fn into_ref(&self) -> &T {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
-        unsafe { &*(self as *const Self).cast() }
-    }
-
-    pub fn from_mut(reference: &mut T) -> &mut Self {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
-        unsafe { &mut *(reference as *mut T).cast() }
-    }
-
-    pub fn into_mut(&mut self) -> &mut T {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
-        unsafe { &mut *(self as *mut Self).cast() }
-    }
-
-    pub fn from_slice(slice: &[T]) -> &[Self] {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
-        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
-    }
-
-    pub fn into_slice(slice: &[Self]) -> &[T] {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
-        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
-    }
-
-    pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
-        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
-    }
-
-    pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
-        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
-    }
-}
-
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for SafeDeviceCopyWrapper<T> {
-    #[cfg(feature = "host")]
-    type CudaAllocation = crate::host::NullCudaAlloc;
-    type CudaRepresentation = Self;
-
-    #[cfg(feature = "host")]
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
-        &self,
-        alloc: A,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        let alloc = crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc);
-        Ok((DeviceAccessible::from(&self.0), alloc))
-    }
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
-        &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
-        let (_alloc_front, alloc_tail): (crate::host::NullCudaAlloc, A) = alloc.split();
-
-        Ok(alloc_tail)
-    }
-}
-
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for SafeDeviceCopyWrapper<T> {
-    type RustRepresentation = Self;
-
-    #[cfg(any(not(feature = "host"), doc))]
-    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        let mut uninit = core::mem::MaybeUninit::uninit();
-        core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1);
-        uninit.assume_init()
-    }
-}
diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs
index a153da4d0..079dba419 100644
--- a/src/utils/exchange/buffer/common.rs
+++ b/src/utils/exchange/buffer/common.rs
@@ -1,7 +1,10 @@
-use const_type_layout::TypeGraphLayout;
-use rustacuda_core::DeviceCopy;
+use const_type_layout::{TypeGraphLayout, TypeLayout};
 
-use crate::{common::CudaAsRust, safety::SafeDeviceCopy};
+use crate::{
+    lend::CudaAsRust,
+    safety::{PortableBitSemantics, StackOnly},
+    utils::ffi::DeviceMutPointer,
+};
 
 use super::{CudaExchangeBuffer, CudaExchangeItem};
 
@@ -9,30 +12,30 @@ use super::{CudaExchangeBuffer, CudaExchangeItem};
 #[doc(hidden)]
 #[derive(TypeLayout)]
 #[repr(C)]
-pub struct CudaExchangeBufferCudaRepresentation<T, const M2D: bool, const M2H: bool>(
-    pub(super) *mut CudaExchangeItem<T, M2D, M2H>,
+pub struct CudaExchangeBufferCudaRepresentation<
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
+    const M2D: bool,
+    const M2H: bool,
+>(
+    pub(super) DeviceMutPointer<CudaExchangeItem<T, M2D, M2H>>,
     pub(super) usize,
-)
-where
-    T: SafeDeviceCopy + TypeGraphLayout;
+);
 
-// Safety: `CudaExchangeBufferCudaRepresentation<T>` is `DeviceCopy`
-//         iff `T` is `SafeDeviceCopy`
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> DeviceCopy
-    for CudaExchangeBufferCudaRepresentation<T, M2D, M2H>
-{
-}
-
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> CudaAsRust
-    for CudaExchangeBufferCudaRepresentation<T, M2D, M2H>
+unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    CudaAsRust for CudaExchangeBufferCudaRepresentation<T, M2D, M2H>
 {
     type RustRepresentation = CudaExchangeBuffer<T, M2D, M2H>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
-    unsafe fn as_rust(this: &crate::common::DeviceAccessible<Self>) -> Self::RustRepresentation {
-        CudaExchangeBuffer(core::mem::ManuallyDrop::new(alloc::boxed::Box::from_raw(
-            core::slice::from_raw_parts_mut(this.0, this.1),
-        )))
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(
+        this: &crate::utils::ffi::DeviceAccessible<Self>,
+    ) -> Self::RustRepresentation {
+        CudaExchangeBuffer {
+            inner: super::device::CudaExchangeBufferDevice(core::mem::ManuallyDrop::new(
+                crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(
+                    this.0 .0, this.1,
+                )),
+            )),
+        }
     }
 }
diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs
index d284e1193..5083263b3 100644
--- a/src/utils/exchange/buffer/device.rs
+++ b/src/utils/exchange/buffer/device.rs
@@ -2,23 +2,21 @@ use core::ops::{Deref, DerefMut};
 
 use const_type_layout::TypeGraphLayout;
 
-use crate::{common::RustToCuda, safety::SafeDeviceCopy};
+use crate::{
+    deps::alloc::boxed::Box,
+    safety::{PortableBitSemantics, StackOnly},
+};
 
-use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem};
+use super::CudaExchangeItem;
 
 #[allow(clippy::module_name_repetitions)]
-#[doc(cfg(not(feature = "host")))]
-/// When the `host` feature is set,
-/// [`CudaExchangeBuffer`](super::CudaExchangeBuffer)
-/// refers to
-/// [`CudaExchangeBufferHost`](super::CudaExchangeBufferHost)
-/// instead.
-/// [`CudaExchangeBufferDevice`](Self) is never exposed directly.
-pub struct CudaExchangeBufferDevice<T: SafeDeviceCopy, const M2D: bool, const M2H: bool>(
-    pub(super) core::mem::ManuallyDrop<alloc::boxed::Box<[CudaExchangeItem<T, M2D, M2H>]>>,
-);
+pub struct CudaExchangeBufferDevice<
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
+    const M2D: bool,
+    const M2H: bool,
+>(pub(super) core::mem::ManuallyDrop<Box<[CudaExchangeItem<T, M2D, M2H>]>>);
 
-impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> Deref
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
     for CudaExchangeBufferDevice<T, M2D, M2H>
 {
     type Target = [CudaExchangeItem<T, M2D, M2H>];
@@ -28,17 +26,10 @@ impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> Deref
     }
 }
 
-impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> DerefMut
-    for CudaExchangeBufferDevice<T, M2D, M2H>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    DerefMut for CudaExchangeBufferDevice<T, M2D, M2H>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.0
     }
 }
-
-#[cfg(not(all(doc, feature = "host")))]
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
-    for CudaExchangeBufferDevice<T, M2D, M2H>
-{
-    type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
-}
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index ad522629f..e252d0ce7 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -1,5 +1,4 @@
-use alloc::vec::Vec;
-use core::{
+use std::{
     cell::UnsafeCell,
     ops::{Deref, DerefMut},
 };
@@ -11,40 +10,51 @@ use rustacuda::{
 };
 
 use crate::{
-    common::{DeviceAccessible, RustToCuda},
-    host::{CombinedCudaAlloc, CudaAlloc, CudaDropWrapper, NullCudaAlloc},
-    safety::SafeDeviceCopy,
+    alloc::{CombinedCudaAlloc, CudaAlloc, NoCudaAlloc},
+    host::CudaDropWrapper,
+    safety::{PortableBitSemantics, StackOnly},
+    utils::{
+        adapter::DeviceCopyWithPortableBitSemantics,
+        ffi::{DeviceAccessible, DeviceMutPointer},
+        r#async::{Async, CompletionFnMut, NoCompletion},
+    },
 };
 
 use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem};
 
 #[allow(clippy::module_name_repetitions)]
-#[doc(cfg(feature = "host"))]
-/// When the `host` feature is **not** set,
-/// [`CudaExchangeBuffer`](super::CudaExchangeBuffer)
-/// refers to
-/// [`CudaExchangeBufferDevice`](super::CudaExchangeBufferDevice)
-/// instead.
-/// [`CudaExchangeBufferHost`](Self) is never exposed directly.
 pub struct CudaExchangeBufferHost<
-    T: SafeDeviceCopy + TypeGraphLayout,
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
     const M2D: bool,
     const M2H: bool,
 > {
-    host_buffer: CudaDropWrapper<LockedBuffer<CudaExchangeItem<T, M2D, M2H>>>,
-    device_buffer: UnsafeCell<CudaDropWrapper<DeviceBuffer<CudaExchangeItem<T, M2D, M2H>>>>,
+    host_buffer: CudaDropWrapper<
+        LockedBuffer<DeviceCopyWithPortableBitSemantics<CudaExchangeItem<T, M2D, M2H>>>,
+    >,
+    device_buffer: UnsafeCell<
+        CudaDropWrapper<
+            DeviceBuffer<DeviceCopyWithPortableBitSemantics<CudaExchangeItem<T, M2D, M2H>>>,
+        >,
+    >,
 }
 
-impl<T: Clone + SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
-    CudaExchangeBufferHost<T, M2D, M2H>
+impl<
+        T: Clone + StackOnly + PortableBitSemantics + TypeGraphLayout,
+        const M2D: bool,
+        const M2H: bool,
+    > CudaExchangeBufferHost<T, M2D, M2H>
 {
     /// # Errors
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     pub fn new(elem: &T, capacity: usize) -> CudaResult<Self> {
         // Safety: CudaExchangeItem is a `repr(transparent)` wrapper around T
-        let elem: &CudaExchangeItem<T, M2D, M2H> = unsafe { &*(elem as *const T).cast() };
+        let elem: &CudaExchangeItem<T, M2D, M2H> = unsafe { &*std::ptr::from_ref(elem).cast() };
 
-        let host_buffer = CudaDropWrapper::from(LockedBuffer::new(elem, capacity)?);
+        let host_buffer = CudaDropWrapper::from(LockedBuffer::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(elem),
+            capacity,
+        )?);
         let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
             host_buffer.as_slice(),
         )?));
@@ -56,20 +66,30 @@ impl<T: Clone + SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bo
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     /// # Errors
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     pub fn from_vec(vec: Vec<T>) -> CudaResult<Self> {
-        let mut host_buffer_uninit =
-            CudaDropWrapper::from(unsafe { LockedBuffer::uninitialized(vec.len())? });
+        let host_buffer = unsafe {
+            let mut uninit: CudaDropWrapper<LockedBuffer<DeviceCopyWithPortableBitSemantics<_>>> =
+                CudaDropWrapper::from(LockedBuffer::uninitialized(vec.len())?);
 
-        for (src, dst) in vec.into_iter().zip(host_buffer_uninit.iter_mut()) {
-            *dst = CudaExchangeItem(src);
-        }
+            let uninit_ptr: *mut DeviceCopyWithPortableBitSemantics<CudaExchangeItem<T, M2D, M2H>> =
+                uninit.as_mut_ptr();
+
+            for (i, src) in vec.into_iter().enumerate() {
+                uninit_ptr
+                    .add(i)
+                    .write(DeviceCopyWithPortableBitSemantics::from(CudaExchangeItem(
+                        src,
+                    )));
+            }
 
-        let host_buffer = host_buffer_uninit;
+            uninit
+        };
 
         let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
             host_buffer.as_slice(),
@@ -82,37 +102,34 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
     for CudaExchangeBufferHost<T, M2D, M2H>
 {
     type Target = [CudaExchangeItem<T, M2D, M2H>];
 
     fn deref(&self) -> &Self::Target {
-        self.host_buffer.as_slice()
+        DeviceCopyWithPortableBitSemantics::into_slice(self.host_buffer.as_slice())
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
-    for CudaExchangeBufferHost<T, M2D, M2H>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    DerefMut for CudaExchangeBufferHost<T, M2D, M2H>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.host_buffer.as_mut_slice()
+        DeviceCopyWithPortableBitSemantics::into_mut_slice(self.host_buffer.as_mut_slice())
     }
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
-    for CudaExchangeBufferHost<T, M2D, M2H>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    CudaExchangeBufferHost<T, M2D, M2H>
 {
-    type CudaAllocation = NullCudaAlloc;
-    type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
-
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: CudaAlloc>(
+    pub unsafe fn borrow<A: CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
+        DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>,
+        CombinedCudaAlloc<NoCudaAlloc, A>,
     )> {
         // Safety: device_buffer is inside an UnsafeCell
         //         borrow checks must be satisfied through LendToCuda
@@ -129,17 +146,17 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
 
         Ok((
             DeviceAccessible::from(CudaExchangeBufferCudaRepresentation(
-                device_buffer.as_mut_ptr(),
+                DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
                 device_buffer.len(),
             )),
-            CombinedCudaAlloc::new(NullCudaAlloc, alloc),
+            CombinedCudaAlloc::new(NoCudaAlloc, alloc),
         ))
     }
 
     #[allow(clippy::type_complexity)]
-    unsafe fn restore<A: CudaAlloc>(
+    pub unsafe fn restore<A: CudaAlloc>(
         &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: CombinedCudaAlloc<NoCudaAlloc, A>,
     ) -> rustacuda::error::CudaResult<A> {
         let (_alloc_front, alloc_tail) = alloc.split();
 
@@ -155,3 +172,76 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
         Ok(alloc_tail)
     }
 }
+
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    CudaExchangeBufferHost<T, M2D, M2H>
+{
+    #[allow(clippy::type_complexity)]
+    pub unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'_, 'stream, DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>>,
+        CombinedCudaAlloc<NoCudaAlloc, A>,
+    )> {
+        // Safety: device_buffer is inside an UnsafeCell
+        //         borrow checks must be satisfied through LendToCuda
+        let device_buffer = &mut *self.device_buffer.get();
+
+        if M2D {
+            // Only move the buffer contents to the device if needed
+
+            rustacuda::memory::AsyncCopyDestination::async_copy_from(
+                &mut ***device_buffer,
+                self.host_buffer.as_slice(),
+                &stream,
+            )?;
+        }
+
+        let cuda_repr = DeviceAccessible::from(CudaExchangeBufferCudaRepresentation(
+            DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
+            device_buffer.len(),
+        ));
+
+        let r#async = if M2D {
+            Async::pending(cuda_repr, stream, NoCompletion)?
+        } else {
+            Async::ready(cuda_repr, stream)
+        };
+
+        Ok((r#async, CombinedCudaAlloc::new(NoCudaAlloc, alloc)))
+    }
+
+    #[allow(clippy::type_complexity)]
+    pub unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        mut this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<NoCudaAlloc, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
+        A,
+    )> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+
+        if M2H {
+            // Only move the buffer contents back to the host if needed
+
+            let this: &mut Self = &mut this;
+
+            rustacuda::memory::AsyncCopyDestination::async_copy_to(
+                &***this.device_buffer.get_mut(),
+                this.host_buffer.as_mut_slice(),
+                &stream,
+            )?;
+        }
+
+        let r#async = if M2H {
+            Async::<_, CompletionFnMut<'a, Self>>::pending(this, stream, Box::new(|_this| Ok(())))?
+        } else {
+            Async::ready(this, stream)
+        };
+
+        Ok((r#async, alloc_tail))
+    }
+}
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index 3648f9d04..1736b30ea 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -1,62 +1,296 @@
+#[cfg(any(feature = "host", feature = "device"))]
+use core::{
+    mem::MaybeUninit,
+    ops::{Deref, DerefMut},
+};
+
+use const_type_layout::TypeLayout;
+
+use const_type_layout::TypeGraphLayout;
+
+use crate::safety::{PortableBitSemantics, StackOnly};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::{
+    alloc::NoCudaAlloc,
+    lend::{RustToCuda, RustToCudaAsync},
+};
+
+#[cfg(feature = "host")]
+use crate::{
+    alloc::{CombinedCudaAlloc, CudaAlloc},
+    utils::ffi::DeviceAccessible,
+    utils::r#async::{Async, CompletionFnMut},
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use self::common::CudaExchangeBufferCudaRepresentation;
+
+#[cfg(any(feature = "host", feature = "device"))]
 mod common;
-#[cfg(any(not(feature = "host"), doc))]
+#[cfg(feature = "device")]
 mod device;
 #[cfg(feature = "host")]
 mod host;
 
-#[cfg(not(feature = "host"))]
+#[cfg(any(feature = "host", feature = "device"))]
 #[allow(clippy::module_name_repetitions)]
-pub use device::CudaExchangeBufferDevice as CudaExchangeBuffer;
+pub struct CudaExchangeBuffer<
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
+    const M2D: bool,
+    const M2H: bool,
+> {
+    #[cfg(feature = "host")]
+    inner: host::CudaExchangeBufferHost<T, M2D, M2H>,
+    #[cfg(all(feature = "device", not(feature = "host")))]
+    inner: device::CudaExchangeBufferDevice<T, M2D, M2H>,
+}
+
+#[cfg(any(feature = "host", feature = "device"))]
+unsafe impl<
+        T: StackOnly + PortableBitSemantics + TypeGraphLayout + Sync,
+        const M2D: bool,
+        const M2H: bool,
+    > Sync for CudaExchangeBuffer<T, M2D, M2H>
+{
+}
+
 #[cfg(feature = "host")]
-#[allow(clippy::module_name_repetitions)]
-pub use host::CudaExchangeBufferHost as CudaExchangeBuffer;
+impl<
+        T: Clone + StackOnly + PortableBitSemantics + TypeGraphLayout,
+        const M2D: bool,
+        const M2H: bool,
+    > CudaExchangeBuffer<T, M2D, M2H>
+{
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn new(elem: &T, capacity: usize) -> rustacuda::error::CudaResult<Self> {
+        Ok(Self {
+            inner: host::CudaExchangeBufferHost::new(elem, capacity)?,
+        })
+    }
+}
+
+#[cfg(feature = "host")]
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    CudaExchangeBuffer<T, M2D, M2H>
+{
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn from_vec(vec: Vec<T>) -> rustacuda::error::CudaResult<Self> {
+        Ok(Self {
+            inner: host::CudaExchangeBufferHost::from_vec(vec)?,
+        })
+    }
+}
 
-#[cfg(doc)]
-pub use self::{device::CudaExchangeBufferDevice, host::CudaExchangeBufferHost};
+#[cfg(any(feature = "host", feature = "device"))]
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
+    for CudaExchangeBuffer<T, M2D, M2H>
+{
+    type Target = [CudaExchangeItem<T, M2D, M2H>];
 
-use crate::safety::SafeDeviceCopy;
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
 
-#[repr(transparent)]
-#[derive(Clone, Copy, TypeLayout)]
-pub struct CudaExchangeItem<T: SafeDeviceCopy, const M2D: bool, const M2H: bool>(T);
+#[cfg(any(feature = "host", feature = "device"))]
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    DerefMut for CudaExchangeBuffer<T, M2D, M2H>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.inner
+    }
+}
 
-// Safety: Transparent newtype wrapper around `SafeDeviceCopy`
-//          is `DeviceCopy`
-unsafe impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> rustacuda_core::DeviceCopy
-    for CudaExchangeItem<T, M2D, M2H>
+#[cfg(any(feature = "host", feature = "device"))]
+unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    RustToCuda for CudaExchangeBuffer<T, M2D, M2H>
 {
+    type CudaAllocation = NoCudaAlloc;
+    type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        self.inner.borrow(alloc)
+    }
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A> {
+        self.inner.restore(alloc)
+    }
 }
 
-impl<T: SafeDeviceCopy, const M2D: bool> CudaExchangeItem<T, M2D, true> {
-    #[cfg(any(feature = "host", doc))]
-    #[doc(cfg(feature = "host"))]
-    pub fn read(&self) -> &T {
+#[cfg(any(feature = "host", feature = "device"))]
+unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    RustToCudaAsync for CudaExchangeBuffer<T, M2D, M2H>
+{
+    type CudaAllocationAsync = NoCudaAlloc;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        self.inner.borrow_async(alloc, stream)
+    }
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: crate::host::Stream<'stream>,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
+        A,
+    )> {
+        let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) };
+
+        let (r#async, alloc_tail) = host::CudaExchangeBufferHost::restore_async(
+            this.map_mut(|this| &mut this.inner),
+            alloc,
+            stream,
+        )?;
+
+        let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? };
+
+        std::mem::forget(inner);
+        let this = std::mem::ManuallyDrop::into_inner(this_backup);
+
+        if let Some(on_completion) = on_completion {
+            let r#async = Async::<_, CompletionFnMut<'a, Self>>::pending(
+                this,
+                stream,
+                Box::new(|this: &mut Self| on_completion(&mut this.inner)),
+            )?;
+            Ok((r#async, alloc_tail))
+        } else {
+            let r#async = Async::ready(this, stream);
+            Ok((r#async, alloc_tail))
+        }
+    }
+}
+
+#[repr(transparent)]
+#[derive(Clone, Copy, TypeLayout)]
+pub struct CudaExchangeItem<
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
+    const M2D: bool,
+    const M2H: bool,
+>(T);
+
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool>
+    CudaExchangeItem<T, M2D, true>
+{
+    #[cfg(feature = "host")]
+    pub const fn read(&self) -> &T {
         &self.0
     }
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(feature = "device")]
     pub fn write(&mut self, value: T) {
         self.0 = value;
     }
 }
 
-impl<T: SafeDeviceCopy, const M2H: bool> CudaExchangeItem<T, true, M2H> {
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
-    pub fn read(&self) -> &T {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2H: bool>
+    CudaExchangeItem<T, true, M2H>
+{
+    #[cfg(feature = "device")]
+    pub const fn read(&self) -> &T {
         &self.0
     }
 
-    #[cfg(any(feature = "host", doc))]
-    #[doc(cfg(feature = "host"))]
+    #[cfg(feature = "host")]
     pub fn write(&mut self, value: T) {
         self.0 = value;
     }
 }
 
-impl<T: SafeDeviceCopy> AsMut<T> for CudaExchangeItem<T, true, true> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> AsMut<T>
+    for CudaExchangeItem<T, true, true>
+{
     fn as_mut(&mut self) -> &mut T {
         &mut self.0
     }
 }
+
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, false, true> {
+    #[cfg(feature = "host")]
+    pub const fn as_scratch(&self) -> &T {
+        &self.0
+    }
+
+    #[cfg(feature = "host")]
+    pub fn as_scratch_mut(&mut self) -> &mut T {
+        &mut self.0
+    }
+}
+
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, true, false> {
+    #[cfg(feature = "device")]
+    pub const fn as_scratch(&self) -> &T {
+        &self.0
+    }
+
+    #[cfg(feature = "device")]
+    pub fn as_scratch_mut(&mut self) -> &mut T {
+        &mut self.0
+    }
+}
+
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, true, false> {
+    #[cfg(feature = "host")]
+    pub const fn as_uninit(&self) -> &MaybeUninit<T> {
+        // Safety:
+        // - MaybeUninit is a transparent newtype union
+        // - CudaExchangeItem is a transparent newtype
+        unsafe { &*core::ptr::from_ref(self).cast() }
+    }
+
+    #[cfg(feature = "host")]
+    pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit<T> {
+        // Safety:
+        // - MaybeUninit is a transparent newtype union
+        // - CudaExchangeItem is a transparent newtype
+        unsafe { &mut *core::ptr::from_mut(self).cast() }
+    }
+}
+
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, false, true> {
+    #[cfg(feature = "device")]
+    pub const fn as_uninit(&self) -> &MaybeUninit<T> {
+        // Safety:
+        // - MaybeUninit is a transparent newtype union
+        // - CudaExchangeItem is a transparent newtype
+        unsafe { &*core::ptr::from_ref(self).cast() }
+    }
+
+    #[cfg(feature = "device")]
+    pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit<T> {
+        // Safety:
+        // - MaybeUninit is a transparent newtype union
+        // - CudaExchangeItem is a transparent newtype
+        unsafe { &mut *core::ptr::from_mut(self).cast() }
+    }
+}
diff --git a/src/utils/exchange/mod.rs b/src/utils/exchange/mod.rs
index ffca4bbf3..722e02559 100644
--- a/src/utils/exchange/mod.rs
+++ b/src/utils/exchange/mod.rs
@@ -1,5 +1,4 @@
 pub mod buffer;
 
 #[cfg(feature = "host")]
-#[doc(cfg(feature = "host"))]
 pub mod wrapper;
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 26958f491..bb137a4af 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -1,56 +1,152 @@
-use core::ops::{Deref, DerefMut};
+use std::ops::{Deref, DerefMut};
 
-use rustacuda::{error::CudaResult, memory::DeviceBox};
+use rustacuda::{
+    error::CudaResult,
+    memory::{AsyncCopyDestination, CopyDestination, DeviceBox, LockedBox},
+};
 
 use crate::{
-    common::{DeviceAccessible, RustToCuda},
-    host::{
-        CombinedCudaAlloc, EmptyCudaAlloc, HostAndDeviceConstRef, HostAndDeviceMutRef,
-        HostDeviceBox, NullCudaAlloc,
+    alloc::{EmptyCudaAlloc, NoCudaAlloc},
+    host::{CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceMutRef, Stream},
+    lend::{RustToCuda, RustToCudaAsync},
+    safety::SafeMutableAliasing,
+    utils::{
+        adapter::DeviceCopyWithPortableBitSemantics,
+        ffi::DeviceAccessible,
+        r#async::{Async, AsyncProj, CompletionFnMut, NoCompletion},
     },
 };
 
 #[allow(clippy::module_name_repetitions)]
 pub struct ExchangeWrapperOnHost<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
-    value: T,
-    device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    value: Box<T>,
+    device_box: CudaDropWrapper<
+        DeviceBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
+    >,
+    locked_cuda_repr: CudaDropWrapper<
+        LockedBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
+    >,
 }
 
 #[allow(clippy::module_name_repetitions)]
 pub struct ExchangeWrapperOnDevice<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
-    value: T,
-    device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
-    cuda_repr: DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-    null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NullCudaAlloc>,
+    value: Box<T>,
+    device_box: CudaDropWrapper<
+        DeviceBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
+    >,
+    locked_cuda_repr: CudaDropWrapper<
+        LockedBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
+    >,
 }
 
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
     /// # Errors
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     pub fn new(value: T) -> CudaResult<Self> {
-        let (cuda_repr, _null_alloc) = unsafe { value.borrow(NullCudaAlloc) }?;
+        // Safety: The uninitialised memory is never exposed
+        //         To access the device memory, [`Self::move_to_device`] has to
+        //          be called first, which initialised the memory.
+        let device_box = CudaDropWrapper::from(unsafe { DeviceBox::uninitialized() }?);
 
-        let device_box = DeviceBox::new(&cuda_repr)?.into();
+        let (cuda_repr, _null_alloc) = unsafe { value.borrow(NoCudaAlloc) }?;
+        let locked_cuda_repr = unsafe {
+            let mut uninit = CudaDropWrapper::from(LockedBox::<
+                DeviceCopyWithPortableBitSemantics<
+                    DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+                >,
+            >::uninitialized()?);
+            uninit
+                .as_mut_ptr()
+                .write(DeviceCopyWithPortableBitSemantics::from(cuda_repr));
+            uninit
+        };
 
-        Ok(Self { value, device_box })
+        Ok(Self {
+            value: Box::new(value),
+            device_box,
+            locked_cuda_repr,
+        })
     }
 
+    /// Moves the data synchronously to the CUDA device, where it can then be
+    /// lent out immutably via [`ExchangeWrapperOnDevice::as_ref`], or mutably
+    /// via [`ExchangeWrapperOnDevice::as_mut_async`](Async::as_mut_async).
+    ///
     /// # Errors
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     pub fn move_to_device(mut self) -> CudaResult<ExchangeWrapperOnDevice<T>> {
-        let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NullCudaAlloc) }?;
+        let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NoCudaAlloc) }?;
+        **self.locked_cuda_repr = DeviceCopyWithPortableBitSemantics::from(cuda_repr);
+
+        self.device_box.copy_from(&**self.locked_cuda_repr)?;
 
-        self.device_box.copy_from(&cuda_repr)?;
+        let _: NoCudaAlloc = null_alloc.into();
 
         Ok(ExchangeWrapperOnDevice {
             value: self.value,
             device_box: self.device_box,
-            cuda_repr,
-            null_alloc,
+            locked_cuda_repr: self.locked_cuda_repr,
         })
     }
 }
 
+impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>>
+    ExchangeWrapperOnHost<T>
+{
+    #[allow(clippy::needless_lifetimes)] // keep 'stream explicit
+    /// Moves the data asynchronously to the CUDA device.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn move_to_device_async<'stream>(
+        mut self,
+        stream: Stream<'stream>,
+    ) -> CudaResult<Async<'static, 'stream, ExchangeWrapperOnDevice<T>, NoCompletion>> {
+        let (cuda_repr, _null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?;
+        let (cuda_repr, _completion): (_, Option<NoCompletion>) =
+            unsafe { cuda_repr.unwrap_unchecked()? };
+
+        **self.locked_cuda_repr = DeviceCopyWithPortableBitSemantics::from(cuda_repr);
+
+        // Safety: The device value is not safely exposed until either
+        // - the passed-in [`Stream`] is synchronised
+        // - the kernel is launched on the passed-in [`Stream`]
+        unsafe {
+            self.device_box
+                .async_copy_from(&*self.locked_cuda_repr, &stream)
+        }?;
+
+        Async::pending(
+            ExchangeWrapperOnDevice {
+                value: self.value,
+                device_box: self.device_box,
+                locked_cuda_repr: self.locked_cuda_repr,
+            },
+            stream,
+            NoCompletion,
+        )
+    }
+}
+
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> Deref for ExchangeWrapperOnHost<T> {
     type Target = T;
 
@@ -66,28 +162,200 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> DerefMut for ExchangeWrapper
 }
 
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
+    /// Moves the data synchronously back to the host CPU device.
+    ///
     /// # Errors
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
-        let _null_alloc: NullCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?;
+        let null_alloc = NoCudaAlloc.into();
+
+        // Reflect deep changes back to the CPU
+        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?;
+
+        // Note: Shallow changes are not reflected back to the CPU
 
         Ok(ExchangeWrapperOnHost {
             value: self.value,
             device_box: self.device_box,
+            locked_cuda_repr: self.locked_cuda_repr,
         })
     }
 
+    #[must_use]
     pub fn as_ref(
         &self,
     ) -> HostAndDeviceConstRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
-        // Safety: `device_box` contains exactly the device copy of `cuda_repr`
-        unsafe { HostAndDeviceConstRef::new(&self.device_box, &self.cuda_repr) }
+        // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
+        unsafe {
+            HostAndDeviceConstRef::new_unchecked(
+                &self.device_box,
+                (**self.locked_cuda_repr).into_ref(),
+            )
+        }
     }
+}
+
+impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>>
+    ExchangeWrapperOnDevice<T>
+{
+    #[allow(clippy::needless_lifetimes)] // keep 'stream explicit
+    /// Moves the data asynchronously back to the host CPU device.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn move_to_host_async<'stream>(
+        self,
+        stream: Stream<'stream>,
+    ) -> CudaResult<
+        Async<
+            'static,
+            'stream,
+            ExchangeWrapperOnHost<T>,
+            CompletionFnMut<'static, ExchangeWrapperOnHost<T>>,
+        >,
+    > {
+        let null_alloc = NoCudaAlloc.into();
+
+        let value = owning_ref::BoxRefMut::new(self.value);
+
+        // Reflect deep changes back to the CPU
+        let (r#async, _null_alloc): (_, NoCudaAlloc) =
+            unsafe { RustToCudaAsync::restore_async(value, null_alloc, stream) }?;
+        let (value, on_complete) = unsafe { r#async.unwrap_unchecked()? };
+
+        let value = value.into_owner();
+
+        // Note: Shallow changes are not reflected back to the CPU
 
-    pub fn as_mut(
+        if let Some(on_complete) = on_complete {
+            Async::<_, CompletionFnMut<ExchangeWrapperOnHost<T>>>::pending(
+                ExchangeWrapperOnHost {
+                    value,
+                    device_box: self.device_box,
+                    locked_cuda_repr: self.locked_cuda_repr,
+                },
+                stream,
+                Box::new(|on_host: &mut ExchangeWrapperOnHost<T>| on_complete(&mut on_host.value)),
+            )
+        } else {
+            Ok(Async::ready(
+                ExchangeWrapperOnHost {
+                    value,
+                    device_box: self.device_box,
+                    locked_cuda_repr: self.locked_cuda_repr,
+                },
+                stream,
+            ))
+        }
+    }
+}
+
+impl<
+        'a,
+        'stream,
+        T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>,
+    > Async<'a, 'stream, ExchangeWrapperOnDevice<T>, NoCompletion>
+{
+    /// Moves the data asynchronously back to the host CPU device.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn move_to_host_async(
+        self,
+        stream: Stream<'stream>,
+    ) -> CudaResult<
+        Async<
+            'static,
+            'stream,
+            ExchangeWrapperOnHost<T>,
+            CompletionFnMut<'static, ExchangeWrapperOnHost<T>>,
+        >,
+    > {
+        let (this, completion): (_, Option<NoCompletion>) = unsafe { self.unwrap_unchecked()? };
+
+        let null_alloc = NoCudaAlloc.into();
+
+        let value = owning_ref::BoxRefMut::new(this.value);
+
+        // Reflect deep changes back to the CPU
+        let (r#async, _null_alloc): (_, NoCudaAlloc) =
+            unsafe { RustToCudaAsync::restore_async(value, null_alloc, stream) }?;
+        let (value, on_complete) = unsafe { r#async.unwrap_unchecked()? };
+
+        let value = value.into_owner();
+
+        // Note: Shallow changes are not reflected back to the CPU
+
+        let on_host = ExchangeWrapperOnHost {
+            value,
+            device_box: this.device_box,
+            locked_cuda_repr: this.locked_cuda_repr,
+        };
+
+        if let Some(on_complete) = on_complete {
+            Async::<_, CompletionFnMut<ExchangeWrapperOnHost<T>>>::pending(
+                on_host,
+                stream,
+                Box::new(|on_host: &mut ExchangeWrapperOnHost<T>| on_complete(&mut on_host.value)),
+            )
+        } else if matches!(completion, Some(NoCompletion)) {
+            Async::<_, CompletionFnMut<ExchangeWrapperOnHost<T>>>::pending(
+                on_host,
+                stream,
+                Box::new(|_on_host: &mut ExchangeWrapperOnHost<T>| Ok(())),
+            )
+        } else {
+            Ok(Async::ready(on_host, stream))
+        }
+    }
+
+    #[must_use]
+    pub fn as_ref_async(
+        &self,
+    ) -> AsyncProj<
+        '_,
+        'stream,
+        HostAndDeviceConstRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    > {
+        let this = unsafe { self.as_ref().unwrap_unchecked() };
+
+        // Safety: this projection captures this async
+        unsafe {
+            AsyncProj::new(
+                HostAndDeviceConstRef::new_unchecked(
+                    &*(this.device_box),
+                    (**(this.locked_cuda_repr)).into_ref(),
+                ),
+                None,
+            )
+        }
+    }
+
+    #[must_use]
+    pub fn as_mut_async(
         &mut self,
-    ) -> HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
-        // Safety: `device_box` contains exactly the device copy of `cuda_repr`
-        unsafe { HostAndDeviceMutRef::new(&mut self.device_box, &mut self.cuda_repr) }
+    ) -> AsyncProj<
+        '_,
+        'stream,
+        HostAndDeviceMutRef<'_, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    >
+    where
+        T: SafeMutableAliasing,
+    {
+        let (this, use_callback) = unsafe { self.as_mut().unwrap_unchecked_with_use() };
+
+        // Safety: this projection captures this async
+        unsafe {
+            AsyncProj::new(
+                HostAndDeviceMutRef::new_unchecked(
+                    &mut *(this.device_box),
+                    (**(this.locked_cuda_repr)).into_mut(),
+                ),
+                use_callback,
+            )
+        }
     }
 }
diff --git a/src/utils/ffi.rs b/src/utils/ffi.rs
new file mode 100644
index 000000000..52d7f691d
--- /dev/null
+++ b/src/utils/ffi.rs
@@ -0,0 +1,208 @@
+use core::marker::PhantomData;
+#[cfg(feature = "device")]
+use core::{
+    convert::{AsMut, AsRef},
+    ops::{Deref, DerefMut},
+};
+#[cfg(feature = "host")]
+use std::fmt;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+use crate::safety::PortableBitSemantics;
+#[cfg(feature = "host")]
+use crate::{lend::CudaAsRust, utils::adapter::RustToCudaWithPortableBitCopySemantics};
+
+#[cfg_attr(any(feature = "device", doc), derive(Debug))]
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceAccessible<T: ?Sized + PortableBitSemantics + TypeGraphLayout>(T);
+
+#[cfg(feature = "host")]
+impl<T: CudaAsRust> From<T> for DeviceAccessible<T> {
+    fn from(value: T) -> Self {
+        Self(value)
+    }
+}
+
+#[cfg(feature = "host")]
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> From<&T>
+    for DeviceAccessible<RustToCudaWithPortableBitCopySemantics<T>>
+{
+    fn from(value: &T) -> Self {
+        Self(RustToCudaWithPortableBitCopySemantics::from_copy(value))
+    }
+}
+
+#[cfg(all(feature = "host", not(doc)))]
+impl<T: ?Sized + PortableBitSemantics + TypeGraphLayout + fmt::Debug> fmt::Debug
+    for DeviceAccessible<T>
+{
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        fmt.debug_struct(stringify!(DeviceAccessible))
+            .finish_non_exhaustive()
+    }
+}
+
+#[cfg(feature = "device")]
+impl<T: ?Sized + PortableBitSemantics + TypeGraphLayout> Deref for DeviceAccessible<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+#[cfg(feature = "device")]
+impl<T: ?Sized + PortableBitSemantics + TypeGraphLayout> DerefMut for DeviceAccessible<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceConstRef<'r, T: PortableBitSemantics + 'r> {
+    #[cfg_attr(feature = "host", allow(dead_code))]
+    pub(crate) pointer: DeviceConstPointer<T>,
+    pub(crate) reference: PhantomData<&'r T>,
+}
+
+impl<'r, T: PortableBitSemantics> Copy for DeviceConstRef<'r, T> {}
+
+impl<'r, T: PortableBitSemantics> Clone for DeviceConstRef<'r, T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+#[cfg(feature = "device")]
+impl<'r, T: PortableBitSemantics> AsRef<T> for DeviceConstRef<'r, T> {
+    fn as_ref(&self) -> &T {
+        unsafe { &*self.pointer.0 }
+    }
+}
+
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceMutRef<'r, T: PortableBitSemantics + 'r> {
+    #[cfg_attr(feature = "host", allow(dead_code))]
+    pub(crate) pointer: DeviceMutPointer<T>,
+    pub(crate) reference: PhantomData<&'r mut T>,
+}
+
+#[cfg(feature = "device")]
+impl<'r, T: PortableBitSemantics> AsRef<T> for DeviceMutRef<'r, T> {
+    fn as_ref(&self) -> &T {
+        unsafe { &*self.pointer.0 }
+    }
+}
+
+#[cfg(feature = "device")]
+impl<'r, T: PortableBitSemantics> AsMut<T> for DeviceMutRef<'r, T> {
+    fn as_mut(&mut self) -> &mut T {
+        unsafe { &mut *self.pointer.0 }
+    }
+}
+
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceOwnedRef<'r, T: PortableBitSemantics> {
+    #[cfg_attr(feature = "host", allow(dead_code))]
+    pub(crate) pointer: DeviceOwnedPointer<T>,
+    pub(crate) reference: PhantomData<&'r mut ()>,
+    pub(crate) marker: PhantomData<T>,
+}
+
+#[cfg(feature = "device")]
+impl<'r, T: PortableBitSemantics> AsRef<T> for DeviceOwnedRef<'r, T> {
+    fn as_ref(&self) -> &T {
+        unsafe { &*self.pointer.0 }
+    }
+}
+
+#[cfg(feature = "device")]
+impl<'r, T: PortableBitSemantics> AsMut<T> for DeviceOwnedRef<'r, T> {
+    fn as_mut(&mut self) -> &mut T {
+        unsafe { &mut *self.pointer.0 }
+    }
+}
+
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceConstPointer<T: ?Sized>(pub(crate) *const T);
+
+impl<T: ?Sized> Copy for DeviceConstPointer<T> {}
+
+impl<T: ?Sized> Clone for DeviceConstPointer<T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T> DeviceConstPointer<[T]> {
+    #[must_use]
+    pub fn into_raw_parts(self) -> (DeviceConstPointer<T>, usize) {
+        let (data, len) = self.0.to_raw_parts();
+        (DeviceConstPointer(data.cast()), len)
+    }
+}
+
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceMutPointer<T: ?Sized>(pub(crate) *mut T);
+
+impl<T: ?Sized> Copy for DeviceMutPointer<T> {}
+
+impl<T: ?Sized> Clone for DeviceMutPointer<T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T: ?Sized> DeviceMutPointer<T> {
+    #[must_use]
+    pub const fn as_const(self) -> DeviceConstPointer<T> {
+        DeviceConstPointer(self.0.cast_const())
+    }
+}
+
+impl<T> DeviceMutPointer<[T]> {
+    #[must_use]
+    pub fn into_raw_parts(self) -> (DeviceMutPointer<T>, usize) {
+        let (data, len) = self.0.to_raw_parts();
+        (DeviceMutPointer(data.cast()), len)
+    }
+}
+
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceOwnedPointer<T: ?Sized>(pub(crate) *mut T);
+
+impl<T: ?Sized> Copy for DeviceOwnedPointer<T> {}
+
+impl<T: ?Sized> Clone for DeviceOwnedPointer<T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T: ?Sized> DeviceOwnedPointer<T> {
+    #[must_use]
+    pub const fn as_const(self) -> DeviceConstPointer<T> {
+        DeviceConstPointer(self.0.cast_const())
+    }
+
+    #[must_use]
+    pub const fn as_mut(self) -> DeviceMutPointer<T> {
+        DeviceMutPointer(self.0)
+    }
+}
+
+impl<T> DeviceOwnedPointer<[T]> {
+    #[must_use]
+    pub fn into_raw_parts(self) -> (DeviceOwnedPointer<T>, usize) {
+        let (data, len) = self.0.to_raw_parts();
+        (DeviceOwnedPointer(data.cast()), len)
+    }
+}
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index 303e96262..e41a3c4ee 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -1,10 +1,6 @@
+pub mod adapter;
 pub mod aliasing;
-#[cfg(any(feature = "alloc", doc))]
-#[doc(cfg(feature = "alloc"))]
-pub mod alloc;
-pub mod device_copy;
+pub mod r#async;
 pub mod exchange;
-
-mod r#box;
-mod boxed_slice;
-mod option;
+pub mod ffi;
+pub mod shared;
diff --git a/src/utils/option.rs b/src/utils/option.rs
deleted file mode 100644
index 7ef601137..000000000
--- a/src/utils/option.rs
+++ /dev/null
@@ -1,116 +0,0 @@
-use core::mem::MaybeUninit;
-
-use const_type_layout::TypeGraphLayout;
-
-use crate::{
-    common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaProxy},
-    safety::SafeDeviceCopy,
-    utils::device_copy::SafeDeviceCopyWrapper,
-};
-
-#[cfg(feature = "host")]
-use crate::{host::CombinedCudaAlloc, host::CudaAlloc, rustacuda::error::CudaResult};
-
-#[doc(hidden)]
-#[allow(clippy::module_name_repetitions)]
-#[derive(TypeLayout)]
-#[repr(C)]
-pub struct OptionCudaRepresentation<T: CudaAsRust> {
-    maybe: MaybeUninit<DeviceAccessible<T>>,
-    present: bool,
-}
-
-// Safety: Since the CUDA representation of T is DeviceCopy,
-//         the full enum is also DeviceCopy
-unsafe impl<T: CudaAsRust> rustacuda_core::DeviceCopy for OptionCudaRepresentation<T> {}
-
-unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    type CudaAllocation = Option<<T as RustToCuda>::CudaAllocation>;
-    type CudaRepresentation = OptionCudaRepresentation<<T as RustToCuda>::CudaRepresentation>;
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: CudaAlloc>(
-        &self,
-        alloc: A,
-    ) -> CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        let (cuda_repr, alloc) = match self {
-            None => (
-                OptionCudaRepresentation {
-                    maybe: MaybeUninit::uninit(),
-                    present: false,
-                },
-                CombinedCudaAlloc::new(None, alloc),
-            ),
-            Some(value) => {
-                let (cuda_repr, alloc) = value.borrow(alloc)?;
-
-                let (alloc_front, alloc_tail) = alloc.split();
-
-                (
-                    OptionCudaRepresentation {
-                        maybe: MaybeUninit::new(cuda_repr),
-                        present: true,
-                    },
-                    CombinedCudaAlloc::new(Some(alloc_front), alloc_tail),
-                )
-            },
-        };
-
-        Ok((DeviceAccessible::from(cuda_repr), alloc))
-    }
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: CudaAlloc>(
-        &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> CudaResult<A> {
-        let (alloc_front, alloc_tail) = alloc.split();
-
-        match (self, alloc_front) {
-            (Some(value), Some(alloc_front)) => {
-                value.restore(CombinedCudaAlloc::new(alloc_front, alloc_tail))
-            },
-            _ => Ok(alloc_tail),
-        }
-    }
-}
-
-unsafe impl<T: CudaAsRust> CudaAsRust for OptionCudaRepresentation<T> {
-    type RustRepresentation = Option<<T as CudaAsRust>::RustRepresentation>;
-
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
-    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        if this.present {
-            Some(CudaAsRust::as_rust(this.maybe.assume_init_ref()))
-        } else {
-            None
-        }
-    }
-}
-
-impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaProxy<Option<T>>
-    for Option<SafeDeviceCopyWrapper<T>>
-{
-    fn from_ref(val: &Option<T>) -> &Self {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype
-        unsafe { &*(val as *const Option<T>).cast() }
-    }
-
-    fn from_mut(val: &mut Option<T>) -> &mut Self {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype
-        unsafe { &mut *(val as *mut Option<T>).cast() }
-    }
-
-    fn into(self) -> Option<T> {
-        self.map(SafeDeviceCopyWrapper::into_inner)
-    }
-}
diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs
new file mode 100644
index 000000000..b01dda26d
--- /dev/null
+++ b/src/utils/shared/mod.rs
@@ -0,0 +1,14 @@
+mod slice;
+mod r#static;
+
+pub use slice::ThreadBlockSharedSlice;
+
+#[allow(clippy::module_name_repetitions)]
+pub use r#static::ThreadBlockShared;
+
+#[doc(hidden)]
+#[cfg(feature = "device")]
+pub use slice::init;
+
+#[cfg(feature = "host")]
+pub(crate) use slice::SharedMemorySize;
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
new file mode 100644
index 000000000..72ed7fde1
--- /dev/null
+++ b/src/utils/shared/slice.rs
@@ -0,0 +1,169 @@
+use core::alloc::Layout;
+
+use const_type_layout::TypeGraphLayout;
+
+#[allow(clippy::module_name_repetitions)]
+#[repr(transparent)]
+pub struct ThreadBlockSharedSlice<T: 'static + TypeGraphLayout> {
+    shared: *mut [T],
+}
+
+impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
+    #[cfg(feature = "host")]
+    #[must_use]
+    pub fn new_uninit_with_len(len: usize) -> Self {
+        Self {
+            shared: Self::dangling_slice_with_len(len),
+        }
+    }
+
+    #[cfg(feature = "host")]
+    #[must_use]
+    pub fn with_len(mut self, len: usize) -> Self {
+        self.shared = Self::dangling_slice_with_len(len);
+        self
+    }
+
+    #[cfg(feature = "host")]
+    #[must_use]
+    pub fn with_len_mut(&mut self, len: usize) -> &mut Self {
+        self.shared = Self::dangling_slice_with_len(len);
+        self
+    }
+
+    #[cfg(feature = "host")]
+    fn dangling_slice_with_len(len: usize) -> *mut [T] {
+        core::ptr::slice_from_raw_parts_mut(core::ptr::NonNull::dangling().as_ptr(), len)
+    }
+
+    #[must_use]
+    pub fn len(&self) -> usize {
+        core::ptr::metadata(self.shared)
+    }
+
+    #[must_use]
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    #[must_use]
+    pub fn layout(&self) -> Layout {
+        // Safety: the length of self.shared is always initialised
+        unsafe { Layout::for_value_raw(self.shared) }
+    }
+
+    #[cfg(feature = "device")]
+    #[must_use]
+    pub const fn as_mut_ptr(&self) -> *mut T {
+        self.shared.cast()
+    }
+
+    #[cfg(feature = "device")]
+    #[must_use]
+    pub const fn as_mut_slice_ptr(&self) -> *mut [T] {
+        self.shared
+    }
+
+    #[cfg(feature = "device")]
+    /// # Safety
+    ///
+    /// The provided `index` must not be out of bounds.
+    #[inline]
+    #[must_use]
+    pub unsafe fn index_mut_unchecked<I: core::slice::SliceIndex<[T]>>(
+        &self,
+        index: I,
+    ) -> *mut <I as core::slice::SliceIndex<[T]>>::Output {
+        self.shared.get_unchecked_mut(index)
+    }
+}
+
+#[cfg(feature = "device")]
+impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
+    /// # Safety
+    ///
+    /// Exposing the [`ThreadBlockSharedSlice`] must be preceded by exactly one
+    /// call to [`init`].
+    pub(crate) unsafe fn with_uninit_for_len<F: FnOnce(&mut Self) -> Q, Q>(
+        len: usize,
+        inner: F,
+    ) -> Q {
+        let base: *mut u8;
+
+        unsafe {
+            core::arch::asm!(
+                "mov.u64    {base}, %rust_cuda_dynamic_shared;",
+                base = out(reg64) base,
+            );
+        }
+
+        let aligned_base = base.byte_add(base.align_offset(core::mem::align_of::<T>()));
+
+        let data: *mut T = aligned_base.cast();
+
+        let new_base = data.add(len).cast::<u8>();
+
+        unsafe {
+            core::arch::asm!(
+                "mov.u64    %rust_cuda_dynamic_shared, {new_base};",
+                new_base = in(reg64) new_base,
+            );
+        }
+
+        let shared = core::ptr::slice_from_raw_parts_mut(data, len);
+
+        inner(&mut Self { shared })
+    }
+}
+
+#[cfg(feature = "device")]
+/// # Safety
+///
+/// The thread-block shared dynamic memory must be initialised once and
+/// only once per kernel.
+pub unsafe fn init() {
+    unsafe {
+        core::arch::asm!(".reg .u64    %rust_cuda_dynamic_shared;");
+        core::arch::asm!(
+            "cvta.shared.u64    %rust_cuda_dynamic_shared, rust_cuda_dynamic_shared_base;",
+        );
+    }
+}
+
+#[cfg(feature = "device")]
+core::arch::global_asm!(".extern .shared .align 8 .b8 rust_cuda_dynamic_shared_base[];");
+
+#[cfg(feature = "host")]
+pub struct SharedMemorySize {
+    last_align: usize,
+    total_size: usize,
+}
+
+#[cfg(feature = "host")]
+impl SharedMemorySize {
+    #[must_use]
+    pub const fn new() -> Self {
+        Self {
+            // we allocate the shared memory with an alignment of 8
+            last_align: 8,
+            total_size: 0,
+        }
+    }
+
+    pub fn add(&mut self, layout: core::alloc::Layout) {
+        if layout.align() > self.last_align {
+            // in the worst case, we are one element of the smaller alignment
+            //  into the larger alignment, so we need to pad the entire rest
+            let pessimistic_padding = layout.align() - self.last_align;
+
+            self.total_size += pessimistic_padding;
+        }
+
+        self.last_align = layout.align();
+        self.total_size += layout.size();
+    }
+
+    pub const fn total(self) -> usize {
+        self.total_size
+    }
+}
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
new file mode 100644
index 000000000..62c3a0c49
--- /dev/null
+++ b/src/utils/shared/static.rs
@@ -0,0 +1,58 @@
+#[repr(transparent)]
+pub struct ThreadBlockShared<T: 'static> {
+    #[cfg_attr(not(feature = "device"), allow(dead_code))]
+    shared: *mut T,
+}
+
+impl<T: 'static> ThreadBlockShared<T> {
+    #[cfg(any(feature = "host", feature = "device"))]
+    #[must_use]
+    #[allow(clippy::inline_always, clippy::missing_const_for_fn)]
+    #[inline(always)]
+    pub fn new_uninit() -> Self {
+        #[cfg(feature = "host")]
+        {
+            Self {
+                shared: core::ptr::NonNull::dangling().as_ptr(),
+            }
+        }
+
+        #[cfg(feature = "device")]
+        {
+            let shared: *mut T;
+
+            unsafe {
+                core::arch::asm!(
+                    ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];",
+                    "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;",
+                    reg = out(reg64) shared,
+                    align = const(core::mem::align_of::<T>()),
+                    size = const(core::mem::size_of::<T>()),
+                );
+            }
+
+            Self { shared }
+        }
+    }
+
+    #[cfg(feature = "device")]
+    #[must_use]
+    pub const fn as_mut_ptr(&self) -> *mut T {
+        self.shared
+    }
+}
+
+impl<T: 'static, const N: usize> ThreadBlockShared<[T; N]> {
+    #[cfg(feature = "device")]
+    /// # Safety
+    ///
+    /// The provided `index` must not be out of bounds.
+    #[inline]
+    #[must_use]
+    pub unsafe fn index_mut_unchecked<I: core::slice::SliceIndex<[T]>>(
+        &self,
+        index: I,
+    ) -> *mut <I as core::slice::SliceIndex<[T]>>::Output {
+        core::ptr::slice_from_raw_parts_mut(self.shared.cast::<T>(), N).get_unchecked_mut(index)
+    }
+}