diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 0d5d1b7b..8f40b639 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -59,7 +59,7 @@ jobs:
         run: cargo fmt --all -- --check
 
       - name: Build
-        run: cargo build --workspace --exclude "optix" --exclude "optix_sys" --exclude "path_tracer" --exclude "denoiser" --exclude "add"
+        run: cargo build --workspace --exclude "optix" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*"
 
       # Don't currently test because many tests rely on the system having a CUDA GPU
       # - name: Test
@@ -69,9 +69,9 @@ jobs:
         if: contains(matrix.os, 'ubuntu')
         env:
           RUSTFLAGS: -Dwarnings
-        run: cargo clippy --workspace --exclude "optix" --exclude "optix_sys" --exclude "path_tracer" --exclude "denoiser" --exclude "add"
+        run: cargo clippy --workspace --exclude "optix" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*"
 
       - name: Check documentation
         env:
           RUSTDOCFLAGS: -Dwarnings
-        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix" --exclude "optix_sys" --exclude "path_tracer" --exclude "denoiser" --exclude "add"
\ No newline at end of file
+        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*"
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index 15772aa0..b41d7b0c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,8 @@
 [workspace]
 members = [
   "crates/*",
-
+  "crates/optix/examples/ex*",
+  "crates/optix/examples/ex*/device",
   "xtask",
 
   "examples/optix/*",
@@ -10,5 +11,9 @@ members = [
 
 ]
 
+exclude = [
+    "crates/optix/examples/common"
+]
+
 [profile.dev.package.rustc_codegen_nvvm]
 opt-level = 3
diff --git a/crates/blastoff/Cargo.toml b/crates/blastoff/Cargo.toml
index 06a66f5d..382deb07 100644
--- a/crates/blastoff/Cargo.toml
+++ b/crates/blastoff/Cargo.toml
@@ -8,7 +8,7 @@ repository = "https://github.com/Rust-GPU/Rust-CUDA"
 [dependencies]
 bitflags = "1.3.2"
 cublas_sys = { version = "0.1", path = "../cublas_sys" }
-cust = { version = "0.2", path = "../cust", features = ["num-complex"] }
+cust = { version = "0.2", path = "../cust", features = ["impl_num_complex"] }
 num-complex = "0.4.0"
 
 [package.metadata.docs.rs]
diff --git a/crates/blastoff/src/level1.rs b/crates/blastoff/src/level1.rs
index c3416d12..5401d40c 100644
--- a/crates/blastoff/src/level1.rs
+++ b/crates/blastoff/src/level1.rs
@@ -46,9 +46,9 @@ impl CublasContext {
             Ok(T::amin(
                 ctx.raw,
                 n as i32,
-                x.as_device_ptr().as_raw(),
+                x.as_device_ptr().as_ptr(),
                 stride.unwrap_or(1) as i32,
-                result.as_device_ptr().as_raw_mut(),
+                result.as_device_ptr().as_mut_ptr(),
             )
             .to_result()?)
         })
@@ -108,9 +108,9 @@ impl CublasContext {
             Ok(T::amax(
                 ctx.raw,
                 n as i32,
-                x.as_device_ptr().as_raw(),
+                x.as_device_ptr().as_ptr(),
                 stride.unwrap_or(1) as i32,
-                result.as_device_ptr().as_raw_mut(),
+                result.as_device_ptr().as_mut_ptr(),
             )
             .to_result()?)
         })
@@ -172,10 +172,10 @@ impl CublasContext {
             Ok(T::axpy(
                 ctx.raw,
                 n as i32,
-                alpha.as_device_ptr().as_raw(),
-                x.as_device_ptr().as_raw(),
+                alpha.as_device_ptr().as_ptr(),
+                x.as_device_ptr().as_ptr(),
                 x_stride.unwrap_or(1) as i32,
-                y.as_device_ptr().as_raw_mut(),
+                y.as_device_ptr().as_mut_ptr(),
                 y_stride.unwrap_or(1) as i32,
             )
             .to_result()?)
@@ -245,9 +245,9 @@ impl CublasContext {
             Ok(T::copy(
                 ctx.raw,
                 n as i32,
-                x.as_device_ptr().as_raw(),
+                x.as_device_ptr().as_ptr(),
                 x_stride.unwrap_or(1) as i32,
-                y.as_device_ptr().as_raw_mut(),
+                y.as_device_ptr().as_mut_ptr(),
                 y_stride.unwrap_or(1) as i32,
             )
             .to_result()?)
@@ -314,11 +314,11 @@ impl CublasContext {
             Ok(T::dot(
                 ctx.raw,
                 n as i32,
-                x.as_device_ptr().as_raw(),
+                x.as_device_ptr().as_ptr(),
                 x_stride.unwrap_or(1) as i32,
-                y.as_device_ptr().as_raw(),
+                y.as_device_ptr().as_ptr(),
                 y_stride.unwrap_or(1) as i32,
-                result.as_device_ptr().as_raw_mut(),
+                result.as_device_ptr().as_mut_ptr(),
             )
             .to_result()?)
         })
@@ -390,11 +390,11 @@ impl CublasContext {
             Ok(T::dotu(
                 ctx.raw,
                 n as i32,
-                x.as_device_ptr().as_raw(),
+                x.as_device_ptr().as_ptr(),
                 x_stride.unwrap_or(1) as i32,
-                y.as_device_ptr().as_raw(),
+                y.as_device_ptr().as_ptr(),
                 y_stride.unwrap_or(1) as i32,
-                result.as_device_ptr().as_raw_mut(),
+                result.as_device_ptr().as_mut_ptr(),
             )
             .to_result()?)
         })
@@ -438,11 +438,11 @@ impl CublasContext {
             Ok(T::dotc(
                 ctx.raw,
                 n as i32,
-                x.as_device_ptr().as_raw(),
+                x.as_device_ptr().as_ptr(),
                 x_stride.unwrap_or(1) as i32,
-                y.as_device_ptr().as_raw(),
+                y.as_device_ptr().as_ptr(),
                 y_stride.unwrap_or(1) as i32,
-                result.as_device_ptr().as_raw_mut(),
+                result.as_device_ptr().as_mut_ptr(),
             )
             .to_result()?)
         })
@@ -483,9 +483,9 @@ impl CublasContext {
             Ok(T::nrm2(
                 ctx.raw,
                 n as i32,
-                x.as_device_ptr().as_raw(),
+                x.as_device_ptr().as_ptr(),
                 x_stride.unwrap_or(1) as i32,
-                result.as_device_ptr().as_raw_mut(),
+                result.as_device_ptr().as_mut_ptr(),
             )
             .to_result()?)
         })
@@ -559,12 +559,12 @@ impl CublasContext {
             Ok(T::rot(
                 ctx.raw,
                 n as i32,
-                x.as_device_ptr().as_raw_mut(),
+                x.as_device_ptr().as_mut_ptr(),
                 x_stride.unwrap_or(1) as i32,
-                y.as_device_ptr().as_raw_mut(),
+                y.as_device_ptr().as_mut_ptr(),
                 y_stride.unwrap_or(1) as i32,
-                c.as_device_ptr().as_raw(),
-                s.as_device_ptr().as_raw(),
+                c.as_device_ptr().as_ptr(),
+                s.as_device_ptr().as_ptr(),
             )
             .to_result()?)
         })
diff --git a/crates/cudnn/src/context.rs b/crates/cudnn/src/context.rs
index 49a72bb7..b917fc9a 100644
--- a/crates/cudnn/src/context.rs
+++ b/crates/cudnn/src/context.rs
@@ -397,9 +397,9 @@ impl CudnnContext {
         let x_data = x.data().as_device_ptr().as_raw();
 
         let y_desc = y.descriptor();
-        let y_data = y.data().as_device_ptr().as_raw_mut();
+        let y_data = y.data().as_device_ptr().as_ptr();
 
-        let reserve_space_ptr = reserve_space.as_device_ptr().as_raw_mut();
+        let reserve_space_ptr = reserve_space.as_device_ptr().as_ptr();
 
         unsafe {
             sys::cudnnDropoutForward(
@@ -454,9 +454,9 @@ impl CudnnContext {
         let dy_data = dy.data().as_device_ptr().as_raw();
 
         let dx_desc = dx.descriptor();
-        let dx_data = dx.data().as_device_ptr().as_raw_mut();
+        let dx_data = dx.data().as_device_ptr().as_ptr();
 
-        let reserve_space_ptr = reserve_space.as_device_ptr().as_raw_mut();
+        let reserve_space_ptr = reserve_space.as_device_ptr().as_ptr();
 
         unsafe {
             sys::cudnnDropoutBackward(
@@ -528,7 +528,7 @@ impl CudnnContext {
                 raw,
                 self.raw,
                 dropout,
-                states.as_device_ptr().as_raw_mut() as *mut std::ffi::c_void,
+                states.as_device_ptr().as_ptr() as *mut std::ffi::c_void,
                 states.len(),
                 seed,
             )
@@ -1185,14 +1185,14 @@ impl CudnnContext {
         let w_data = w.data().as_device_ptr().as_raw();
         let w_desc = w.descriptor();
 
-        let y_data = y.data().as_device_ptr().as_raw_mut();
+        let y_data = y.data().as_device_ptr().as_ptr();
         let y_desc = y.descriptor();
 
         // If the _ size is 0 then the algorithm can work in-place and cuDNN expects a null
         // pointer.
         let (work_space_ptr, work_space_size): (*mut u8, usize) = {
             work_space.map_or((std::ptr::null_mut(), 0), |work_space| {
-                (work_space.as_device_ptr().as_raw_mut(), work_space.len())
+                (work_space.as_device_ptr().as_mut_ptr(), work_space.len())
             })
         };
 
@@ -1287,12 +1287,12 @@ impl CudnnContext {
         let dy_data = dy.data().as_device_ptr().as_raw();
         let dy_desc = dy.descriptor();
 
-        let dx_data = dx.data().as_device_ptr().as_raw_mut();
+        let dx_data = dx.data().as_device_ptr().as_ptr();
         let dx_desc = dx.descriptor();
 
         let (work_space_ptr, work_space_size): (*mut u8, usize) = {
             work_space.map_or((std::ptr::null_mut(), 0), |work_space| {
-                (work_space.as_device_ptr().as_raw_mut(), work_space.len())
+                (work_space.as_device_ptr().as_mut_ptr(), work_space.len())
             })
         };
 
@@ -1388,12 +1388,12 @@ impl CudnnContext {
         let dy_data = dy.data().as_device_ptr().as_raw();
         let dy_desc = dy.descriptor();
 
-        let dw_data = dw.data().as_device_ptr().as_raw_mut();
+        let dw_data = dw.data().as_device_ptr().as_ptr();
         let dw_desc = dw.descriptor();
 
         let (work_space_ptr, work_space_size): (*mut u8, usize) = {
             work_space.map_or((std::ptr::null_mut(), 0), |work_space| {
-                (work_space.as_device_ptr().as_raw_mut(), work_space.len())
+                (work_space.as_device_ptr().as_mut_ptr(), work_space.len())
             })
         };
 
@@ -1615,28 +1615,28 @@ impl CudnnContext {
         L: RnnDataLayout,
         NCHW: SupportedType<T1>,
     {
-        let device_sequence_lengths_ptr = device_seq_lengths.as_device_ptr().as_raw();
+        let device_sequence_lengths_ptr = device_seq_lengths.as_device_ptr().as_ptr();
 
         let x_ptr = x.as_device_ptr().as_raw();
-        let y_ptr = y.as_device_ptr().as_raw_mut();
+        let y_ptr = y.as_device_ptr().as_ptr();
 
-        let hx_ptr = hx.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_raw());
+        let hx_ptr = hx.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_ptr());
         let hy_ptr = hy.map_or(std::ptr::null_mut(), |buff| {
-            buff.as_device_ptr().as_raw_mut()
+            buff.as_device_ptr().as_mut_ptr()
         });
 
         let c_desc = c_desc.map_or(std::ptr::null_mut(), |desc| desc.raw);
 
-        let cx_ptr = cx.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_raw());
+        let cx_ptr = cx.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_ptr());
         let cy_ptr = cy.map_or(std::ptr::null_mut(), |buff| {
-            buff.as_device_ptr().as_raw_mut()
+            buff.as_device_ptr().as_mut_ptr()
         });
 
-        let weight_space_ptr = weight_space.as_device_ptr().as_raw_mut();
-        let work_space_ptr = work_space.as_device_ptr().as_raw_mut();
+        let weight_space_ptr = weight_space.as_device_ptr().as_ptr();
+        let work_space_ptr = work_space.as_device_ptr().as_ptr();
         let (reserve_space_ptr, reserve_space_size) = reserve_space
             .map_or((std::ptr::null_mut(), 0), |buff| {
-                (buff.as_device_ptr().as_raw_mut(), buff.len())
+                (buff.as_device_ptr().as_mut_ptr(), buff.len())
             });
 
         unsafe {
@@ -1814,32 +1814,32 @@ impl CudnnContext {
         L: RnnDataLayout,
         NCHW: SupportedType<T1>,
     {
-        let device_sequence_lengths_ptr = device_seq_lengths.as_device_ptr().as_raw();
+        let device_sequence_lengths_ptr = device_seq_lengths.as_device_ptr().as_ptr();
 
         let y_ptr = y.as_device_ptr().as_raw();
         let dy_ptr = dy.as_device_ptr().as_raw();
 
-        let dx_ptr = dx.as_device_ptr().as_raw_mut();
+        let dx_ptr = dx.as_device_ptr().as_ptr();
 
         let h_desc = h_desc.map_or(std::ptr::null_mut(), |desc| desc.raw);
 
-        let hx_ptr = hx.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_raw());
-        let dhy_ptr = dhy.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_raw());
+        let hx_ptr = hx.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_ptr());
+        let dhy_ptr = dhy.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_ptr());
         let dhx_ptr = dhx.map_or(std::ptr::null_mut(), |buff| {
-            buff.as_device_ptr().as_raw_mut()
+            buff.as_device_ptr().as_mut_ptr()
         });
 
         let c_desc = c_desc.map_or(std::ptr::null_mut(), |desc| desc.raw);
 
-        let cx_ptr = cx.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_raw());
-        let dcy_ptr = dcy.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_raw());
+        let cx_ptr = cx.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_ptr());
+        let dcy_ptr = dcy.map_or(std::ptr::null(), |buff| buff.as_device_ptr().as_mut_ptr());
         let dcx_ptr = dcx.map_or(std::ptr::null_mut(), |buff| {
-            buff.as_device_ptr().as_raw_mut()
+            buff.as_device_ptr().as_mut_ptr()
         });
 
-        let weight_space_ptr = weight_space.as_device_ptr().as_raw_mut();
-        let work_space_ptr = work_space.as_device_ptr().as_raw_mut();
-        let reserve_space_ptr = reserve_space.as_device_ptr().as_raw_mut();
+        let weight_space_ptr = weight_space.as_device_ptr().as_ptr();
+        let work_space_ptr = work_space.as_device_ptr().as_ptr();
+        let reserve_space_ptr = reserve_space.as_device_ptr().as_ptr();
 
         unsafe {
             sys::cudnnRNNBackwardData_v8(
@@ -1947,15 +1947,15 @@ impl CudnnContext {
         L: RnnDataLayout,
         NCHW: SupportedType<T1>,
     {
-        let device_sequence_lengths_ptr = device_seq_lengths.as_device_ptr().as_raw();
+        let device_sequence_lengths_ptr = device_seq_lengths.as_device_ptr().as_mut_ptr();
 
         let x_ptr = x.as_device_ptr().as_raw();
         let hx_ptr = x.as_device_ptr().as_raw();
         let y_ptr = y.as_device_ptr().as_raw();
 
-        let dweight_space_ptr = dweight_space.as_device_ptr().as_raw_mut();
-        let work_space_ptr = work_space.as_device_ptr().as_raw_mut();
-        let reserve_space_ptr = reserve_space.as_device_ptr().as_raw_mut();
+        let dweight_space_ptr = dweight_space.as_device_ptr().as_mut_ptr();
+        let work_space_ptr = work_space.as_device_ptr().as_mut_ptr();
+        let reserve_space_ptr = reserve_space.as_device_ptr().as_mut_ptr();
 
         unsafe {
             sys::cudnnRNNBackwardWeights_v8(
diff --git a/crates/cust/CHANGELOG.md b/crates/cust/CHANGELOG.md
index 25432074..437f53ff 100644
--- a/crates/cust/CHANGELOG.md
+++ b/crates/cust/CHANGELOG.md
@@ -30,6 +30,50 @@ any breaking changes, the API is the same.
 - `Linker::complete` now only returns the built cubin, and not the cubin and a duration.
 - `Stream`, `Module`, `Linker`, `Function`, `Event`, `UnifiedBox`, `ArrayObject`, `LockedBuffer`, `LockedBox`, `DeviceSlice`, `DeviceBuffer`, and `DeviceBox` all now impl `Send` and `Sync`, this makes
 it much easier to write multigpu code. The CUDA API is fully thread-safe except for graph objects.
+- Features such as `vek` for implementing DeviceCopy are now `impl_cratename`, e.g. `impl_vek`, `impl_half`, etc.
+- `DevicePointer::as_raw` now returns a `CUdeviceptr` instead of a `*const T`.
+- Added `DevicePointer::as_ptr` and `DevicePointer::as_mut_ptr` for returning `*const T` or `*mut T`.
+- Added mint integration behind `impl_mint`.
+- Added half integration behind `impl_half`.
+- Added glam integration behind `impl_glam`.
+- `num-complex` integration is now behind `impl_num_complex`, not `num-complex`.
+- Added experimental linux external memory import APIs through `cust::external::ExternalMemory`.
+- `vek` is no longer re-exported.
+- `DeviceBox` now requires `T: DeviceCopy` (previously it didn't but almost all its methods did).
+- `DeviceBox::from_raw` now takes a `CUdeviceptr` instead of a `*mut T`.
+- `DeviceBox::as_device_ptr` now requires `&self` instead of `&mut self`.
+- Deleted `DeviceBox::wrap`, use `DeviceBox::from_raw`.
+- `DeviceBuffer` now requires `T: DeviceCopy`.
+- `DeviceBuffer` is now `repr(C)` and is represented by a `DevicePointer<T>` and a `usize`.
+- Added `DeviceBuffer::as_slice`.
+- `DeviceSlice` now requires `T: DeviceCopy`.
+- `DeviceSlice` is now represented as a `DevicePointer<T>` and a `usize` (and is repr(C)) instead of `[T]` which was definitely unsound.
+- `DeviceSlice::as_ptr` and `DeviceSlice::as_ptr_mut` now both return a `DevicePointer<T>`.
+- Deleted `DeviceSlice::as_ptr` and `DeviceSlice::as_mut_ptr`. Use `DeviceSlice::as_device_ptr` then `DevicePointer::as_(mut)_ptr`.
+- Deleted `DeviceSlice::chunks` and consequently `DeviceChunks`.
+- Deleted `DeviceSlice::chunks_mut` and consequently `DeviceChunksMut`.
+- Deleted `DeviceSlice::from_slice` and `DeviceSlice::from_slice_mut` because it was unsound.
+- `DeviceSlice` no longer implements `Index` and `IndexMut`, switching away from `[T]` made this impossible to implement.
+Instead you can now use `DeviceSlice::index` which behaves the same.
+- `DeviceSlice` is now `Clone` and `Copy`.
+- Added `DeviceVariable`, a simple wrapper around `DeviceBox<T>` and `T` which allows easy management of a CPU and GPU version of a type.
+- Added `DeviceMemory`, a trait describing any region of GPU memory that can be described with a pointer + a length.
+- Added `memcpy_htod`, a wrapper around `cuMemcpyHtoD_v2`.
+- Added `mem_get_info` to query the amount of free and total memory.
+- `DevicePointer::as_raw` now returns a `CUdeviceptr`, not a `*const T` (use `DevicePointer::as_ptr`).
+- Deleted `DevicePointer::as_raw_mut` (use `DevicePointer::as_mut_ptr`).
+- Added `DevicePointer::as_ptr` and `DevicePointer::as_mut_ptr` for `*const T` and `*mut T`.
+- Added `DevicePointer::from_raw` for `CUdeviceptr -> DevicePointer<T>` with a safe function.
+- Deleted `DevicePointer::wrap` (use `DevicePointer::from_raw`).
+- Added dependency on `cust_core` for `DeviceCopy`.
+- Added dependency on `goblin` for verifying cubins and fatbins (impossible to implement safe module loading without it).
+- Deprecated `Module::from_str`, use `Module::from_ptx` and pass `&[]` for options.
+- Added `ModuleJitOption`, `JitFallback`, `JitTarget`, and `OptLevel` for specifying options when loading a module. Note that
+`ModuleJitOption::MaxRegisters` does not seem to work currently, but NVIDIA is looking into it.
+- Added `Module::from_fatbin` and `Module::from_fatbin_unchecked`.
+- Added `Module::from_cubin` and `Module::from_cubin_unchecked`.
+- Added `Module::from_ptr` and `Module::from_ptx_cstr`.
+- Deprecated `Module::load_from_string`, use `Module::from_ptx_cstr`.
 
 ## 0.2.2 - 12/5/21
 
diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml
index bf801210..30ea3dc2 100644
--- a/crates/cust/Cargo.toml
+++ b/crates/cust/Cargo.toml
@@ -13,16 +13,24 @@ repository = "https://github.com/Rust-GPU/Rust-CUDA"
 readme = "../../README.md"
 
 [dependencies]
-cust_raw = { path = "../cust_raw", version = "0.11.2" }
+cust_core = { path = "../cust_core", version = "0.1.0"}
+cust_raw = { path = "../cust_raw", version = "0.11.2"}
 bitflags = "1.2"
 cust_derive = { path = "../cust_derive", version = "0.1" }
+glam = { version = "0.20", features=["cuda"], optional = true }
+mint = { version = "^0.5", optional = true }
 num-complex = { version = "0.4", optional = true }
 vek = { version = "0.15.1", optional = true, default-features = false }
 bytemuck = { version = "1.7.3", optional = true }
 goblin = { version = "0.4.3", default-features = false, features = ["elf32", "elf64", "std", "endian_fd"] }
 
 [features]
-default = ["bytemuck"]
+default= ["bytemuck"]
+impl_glam = ["cust_core/glam", "glam"]
+impl_mint = ["cust_core/mint", "mint"]
+impl_vek = ["cust_core/vek", "vek"]
+impl_half = ["cust_core/half"]
+impl_num_complex = ["cust_core/num-complex", "num-complex"]
 
 [build-dependencies]
 find_cuda_helper = { path = "../find_cuda_helper", version = "0.2" }
diff --git a/crates/cust/src/external.rs b/crates/cust/src/external.rs
new file mode 100644
index 00000000..c735842a
--- /dev/null
+++ b/crates/cust/src/external.rs
@@ -0,0 +1,66 @@
+//! External memory and synchronization resources
+
+use crate::error::{CudaResult, ToResult};
+use crate::memory::{DeviceCopy, DevicePointer};
+
+use cust_raw as sys;
+
+#[repr(transparent)]
+pub struct ExternalMemory(sys::CUexternalMemory);
+
+impl ExternalMemory {
+    // Import an external memory referenced by `fd` with `size`
+    #[allow(clippy::missing_safety_doc)]
+    pub unsafe fn import(fd: i32, size: usize) -> CudaResult<ExternalMemory> {
+        let desc = sys::CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
+            type_: sys::CUexternalMemoryHandleType_enum::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
+            handle: sys::CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1 { fd },
+            size: size as u64,
+            flags: 0,
+            reserved: Default::default(),
+        };
+
+        let mut memory: sys::CUexternalMemory = std::ptr::null_mut();
+
+        sys::cuImportExternalMemory(&mut memory, &desc)
+            .to_result()
+            .map(|_| ExternalMemory(memory))
+    }
+
+    #[allow(clippy::missing_safety_doc)]
+    pub unsafe fn reimport(&mut self, fd: i32, size: usize) -> CudaResult<()> {
+        // import new memory - this will call drop to destroy the old one
+        *self = ExternalMemory::import(fd, size)?;
+
+        Ok(())
+    }
+
+    // Map a buffer from this memory with `size` and `offset`
+    pub fn mapped_buffer<T: DeviceCopy>(
+        &self,
+        size_in_bytes: usize,
+        offset_in_bytes: usize,
+    ) -> CudaResult<DevicePointer<T>> {
+        let buffer_desc = sys::CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
+            flags: 0,
+            size: size_in_bytes as u64,
+            offset: offset_in_bytes as u64,
+            reserved: Default::default(),
+        };
+
+        let mut dptr = 0;
+        unsafe {
+            sys::cuExternalMemoryGetMappedBuffer(&mut dptr, self.0, &buffer_desc)
+                .to_result()
+                .map(|_| DevicePointer::from_raw(dptr))
+        }
+    }
+}
+
+impl Drop for ExternalMemory {
+    fn drop(&mut self) {
+        unsafe {
+            sys::cuDestroyExternalMemory(self.0).to_result().unwrap();
+        }
+    }
+}
diff --git a/crates/cust/src/lib.rs b/crates/cust/src/lib.rs
index 4b50b4dd..198702c9 100644
--- a/crates/cust/src/lib.rs
+++ b/crates/cust/src/lib.rs
@@ -59,6 +59,7 @@
 pub mod device;
 pub mod error;
 pub mod event;
+pub mod external;
 pub mod function;
 // WIP
 pub mod context;
@@ -84,9 +85,6 @@ use crate::error::{CudaResult, ToResult};
 use bitflags::bitflags;
 use sys::{cuDriverGetVersion, cuInit};
 
-#[cfg(feature = "vek")]
-pub use vek;
-
 bitflags! {
     /// Bit flags for initializing the CUDA driver. Currently, no flags are defined,
     /// so `CudaFlags::empty()` is the only valid value.
diff --git a/crates/cust/src/memory/device/device_box.rs b/crates/cust/src/memory/device/device_box.rs
index e1fb67ec..4d050a78 100644
--- a/crates/cust/src/memory/device/device_box.rs
+++ b/crates/cust/src/memory/device/device_box.rs
@@ -15,12 +15,12 @@ use std::os::raw::c_void;
 ///
 /// See the [`module-level documentation`](../memory/index.html) for more information on device memory.
 #[derive(Debug)]
-pub struct DeviceBox<T> {
+pub struct DeviceBox<T: DeviceCopy> {
     pub(crate) ptr: DevicePointer<T>,
 }
 
-unsafe impl<T: Send> Send for DeviceBox<T> {}
-unsafe impl<T: Sync> Sync for DeviceBox<T> {}
+unsafe impl<T: Send + DeviceCopy> Send for DeviceBox<T> {}
+unsafe impl<T: Sync + DeviceCopy> Sync for DeviceBox<T> {}
 
 impl<T: DeviceCopy> DeviceBox<T> {
     /// Allocate device memory and place val into it.
@@ -156,14 +156,10 @@ impl<T: DeviceCopy + bytemuck::Zeroable> DeviceBox<T> {
     #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
     pub fn zeroed() -> CudaResult<Self> {
         unsafe {
-            let mut new_box = DeviceBox::uninitialized()?;
+            let new_box = DeviceBox::uninitialized()?;
             if mem::size_of::<T>() != 0 {
-                cuda::cuMemsetD8_v2(
-                    new_box.as_device_ptr().as_raw_mut() as u64,
-                    0,
-                    mem::size_of::<T>(),
-                )
-                .to_result()?;
+                cuda::cuMemsetD8_v2(new_box.as_device_ptr().as_raw(), 0, mem::size_of::<T>())
+                    .to_result()?;
             }
             Ok(new_box)
         }
@@ -200,10 +196,10 @@ impl<T: DeviceCopy + bytemuck::Zeroable> DeviceBox<T> {
     /// ```
     #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
     pub unsafe fn zeroed_async(stream: &Stream) -> CudaResult<Self> {
-        let mut new_box = DeviceBox::uninitialized_async(stream)?;
+        let new_box = DeviceBox::uninitialized_async(stream)?;
         if mem::size_of::<T>() != 0 {
             cuda::cuMemsetD8Async(
-                new_box.as_device_ptr().as_raw_mut() as u64,
+                new_box.as_device_ptr().as_raw(),
                 0,
                 mem::size_of::<T>(),
                 stream.as_inner(),
@@ -214,7 +210,7 @@ impl<T: DeviceCopy + bytemuck::Zeroable> DeviceBox<T> {
     }
 }
 
-impl<T> DeviceBox<T> {
+impl<T: DeviceCopy> DeviceBox<T> {
     /// Allocate device memory, but do not initialize it.
     ///
     /// This doesn't actually allocate if `T` is zero-sized.
@@ -289,9 +285,9 @@ impl<T> DeviceBox<T> {
     /// let ptr = DeviceBox::into_device(x).as_raw_mut();
     /// let x = unsafe { DeviceBox::from_raw(ptr) };
     /// ```
-    pub unsafe fn from_raw(ptr: *mut T) -> Self {
+    pub unsafe fn from_raw(ptr: cust_raw::CUdeviceptr) -> Self {
         DeviceBox {
-            ptr: DevicePointer::wrap(ptr),
+            ptr: DevicePointer::from_raw(ptr),
         }
     }
 
@@ -360,7 +356,7 @@ impl<T> DeviceBox<T> {
     /// let ptr = x.as_device_ptr();
     /// println!("{:p}", ptr);
     /// ```
-    pub fn as_device_ptr(&mut self) -> DevicePointer<T> {
+    pub fn as_device_ptr(&self) -> DevicePointer<T> {
         self.ptr
     }
 
@@ -400,7 +396,7 @@ impl<T> DeviceBox<T> {
         }
     }
 }
-impl<T> Drop for DeviceBox<T> {
+impl<T: DeviceCopy> Drop for DeviceBox<T> {
     fn drop(&mut self) {
         if self.ptr.is_null() {
             return;
@@ -412,23 +408,22 @@ impl<T> Drop for DeviceBox<T> {
         }
     }
 }
-impl<T> Pointer for DeviceBox<T> {
+
+impl<T: DeviceCopy> Pointer for DeviceBox<T> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        fmt::Pointer::fmt(&self.ptr, f)
+        let ptr = self.ptr.as_raw() as *const c_void;
+        fmt::Pointer::fmt(&ptr, f)
     }
 }
-impl<T> crate::private::Sealed for DeviceBox<T> {}
+
+impl<T: DeviceCopy> crate::private::Sealed for DeviceBox<T> {}
 impl<T: DeviceCopy> CopyDestination<T> for DeviceBox<T> {
     fn copy_from(&mut self, val: &T) -> CudaResult<()> {
         let size = mem::size_of::<T>();
         if size != 0 {
             unsafe {
-                cuda::cuMemcpyHtoD_v2(
-                    self.ptr.as_raw_mut() as u64,
-                    val as *const T as *const c_void,
-                    size,
-                )
-                .to_result()?
+                cuda::cuMemcpyHtoD_v2(self.ptr.as_raw(), val as *const T as *const c_void, size)
+                    .to_result()?
             }
         }
         Ok(())
@@ -453,10 +448,7 @@ impl<T: DeviceCopy> CopyDestination<DeviceBox<T>> for DeviceBox<T> {
     fn copy_from(&mut self, val: &DeviceBox<T>) -> CudaResult<()> {
         let size = mem::size_of::<T>();
         if size != 0 {
-            unsafe {
-                cuda::cuMemcpyDtoD_v2(self.ptr.as_raw_mut() as u64, val.ptr.as_raw() as u64, size)
-                    .to_result()?
-            }
+            unsafe { cuda::cuMemcpyDtoD_v2(self.ptr.as_raw(), val.ptr.as_raw(), size).to_result()? }
         }
         Ok(())
     }
@@ -464,10 +456,7 @@ impl<T: DeviceCopy> CopyDestination<DeviceBox<T>> for DeviceBox<T> {
     fn copy_to(&self, val: &mut DeviceBox<T>) -> CudaResult<()> {
         let size = mem::size_of::<T>();
         if size != 0 {
-            unsafe {
-                cuda::cuMemcpyDtoD_v2(val.ptr.as_raw_mut() as u64, self.ptr.as_raw() as u64, size)
-                    .to_result()?
-            }
+            unsafe { cuda::cuMemcpyDtoD_v2(val.ptr.as_raw(), self.ptr.as_raw(), size).to_result()? }
         }
         Ok(())
     }
@@ -477,7 +466,7 @@ impl<T: DeviceCopy> AsyncCopyDestination<T> for DeviceBox<T> {
         let size = mem::size_of::<T>();
         if size != 0 {
             cuda::cuMemcpyHtoDAsync_v2(
-                self.ptr.as_raw_mut() as u64,
+                self.ptr.as_raw(),
                 val as *const _ as *const c_void,
                 size,
                 stream.as_inner(),
@@ -505,13 +494,8 @@ impl<T: DeviceCopy> AsyncCopyDestination<DeviceBox<T>> for DeviceBox<T> {
     unsafe fn async_copy_from(&mut self, val: &DeviceBox<T>, stream: &Stream) -> CudaResult<()> {
         let size = mem::size_of::<T>();
         if size != 0 {
-            cuda::cuMemcpyDtoDAsync_v2(
-                self.ptr.as_raw_mut() as u64,
-                val.ptr.as_raw() as u64,
-                size,
-                stream.as_inner(),
-            )
-            .to_result()?
+            cuda::cuMemcpyDtoDAsync_v2(self.ptr.as_raw(), val.ptr.as_raw(), size, stream.as_inner())
+                .to_result()?
         }
         Ok(())
     }
@@ -519,13 +503,8 @@ impl<T: DeviceCopy> AsyncCopyDestination<DeviceBox<T>> for DeviceBox<T> {
     unsafe fn async_copy_to(&self, val: &mut DeviceBox<T>, stream: &Stream) -> CudaResult<()> {
         let size = mem::size_of::<T>();
         if size != 0 {
-            cuda::cuMemcpyDtoDAsync_v2(
-                val.ptr.as_raw_mut() as u64,
-                self.ptr.as_raw() as u64,
-                size,
-                stream.as_inner(),
-            )
-            .to_result()?
+            cuda::cuMemcpyDtoDAsync_v2(val.ptr.as_raw(), self.ptr.as_raw(), size, stream.as_inner())
+                .to_result()?
         }
         Ok(())
     }
@@ -609,8 +588,8 @@ mod test_device_box {
     #[test]
     fn test_device_pointer_implements_traits_safely() {
         let _context = crate::quick_init().unwrap();
-        let mut x = DeviceBox::new(&5u64).unwrap();
-        let mut y = DeviceBox::new(&0u64).unwrap();
+        let x = DeviceBox::new(&5u64).unwrap();
+        let y = DeviceBox::new(&0u64).unwrap();
 
         // If the impls dereference the pointer, this should segfault.
         let _ = Ord::cmp(&x.as_device_ptr(), &y.as_device_ptr());
diff --git a/crates/cust/src/memory/device/device_buffer.rs b/crates/cust/src/memory/device/device_buffer.rs
index 6c090e12..856251e7 100644
--- a/crates/cust/src/memory/device/device_buffer.rs
+++ b/crates/cust/src/memory/device/device_buffer.rs
@@ -8,19 +8,18 @@ use crate::sys as cuda;
 use std::mem;
 use std::ops::{Deref, DerefMut};
 
-use std::ptr;
-
 /// Fixed-size device-side buffer. Provides basic access to device memory.
 #[derive(Debug)]
-pub struct DeviceBuffer<T> {
+#[repr(C)]
+pub struct DeviceBuffer<T: DeviceCopy> {
     buf: DevicePointer<T>,
     capacity: usize,
 }
 
-unsafe impl<T: Send> Send for DeviceBuffer<T> {}
-unsafe impl<T: Sync> Sync for DeviceBuffer<T> {}
+unsafe impl<T: Send + DeviceCopy> Send for DeviceBuffer<T> {}
+unsafe impl<T: Sync + DeviceCopy> Sync for DeviceBuffer<T> {}
 
-impl<T> DeviceBuffer<T> {
+impl<T: DeviceCopy> DeviceBuffer<T> {
     /// Allocate a new device buffer large enough to hold `size` `T`'s, but without
     /// initializing the contents.
     ///
@@ -46,7 +45,8 @@ impl<T> DeviceBuffer<T> {
         let ptr = if size > 0 && mem::size_of::<T>() > 0 {
             cuda_malloc(size)?
         } else {
-            DevicePointer::wrap(ptr::NonNull::dangling().as_ptr() as *mut T)
+            // FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
+            DevicePointer::null()
         };
         Ok(DeviceBuffer {
             buf: ptr,
@@ -80,12 +80,12 @@ impl<T> DeviceBuffer<T> {
     /// ```
     pub unsafe fn zeroed(size: usize) -> CudaResult<Self> {
         let ptr = if size > 0 && mem::size_of::<T>() > 0 {
-            let mut ptr = cuda_malloc(size)?;
-            cuda::cuMemsetD8_v2(ptr.as_raw_mut() as u64, 0, size * mem::size_of::<T>())
-                .to_result()?;
+            let ptr = cuda_malloc(size)?;
+            cuda::cuMemsetD8_v2(ptr.as_raw(), 0, size * mem::size_of::<T>()).to_result()?;
             ptr
         } else {
-            DevicePointer::wrap(ptr::NonNull::dangling().as_ptr() as *mut T)
+            // FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
+            DevicePointer::null()
         };
         Ok(DeviceBuffer {
             buf: ptr,
@@ -229,28 +229,28 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
         uninit.async_copy_from(slice, stream)?;
         Ok(uninit)
     }
+
+    /// Explicitly creates a [`DeviceSlice`] from this buffer.
+    pub fn as_slice(&self) -> &DeviceSlice<T> {
+        self
+    }
 }
-impl<T> Deref for DeviceBuffer<T> {
+
+impl<T: DeviceCopy> Deref for DeviceBuffer<T> {
     type Target = DeviceSlice<T>;
 
     fn deref(&self) -> &DeviceSlice<T> {
-        unsafe {
-            DeviceSlice::from_slice(::std::slice::from_raw_parts(
-                self.buf.as_raw(),
-                self.capacity,
-            ))
-        }
+        unsafe { &*(self as *const _ as *const DeviceSlice<T>) }
     }
 }
-impl<T> DerefMut for DeviceBuffer<T> {
+
+impl<T: DeviceCopy> DerefMut for DeviceBuffer<T> {
     fn deref_mut(&mut self) -> &mut DeviceSlice<T> {
-        unsafe {
-            &mut *(::std::slice::from_raw_parts_mut(self.buf.as_raw_mut(), self.capacity)
-                as *mut [T] as *mut DeviceSlice<T>)
-        }
+        unsafe { &mut *(self as *mut _ as *mut DeviceSlice<T>) }
     }
 }
-impl<T> Drop for DeviceBuffer<T> {
+
+impl<T: DeviceCopy> Drop for DeviceBuffer<T> {
     fn drop(&mut self) {
         if self.buf.is_null() {
             return;
@@ -269,7 +269,6 @@ impl<T> Drop for DeviceBuffer<T> {
 #[cfg(test)]
 mod test_device_buffer {
     use super::*;
-    use crate::memory::device::DeviceBox;
     use crate::stream::{Stream, StreamFlags};
 
     #[derive(Clone, Copy, Debug)]
@@ -307,32 +306,6 @@ mod test_device_buffer {
         assert_eq!(start, end);
     }
 
-    #[test]
-    fn test_slice() {
-        let _context = crate::quick_init().unwrap();
-        let start = [0u64, 1, 2, 3, 4, 5];
-        let mut end = [0u64, 0];
-        let mut buf = DeviceBuffer::from_slice(&[0u64, 0, 0, 0]).unwrap();
-        buf.copy_from(&start[0..4]).unwrap();
-        buf[0..2].copy_to(&mut end).unwrap();
-        assert_eq!(start[0..2], end);
-    }
-
-    #[test]
-    fn test_async_slice() {
-        let _context = crate::quick_init().unwrap();
-        let stream = Stream::new(StreamFlags::NON_BLOCKING, None).unwrap();
-        let start = [0u64, 1, 2, 3, 4, 5];
-        let mut end = [0u64, 0];
-        unsafe {
-            let mut buf = DeviceBuffer::from_slice_async(&[0u64, 0, 0, 0], &stream).unwrap();
-            buf.async_copy_from(&start[0..4], &stream).unwrap();
-            buf[0..2].async_copy_to(&mut end, &stream).unwrap();
-            stream.synchronize().unwrap();
-            assert_eq!(start[0..2], end);
-        }
-    }
-
     #[test]
     #[should_panic]
     fn test_copy_to_d2h_wrong_size() {
@@ -375,36 +348,6 @@ mod test_device_buffer {
         }
     }
 
-    #[test]
-    fn test_copy_device_slice_to_device() {
-        let _context = crate::quick_init().unwrap();
-        let start = DeviceBuffer::from_slice(&[0u64, 1, 2, 3, 4, 5]).unwrap();
-        let mut mid = DeviceBuffer::from_slice(&[0u64, 0, 0, 0]).unwrap();
-        let mut end = DeviceBuffer::from_slice(&[0u64, 0]).unwrap();
-        let mut host_end = [0u64, 0];
-        start[1..5].copy_to(&mut mid).unwrap();
-        end.copy_from(&mid[1..3]).unwrap();
-        end.copy_to(&mut host_end).unwrap();
-        assert_eq!([2u64, 3], host_end);
-    }
-
-    #[test]
-    fn test_async_copy_device_slice_to_device() {
-        let _context = crate::quick_init().unwrap();
-        let stream = Stream::new(StreamFlags::NON_BLOCKING, None).unwrap();
-        unsafe {
-            let start = DeviceBuffer::from_slice_async(&[0u64, 1, 2, 3, 4, 5], &stream).unwrap();
-            let mut mid = DeviceBuffer::from_slice_async(&[0u64, 0, 0, 0], &stream).unwrap();
-            let mut end = DeviceBuffer::from_slice_async(&[0u64, 0], &stream).unwrap();
-            let mut host_end = [0u64, 0];
-            start[1..5].async_copy_to(&mut mid, &stream).unwrap();
-            end.async_copy_from(&mid[1..3], &stream).unwrap();
-            end.async_copy_to(&mut host_end, &stream).unwrap();
-            stream.synchronize().unwrap();
-            assert_eq!([2u64, 3], host_end);
-        }
-    }
-
     #[test]
     #[should_panic]
     fn test_copy_to_d2d_wrong_size() {
@@ -446,14 +389,4 @@ mod test_device_buffer {
             let _ = buf.async_copy_from(&start, &stream);
         }
     }
-
-    #[test]
-    fn test_can_create_uninitialized_non_devicecopy_buffers() {
-        let _context = crate::quick_init().unwrap();
-        unsafe {
-            let _box: DeviceBox<Vec<u8>> = DeviceBox::uninitialized().unwrap();
-            let buffer: DeviceBuffer<Vec<u8>> = DeviceBuffer::uninitialized(10).unwrap();
-            let _slice = &buffer[0..5];
-        }
-    }
 }
diff --git a/crates/cust/src/memory/device/device_slice.rs b/crates/cust/src/memory/device/device_slice.rs
index 0e0b2e97..3e3ee8f6 100644
--- a/crates/cust/src/memory/device/device_slice.rs
+++ b/crates/cust/src/memory/device/device_slice.rs
@@ -5,22 +5,21 @@ use crate::memory::DeviceCopy;
 use crate::memory::DevicePointer;
 use crate::stream::Stream;
 use crate::sys as cuda;
-use std::iter::{ExactSizeIterator, FusedIterator};
-use std::mem::{self};
-use std::ops::{
-    Index, IndexMut, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive,
-};
+use std::mem;
 
+use std::ops::{Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive};
 use std::os::raw::c_void;
-use std::slice::{self, Chunks, ChunksMut};
 
 /// Fixed-size device-side slice.
-#[derive(Debug)]
+#[derive(Debug, Copy, Clone)]
 #[repr(C)]
-pub struct DeviceSlice<T>([T]);
+pub struct DeviceSlice<T: DeviceCopy> {
+    ptr: DevicePointer<T>,
+    len: usize,
+}
 
-unsafe impl<T: Send> Send for DeviceSlice<T> {}
-unsafe impl<T: Sync> Sync for DeviceSlice<T> {}
+unsafe impl<T: Send + DeviceCopy> Send for DeviceSlice<T> {}
+unsafe impl<T: Sync + DeviceCopy> Sync for DeviceSlice<T> {}
 
 impl<T: DeviceCopy + Default + Clone> DeviceSlice<T> {
     pub fn as_host_vec(&self) -> CudaResult<Vec<T>> {
@@ -34,7 +33,7 @@ impl<T: DeviceCopy + Default + Clone> DeviceSlice<T> {
 // I have no idea if this is safe or not. Probably not, though I can't imagine how the compiler
 // could possibly know that the pointer is not de-referenceable. I'm banking that we get proper
 // Dynamicaly-sized Types before the compiler authors break this assumption.
-impl<T> DeviceSlice<T> {
+impl<T: DeviceCopy> DeviceSlice<T> {
     /// Returns the number of elements in the slice.
     ///
     /// # Examples
@@ -46,7 +45,7 @@ impl<T> DeviceSlice<T> {
     /// assert_eq!(a.len(), 3);
     /// ```
     pub fn len(&self) -> usize {
-        self.0.len()
+        self.len
     }
 
     /// Returns `true` if the slice has a length of 0.
@@ -60,7 +59,7 @@ impl<T> DeviceSlice<T> {
     /// assert!(a.is_empty());
     /// ```
     pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
+        self.len == 0
     }
 
     /// Return a raw device-pointer to the slice's buffer.
@@ -77,28 +76,11 @@ impl<T> DeviceSlice<T> {
     /// let a = DeviceBuffer::from_slice(&[1, 2, 3]).unwrap();
     /// println!("{:p}", a.as_ptr());
     /// ```
-    pub fn as_ptr(&self) -> *const T {
-        self.0.as_ptr()
-    }
-
-    /// Returns an unsafe mutable device-pointer to the slice's buffer.
-    ///
-    /// The caller must ensure that the slice outlives the pointer this function returns, or else
-    /// it will end up pointing to garbage. The caller must also ensure that the pointer is not
-    /// dereferenced by the CPU.
-    ///
-    /// Examples:
-    ///
-    /// ```
-    /// # let _context = cust::quick_init().unwrap();
-    /// use cust::memory::*;
-    /// let mut a = DeviceBuffer::from_slice(&[1, 2, 3]).unwrap();
-    /// println!("{:p}", a.as_mut_ptr());
-    /// ```
-    pub fn as_mut_ptr(&mut self) -> *mut T {
-        self.0.as_mut_ptr()
+    pub fn as_device_ptr(&self) -> DevicePointer<T> {
+        self.ptr
     }
 
+    /* TODO (AL): keep these?
     /// Divides one DeviceSlice into two at a given index.
     ///
     /// The first will contain all indices from `[0, mid)` (excluding the index `mid` itself) and
@@ -169,98 +151,7 @@ impl<T> DeviceSlice<T> {
             )
         }
     }
-
-    /// Returns an iterator over `chunk_size` elements of the slice at a time. The chunks are device
-    /// slices and do not overlap. If `chunk_size` does not divide the length of the slice, then the
-    /// last chunk will not have length `chunk_size`.
-    ///
-    /// See `exact_chunks` for a variant of this iterator that returns chunks of always exactly
-    /// `chunk_size` elements.
-    ///
-    /// # Panics
-    ///
-    /// Panics if `chunk_size` is 0.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # let _context = cust::quick_init().unwrap();
-    /// use cust::memory::*;
-    /// let slice = DeviceBuffer::from_slice(&[1u64, 2, 3, 4, 5]).unwrap();
-    /// let mut iter = slice.chunks(2);
-    ///
-    /// assert_eq!(iter.next().unwrap().len(), 2);
-    ///
-    /// let mut host_buf = [0u64, 0];
-    /// iter.next().unwrap().copy_to(&mut host_buf).unwrap();
-    /// assert_eq!([3, 4], host_buf);
-    ///
-    /// assert_eq!(iter.next().unwrap().len(), 1);
-    ///
-    /// ```
-    pub fn chunks(&self, chunk_size: usize) -> DeviceChunks<T> {
-        DeviceChunks(self.0.chunks(chunk_size))
-    }
-
-    /// Returns an iterator over `chunk_size` elements of the slice at a time. The chunks are
-    /// mutable device slices and do not overlap. If `chunk_size` does not divide the length of the
-    /// slice, then the last chunk will not have length `chunk_size`.
-    ///
-    /// See `exact_chunks` for a variant of this iterator that returns chunks of always exactly
-    /// `chunk_size` elements.
-    ///
-    /// # Panics
-    ///
-    /// Panics if `chunk_size` is 0.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # let _context = cust::quick_init().unwrap();
-    /// use cust::memory::*;
-    /// let mut slice = DeviceBuffer::from_slice(&[0u64, 0, 0, 0, 0]).unwrap();
-    /// {
-    ///     let mut iter = slice.chunks_mut(2);
-    ///
-    ///     assert_eq!(iter.next().unwrap().len(), 2);
-    ///
-    ///     let host_buf = [2u64, 3];
-    ///     iter.next().unwrap().copy_from(&host_buf).unwrap();
-    ///
-    ///     assert_eq!(iter.next().unwrap().len(), 1);
-    /// }
-    ///
-    /// let mut host_buf = [0u64, 0, 0, 0, 0];
-    /// slice.copy_to(&mut host_buf).unwrap();
-    /// assert_eq!([0u64, 0, 2, 3, 0], host_buf);
-    /// ```
-    pub fn chunks_mut(&mut self, chunk_size: usize) -> DeviceChunksMut<T> {
-        DeviceChunksMut(self.0.chunks_mut(chunk_size))
-    }
-
-    /// Private function used to transmute a CPU slice (which must have the device pointer as it's
-    /// buffer pointer) to a DeviceSlice. Completely unsafe.
-    pub(super) unsafe fn from_slice(slice: &[T]) -> &DeviceSlice<T> {
-        &*(slice as *const [T] as *const DeviceSlice<T>)
-    }
-
-    /// Private function used to transmute a mutable CPU slice (which must have the device pointer
-    /// as it's buffer pointer) to a mutable DeviceSlice. Completely unsafe.
-    pub(super) unsafe fn from_slice_mut(slice: &mut [T]) -> &mut DeviceSlice<T> {
-        &mut *(slice as *mut [T] as *mut DeviceSlice<T>)
-    }
-
-    /// Returns a `DevicePointer<T>` to the buffer.
-    ///
-    /// The caller must ensure that the buffer outlives the returned pointer, or it will end up
-    /// pointing to garbage.
-    ///
-    /// Modifying `DeviceBuffer` is guaranteed not to cause its buffer to be reallocated, so pointers
-    /// cannot be invalidated in that manner, but other types may be added in the future which can
-    /// reallocate.
-    pub fn as_device_ptr(&mut self) -> DevicePointer<T> {
-        unsafe { DevicePointer::wrap(self.0.as_mut_ptr()) }
-    }
+    */
 
     /// Forms a slice from a `DevicePointer` and a length.
     ///
@@ -292,8 +183,8 @@ impl<T> DeviceSlice<T> {
     /// assert_eq!([1u64, 2], host_buf);
     /// ```
     #[allow(clippy::needless_pass_by_value)]
-    pub unsafe fn from_raw_parts<'a>(data: DevicePointer<T>, len: usize) -> &'a DeviceSlice<T> {
-        DeviceSlice::from_slice(slice::from_raw_parts(data.as_raw(), len))
+    pub unsafe fn from_raw_parts(ptr: DevicePointer<T>, len: usize) -> DeviceSlice<T> {
+        DeviceSlice { ptr, len }
     }
 
     /// Performs the same functionality as `from_raw_parts`, except that a
@@ -311,144 +202,139 @@ impl<T> DeviceSlice<T> {
     /// slices as with `from_raw_parts`.
     ///
     /// See the documentation of `from_raw_parts` for more details.
-    pub unsafe fn from_raw_parts_mut<'a>(
-        mut data: DevicePointer<T>,
-        len: usize,
-    ) -> &'a mut DeviceSlice<T> {
-        DeviceSlice::from_slice_mut(slice::from_raw_parts_mut(data.as_raw_mut(), len))
+    pub unsafe fn from_raw_parts_mut(ptr: DevicePointer<T>, len: usize) -> DeviceSlice<T> {
+        DeviceSlice { ptr, len }
     }
 }
 
-/// An iterator over a [`DeviceSlice`](struct.DeviceSlice.html) in (non-overlapping) chunks
-/// (`chunk_size` elements at a time).
-///
-/// When the slice len is not evenly divided by the chunk size, the last slice of the iteration will
-/// be the remainder.
-///
-/// This struct is created by the `chunks` method on `DeviceSlices`.
-#[derive(Debug, Clone)]
-pub struct DeviceChunks<'a, T: 'a>(Chunks<'a, T>);
-impl<'a, T> Iterator for DeviceChunks<'a, T> {
-    type Item = &'a DeviceSlice<T>;
+pub trait DeviceSliceIndex<T: DeviceCopy> {
+    /// Indexes into this slice without checking if it is in-bounds.
+    ///
+    /// # Safety
+    ///
+    /// The range must be in-bounds of the slice.
+    unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> DeviceSlice<T>;
+    fn index(self, slice: &DeviceSlice<T>) -> DeviceSlice<T>;
+}
 
-    fn next(&mut self) -> Option<&'a DeviceSlice<T>> {
-        self.0
-            .next()
-            .map(|slice| unsafe { DeviceSlice::from_slice(slice) })
-    }
+#[inline(never)]
+#[cold]
+#[track_caller]
+fn slice_start_index_len_fail(index: usize, len: usize) -> ! {
+    panic!(
+        "range start index {} out of range for slice of length {}",
+        index, len
+    );
+}
 
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.0.size_hint()
-    }
+#[inline(never)]
+#[cold]
+#[track_caller]
+fn slice_end_index_len_fail(index: usize, len: usize) -> ! {
+    panic!(
+        "range end index {} out of range for slice of length {}",
+        index, len
+    );
+}
 
-    fn count(self) -> usize {
-        self.0.len()
-    }
+#[inline(never)]
+#[cold]
+#[track_caller]
+fn slice_index_order_fail(index: usize, end: usize) -> ! {
+    panic!("slice index starts at {} but ends at {}", index, end);
+}
 
-    fn nth(&mut self, n: usize) -> Option<Self::Item> {
-        self.0
-            .nth(n)
-            .map(|slice| unsafe { DeviceSlice::from_slice(slice) })
-    }
+#[inline(never)]
+#[cold]
+#[track_caller]
+fn slice_end_index_overflow_fail() -> ! {
+    panic!("attempted to index slice up to maximum usize");
+}
 
-    #[inline]
-    fn last(self) -> Option<Self::Item> {
-        self.0
-            .last()
-            .map(|slice| unsafe { DeviceSlice::from_slice(slice) })
+impl<T: DeviceCopy> DeviceSliceIndex<T> for Range<usize> {
+    unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        DeviceSlice::from_raw_parts(slice.as_device_ptr().add(self.start), self.end - self.start)
     }
-}
-impl<'a, T> DoubleEndedIterator for DeviceChunks<'a, T> {
-    #[inline]
-    fn next_back(&mut self) -> Option<&'a DeviceSlice<T>> {
-        self.0
-            .next_back()
-            .map(|slice| unsafe { DeviceSlice::from_slice(slice) })
+    fn index(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        if self.start > self.end {
+            slice_index_order_fail(self.start, self.end);
+        } else if self.end > slice.len() {
+            slice_end_index_len_fail(self.end, slice.len());
+        }
+        // SAFETY: `self` is checked to be valid and in bounds above.
+        unsafe { self.get_unchecked(slice) }
     }
 }
-impl<'a, T> ExactSizeIterator for DeviceChunks<'a, T> {}
-impl<'a, T> FusedIterator for DeviceChunks<'a, T> {}
-
-/// An iterator over a [`DeviceSlice`](struct.DeviceSlice.html) in (non-overlapping) mutable chunks
-/// (`chunk_size` elements at a time).
-///
-/// When the slice len is not evenly divided by the chunk size, the last slice of the iteration will
-/// be the remainder.
-///
-/// This struct is created by the `chunks` method on `DeviceSlices`.
-#[derive(Debug)]
-pub struct DeviceChunksMut<'a, T: 'a>(ChunksMut<'a, T>);
-impl<'a, T> Iterator for DeviceChunksMut<'a, T> {
-    type Item = &'a mut DeviceSlice<T>;
 
-    fn next(&mut self) -> Option<&'a mut DeviceSlice<T>> {
-        self.0
-            .next()
-            .map(|slice| unsafe { DeviceSlice::from_slice_mut(slice) })
+impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeTo<usize> {
+    unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        (0..self.end).get_unchecked(slice)
     }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.0.size_hint()
+    fn index(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        (0..self.end).index(slice)
     }
+}
 
-    fn count(self) -> usize {
-        self.0.len()
+impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeFrom<usize> {
+    unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        (self.start..slice.len()).get_unchecked(slice)
     }
-
-    fn nth(&mut self, n: usize) -> Option<Self::Item> {
-        self.0
-            .nth(n)
-            .map(|slice| unsafe { DeviceSlice::from_slice_mut(slice) })
+    fn index(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        if self.start > slice.len() {
+            slice_start_index_len_fail(self.start, slice.len());
+        }
+        // SAFETY: `self` is checked to be valid and in bounds above.
+        unsafe { self.get_unchecked(slice) }
     }
+}
 
-    #[inline]
-    fn last(self) -> Option<Self::Item> {
-        self.0
-            .last()
-            .map(|slice| unsafe { DeviceSlice::from_slice_mut(slice) })
+impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeFull {
+    unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        *slice
     }
-}
-impl<'a, T> DoubleEndedIterator for DeviceChunksMut<'a, T> {
-    #[inline]
-    fn next_back(&mut self) -> Option<&'a mut DeviceSlice<T>> {
-        self.0
-            .next_back()
-            .map(|slice| unsafe { DeviceSlice::from_slice_mut(slice) })
+    fn index(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        *slice
     }
 }
-impl<'a, T> ExactSizeIterator for DeviceChunksMut<'a, T> {}
-impl<'a, T> FusedIterator for DeviceChunksMut<'a, T> {}
 
-macro_rules! impl_index {
-    ($($t:ty)*) => {
-        $(
-            impl<T> Index<$t> for DeviceSlice<T>
-            {
-                type Output = DeviceSlice<T>;
+fn into_slice_range(range: RangeInclusive<usize>) -> Range<usize> {
+    let exclusive_end = range.end() + 1;
+    let start = if range.is_empty() {
+        exclusive_end
+    } else {
+        *range.start()
+    };
+    start..exclusive_end
+}
 
-                fn index(&self, index: $t) -> &Self {
-                    unsafe { DeviceSlice::from_slice(self.0.index(index)) }
-                }
-            }
+impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeInclusive<usize> {
+    unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        into_slice_range(self).get_unchecked(slice)
+    }
+    fn index(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        if *self.end() == usize::MAX {
+            slice_end_index_overflow_fail();
+        }
+        into_slice_range(self).index(slice)
+    }
+}
 
-            impl<T> IndexMut<$t> for DeviceSlice<T>
-            {
-                fn index_mut(&mut self, index: $t) -> &mut Self {
-                    unsafe { DeviceSlice::from_slice_mut( self.0.index_mut(index)) }
-                }
-            }
-        )*
+impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeToInclusive<usize> {
+    unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        (0..=self.end).get_unchecked(slice)
+    }
+    fn index(self, slice: &DeviceSlice<T>) -> DeviceSlice<T> {
+        (0..=self.end).index(slice)
     }
 }
-impl_index! {
-    Range<usize>
-    RangeFull
-    RangeFrom<usize>
-    RangeInclusive<usize>
-    RangeTo<usize>
-    RangeToInclusive<usize>
+
+impl<T: DeviceCopy> DeviceSlice<T> {
+    pub fn index<Idx: DeviceSliceIndex<T>>(&self, idx: Idx) -> DeviceSlice<T> {
+        idx.index(self)
+    }
 }
-impl<T> crate::private::Sealed for DeviceSlice<T> {}
+
+impl<T: DeviceCopy> crate::private::Sealed for DeviceSlice<T> {}
 impl<T: DeviceCopy, I: AsRef<[T]> + AsMut<[T]> + ?Sized> CopyDestination<I> for DeviceSlice<T> {
     fn copy_from(&mut self, val: &I) -> CudaResult<()> {
         let val = val.as_ref();
@@ -459,12 +345,8 @@ impl<T: DeviceCopy, I: AsRef<[T]> + AsMut<[T]> + ?Sized> CopyDestination<I> for
         let size = mem::size_of::<T>() * self.len();
         if size != 0 {
             unsafe {
-                cuda::cuMemcpyHtoD_v2(
-                    self.0.as_mut_ptr() as u64,
-                    val.as_ptr() as *const c_void,
-                    size,
-                )
-                .to_result()?
+                cuda::cuMemcpyHtoD_v2(self.ptr.as_raw(), val.as_ptr() as *const c_void, size)
+                    .to_result()?
             }
         }
         Ok(())
@@ -479,8 +361,12 @@ impl<T: DeviceCopy, I: AsRef<[T]> + AsMut<[T]> + ?Sized> CopyDestination<I> for
         let size = mem::size_of::<T>() * self.len();
         if size != 0 {
             unsafe {
-                cuda::cuMemcpyDtoH_v2(val.as_mut_ptr() as *mut c_void, self.as_ptr() as u64, size)
-                    .to_result()?
+                cuda::cuMemcpyDtoH_v2(
+                    val.as_mut_ptr() as *mut c_void,
+                    self.as_device_ptr().as_raw(),
+                    size,
+                )
+                .to_result()?
             }
         }
         Ok(())
@@ -495,7 +381,7 @@ impl<T: DeviceCopy> CopyDestination<DeviceSlice<T>> for DeviceSlice<T> {
         let size = mem::size_of::<T>() * self.len();
         if size != 0 {
             unsafe {
-                cuda::cuMemcpyDtoD_v2(self.0.as_mut_ptr() as u64, val.as_ptr() as u64, size)
+                cuda::cuMemcpyDtoD_v2(self.ptr.as_raw(), val.as_device_ptr().as_raw(), size)
                     .to_result()?
             }
         }
@@ -510,8 +396,12 @@ impl<T: DeviceCopy> CopyDestination<DeviceSlice<T>> for DeviceSlice<T> {
         let size = mem::size_of::<T>() * self.len();
         if size != 0 {
             unsafe {
-                cuda::cuMemcpyDtoD_v2(val.as_mut_ptr() as u64, self.as_ptr() as u64, size)
-                    .to_result()?
+                cuda::cuMemcpyDtoD_v2(
+                    val.as_device_ptr().as_raw(),
+                    self.as_device_ptr().as_raw(),
+                    size,
+                )
+                .to_result()?
             }
         }
         Ok(())
@@ -538,7 +428,7 @@ impl<T: DeviceCopy, I: AsRef<[T]> + AsMut<[T]> + ?Sized> AsyncCopyDestination<I>
         let size = mem::size_of::<T>() * self.len();
         if size != 0 {
             cuda::cuMemcpyHtoDAsync_v2(
-                self.0.as_mut_ptr() as u64,
+                self.ptr.as_raw(),
                 val.as_ptr() as *const c_void,
                 size,
                 stream.as_inner(),
@@ -558,7 +448,7 @@ impl<T: DeviceCopy, I: AsRef<[T]> + AsMut<[T]> + ?Sized> AsyncCopyDestination<I>
         if size != 0 {
             cuda::cuMemcpyDtoHAsync_v2(
                 val.as_mut_ptr() as *mut c_void,
-                self.as_ptr() as u64,
+                self.as_device_ptr().as_raw(),
                 size,
                 stream.as_inner(),
             )
@@ -576,8 +466,8 @@ impl<T: DeviceCopy> AsyncCopyDestination<DeviceSlice<T>> for DeviceSlice<T> {
         let size = mem::size_of::<T>() * self.len();
         if size != 0 {
             cuda::cuMemcpyDtoDAsync_v2(
-                self.0.as_mut_ptr() as u64,
-                val.as_ptr() as u64,
+                self.as_device_ptr().as_raw(),
+                val.as_device_ptr().as_raw(),
                 size,
                 stream.as_inner(),
             )
@@ -594,8 +484,8 @@ impl<T: DeviceCopy> AsyncCopyDestination<DeviceSlice<T>> for DeviceSlice<T> {
         let size = mem::size_of::<T>() * self.len();
         if size != 0 {
             cuda::cuMemcpyDtoDAsync_v2(
-                val.as_mut_ptr() as u64,
-                self.as_ptr() as u64,
+                val.as_device_ptr().as_raw(),
+                self.as_device_ptr().as_raw(),
                 size,
                 stream.as_inner(),
             )
diff --git a/crates/cust/src/memory/device/device_variable.rs b/crates/cust/src/memory/device/device_variable.rs
new file mode 100644
index 00000000..14eb52bf
--- /dev/null
+++ b/crates/cust/src/memory/device/device_variable.rs
@@ -0,0 +1,51 @@
+use crate::error::CudaResult;
+use crate::memory::device::CopyDestination;
+use crate::memory::DeviceCopy;
+use crate::memory::{DeviceBox, DevicePointer};
+use std::ops::{Deref, DerefMut};
+
+/// Wrapper around a variable on the host and a [`DeviceBox`] holding the
+/// variable on the device, allowing for easy synchronization and storage.
+#[derive(Debug)]
+pub struct DeviceVariable<T: DeviceCopy> {
+    mem: DeviceBox<T>,
+    var: T,
+}
+
+impl<T: DeviceCopy> DeviceVariable<T> {
+    /// Create a new `DeviceVariable` wrapping `var`.
+    ///
+    /// Allocates storage on the device and copies `var` to the device.
+    pub fn new(var: T) -> CudaResult<Self> {
+        let mem = DeviceBox::new(&var)?;
+        Ok(Self { mem, var })
+    }
+
+    /// Copy the host copy of the variable to the device
+    pub fn copy_htod(&mut self) -> CudaResult<()> {
+        self.mem.copy_from(&self.var)
+    }
+
+    /// Copy the device copy of the variable to the host
+    pub fn copy_dtoh(&mut self) -> CudaResult<()> {
+        self.mem.copy_to(&mut self.var)
+    }
+
+    pub fn as_device_ptr(&self) -> DevicePointer<T> {
+        self.mem.as_device_ptr()
+    }
+}
+
+impl<T: DeviceCopy> Deref for DeviceVariable<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.var
+    }
+}
+
+impl<T: DeviceCopy> DerefMut for DeviceVariable<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.var
+    }
+}
diff --git a/crates/cust/src/memory/device/mod.rs b/crates/cust/src/memory/device/mod.rs
index 2bdbfaf3..e30704ac 100644
--- a/crates/cust/src/memory/device/mod.rs
+++ b/crates/cust/src/memory/device/mod.rs
@@ -4,10 +4,12 @@ use crate::stream::Stream;
 mod device_box;
 mod device_buffer;
 mod device_slice;
+mod device_variable;
 
 pub use self::device_box::*;
 pub use self::device_buffer::*;
 pub use self::device_slice::*;
+pub use self::device_variable::*;
 
 /// Sealed trait implemented by types which can be the source or destination when copying data
 /// to/from the device or from one device allocation to another.
diff --git a/crates/cust/src/memory/malloc.rs b/crates/cust/src/memory/malloc.rs
index 4651e45a..fb86501a 100644
--- a/crates/cust/src/memory/malloc.rs
+++ b/crates/cust/src/memory/malloc.rs
@@ -39,16 +39,15 @@ use std::ptr;
 ///     cuda_free(device_buffer).unwrap();
 /// }
 /// ```
-pub unsafe fn cuda_malloc<T>(count: usize) -> CudaResult<DevicePointer<T>> {
+pub unsafe fn cuda_malloc<T: DeviceCopy>(count: usize) -> CudaResult<DevicePointer<T>> {
     let size = count.checked_mul(mem::size_of::<T>()).unwrap_or(0);
     if size == 0 {
         return Err(CudaError::InvalidMemoryAllocation);
     }
 
-    let mut ptr: *mut c_void = ptr::null_mut();
-    cuda::cuMemAlloc_v2(&mut ptr as *mut *mut c_void as *mut u64, size).to_result()?;
-    let ptr = ptr as *mut T;
-    Ok(DevicePointer::wrap(ptr as *mut T))
+    let mut ptr = 0;
+    cuda::cuMemAlloc_v2(&mut ptr, size).to_result()?;
+    Ok(DevicePointer::from_raw(ptr))
 }
 
 /// Unsafe wrapper around `cuMemAllocAsync` which queues a memory allocation operation on a stream.
@@ -60,7 +59,10 @@ pub unsafe fn cuda_malloc<T>(count: usize) -> CudaResult<DevicePointer<T>> {
 ///
 /// The memory behind the returned pointer must not be used in any way until the
 /// allocation actually takes place in the stream.
-pub unsafe fn cuda_malloc_async<T>(stream: &Stream, count: usize) -> CudaResult<DevicePointer<T>> {
+pub unsafe fn cuda_malloc_async<T: DeviceCopy>(
+    stream: &Stream,
+    count: usize,
+) -> CudaResult<DevicePointer<T>> {
     let size = count.checked_mul(mem::size_of::<T>()).unwrap_or(0);
     if size == 0 {
         return Err(CudaError::InvalidMemoryAllocation);
@@ -74,7 +76,7 @@ pub unsafe fn cuda_malloc_async<T>(stream: &Stream, count: usize) -> CudaResult<
     )
     .to_result()?;
     let ptr = ptr as *mut T;
-    Ok(DevicePointer::wrap(ptr as *mut T))
+    Ok(DevicePointer::from_raw(ptr as cuda::CUdeviceptr))
 }
 
 /// Unsafe wrapper around `cuMemFreeAsync` which queues a memory allocation free operation on a stream.
@@ -85,12 +87,15 @@ pub unsafe fn cuda_malloc_async<T>(stream: &Stream, count: usize) -> CudaResult<
 /// # Safety
 ///
 /// The pointer must be valid.
-pub unsafe fn cuda_free_async<T>(stream: &Stream, mut p: DevicePointer<T>) -> CudaResult<()> {
+pub unsafe fn cuda_free_async<T: DeviceCopy>(
+    stream: &Stream,
+    p: DevicePointer<T>,
+) -> CudaResult<()> {
     if mem::size_of::<T>() == 0 {
         return Err(CudaError::InvalidMemoryAllocation);
     }
 
-    cuda::cuMemFreeAsync(p.as_raw_mut() as u64, stream.as_inner()).to_result()
+    cuda::cuMemFreeAsync(p.as_raw(), stream.as_inner()).to_result()
 }
 
 /// Unsafe wrapper around the `cuMemAllocManaged` function, which allocates some unified memory and
@@ -166,13 +171,12 @@ pub unsafe fn cuda_malloc_unified<T: DeviceCopy>(count: usize) -> CudaResult<Uni
 ///     cuda_free(device_buffer).unwrap();
 /// }
 /// ```
-pub unsafe fn cuda_free<T>(mut p: DevicePointer<T>) -> CudaResult<()> {
-    let ptr = p.as_raw_mut();
+pub unsafe fn cuda_free<T: DeviceCopy>(ptr: DevicePointer<T>) -> CudaResult<()> {
     if ptr.is_null() {
         return Err(CudaError::InvalidMemoryAllocation);
     }
 
-    cuda::cuMemFree_v2(ptr as u64).to_result()?;
+    cuda::cuMemFree_v2(ptr.as_raw()).to_result()?;
     Ok(())
 }
 
@@ -385,11 +389,10 @@ mod test {
     #[test]
     fn test_cuda_free_null() {
         let _context = crate::quick_init().unwrap();
-        let null = ::std::ptr::null_mut::<u64>();
         unsafe {
             assert_eq!(
                 CudaError::InvalidMemoryAllocation,
-                cuda_free(DevicePointer::wrap(null)).unwrap_err()
+                cuda_free(DevicePointer::<u64>::null()).unwrap_err()
             );
         }
     }
diff --git a/crates/cust/src/memory/mod.rs b/crates/cust/src/memory/mod.rs
index 7d38b504..1caf4b56 100644
--- a/crates/cust/src/memory/mod.rs
+++ b/crates/cust/src/memory/mod.rs
@@ -87,8 +87,11 @@ pub use self::malloc::*;
 pub use self::pointer::*;
 pub use self::unified::*;
 
-use core::marker::PhantomData;
-use core::num::*;
+use crate::error::*;
+
+pub use cust_core::DeviceCopy;
+
+use std::ffi::c_void;
 
 /// A trait describing a generic buffer that can be accessed from the GPU. This could be either a [`UnifiedBuffer`]
 /// or a regular [`DeviceBuffer`].
@@ -100,7 +103,7 @@ pub trait GpuBuffer<T: DeviceCopy>: private::Sealed {
 
 impl<T: DeviceCopy> GpuBuffer<T> for DeviceBuffer<T> {
     fn as_device_ptr(&self) -> DevicePointer<T> {
-        unsafe { DevicePointer::wrap((**self).as_ptr() as *mut _) }
+        self.as_slice().as_device_ptr()
     }
 
     fn len(&self) -> usize {
@@ -110,7 +113,7 @@ impl<T: DeviceCopy> GpuBuffer<T> for DeviceBuffer<T> {
 
 impl<T: DeviceCopy> GpuBuffer<T> for UnifiedBuffer<T> {
     fn as_device_ptr(&self) -> DevicePointer<T> {
-        unsafe { DevicePointer::wrap(self.as_ptr() as *mut _) }
+        DevicePointer::from_raw(self.as_ptr() as u64)
     }
 
     fn len(&self) -> usize {
@@ -132,159 +135,92 @@ impl<T: DeviceCopy> GpuBox<T> for DeviceBox<T> {
 
 impl<T: DeviceCopy> GpuBox<T> for UnifiedBox<T> {
     fn as_device_ptr(&self) -> DevicePointer<T> {
-        unsafe { DevicePointer::wrap(self.ptr.as_raw() as *mut _) }
+        DevicePointer::from_raw(self.ptr.as_raw() as u64)
     }
 }
 
-mod private {
-    use super::{DeviceBox, DeviceBuffer, DeviceCopy, UnifiedBox, UnifiedBuffer};
+/// A trait describing a region of memory on the device with a base pointer and
+/// a size, used to be generic over DeviceBox, DeviceBuffer, DeviceVariable etc.
+pub trait DeviceMemory {
+    /// Get the raw cuda device pointer
+    fn as_raw_ptr(&self) -> cust_raw::CUdeviceptr;
 
-    pub trait Sealed {}
-    impl<T: DeviceCopy> Sealed for UnifiedBuffer<T> {}
-    impl<T: DeviceCopy> Sealed for DeviceBuffer<T> {}
-    impl<T: DeviceCopy> Sealed for UnifiedBox<T> {}
-    impl<T: DeviceCopy> Sealed for DeviceBox<T> {}
+    /// Get the size of the memory region in bytes
+    fn size_in_bytes(&self) -> usize;
 }
 
-/// Marker trait for types which can safely be copied to or from a CUDA device.
-///
-/// A type can be safely copied if its value can be duplicated simply by copying bits and if it does
-/// not contain a reference to memory which is not accessible to the device. Additionally, the
-/// DeviceCopy trait does not imply copy semantics as the Copy trait does.
-///
-/// ## How can I implement DeviceCopy?
-///
-/// There are two ways to implement DeviceCopy on your type. The simplest is to use `derive`:
-///
-/// ```
-/// use cust::DeviceCopy;
-///
-/// #[derive(Clone, Copy, DeviceCopy)]
-/// struct MyStruct(u64);
-///
-/// # fn main () {}
-/// ```
-///
-/// This is safe because the `DeviceCopy` derive macro will check that all fields of the struct,
-/// enum or union implement `DeviceCopy`. For example, this fails to compile, because `Vec` cannot
-/// be copied to the device:
-///
-/// ```compile_fail
-/// use cust::DeviceCopy;
-///
-/// #[derive(Clone, DeviceCopy)]
-/// struct MyStruct(Vec<u64>);
-/// # fn main () {}
-/// ```
-///
-/// You can also implement `DeviceCopy` unsafely:
-///
-/// ```
-/// use cust::memory::DeviceCopy;
-///
-/// #[derive(Clone, Copy)]
-/// struct MyStruct(u64);
-///
-/// unsafe impl DeviceCopy for MyStruct { }
-/// # fn main () {}
-/// ```
-///
-/// ## What is the difference between `DeviceCopy` and `Copy`?
-///
-/// `DeviceCopy` is stricter than `Copy`. `DeviceCopy` must only be implemented for types which
-/// do not contain references or raw pointers to non-device-accessible memory. `DeviceCopy` also
-/// does not imply copy semantics - that is, `DeviceCopy` values are not implicitly copied on
-/// assignment the way that `Copy` values are. This is helpful, as it may be desirable to implement
-/// `DeviceCopy` for large structures that would be inefficient to copy for every assignment.
-///
-/// ## When can't my type be `DeviceCopy`?
-///
-/// Some types cannot be safely copied to the device. For example, copying `&T` would create an
-/// invalid reference on the device which would segfault if dereferenced. Generalizing this, any
-/// type implementing `Drop` cannot be `DeviceCopy` since it is responsible for some resource that
-/// would not be available on the device.
-#[allow(clippy::missing_safety_doc)] // explained in the doc already
-pub unsafe trait DeviceCopy: Copy {}
+impl<T: DeviceCopy> DeviceMemory for DeviceBox<T> {
+    fn as_raw_ptr(&self) -> cust_raw::CUdeviceptr {
+        self.as_device_ptr().as_raw()
+    }
 
-macro_rules! impl_device_copy {
-    ($($t:ty)*) => {
-        $(
-            unsafe impl DeviceCopy for $t {}
-        )*
+    fn size_in_bytes(&self) -> usize {
+        std::mem::size_of::<T>()
     }
 }
 
-impl_device_copy!(
-    usize u8 u16 u32 u64 u128
-    isize i8 i16 i32 i64 i128
-    f32 f64
-    bool char
+impl<T: DeviceCopy> DeviceMemory for DeviceVariable<T> {
+    fn as_raw_ptr(&self) -> cust_raw::CUdeviceptr {
+        self.as_device_ptr().as_raw()
+    }
 
-    NonZeroU8 NonZeroU16 NonZeroU32 NonZeroU64 NonZeroU128
-);
-unsafe impl<T: DeviceCopy> DeviceCopy for Option<T> {}
-unsafe impl<L: DeviceCopy, R: DeviceCopy> DeviceCopy for Result<L, R> {}
-unsafe impl<T: ?Sized + DeviceCopy> DeviceCopy for PhantomData<T> {}
-unsafe impl<T: DeviceCopy> DeviceCopy for Wrapping<T> {}
-unsafe impl<T: DeviceCopy, const N: usize> DeviceCopy for [T; N] {}
-unsafe impl DeviceCopy for () {}
-unsafe impl<A: DeviceCopy, B: DeviceCopy> DeviceCopy for (A, B) {}
-unsafe impl<A: DeviceCopy, B: DeviceCopy, C: DeviceCopy> DeviceCopy for (A, B, C) {}
-unsafe impl<A: DeviceCopy, B: DeviceCopy, C: DeviceCopy, D: DeviceCopy> DeviceCopy
-    for (A, B, C, D)
-{
-}
-unsafe impl<A: DeviceCopy, B: DeviceCopy, C: DeviceCopy, D: DeviceCopy, E: DeviceCopy> DeviceCopy
-    for (A, B, C, D, E)
-{
-}
-unsafe impl<A: DeviceCopy, B: DeviceCopy, C: DeviceCopy, D: DeviceCopy, E: DeviceCopy, F: DeviceCopy>
-    DeviceCopy for (A, B, C, D, E, F)
-{
-}
-unsafe impl<
-        A: DeviceCopy,
-        B: DeviceCopy,
-        C: DeviceCopy,
-        D: DeviceCopy,
-        E: DeviceCopy,
-        F: DeviceCopy,
-        G: DeviceCopy,
-    > DeviceCopy for (A, B, C, D, E, F, G)
-{
+    fn size_in_bytes(&self) -> usize {
+        std::mem::size_of::<T>()
+    }
 }
-unsafe impl<
-        A: DeviceCopy,
-        B: DeviceCopy,
-        C: DeviceCopy,
-        D: DeviceCopy,
-        E: DeviceCopy,
-        F: DeviceCopy,
-        G: DeviceCopy,
-        H: DeviceCopy,
-    > DeviceCopy for (A, B, C, D, E, F, G, H)
-{
+
+impl<T: DeviceCopy> DeviceMemory for DeviceBuffer<T> {
+    fn as_raw_ptr(&self) -> cust_raw::CUdeviceptr {
+        self.as_device_ptr().as_raw()
+    }
+
+    fn size_in_bytes(&self) -> usize {
+        std::mem::size_of::<T>() * self.len()
+    }
 }
 
-#[cfg(feature = "vek")]
-macro_rules! impl_device_copy_vek {
-    ($($strukt:ident),* $(,)?) => {
-        $(
-            unsafe impl<T: DeviceCopy> DeviceCopy for $strukt<T> {}
-        )*
+impl<T: DeviceCopy> DeviceMemory for DeviceSlice<T> {
+    fn as_raw_ptr(&self) -> cust_raw::CUdeviceptr {
+        self.as_device_ptr().as_raw()
+    }
+
+    fn size_in_bytes(&self) -> usize {
+        std::mem::size_of::<T>() * self.len()
     }
 }
 
-#[cfg(feature = "vek")]
-use vek::*;
+mod private {
+    use super::{DeviceBox, DeviceBuffer, DeviceCopy, UnifiedBox, UnifiedBuffer};
+
+    pub trait Sealed {}
+    impl<T: DeviceCopy> Sealed for UnifiedBuffer<T> {}
+    impl<T: DeviceCopy> Sealed for DeviceBuffer<T> {}
+    impl<T: DeviceCopy> Sealed for UnifiedBox<T> {}
+    impl<T: DeviceCopy> Sealed for DeviceBox<T> {}
+}
 
-#[cfg(feature = "vek")]
-impl_device_copy_vek! {
-    Vec2, Vec3, Vec4, Extent2, Extent3,
-    Mat2, Mat3, Mat4,
-    CubicBezier2, CubicBezier3,
-    Quaternion,
+/// Simple wrapper over cuMemcpyHtoD_v2
+#[allow(clippy::missing_safety_doc)]
+pub unsafe fn memcpy_htod(
+    d_ptr: cust_raw::CUdeviceptr,
+    src_ptr: *const c_void,
+    size: usize,
+) -> CudaResult<()> {
+    crate::sys::cuMemcpyHtoD_v2(d_ptr, src_ptr, size).to_result()?;
+    Ok(())
 }
 
-#[cfg(feature = "num-complex")]
-unsafe impl<T: DeviceCopy> DeviceCopy for num_complex::Complex<T> {}
+/// Get the current free and total memory.
+///
+/// Returns in `.1` the total amount of memory available to the the current context.
+/// Returns in `.0` the amount of memory on the device that is free according to
+/// the OS. CUDA is not guaranteed to be able to allocate all of the memory that
+/// the OS reports as free.
+pub fn mem_get_info() -> CudaResult<(usize, usize)> {
+    let mut mem_free = 0;
+    let mut mem_total = 0;
+    unsafe {
+        crate::sys::cuMemGetInfo_v2(&mut mem_free, &mut mem_total).to_result()?;
+    }
+    Ok((mem_free, mem_total))
+}
diff --git a/crates/cust/src/memory/pointer.rs b/crates/cust/src/memory/pointer.rs
index e36762ca..9f14999b 100644
--- a/crates/cust/src/memory/pointer.rs
+++ b/crates/cust/src/memory/pointer.rs
@@ -1,60 +1,14 @@
 use crate::memory::DeviceCopy;
+use cust_raw::CUdeviceptr;
 
 use core::{
-    cmp::Ordering,
     fmt::{self, Debug, Pointer},
-    hash::{Hash, Hasher},
+    hash::Hash,
     ptr,
 };
-
-macro_rules! derive_traits {
-    ( $( $Ptr:ty )* ) => ($(
-        impl<T: ?Sized> Debug for $Ptr {
-            fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-                Debug::fmt(&self.0, f)
-            }
-        }
-        impl<T: ?Sized> Pointer for $Ptr {
-            fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-                Pointer::fmt(&self.0, f)
-            }
-        }
-
-        impl<T: ?Sized> Hash for $Ptr {
-            fn hash<H: Hasher>(&self, h: &mut H) {
-                Hash::hash(&self.0, h);
-            }
-        }
-
-        impl<T: ?Sized> PartialEq for $Ptr {
-            fn eq(&self, other: &$Ptr) -> bool {
-                PartialEq::eq(&self.0, &other.0)
-            }
-        }
-
-        impl<T: ?Sized> Eq for $Ptr {}
-
-        impl<T: ?Sized> PartialOrd for $Ptr {
-            fn partial_cmp(&self, other: &$Ptr) -> Option<Ordering> {
-                PartialOrd::partial_cmp(&self.0, &other.0)
-            }
-        }
-
-        impl<T: ?Sized> Ord for $Ptr {
-            fn cmp(&self, other: &$Ptr) -> Ordering {
-                Ord::cmp(&self.0, &other.0)
-            }
-        }
-
-        impl<T: ?Sized> Clone for $Ptr {
-            fn clone(&self) -> Self {
-                Self(self.0)
-            }
-        }
-        impl<T: ?Sized> Copy for $Ptr {}
-    )*)
-}
-derive_traits!(DevicePointer<T> UnifiedPointer<T>);
+use std::ffi::c_void;
+use std::marker::PhantomData;
+use std::mem::size_of;
 
 /// A pointer to device memory.
 ///
@@ -68,68 +22,45 @@ derive_traits!(DevicePointer<T> UnifiedPointer<T>);
 /// the other side of that boundary does not attempt to dereference the pointer on the CPU. It is
 /// thus possible to pass a `DevicePointer` to a CUDA kernel written in C.
 #[repr(transparent)]
-pub struct DevicePointer<T: ?Sized>(*mut T);
+#[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct DevicePointer<T: ?Sized + DeviceCopy> {
+    ptr: CUdeviceptr,
+    marker: PhantomData<*mut T>,
+}
 
-unsafe impl<T: ?Sized> DeviceCopy for DevicePointer<T> {}
+unsafe impl<T: ?Sized + DeviceCopy> DeviceCopy for DevicePointer<T> {}
 
-impl<T: ?Sized> DevicePointer<T> {
-    /// Wrap the given raw pointer in a DevicePointer. The given pointer is assumed to be a valid,
-    /// device pointer or null.
-    ///
-    /// # Safety
-    ///
-    /// The given pointer must have been allocated with [`cuda_malloc`](fn.cuda_malloc.html) or
-    /// be null.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # let _context = cust::quick_init().unwrap();
-    /// use cust::memory::*;
-    /// use std::ptr;
-    /// unsafe {
-    ///     let null : *mut u64 = ptr::null_mut();
-    ///     assert!(DevicePointer::wrap(null).is_null());
-    /// }
-    /// ```
-    pub unsafe fn wrap(ptr: *mut T) -> Self {
-        DevicePointer(ptr)
+impl<T: DeviceCopy> Pointer for DevicePointer<T> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let ptr = self.ptr as *const c_void;
+        fmt::Pointer::fmt(&ptr, f)
     }
+}
 
-    /// Returns the contained pointer as a raw pointer. The returned pointer is not valid on the CPU
-    /// and must not be dereferenced.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # let _context = cust::quick_init().unwrap();
-    /// use cust::memory::*;
-    /// unsafe {
-    ///     let dev_ptr = cuda_malloc::<u64>(1).unwrap();
-    ///     let ptr: *const u64 = dev_ptr.as_raw();
-    ///     cuda_free(dev_ptr);
-    /// }
-    /// ```
-    pub fn as_raw(self) -> *const T {
-        self.0
+impl<T: ?Sized + DeviceCopy> DevicePointer<T> {
+    /// Returns a rust [`pointer`] created from this pointer, meant for FFI purposes.
+    /// **The pointer is not dereferenceable from the CPU!**
+    pub fn as_ptr(&self) -> *const T {
+        self.ptr as *const T
     }
 
-    /// Returns the contained pointer as a mutable raw pointer. The returned pointer is not valid on the CPU
-    /// and must not be dereferenced.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # let _context = cust::quick_init().unwrap();
-    /// use cust::memory::*;
-    /// unsafe {
-    ///     let mut dev_ptr = cuda_malloc::<u64>(1).unwrap();
-    ///     let ptr: *mut u64 = dev_ptr.as_raw_mut();
-    ///     cuda_free(dev_ptr);
-    /// }
-    /// ```
-    pub fn as_raw_mut(&mut self) -> *mut T {
-        self.0
+    /// Returns a rust [`pointer`] created from this pointer, meant for FFI purposes.
+    /// **The pointer is not dereferenceable from the CPU!**
+    pub fn as_mut_ptr(&self) -> *mut T {
+        self.ptr as *mut T
+    }
+
+    /// Returns the contained CUdeviceptr.
+    pub fn as_raw(&self) -> CUdeviceptr {
+        self.ptr
+    }
+
+    /// Create a DevicePointer from a raw CUDA pointer
+    pub fn from_raw(ptr: CUdeviceptr) -> Self {
+        Self {
+            ptr,
+            marker: PhantomData,
+        }
     }
 
     /// Returns true if the pointer is null.
@@ -145,24 +76,20 @@ impl<T: ?Sized> DevicePointer<T> {
     /// }
     /// ```
     pub fn is_null(self) -> bool {
-        self.0.is_null()
+        self.ptr == 0
     }
 
     /// Returns a null device pointer.
     ///
-    /// # Examples:
-    ///
-    /// ```
-    /// # let _context = cust::quick_init().unwrap();
-    /// use cust::memory::*;
-    /// let ptr : DevicePointer<u64> = DevicePointer::null();
-    /// assert!(ptr.is_null());
-    /// ```
+    // TODO (AL): do we even want this?
     pub fn null() -> Self
     where
         T: Sized,
     {
-        unsafe { Self::wrap(ptr::null_mut()) }
+        Self {
+            ptr: 0,
+            marker: PhantomData,
+        }
     }
 
     /// Calculates the offset from a device pointer.
@@ -202,7 +129,11 @@ impl<T: ?Sized> DevicePointer<T> {
     where
         T: Sized,
     {
-        Self::wrap(self.0.offset(count))
+        let ptr = self.ptr + (count as usize * size_of::<T>()) as u64;
+        Self {
+            ptr,
+            marker: PhantomData,
+        }
     }
 
     /// Calculates the offset from a device pointer using wrapping arithmetic.
@@ -239,7 +170,13 @@ impl<T: ?Sized> DevicePointer<T> {
     where
         T: Sized,
     {
-        unsafe { Self::wrap(self.0.wrapping_offset(count)) }
+        let ptr = self
+            .ptr
+            .wrapping_add((count as usize * size_of::<T>()) as u64);
+        Self {
+            ptr,
+            marker: PhantomData,
+        }
     }
 
     /// Calculates the offset from a pointer (convenience for `.offset(count as isize)`).
@@ -400,11 +337,18 @@ impl<T: ?Sized> DevicePointer<T> {
 /// `UnifiedPointer` through an FFI boundary to C code expecting a `*mut T`. It is
 /// thus possible to pass a `UnifiedPointer` to a CUDA kernel written in C.
 #[repr(transparent)]
-pub struct UnifiedPointer<T: ?Sized>(*mut T);
+#[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct UnifiedPointer<T: ?Sized + DeviceCopy>(*mut T);
 
 unsafe impl<T: ?Sized + DeviceCopy> DeviceCopy for UnifiedPointer<T> {}
 
-impl<T: ?Sized> UnifiedPointer<T> {
+impl<T: DeviceCopy> Pointer for UnifiedPointer<T> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Pointer::fmt(&self.0, f)
+    }
+}
+
+impl<T: ?Sized + DeviceCopy> UnifiedPointer<T> {
     /// Wrap the given raw pointer in a UnifiedPointer. The given pointer is assumed to be a valid,
     /// unified-memory pointer or null.
     ///
diff --git a/crates/cust/src/module.rs b/crates/cust/src/module.rs
index cecd2847..dd696b98 100644
--- a/crates/cust/src/module.rs
+++ b/crates/cust/src/module.rs
@@ -481,12 +481,8 @@ impl<'a, T: DeviceCopy> CopyDestination<T> for Symbol<'a, T> {
         let size = mem::size_of::<T>();
         if size != 0 {
             unsafe {
-                cuda::cuMemcpyHtoD_v2(
-                    self.ptr.as_raw_mut() as u64,
-                    val as *const T as *const c_void,
-                    size,
-                )
-                .to_result()?
+                cuda::cuMemcpyHtoD_v2(self.ptr.as_raw(), val as *const T as *const c_void, size)
+                    .to_result()?
             }
         }
         Ok(())
diff --git a/crates/cust/src/nvtx.rs b/crates/cust/src/nvtx.rs
index e69de29b..8b137891 100644
--- a/crates/cust/src/nvtx.rs
+++ b/crates/cust/src/nvtx.rs
@@ -0,0 +1 @@
+
diff --git a/crates/cust/src/prelude.rs b/crates/cust/src/prelude.rs
index 83579de2..559a55ba 100644
--- a/crates/cust/src/prelude.rs
+++ b/crates/cust/src/prelude.rs
@@ -5,8 +5,13 @@
 
 pub use crate::context::{Context, ContextFlags};
 pub use crate::device::Device;
+pub use crate::event::{Event, EventFlags, EventStatus};
+pub use crate::external::*;
+pub use crate::function::Function;
 pub use crate::launch;
-pub use crate::memory::{CopyDestination, DeviceBuffer, UnifiedBuffer};
+pub use crate::memory::{
+    CopyDestination, DeviceBuffer, DevicePointer, DeviceSlice, DeviceVariable, UnifiedBuffer,
+};
 pub use crate::module::Module;
 pub use crate::stream::{Stream, StreamFlags};
 pub use crate::util::*;
diff --git a/crates/cust_core/Cargo.toml b/crates/cust_core/Cargo.toml
new file mode 100644
index 00000000..9de20e72
--- /dev/null
+++ b/crates/cust_core/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "cust_core"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+vek = { version = "0.15.1", default-features=false, features=["libm"], optional = true }
+glam = { version = "0.20", features=["cuda", "libm"], default-features=false, optional=true }
+mint = { version = "^0.5", optional = true }
+half = { version = "1.8", optional = true }
+num-complex = { version = "0.4", optional = true }
+
+[features]
+default = ["vek", "glam", "mint"]
diff --git a/crates/cust_core/src/lib.rs b/crates/cust_core/src/lib.rs
new file mode 100644
index 00000000..c647c9c7
--- /dev/null
+++ b/crates/cust_core/src/lib.rs
@@ -0,0 +1,177 @@
+#![no_std]
+use core::marker::PhantomData;
+use core::num::*;
+
+/// Marker trait for types which can safely be copied to or from a CUDA device.
+///
+/// A type can be safely copied if its value can be duplicated simply by copying bits and if it does
+/// not contain a reference to memory which is not accessible to the device. Additionally, the
+/// DeviceCopy trait does not imply copy semantics as the Copy trait does.
+///
+/// ## How can I implement DeviceCopy?
+///
+/// There are two ways to implement DeviceCopy on your type. The simplest is to use `derive`:
+///
+/// ```
+/// use cust::DeviceCopy;
+///
+/// #[derive(Clone, DeviceCopy)]
+/// struct MyStruct(u64);
+///
+/// # fn main () {}
+/// ```
+///
+/// This is safe because the `DeviceCopy` derive macro will check that all fields of the struct,
+/// enum or union implement `DeviceCopy`. For example, this fails to compile, because `Vec` cannot
+/// be copied to the device:
+///
+/// ```compile_fail
+/// use cust::DeviceCopy;
+///
+/// #[derive(Clone, DeviceCopy)]
+/// struct MyStruct(Vec<u64>);
+/// # fn main () {}
+/// ```
+///
+/// You can also implement `DeviceCopy` unsafely:
+///
+/// ```
+/// use cust::memory::DeviceCopy;
+///
+/// #[derive(Clone)]
+/// struct MyStruct(u64);
+///
+/// unsafe impl DeviceCopy for MyStruct { }
+/// # fn main () {}
+/// ```
+///
+/// ## What is the difference between `DeviceCopy` and `Copy`?
+///
+/// `DeviceCopy` is stricter than `Copy`. `DeviceCopy` must only be implemented for types which
+/// do not contain references or raw pointers to non-device-accessible memory. `DeviceCopy` also
+/// does not imply copy semantics - that is, `DeviceCopy` values are not implicitly copied on
+/// assignment the way that `Copy` values are. This is helpful, as it may be desirable to implement
+/// `DeviceCopy` for large structures that would be inefficient to copy for every assignment.
+///
+/// ## When can't my type be `DeviceCopy`?
+///
+/// Some types cannot be safely copied to the device. For example, copying `&T` would create an
+/// invalid reference on the device which would segfault if dereferenced. Generalizing this, any
+/// type implementing `Drop` cannot be `DeviceCopy` since it is responsible for some resource that
+/// would not be available on the device.
+///
+/// # Safety
+///
+/// The type being implemented must hold no references to CPU data.
+pub unsafe trait DeviceCopy: Copy {}
+
+macro_rules! impl_device_copy {
+    ($($t:ty)*) => {
+        $(
+            unsafe impl DeviceCopy for $t {}
+        )*
+    }
+}
+
+impl_device_copy!(
+    usize u8 u16 u32 u64 u128
+    isize i8 i16 i32 i64 i128
+    f32 f64
+    bool char
+
+    NonZeroU8 NonZeroU16 NonZeroU32 NonZeroU64 NonZeroU128
+);
+unsafe impl<T: DeviceCopy> DeviceCopy for Option<T> {}
+unsafe impl<L: DeviceCopy, R: DeviceCopy> DeviceCopy for Result<L, R> {}
+unsafe impl<T: ?Sized + DeviceCopy> DeviceCopy for PhantomData<T> {}
+// Allow DeviceCopy for lifetime constraint markers
+unsafe impl DeviceCopy for PhantomData<&()> {}
+unsafe impl<T: DeviceCopy> DeviceCopy for Wrapping<T> {}
+unsafe impl<T: DeviceCopy, const N: usize> DeviceCopy for [T; N] {}
+unsafe impl DeviceCopy for () {}
+unsafe impl<A: DeviceCopy, B: DeviceCopy> DeviceCopy for (A, B) {}
+unsafe impl<A: DeviceCopy, B: DeviceCopy, C: DeviceCopy> DeviceCopy for (A, B, C) {}
+unsafe impl<A: DeviceCopy, B: DeviceCopy, C: DeviceCopy, D: DeviceCopy> DeviceCopy
+    for (A, B, C, D)
+{
+}
+unsafe impl<A: DeviceCopy, B: DeviceCopy, C: DeviceCopy, D: DeviceCopy, E: DeviceCopy> DeviceCopy
+    for (A, B, C, D, E)
+{
+}
+unsafe impl<A: DeviceCopy, B: DeviceCopy, C: DeviceCopy, D: DeviceCopy, E: DeviceCopy, F: DeviceCopy>
+    DeviceCopy for (A, B, C, D, E, F)
+{
+}
+unsafe impl<
+        A: DeviceCopy,
+        B: DeviceCopy,
+        C: DeviceCopy,
+        D: DeviceCopy,
+        E: DeviceCopy,
+        F: DeviceCopy,
+        G: DeviceCopy,
+    > DeviceCopy for (A, B, C, D, E, F, G)
+{
+}
+unsafe impl<
+        A: DeviceCopy,
+        B: DeviceCopy,
+        C: DeviceCopy,
+        D: DeviceCopy,
+        E: DeviceCopy,
+        F: DeviceCopy,
+        G: DeviceCopy,
+        H: DeviceCopy,
+    > DeviceCopy for (A, B, C, D, E, F, G, H)
+{
+}
+
+macro_rules! impl_device_copy_generic {
+    ($($($strukt:ident)::+),* $(,)?) => {
+        $(
+            unsafe impl<T: DeviceCopy> DeviceCopy for $($strukt)::+<T> {}
+        )*
+    }
+}
+
+macro_rules! impl_device_copy {
+    ($($strukt:ty),* $(,)?) => {
+        $(
+            unsafe impl DeviceCopy for $strukt {}
+        )*
+    }
+}
+
+#[cfg(feature = "vek")]
+use vek::*;
+
+#[cfg(feature = "vek")]
+impl_device_copy_generic! {
+    Vec2, Vec3, Vec4, Extent2, Extent3,
+    Mat2, Mat3, Mat4,
+    CubicBezier2, CubicBezier3,
+    Quaternion,
+}
+
+#[cfg(feature = "glam")]
+impl_device_copy! {
+    glam::Vec2, glam::Vec3, glam::Vec4, glam::IVec2, glam::IVec3, glam::IVec4,
+}
+
+#[cfg(feature = "mint")]
+impl_device_copy_generic! {
+    mint::Vector2, mint::Vector3, mint::Vector4,
+    mint::ColumnMatrix2, mint::ColumnMatrix3, mint::ColumnMatrix4, mint::ColumnMatrix3x4,
+    mint::RowMatrix2, mint::RowMatrix3, mint::RowMatrix4, mint::RowMatrix3x4,
+}
+
+#[cfg(feature = "half")]
+unsafe impl DeviceCopy for half::f16 {}
+#[cfg(feature = "half")]
+unsafe impl DeviceCopy for half::bf16 {}
+
+#[cfg(feature = "num-complex")]
+impl_device_copy_generic! {
+    num_complex::Complex
+}
diff --git a/crates/optix/Cargo.toml b/crates/optix/Cargo.toml
index fa6dc07d..7c803c9e 100644
--- a/crates/optix/Cargo.toml
+++ b/crates/optix/Cargo.toml
@@ -5,7 +5,31 @@ edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/Rust-GPU/Rust-CUDA"
 readme = "../../README.md"
+authors = ["Anders Langlands <anderslanglands@gmail.com>", "Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
+
+[features]
+optix71 = []
+optix72 = []
+optix73 = []
+default=["optix73", "impl_glam"]
+impl_glam=["cust/impl_glam", "glam"]
+impl_half=["cust/impl_half", "half"]
 
 [dependencies]
-optix_sys = { version = "0.1", path = "../optix_sys" }
-cust = { version = "0.2", path = "../cust" }
+cust = { version = "0.2", path = "../cust", features=["impl_mint"] }
+cust_raw = { version = "0.11.2", path = "../cust_raw" }
+cfg-if = "1.0.0"
+bitflags = "1.3.2"
+glam = { version = "0.20", features=["cuda", "libm"], default-features=false, optional=true }
+half = { version = "^1.8", optional = true }
+memoffset = "0.6.4"
+mint = "0.5.8"
+embed-doc-image = {version = "0.1.4"}
+
+[build-dependencies]
+bindgen = "0.59"
+cc = "1.0.71"
+find_cuda_helper = { version = "0.2", path = "../find_cuda_helper" }
+
+[package.metadata.docs.rs]
+rustdoc-args = [ "--html-in-header", "katex-header.html" ]
diff --git a/crates/optix/build.rs b/crates/optix/build.rs
new file mode 100644
index 00000000..c48dcdd2
--- /dev/null
+++ b/crates/optix/build.rs
@@ -0,0 +1,101 @@
+use find_cuda_helper::{find_cuda_root, find_optix_root};
+use std::env;
+use std::path::{Path, PathBuf};
+
+// OptiX is a bit exotic in how it provides its functions. It uses a function table
+// approach, a function table struct holds function pointers to every optix function. Then
+// the Optix driver dll is loaded at runtime and the function table is loaded from that.
+// OptiX provides this logic inside optix_stubs.h in the include dir, so we need to compile that
+// to a lib and link it in so that we have the initialization and C function logic.
+fn main() {
+    let out_dir = env::var("OUT_DIR").unwrap();
+    let mut optix_include = find_optix_root().expect(
+        "Unable to find the OptiX SDK, make sure you installed it and
+    that OPTIX_ROOT or OPTIX_ROOT_DIR are set",
+    );
+    optix_include = optix_include.join("include");
+
+    let mut cuda_include = find_cuda_root().expect(
+        "Unable to find the CUDA Toolkit, make sure you installed it and
+    that CUDA_ROOT, CUDA_PATH or CUDA_TOOLKIT_ROOT_DIR are set",
+    );
+    cuda_include = cuda_include.join("include");
+
+    bindgen_optix(&optix_include, &cuda_include);
+
+    cc::Build::new()
+        .file("./optix_stubs.c")
+        .include(optix_include)
+        .include(cuda_include)
+        .cpp(false)
+        .compile("optix_stubs");
+
+    println!("cargo:rustc-link-search=native={}", out_dir);
+    println!("cargo:rustc-link-lib=static=optix_stubs");
+}
+
+fn bindgen_optix(optix_include: &Path, cuda_include: &Path) {
+    let out_path = PathBuf::from(std::env::var("OUT_DIR").unwrap()).join("optix_wrapper.rs");
+
+    let header_path = PathBuf::from(std::env::var("CARGO_MANIFEST_DIR").unwrap())
+        .join("src")
+        .join("optix_wrapper.h");
+
+    let this_path =
+        std::path::PathBuf::from(std::env::var("CARGO_MANIFEST_DIR").unwrap()).join("build.rs");
+
+    println!("cargo:rerun-if-changed={}", header_path.display());
+    println!("cargo:rerun-if-changed={}", this_path.display());
+
+    let bindings = bindgen::Builder::default()
+        .header("src/optix_wrapper.h")
+        .clang_arg(format!("-I{}", optix_include.display()))
+        .clang_arg(format!("-I{}", cuda_include.display()))
+        .allowlist_recursively(false)
+        .allowlist_type("Optix.*")
+        .allowlist_type("RaygenRecord")
+        .allowlist_type("MissRecord")
+        .allowlist_type("HitgroupRecord")
+        .blocklist_type("OptixBuildInput")
+        .allowlist_function("optix.*")
+        .allowlist_var("OptixSbtRecordHeaderSize")
+        .allowlist_var("OptixSbtRecordAlignment")
+        .allowlist_var("OptixAccelBufferByteAlignment")
+        .allowlist_var("OptixInstanceByteAlignment")
+        .allowlist_var("OptixAabbBufferByteAlignment")
+        .allowlist_var("OptixGeometryTransformByteAlignment")
+        .allowlist_var("OptixTransformByteAlignment")
+        .allowlist_var("OptixVersion")
+        .allowlist_var("OptixBuildInputSize")
+        .allowlist_var("OptixShaderBindingTableSize")
+        .layout_tests(false)
+        .generate_comments(false)
+        .newtype_enum("OptixResult")
+        .constified_enum_module("OptixCompileOptimizationLevel")
+        .constified_enum_module("OptixCompileDebugLevel")
+        .constified_enum_module("OptixTraversableGraphFlags")
+        .constified_enum_module("OptixExceptionFlags")
+        .constified_enum_module("OptixProgramGroupKind")
+        .constified_enum_module("OptixDeviceProperty")
+        .constified_enum_module("OptixPixelFormat")
+        .constified_enum_module("OptixDenoiserModelKind")
+        .rustified_enum("GeometryFlags")
+        .rustified_enum("OptixGeometryFlags")
+        .constified_enum("OptixVertexFormat")
+        .constified_enum("OptixIndicesFormat")
+        .rust_target(bindgen::RustTarget::Nightly)
+        .derive_default(true)
+        .derive_partialeq(true)
+        .rustfmt_bindings(true)
+        .generate()
+        .expect("Unable to generate optix bindings");
+
+    let dbg_path = std::path::PathBuf::from(std::env::var("CARGO_MANIFEST_DIR").unwrap());
+    bindings
+        .write_to_file(dbg_path.join("optix_wrapper.rs"))
+        .expect("Couldn't write bindings!");
+
+    bindings
+        .write_to_file(out_path)
+        .expect("Couldn't write bindings!");
+}
diff --git a/crates/optix/examples/common/gdt/CMakeLists.txt b/crates/optix/examples/common/gdt/CMakeLists.txt
new file mode 100644
index 00000000..bc14ffc0
--- /dev/null
+++ b/crates/optix/examples/common/gdt/CMakeLists.txt
@@ -0,0 +1,35 @@
+# ======================================================================== #
+# Copyright 2018-2019 Ingo Wald                                            #
+#                                                                          #
+# Licensed under the Apache License, Version 2.0 (the "License");          #
+# you may not use this file except in compliance with the License.         #
+# You may obtain a copy of the License at                                  #
+#                                                                          #
+#     http://www.apache.org/licenses/LICENSE-2.0                           #
+#                                                                          #
+# Unless required by applicable law or agreed to in writing, software      #
+# distributed under the License is distributed on an "AS IS" BASIS,        #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and      #
+# limitations under the License.                                           #
+# ======================================================================== #
+
+project(GPU_Development_Tools)
+cmake_minimum_required(VERSION 3.5)
+
+set(CMAKE_CXX_STANDARD 11)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+add_library(gdt 
+  cmake/configure_build_type.cmake
+  cmake/configure_optix.cmake
+  cmake/FindOptiX.cmake
+  
+  gdt/gdt.h
+  gdt/math/LinearSpace.h
+  gdt/math/AffineSpace.h
+  
+  gdt/gdt.cpp
+  )
+
diff --git a/crates/optix/examples/common/gdt/cmake/FindOptiX.cmake b/crates/optix/examples/common/gdt/cmake/FindOptiX.cmake
new file mode 100644
index 00000000..17578042
--- /dev/null
+++ b/crates/optix/examples/common/gdt/cmake/FindOptiX.cmake
@@ -0,0 +1,189 @@
+#
+# Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# Locate the OptiX distribution.  Search relative to the SDK first, then look in the system.
+
+# Our initial guess will be within the SDK.
+
+if (WIN32)
+#		set(OptiX_INSTALL_DIR "C:/ProgramData/NVIDIA Corporation/OptiX SDK 5.1.0" CACHE PATH "Path to OptiX installed location.")
+	find_path(searched_OptiX_INSTALL_DIR
+		NAME include/optix.h
+		PATHS
+		"C:/ProgramData/NVIDIA Corporation/OptiX SDK 7.0.0"
+		"C:/ProgramData/NVIDIA Corporation/OptiX SDK 6.5.0"
+		"C:/ProgramData/NVIDIA Corporation/OptiX SDK 6.0.0"
+		"C:/ProgramData/NVIDIA Corporation/OptiX SDK 5.1.1"
+		"C:/ProgramData/NVIDIA Corporation/OptiX SDK 5.1.0"
+		"C:/ProgramData/NVIDIA Corporation/OptiX SDK 5.0.1"
+		"C:/ProgramData/NVIDIA Corporation/OptiX SDK 5.0.0"
+		"C:/ProgramData/NVIDIA Corporation/OptiX SDK *"
+	)
+	mark_as_advanced(searched_OptiX_INSTALL_DIR)
+  set(OptiX_INSTALL_DIR ${searched_OptiX_INSTALL_DIR} CACHE PATH "Path to OptiX installed location.")
+else()
+  set(OptiX_INSTALL_DIR $ENV{OptiX_INSTALL_DIR} CACHE PATH "Path to OptiX installed location.")
+endif()
+# The distribution contains both 32 and 64 bit libraries.  Adjust the library
+# search path based on the bit-ness of the build.  (i.e. 64: bin64, lib64; 32:
+# bin, lib).  Note that on Mac, the OptiX library is a universal binary, so we
+# only need to look in lib and not lib64 for 64 bit builds.
+if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT APPLE)
+  set(bit_dest "64")
+else()
+  set(bit_dest "")
+endif()
+
+macro(OPTIX_find_api_library name version)
+  find_library(${name}_LIBRARY
+    NAMES ${name}.${version} ${name}
+    PATHS "${OptiX_INSTALL_DIR}/lib${bit_dest}"
+    NO_DEFAULT_PATH
+    )
+  find_library(${name}_LIBRARY
+    NAMES ${name}.${version} ${name}
+    )
+  if(WIN32)
+    find_file(${name}_DLL
+      NAMES ${name}.${version}.dll
+      PATHS "${OptiX_INSTALL_DIR}/bin${bit_dest}"
+      NO_DEFAULT_PATH
+      )
+    find_file(${name}_DLL
+      NAMES ${name}.${version}.dll
+      )
+  endif()
+endmacro()
+
+#OPTIX_find_api_library(optix 7.0.0)
+#OPTIX_find_api_library(optixu 7.0.0)
+#OPTIX_find_api_library(optix_prime 7.0.0)
+
+# Include
+find_path(OptiX_INCLUDE
+  NAMES optix.h
+  PATHS "${OptiX_INSTALL_DIR}/include"
+  NO_DEFAULT_PATH
+  )
+find_path(OptiX_INCLUDE
+  NAMES optix.h
+  )
+
+# Check to make sure we found what we were looking for
+function(OptiX_report_error error_message required)
+  if(OptiX_FIND_REQUIRED AND required)
+    message(FATAL_ERROR "${error_message}")
+  else()
+    if(NOT OptiX_FIND_QUIETLY)
+      message(STATUS "${error_message}")
+    endif(NOT OptiX_FIND_QUIETLY)
+  endif()
+endfunction()
+
+#if(NOT optix_LIBRARY)
+#  OptiX_report_error("optix library not found.  Please locate before proceeding." TRUE)
+#endif()
+if(NOT OptiX_INCLUDE)
+  OptiX_report_error("OptiX headers (optix.h and friends) not found.  Please locate before proceeding." TRUE)
+endif()
+#if(NOT optix_prime_LIBRARY)
+#  OptiX_report_error("optix Prime library not found.  Please locate before proceeding." FALSE)
+#endif()
+
+# Macro for setting up dummy targets
+function(OptiX_add_imported_library name lib_location dll_lib dependent_libs)
+  set(CMAKE_IMPORT_FILE_VERSION 1)
+
+  # Create imported target
+  add_library(${name} SHARED IMPORTED)
+
+  # Import target "optix" for configuration "Debug"
+  if(WIN32)
+    set_target_properties(${name} PROPERTIES
+      IMPORTED_IMPLIB "${lib_location}"
+      #IMPORTED_LINK_INTERFACE_LIBRARIES "glu32;opengl32"
+      IMPORTED_LOCATION "${dll_lib}"
+      IMPORTED_LINK_INTERFACE_LIBRARIES "${dependent_libs}"
+      )
+  elseif(UNIX)
+    set_target_properties(${name} PROPERTIES
+      #IMPORTED_LINK_INTERFACE_LIBRARIES "glu32;opengl32"
+      IMPORTED_LOCATION "${lib_location}"
+      # We don't have versioned filenames for now, and it may not even matter.
+      #IMPORTED_SONAME "${optix_soname}"
+      IMPORTED_LINK_INTERFACE_LIBRARIES "${dependent_libs}"
+      )
+  else()
+    # Unknown system, but at least try and provide the minimum required
+    # information.
+    set_target_properties(${name} PROPERTIES
+      IMPORTED_LOCATION "${lib_location}"
+      IMPORTED_LINK_INTERFACE_LIBRARIES "${dependent_libs}"
+      )
+  endif()
+
+  # Commands beyond this point should not need to know the version.
+  set(CMAKE_IMPORT_FILE_VERSION)
+endfunction()
+
+# Sets up a dummy target
+#OptiX_add_imported_library(optix "${optix_LIBRARY}" "${optix_DLL}" "${OPENGL_LIBRARIES}")
+#OptiX_add_imported_library(optixu   "${optixu_LIBRARY}"   "${optixu_DLL}"   "")
+#OptiX_add_imported_library(optix_prime "${optix_prime_LIBRARY}"  "${optix_prime_DLL}"  "")
+
+macro(OptiX_check_same_path libA libB)
+  if(_optix_path_to_${libA})
+    if(NOT _optix_path_to_${libA} STREQUAL _optix_path_to_${libB})
+      # ${libA} and ${libB} are in different paths.  Make sure there isn't a ${libA} next
+      # to the ${libB}.
+      get_filename_component(_optix_name_of_${libA} "${${libA}_LIBRARY}" NAME)
+      if(EXISTS "${_optix_path_to_${libB}}/${_optix_name_of_${libA}}")
+        message(WARNING " ${libA} library found next to ${libB} library that is not being used.  Due to the way we are using rpath, the copy of ${libA} next to ${libB} will be used during loading instead of the one you intended.  Consider putting the libraries in the same directory or moving ${_optix_path_to_${libB}}/${_optix_name_of_${libA} out of the way.")
+      endif()
+    endif()
+    set( _${libA}_rpath "-Wl,-rpath,${_optix_path_to_${libA}}" )
+  endif()
+endmacro()
+
+# Since liboptix.1.dylib is built with an install name of @rpath, we need to
+# compile our samples with the rpath set to where optix exists.
+if(APPLE)
+  get_filename_component(_optix_path_to_optix "${optix_LIBRARY}" PATH)
+  if(_optix_path_to_optix)
+    set( _optix_rpath "-Wl,-rpath,${_optix_path_to_optix}" )
+  endif()
+  get_filename_component(_optix_path_to_optixu "${optixu_LIBRARY}" PATH)
+  OptiX_check_same_path(optixu optix)
+  get_filename_component(_optix_path_to_optix_prime "${optix_prime_LIBRARY}" PATH)
+  OptiX_check_same_path(optix_prime optix)
+  OptiX_check_same_path(optix_prime optixu)
+
+  set( optix_rpath ${_optix_rpath} ${_optixu_rpath} ${_optix_prime_rpath} )
+  list(REMOVE_DUPLICATES optix_rpath)
+endif()
+
diff --git a/crates/optix/examples/common/gdt/cmake/FindTBB.cmake b/crates/optix/examples/common/gdt/cmake/FindTBB.cmake
new file mode 100644
index 00000000..a7c4f465
--- /dev/null
+++ b/crates/optix/examples/common/gdt/cmake/FindTBB.cmake
@@ -0,0 +1,154 @@
+## ======================================================================== ##
+## Copyright 2009-2019 Intel Corporation                                    ##
+##                                                                          ##
+## Licensed under the Apache License, Version 2.0 (the "License");          ##
+## you may not use this file except in compliance with the License.         ##
+## You may obtain a copy of the License at                                  ##
+##                                                                          ##
+##     http://www.apache.org/licenses/LICENSE-2.0                           ##
+##                                                                          ##
+## Unless required by applicable law or agreed to in writing, software      ##
+## distributed under the License is distributed on an "AS IS" BASIS,        ##
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ##
+## See the License for the specific language governing permissions and      ##
+## limitations under the License.                                           ##
+## ======================================================================== ##
+
+SET(TBB_VERSION_REQUIRED "3.0")
+
+IF (NOT TBB_ROOT_PATH)
+  SET(TBB_ROOT_PATH $ENV{TBB_ROOT_PATH})
+ENDIF()
+IF (NOT TBB_ROOT_PATH)
+  SET(TBB_ROOT_PATH $ENV{TBBROOT})
+ENDIF()
+
+# detect changed TBB_ROOT_PATH
+IF (NOT TBB_ROOT_PATH STREQUAL TBB_ROOT_PATH_LAST)
+  UNSET(TBB_INCLUDE_DIR CACHE)
+  UNSET(TBB_LIBRARY CACHE)
+  UNSET(TBB_LIBRARY_DEBUG CACHE)
+  UNSET(TBB_LIBRARY_MALLOC CACHE)
+  UNSET(TBB_LIBRARY_MALLOC_DEBUG CACHE)
+ENDIF()
+
+IF (WIN32)
+  # workaround for parentheses in variable name / CMP0053
+  SET(PROGRAMFILESx86 "PROGRAMFILES(x86)")
+  SET(PROGRAMFILES32 "$ENV{${PROGRAMFILESx86}}")
+  IF (NOT PROGRAMFILES32)
+    SET(PROGRAMFILES32 "$ENV{PROGRAMFILES}")
+  ENDIF()
+  IF (NOT PROGRAMFILES32)
+    SET(PROGRAMFILES32 "C:/Program Files (x86)")
+  ENDIF()
+  FIND_PATH(TBB_ROOT_PATH include/tbb/task_scheduler_init.h
+    DOC "Root of TBB installation"
+    HINTS ${TBB_ROOT_PATH}
+    PATHS
+      ${PROJECT_SOURCE_DIR}/tbb
+      ${PROJECT_SOURCE_DIR}/../tbb
+      "${PROGRAMFILES32}/IntelSWTools/compilers_and_libraries/windows/tbb"
+      "${PROGRAMFILES32}/Intel/Composer XE/tbb"
+      "${PROGRAMFILES32}/Intel/compilers_and_libraries/windows/tbb"
+  )
+
+  IF (CMAKE_SIZEOF_VOID_P EQUAL 8)
+    SET(TBB_ARCH intel64)
+  ELSE()
+    SET(TBB_ARCH ia32)
+  ENDIF()
+
+  SET(TBB_LIBDIR ${TBB_ROOT_PATH}/lib)
+
+  FIND_PATH(TBB_INCLUDE_DIR tbb/task_scheduler_init.h PATHS ${TBB_ROOT_PATH}/include NO_DEFAULT_PATH)
+  SET(TBB_LIB_HINTS
+    PATHS
+    ${TBB_LIBDIR}/${TBB_ARCH}/vc14
+    ${TBB_LIBDIR}
+    NO_DEFAULT_PATH
+  )
+  FIND_LIBRARY(TBB_LIBRARY tbb ${TBB_LIB_HINTS})
+  FIND_LIBRARY(TBB_LIBRARY_DEBUG tbb_debug ${TBB_LIB_HINTS})
+  FIND_LIBRARY(TBB_LIBRARY_MALLOC tbbmalloc ${TBB_LIB_HINTS})
+  FIND_LIBRARY(TBB_LIBRARY_MALLOC_DEBUG tbbmalloc_debug ${TBB_LIB_HINTS})
+
+ELSE ()
+
+  FIND_PATH(TBB_ROOT_PATH include/tbb/task_scheduler_init.h
+    DOC "Root of TBB installation"
+    HINTS ${TBB_ROOT_PATH}
+    PATHS
+      ${PROJECT_SOURCE_DIR}/tbb
+      /opt/intel/composerxe/tbb
+      /opt/intel/compilers_and_libraries/tbb
+      /opt/intel/tbb
+  )
+
+  IF (APPLE)
+    FIND_PATH(TBB_INCLUDE_DIR tbb/task_scheduler_init.h PATHS ${TBB_ROOT_PATH}/include NO_DEFAULT_PATH)
+    FIND_LIBRARY(TBB_LIBRARY tbb PATHS ${TBB_ROOT_PATH}/lib NO_DEFAULT_PATH)
+    FIND_LIBRARY(TBB_LIBRARY_DEBUG tbb_debug PATHS ${TBB_ROOT_PATH}/lib NO_DEFAULT_PATH)
+    FIND_LIBRARY(TBB_LIBRARY_MALLOC tbbmalloc PATHS ${TBB_ROOT_PATH}/lib NO_DEFAULT_PATH)
+    FIND_LIBRARY(TBB_LIBRARY_MALLOC_DEBUG tbbmalloc_debug PATHS ${TBB_ROOT_PATH}/lib NO_DEFAULT_PATH)
+  ELSE()
+    FIND_PATH(TBB_INCLUDE_DIR tbb/task_scheduler_init.h PATHS ${TBB_ROOT_PATH}/include NO_DEFAULT_PATH)
+    SET(TBB_HINTS HINTS ${TBB_ROOT_PATH}/lib/intel64/gcc4.7 ${TBB_ROOT_PATH}/lib/intel64/gcc4.4 ${TBB_ROOT_PATH}/lib ${TBB_ROOT_PATH}/lib64 PATHS /usr/libx86_64-linux-gnu/)
+    FIND_LIBRARY(TBB_LIBRARY libtbb.so.2 ${TBB_HINTS})
+    FIND_LIBRARY(TBB_LIBRARY_DEBUG libtbb_debug.so.2 ${TBB_HINTS})
+    FIND_LIBRARY(TBB_LIBRARY_MALLOC libtbbmalloc.so.2 ${TBB_HINTS})
+    FIND_LIBRARY(TBB_LIBRARY_MALLOC_DEBUG libtbbmalloc_debug.so.2 ${TBB_HINTS})
+  ENDIF()
+ENDIF()
+
+SET(TBB_ROOT_PATH_LAST ${TBB_ROOT_PATH} CACHE INTERNAL "Last value of TBB_ROOT_PATH to detect changes")
+
+SET(TBB_ERROR_MESSAGE
+  "Threading Building Blocks (TBB) with minimum version ${TBB_VERSION_REQUIRED} not found.
+OSPRay uses TBB as default tasking system. Please make sure you have the TBB headers installed as well (the package is typically named 'libtbb-dev' or 'tbb-devel') and/or hint the location of TBB in TBB_ROOT_PATH.
+Alternatively, you can try to use OpenMP as tasking system by setting OSPRAY_TASKING_SYSTEM=OpenMP")
+
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(TBB
+  ${TBB_ERROR_MESSAGE}
+  TBB_INCLUDE_DIR TBB_LIBRARY TBB_LIBRARY_MALLOC
+)
+
+# check version
+IF (TBB_INCLUDE_DIR)
+  FILE(READ ${TBB_INCLUDE_DIR}/tbb/tbb_stddef.h TBB_STDDEF_H)
+
+  STRING(REGEX MATCH "#define TBB_VERSION_MAJOR ([0-9]+)" DUMMY "${TBB_STDDEF_H}")
+  SET(TBB_VERSION_MAJOR ${CMAKE_MATCH_1})
+
+  STRING(REGEX MATCH "#define TBB_VERSION_MINOR ([0-9]+)" DUMMY "${TBB_STDDEF_H}")
+  SET(TBB_VERSION "${TBB_VERSION_MAJOR}.${CMAKE_MATCH_1}")
+
+  IF (TBB_VERSION VERSION_LESS TBB_VERSION_REQUIRED)
+    MESSAGE(FATAL_ERROR ${TBB_ERROR_MESSAGE})
+  ENDIF()
+
+  SET(TBB_VERSION ${TBB_VERSION} CACHE STRING "TBB Version")
+  MARK_AS_ADVANCED(TBB_VERSION)
+ENDIF()
+
+IF (TBB_FOUND)
+  SET(TBB_INCLUDE_DIRS ${TBB_INCLUDE_DIR})
+  # NOTE(jda) - TBB found in CentOS 6/7 package manager does not have debug
+  #             versions of the library...silently fall-back to using only the
+  #             libraries which we actually found.
+  IF (NOT TBB_LIBRARY_DEBUG)
+    SET(TBB_LIBRARIES ${TBB_LIBRARY} ${TBB_LIBRARY_MALLOC})
+  ELSE ()
+    SET(TBB_LIBRARIES
+        optimized ${TBB_LIBRARY} optimized ${TBB_LIBRARY_MALLOC}
+        debug ${TBB_LIBRARY_DEBUG} debug ${TBB_LIBRARY_MALLOC_DEBUG}
+    )
+  ENDIF()
+ENDIF()
+
+MARK_AS_ADVANCED(TBB_INCLUDE_DIR)
+MARK_AS_ADVANCED(TBB_LIBRARY)
+MARK_AS_ADVANCED(TBB_LIBRARY_DEBUG)
+MARK_AS_ADVANCED(TBB_LIBRARY_MALLOC)
+MARK_AS_ADVANCED(TBB_LIBRARY_MALLOC_DEBUG)
diff --git a/crates/optix/examples/common/gdt/cmake/configure_build_type.cmake b/crates/optix/examples/common/gdt/cmake/configure_build_type.cmake
new file mode 100644
index 00000000..e34b964d
--- /dev/null
+++ b/crates/optix/examples/common/gdt/cmake/configure_build_type.cmake
@@ -0,0 +1,41 @@
+# ======================================================================== #
+# Copyright 2018-2020 Ingo Wald                                            #
+#                                                                          #
+# Licensed under the Apache License, Version 2.0 (the "License");          #
+# you may not use this file except in compliance with the License.         #
+# You may obtain a copy of the License at                                  #
+#                                                                          #
+#     http://www.apache.org/licenses/LICENSE-2.0                           #
+#                                                                          #
+# Unless required by applicable law or agreed to in writing, software      #
+# distributed under the License is distributed on an "AS IS" BASIS,        #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and      #
+# limitations under the License.                                           #
+# ======================================================================== #
+
+# This helper script sets up default build targets for Release/Debug, etc,
+# something which each project I worked on seems to need, eventually, so
+# having it in one place arguably makes sense.
+
+if(NOT SET_UP_CONFIGURATIONS_DONE)
+    set(SET_UP_CONFIGURATIONS_DONE 1)
+
+    # No reason to set CMAKE_CONFIGURATION_TYPES if it's not a multiconfig generator
+    # Also no reason mess with CMAKE_BUILD_TYPE if it's a multiconfig generator.
+    if(CMAKE_CONFIGURATION_TYPES) # multiconfig generator?
+        set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE) 
+    else()
+        if(NOT CMAKE_BUILD_TYPE)
+#            message("Defaulting to release build.")
+            set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
+        endif()
+        set_property(CACHE CMAKE_BUILD_TYPE PROPERTY HELPSTRING "Choose the type of build")
+        # set the valid options for cmake-gui drop-down list
+        set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug;Release")
+    endif()
+endif()
+
+SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
+SET(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
+SET(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
\ No newline at end of file
diff --git a/crates/optix/examples/common/gdt/cmake/configure_glut.cmake b/crates/optix/examples/common/gdt/cmake/configure_glut.cmake
new file mode 100644
index 00000000..8e3a64b4
--- /dev/null
+++ b/crates/optix/examples/common/gdt/cmake/configure_glut.cmake
@@ -0,0 +1,20 @@
+# helper script that finds GLUT, either from the system install (linux), or from the included, precompiled binaries (windows)
+# Note we *intentionally* do not use the file name of "FindGLUT.cmake" because we want to call the system-provided FindGLUT later on, we just set up some paths, where required
+
+# legacy gl vs glvnd/glx
+if (POLICY CMP0072)
+  cmake_policy(SET CMP0072 NEW)
+endif()
+
+if (WIN32)
+   # The default cmake-FindGLUT.cmake script will automatically search in 
+   # - ${GLUT_ROOT_PATH}/Release (fro the lib)
+   # - ${GLUT_ROOT_PATH}/include 
+   # ... ie, setting this search path _should_ make the default script find the
+   # right stuff, and set the right variables
+   set(GLUT_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/../3rdParty/freeglut")
+endif()
+
+
+find_package(OpenGL REQUIRED)
+find_package(GLUT REQUIRED)
diff --git a/crates/optix/examples/common/gdt/cmake/configure_optix.cmake b/crates/optix/examples/common/gdt/cmake/configure_optix.cmake
new file mode 100644
index 00000000..4023f30b
--- /dev/null
+++ b/crates/optix/examples/common/gdt/cmake/configure_optix.cmake
@@ -0,0 +1,68 @@
+# ======================================================================== #
+# Copyright 2018 Ingo Wald                                                 #
+#                                                                          #
+# Licensed under the Apache License, Version 2.0 (the "License");          #
+# you may not use this file except in compliance with the License.         #
+# You may obtain a copy of the License at                                  #
+#                                                                          #
+#     http://www.apache.org/licenses/LICENSE-2.0                           #
+#                                                                          #
+# Unless required by applicable law or agreed to in writing, software      #
+# distributed under the License is distributed on an "AS IS" BASIS,        #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and      #
+# limitations under the License.                                           #
+# ======================================================================== #
+
+set(CMAKE_MODULE_PATH
+  "${PROJECT_SOURCE_DIR}/cmake"
+#  "${CMAKE_CURRENT_SOURCE_DIR}/../cmake"
+  ${CMAKE_MODULE_PATH}
+  )
+
+find_package(CUDA REQUIRED)
+find_package(OptiX REQUIRED VERSION 7.0)
+
+#include_directories(${CUDA_TOOLKIT_INCLUDE})
+if (CUDA_TOOLKIT_ROOT_DIR)
+	include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
+endif()
+include_directories(${OptiX_INCLUDE})
+
+if (WIN32)
+  add_definitions(-DNOMINMAX)
+endif()
+
+find_program(BIN2C bin2c
+  DOC "Path to the cuda-sdk bin2c executable.")
+
+# this macro defines cmake rules that execute the following four steps:
+# 1) compile the given cuda file ${cuda_file} to an intermediary PTX file
+# 2) use the 'bin2c' tool (that comes with CUDA) to
+#    create a second intermediary (.c-)file which defines a const string variable
+#    (named '${c_var_name}') whose (constant) value is the PTX output
+#    from the previous step.
+# 3) compile the given .c file to an intermediary object file (why thus has
+#    that PTX string 'embedded' as a global constant.
+# 4) assign the name of the intermediary .o file to the cmake variable
+#    'output_var', which can then be added to cmake targets.
+macro(cuda_compile_and_embed output_var cuda_file)
+  set(c_var_name ${output_var})
+  cuda_compile_ptx(ptx_files ${cuda_file})
+  list(GET ptx_files 0 ptx_file)
+  set(embedded_file ${ptx_file}_embedded.c)
+#  message("adding rule to compile and embed ${cuda_file} to \"const char ${var_name}[];\"")
+  add_custom_command(
+    OUTPUT ${embedded_file}
+    COMMAND ${BIN2C} -c --padd 0 --type char --name ${c_var_name} ${ptx_file} > ${embedded_file}
+    DEPENDS ${ptx_file}
+    COMMENT "compiling (and embedding ptx from) ${cuda_file}"
+    )
+  set(${output_var} ${embedded_file})
+endmacro()
+
+include_directories(${OptiX_INCLUDE})
+
+add_definitions(-D__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__=1)
+
+
diff --git a/crates/optix/examples/common/gdt/cmake/configure_tbb.cmake b/crates/optix/examples/common/gdt/cmake/configure_tbb.cmake
new file mode 100644
index 00000000..1aa45852
--- /dev/null
+++ b/crates/optix/examples/common/gdt/cmake/configure_tbb.cmake
@@ -0,0 +1,21 @@
+# ======================================================================== #
+# Copyright 2018-2019 Ingo Wald                                            #
+#                                                                          #
+# Licensed under the Apache License, Version 2.0 (the "License");          #
+# you may not use this file except in compliance with the License.         #
+# You may obtain a copy of the License at                                  #
+#                                                                          #
+#     http://www.apache.org/licenses/LICENSE-2.0                           #
+#                                                                          #
+# Unless required by applicable law or agreed to in writing, software      #
+# distributed under the License is distributed on an "AS IS" BASIS,        #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and      #
+# limitations under the License.                                           #
+# ======================================================================== #
+
+find_package(TBB REQUIRED)
+if (TBB_FOUND)
+    include_directories(${TBB_INCLUDE_DIR})
+endif()
+
diff --git a/crates/optix/examples/common/gdt/gdt/gdt.cpp b/crates/optix/examples/common/gdt/gdt/gdt.cpp
new file mode 100644
index 00000000..0dcbffb4
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/gdt.cpp
@@ -0,0 +1,20 @@
+// ======================================================================== //
+// Copyright 2018-2019 Ingo Wald                                            //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#include "gdt.h"
+#include "math/LinearSpace.h"
+#include "math/AffineSpace.h"
+
diff --git a/crates/optix/examples/common/gdt/gdt/gdt.h b/crates/optix/examples/common/gdt/gdt/gdt.h
new file mode 100644
index 00000000..b4eaa48d
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/gdt.h
@@ -0,0 +1,233 @@
+// ======================================================================== //
+// Copyright 2018 Ingo Wald                                                 //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include <stdio.h>
+#include <iostream>
+#include <stdexcept>
+#include <memory>
+#include <assert.h>
+#include <string>
+#include <math.h>
+#ifdef __CUDA_ARCH__
+#  include <math_constants.h>
+#else
+#  include <cmath>
+#endif
+#include <algorithm>
+#ifdef __GNUC__
+#  include <sys/time.h>
+#  include <stdint.h>
+#endif
+#include <stdexcept>
+
+#ifdef _WIN32
+    #ifndef WIN32_LEAN_AND_MEAN
+        #define WIN32_LEAN_AND_MEAN
+    #endif
+    #include <Windows.h>
+    #ifdef min
+        #undef min
+    #endif
+    #ifdef max
+        #undef max
+    #endif
+#endif
+
+
+#if defined(_MSC_VER)
+#  define GDT_DLL_EXPORT __declspec(dllexport)
+#  define GDT_DLL_IMPORT __declspec(dllimport)
+#elif defined(__clang__) || defined(__GNUC__)
+#  define GDT_DLL_EXPORT __attribute__((visibility("default")))
+#  define GDT_DLL_IMPORT __attribute__((visibility("default")))
+#else
+#  define GDT_DLL_EXPORT
+#  define GDT_DLL_IMPORT
+#endif
+
+#if 1
+# define GDT_INTERFACE /* nothing */
+#else
+//#if defined(GDT_DLL_INTERFACE)
+#  ifdef gdt_EXPORTS
+#    define GDT_INTERFACE GDT_DLL_EXPORT
+#  else
+#    define GDT_INTERFACE GDT_DLL_IMPORT
+#  endif
+//#else
+//#  define GDT_INTERFACE /*static lib*/
+//#endif
+#endif
+
+#ifndef PRINT
+# define PRINT(var) std::cout << #var << "=" << var << std::endl;
+# define PING std::cout << __FILE__ << "::" << __LINE__ << ": " << __FUNCTION__ << std::endl;
+#endif
+
+#if defined(__CUDACC__)
+# define __gdt_device   __device__
+# define __gdt_host     __host__
+#else
+# define __gdt_device   /* ignore */
+# define __gdt_host     /* ignore */
+#endif
+
+# define __both__   __gdt_host __gdt_device
+
+
+#ifdef __GNUC__
+  #define MAYBE_UNUSED __attribute__((unused))
+#else
+  #define MAYBE_UNUSED
+#endif
+
+
+
+#define GDT_NOTIMPLEMENTED throw std::runtime_error(std::string(__PRETTY_FUNCTION__)+" not implemented")
+
+#define GDT_TERMINAL_RED "\033[1;31m"
+#define GDT_TERMINAL_GREEN "\033[1;32m"
+#define GDT_TERMINAL_YELLOW "\033[1;33m"
+#define GDT_TERMINAL_BLUE "\033[1;34m"
+#define GDT_TERMINAL_RESET "\033[0m"
+#define GDT_TERMINAL_DEFAULT GDT_TERMINAL_RESET
+#define GDT_TERMINAL_BOLD "\033[1;1m"
+     
+
+
+
+
+/*! \namespace gdt GPU Developer Toolbox */
+namespace gdt {
+
+#ifdef __CUDACC__
+  using ::min;
+  using ::max;
+  // inline __both__ float abs(float f)      { return fabsf(f); }
+  // inline __both__ double abs(double f)    { return fabs(f); }
+  using std::abs;
+  // inline __both__ float sin(float f) { return ::sinf(f); }
+  // inline __both__ double sin(double f) { return ::sin(f); }
+  // inline __both__ float cos(float f) { return ::cosf(f); }
+  // inline __both__ double cos(double f) { return ::cos(f); }
+
+  using ::saturate;
+#else
+  using std::min;
+  using std::max;
+  using std::abs;
+  // inline __both__ double sin(double f) { return ::sin(f); }
+  inline __both__ float saturate(const float &f) { return min(1.f,max(0.f,f)); }
+#endif
+
+  // inline __both__ float abs(float f)      { return fabsf(f); }
+  // inline __both__ double abs(double f)    { return fabs(f); }
+  inline __both__ float rcp(float f)      { return 1.f/f; }
+  inline __both__ double rcp(double d)    { return 1./d; }
+  
+  inline __both__ int32_t divRoundUp(int32_t a, int32_t b) { return (a+b-1)/b; }
+  inline __both__ uint32_t divRoundUp(uint32_t a, uint32_t b) { return (a+b-1)/b; }
+  inline __both__ int64_t divRoundUp(int64_t a, int64_t b) { return (a+b-1)/b; }
+  inline __both__ uint64_t divRoundUp(uint64_t a, uint64_t b) { return (a+b-1)/b; }
+  
+#ifdef __CUDACC__
+  using ::sin; // this is the double version
+  // inline __both__ float sin(float f) { return ::sinf(f); }
+  using ::cos; // this is the double version
+  // inline __both__ float cos(float f) { return ::cosf(f); }
+#else
+  using ::sin; // this is the double version
+  using ::cos; // this is the double version
+#endif
+
+// #ifdef __CUDA_ARCH__
+  // using ::sqrt;
+  // using ::sqrtf;
+// #else
+  namespace overloaded {
+    /* move all those in a special namespace so they will never get
+       included - and thus, conflict with, the default namesapce */
+    inline __both__ float  sqrt(const float f)   { return ::sqrtf(f); }
+    inline __both__ double sqrt(const double d)   { return ::sqrt(d); }
+  }
+// #endif
+//   inline __both__ float rsqrt(const float f)   { return 1.f/sqrtf(f); }
+//   inline __both__ double rsqrt(const double d)   { return 1./sqrt(d); }
+
+#ifdef __WIN32__
+#  define osp_snprintf sprintf_s
+#else
+#  define osp_snprintf snprintf
+#endif
+  
+  /*! added pretty-print function for large numbers, printing 10000000 as "10M" instead */
+  inline std::string prettyDouble(const double val) {
+    const double absVal = abs(val);
+    char result[1000];
+
+    if      (absVal >= 1e+18f) osp_snprintf(result,1000,"%.1f%c",val/1e18f,'E');
+    else if (absVal >= 1e+15f) osp_snprintf(result,1000,"%.1f%c",val/1e15f,'P');
+    else if (absVal >= 1e+12f) osp_snprintf(result,1000,"%.1f%c",val/1e12f,'T');
+    else if (absVal >= 1e+09f) osp_snprintf(result,1000,"%.1f%c",val/1e09f,'G');
+    else if (absVal >= 1e+06f) osp_snprintf(result,1000,"%.1f%c",val/1e06f,'M');
+    else if (absVal >= 1e+03f) osp_snprintf(result,1000,"%.1f%c",val/1e03f,'k');
+    else if (absVal <= 1e-12f) osp_snprintf(result,1000,"%.1f%c",val*1e15f,'f');
+    else if (absVal <= 1e-09f) osp_snprintf(result,1000,"%.1f%c",val*1e12f,'p');
+    else if (absVal <= 1e-06f) osp_snprintf(result,1000,"%.1f%c",val*1e09f,'n');
+    else if (absVal <= 1e-03f) osp_snprintf(result,1000,"%.1f%c",val*1e06f,'u');
+    else if (absVal <= 1e-00f) osp_snprintf(result,1000,"%.1f%c",val*1e03f,'m');
+    else osp_snprintf(result,1000,"%f",(float)val);
+
+    return result;
+  }
+  
+
+
+  inline std::string prettyNumber(const size_t s)
+  {
+    char buf[1000];
+    if (s >= (1024LL*1024LL*1024LL*1024LL)) {
+		osp_snprintf(buf, 1000,"%.2fT",s/(1024.f*1024.f*1024.f*1024.f));
+    } else if (s >= (1024LL*1024LL*1024LL)) {
+		osp_snprintf(buf, 1000, "%.2fG",s/(1024.f*1024.f*1024.f));
+    } else if (s >= (1024LL*1024LL)) {
+		osp_snprintf(buf, 1000, "%.2fM",s/(1024.f*1024.f));
+    } else if (s >= (1024LL)) {
+		osp_snprintf(buf, 1000, "%.2fK",s/(1024.f));
+    } else {
+		osp_snprintf(buf,1000,"%zi",s);
+    }
+    return buf;
+  }
+  
+  inline double getCurrentTime()
+  {
+#ifdef _WIN32
+    SYSTEMTIME tp; GetSystemTime(&tp);
+    return double(tp.wSecond) + double(tp.wMilliseconds) / 1E3;
+#else
+    struct timeval tp; gettimeofday(&tp,nullptr);
+    return double(tp.tv_sec) + double(tp.tv_usec)/1E6;
+#endif
+  }
+
+  inline bool hasSuffix(const std::string &s, const std::string &suffix)
+  {
+    return s.substr(s.size()-suffix.size()) == suffix;
+  }
+}
diff --git a/crates/optix/examples/common/gdt/gdt/math/AffineSpace.h b/crates/optix/examples/common/gdt/gdt/math/AffineSpace.h
new file mode 100644
index 00000000..80a306c5
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/math/AffineSpace.h
@@ -0,0 +1,183 @@
+// ======================================================================== //
+// Copyright 2018-2019 Ingo Wald                                            //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+/* originally taken (and adapted) from ospray, under following license */
+
+// ======================================================================== //
+// Copyright 2009-2018 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "LinearSpace.h"
+#include "box.h"
+
+namespace gdt {
+
+#define VectorT typename L::vector_t
+#define ScalarT typename L::vector_t::scalar_t
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Affine Space
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L>
+  struct GDT_INTERFACE AffineSpaceT
+  {
+    L l;           /*< linear part of affine space */
+    VectorT p;     /*< affine part of affine space */
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Constructors, Assignment, Cast, Copy Operations
+    ////////////////////////////////////////////////////////////////////////////////
+
+    inline AffineSpaceT           ( ) = default;
+    inline AffineSpaceT           ( const AffineSpaceT& other ) { l = other.l; p = other.p; }
+    inline AffineSpaceT           ( const L           & other ) { l = other  ; p = VectorT(zero); }
+    inline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; }
+
+    inline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {}
+    inline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {}
+
+    template<typename L1> inline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    inline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {}
+    inline AffineSpaceT( OneTy )  : l(one),  p(zero) {}
+
+    /*! return matrix for scaling */
+    static inline AffineSpaceT scale(const VectorT& s) { return L::scale(s); }
+
+    /*! return matrix for translation */
+    static inline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); }
+
+    /*! return matrix for rotation, only in 2D */
+    static inline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); }
+
+    /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */
+    static inline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); }
+
+    /*! return matrix for rotation around arbitrary axis and point, only in 3D */
+    static inline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p);  }
+
+    /*! return matrix for looking at given point, only in 3D; right-handed coordinate system */
+    static inline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) {
+      VectorT Z = normalize(point-eye);
+      VectorT U = normalize(cross(Z,up));
+      VectorT V = cross(U,Z);
+      return AffineSpaceT(L(U,V,Z),eye);
+    }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> inline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); }
+  template<typename L> inline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); }
+  template<typename L> inline AffineSpaceT<L>        rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> inline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); }
+  template<typename L> inline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); }
+
+  template<typename L> inline AffineSpaceT<L> operator *( const ScalarT        & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); }
+  template<typename L> inline AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); }
+  template<typename L> inline AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); }
+  template<typename L> inline AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT        & b ) { return a * rcp(b); }
+
+  template<typename L> inline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; }
+  template<typename L> inline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a * b; }
+  template<typename L> inline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; }
+  template<typename L> inline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a / b; }
+
+  template<typename L> inline __both__ const VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); }
+  template<typename L> inline __both__ const VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); }
+  template<typename L> inline __both__ const VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); }
+
+
+  // template<typename S, bool A=false>
+  // inline const box_t<S,3,A> 
+  // xfmBounds(const AffineSpaceT<LinearSpace3<vec_t<S,3,A>>> &m, 
+  //           const box_t<S,3,A> &b)
+  // {
+  //   box_t<S,3,A> dst = empty;
+  //   const vec_t<S,3,A> p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0));
+  //   const vec_t<S,3,A> p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1));
+  //   const vec_t<S,3,A> p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2));
+  //   const vec_t<S,3,A> p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3));
+  //   const vec_t<S,3,A> p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4));
+  //   const vec_t<S,3,A> p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5));
+  //   const vec_t<S,3,A> p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6));
+  //   const vec_t<S,3,A> p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7));
+  //   return dst;
+  // }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> inline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; }
+  template<typename L> inline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> inline std::ostream& operator<<(std::ostream& cout, const AffineSpaceT<L>& m) {
+    return cout << "{ l = " << m.l << ", p = " << m.p << " }";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Type Aliases
+  ////////////////////////////////////////////////////////////////////////////////
+
+  using AffineSpace2f      = AffineSpaceT<LinearSpace2f>;
+  using AffineSpace3f      = AffineSpaceT<LinearSpace3f>;
+  using AffineSpace3fa     = AffineSpaceT<LinearSpace3fa>;
+  using OrthonormalSpace3f = AffineSpaceT<Quaternion3f >;
+
+  using affine2f = AffineSpace2f;
+  using affine3f = AffineSpace3f;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /*! Template Specialization for 2D: return matrix for rotation around point (rotation around arbitrarty vector is not meaningful in 2D) */
+  template<> inline AffineSpace2f AffineSpace2f::rotate(const vec2f& p, const float& r)
+  { return translate(+p) * AffineSpace2f(LinearSpace2f::rotate(r)) * translate(-p); }
+
+#undef VectorT
+#undef ScalarT
+
+} // ::gdt
diff --git a/crates/optix/examples/common/gdt/gdt/math/LinearSpace.h b/crates/optix/examples/common/gdt/gdt/math/LinearSpace.h
new file mode 100644
index 00000000..50bd4066
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/math/LinearSpace.h
@@ -0,0 +1,341 @@
+// ======================================================================== //
+// Copyright 2018-2019 Ingo Wald                                            //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+/* originally taken (and adapted) from ospray, under following license */
+
+// ======================================================================== //
+// Copyright 2009-2018 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "vec.h"
+#include "Quaternion.h"
+
+namespace gdt {
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 2D Linear Transform (2x2 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct GDT_INTERFACE LinearSpace2
+  {
+    using vector_t = T;
+    // using Scalar = typename T::scalar_t;
+    // using vector_t = T;
+    using scalar_t = typename T::scalar_t;
+    
+    /*! default matrix constructor */
+    inline LinearSpace2           ( ) = default;
+    inline LinearSpace2           ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; }
+    inline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; }
+
+    template<typename L1> inline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {}
+
+    /*! matrix construction from column vectors */
+    inline __both__ LinearSpace2(const vector_t& vx, const vector_t& vy)
+      : vx(vx), vy(vy) {}
+
+    /*! matrix construction from row mayor data */
+    inline __both__ LinearSpace2(const scalar_t& m00, const scalar_t& m01, 
+                        const scalar_t& m10, const scalar_t& m11)
+      : vx(m00,m10), vy(m01,m11) {}
+
+    /*! compute the determinant of the matrix */
+    inline __both__ const scalar_t det() const { return vx.x*vy.y - vx.y*vy.x; }
+
+    /*! compute adjoint matrix */
+    inline __both__ const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); }
+
+    /*! compute inverse matrix */
+    inline __both__ const LinearSpace2 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    inline __both__ const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); }
+
+    /*! returns first row of matrix */
+    inline const vector_t row0() const { return vector_t(vx.x,vy.x); }
+
+    /*! returns second row of matrix */
+    inline const vector_t row1() const { return vector_t(vx.y,vy.y); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    inline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {}
+    inline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {}
+
+    /*! return matrix for scaling */
+    static inline LinearSpace2 scale(const vector_t& s) {
+      return LinearSpace2(s.x,   0,
+                          0  , s.y);
+    }
+
+    /*! return matrix for rotation */
+    static inline LinearSpace2 rotate(const scalar_t& r) {
+      scalar_t s = sin(r), c = cos(r);
+      return LinearSpace2(c, -s,
+                          s,  c);
+    }
+
+    /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */
+    LinearSpace2 orthogonal() const {
+      LinearSpace2 m = *this;
+
+      // mirrored?
+      scalar_t mirror(one);
+      if (m.det() < scalar_t(zero)) {
+        m.vx = -m.vx;
+        mirror = -mirror;
+      }
+
+      // rotation
+      for (int i = 0; i < 99; i++) {
+        const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse());
+        const LinearSpace2 d = m_next - m;
+        m = m_next;
+        // norm^2 of difference small enough?
+        if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8)
+          break;
+      }
+
+      // rotation * mirror_x
+      return LinearSpace2(mirror*m.vx, m.vy);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    vector_t vx,vy;
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __both__ inline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); }
+  template<typename T> __both__ inline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); }
+  template<typename T> __both__ inline LinearSpace2<T> rcp       ( const LinearSpace2<T>& a ) { return a.inverse(); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> inline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); }
+  template<typename T> inline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); }
+
+  template<typename T> inline LinearSpace2<T> operator*(const typename T::scalar_t & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+  template<typename T> inline T               operator*(const LinearSpace2<T>& a, const T              & b) { return b.x*a.vx + b.y*a.vy; }
+  template<typename T> inline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+
+  template<typename T> inline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::scalar_t & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); }
+  template<typename T> inline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); }
+
+  template<typename T> inline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; }
+  template<typename T> inline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> inline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; }
+  template<typename T> inline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static std::ostream& operator<<(std::ostream& cout, const LinearSpace2<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 3D Linear Transform (3x3 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> 
+  struct GDT_INTERFACE LinearSpace3
+  {
+    // using vector_t = T;
+    using scalar_t = typename T::scalar_t;
+    using vector_t = T;
+    // using scalar_t = typename T::scalar_t;
+
+    /*! default matrix constructor */
+    inline LinearSpace3           ( ) = default;
+    inline __both__ LinearSpace3           ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; }
+    inline __both__ LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; }
+
+    template<typename L1> inline __both__ LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {}
+
+    /*! matrix construction from column vectors */
+    inline __both__ LinearSpace3(const vector_t& vx, const vector_t& vy, const vector_t& vz)
+      : vx(vx), vy(vy), vz(vz) {}
+
+    /*! construction from quaternion */
+    inline __both__ LinearSpace3( const QuaternionT<scalar_t>& q )
+      : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j))
+      , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i))
+      , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {}
+
+    /*! matrix construction from row mayor data */
+    inline __both__ LinearSpace3(const scalar_t& m00, const scalar_t& m01, const scalar_t& m02,
+                        const scalar_t& m10, const scalar_t& m11, const scalar_t& m12,
+                        const scalar_t& m20, const scalar_t& m21, const scalar_t& m22)
+      : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {}
+
+    /*! compute the determinant of the matrix */
+    inline __both__ const scalar_t det() const { return dot(vx,cross(vy,vz)); }
+
+    /*! compute adjoint matrix */
+    inline __both__ const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); }
+
+    /*! compute inverse matrix */
+    inline __both__ const LinearSpace3 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    inline __both__ const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); }
+
+    /*! returns first row of matrix */
+    inline __both__ const vector_t row0() const { return vector_t(vx.x,vy.x,vz.x); }
+
+    /*! returns second row of matrix */
+    inline __both__ const vector_t row1() const { return vector_t(vx.y,vy.y,vz.y); }
+
+    /*! returns third row of matrix */
+    inline __both__ const vector_t row2() const { return vector_t(vx.z,vy.z,vz.z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    inline __both__ LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {}
+    inline __both__ LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {}
+
+    /*! return matrix for scaling */
+    static inline __both__ LinearSpace3 scale(const vector_t& s) {
+      return LinearSpace3(s.x,   0,   0,
+                          0  , s.y,   0,
+                          0  ,   0, s.z);
+    }
+
+    /*! return matrix for rotation around arbitrary axis */
+    static inline __both__ LinearSpace3 rotate(const vector_t& _u, const scalar_t& r) {
+      vector_t u = normalize(_u);
+      scalar_t s = sin(r), c = cos(r);
+      return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c,  u.x*u.y*(1-c)-u.z*s,    u.x*u.z*(1-c)+u.y*s,
+                          u.x*u.y*(1-c)+u.z*s,    u.y*u.y+(1-u.y*u.y)*c,  u.y*u.z*(1-c)-u.x*s,
+                          u.x*u.z*(1-c)-u.y*s,    u.y*u.z*(1-c)+u.x*s,    u.z*u.z+(1-u.z*u.z)*c);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+	  T vx,vy,vz;
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> inline __both__ LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); }
+  template<typename T> inline __both__ LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); }
+  template<typename T> inline __both__ LinearSpace3<T> rcp       ( const LinearSpace3<T>& a ) { return a.inverse(); }
+
+  /* constructs a coordinate frame form a normalized normal */
+  template<typename T> inline __both__ LinearSpace3<T> frame(const T& N) 
+  {
+    const T dx0 = cross(T(one,zero,zero),N);
+    const T dx1 = cross(T(zero,one,zero),N);
+    const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+
+  /* constructs a coordinate frame from a normal and approximate x-direction */
+  template<typename T> inline __both__ LinearSpace3<T> frame(const T& N, const T& dxi)
+  {
+    if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel
+    const T dx = normalize(cross(dxi,N));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+  
+  /* clamps linear space to range -1 to +1 */
+  template<typename T> inline __both__ LinearSpace3<T> clamp(const LinearSpace3<T>& space) {
+    return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)),
+                           clamp(space.vy,T(-1.0f),T(1.0f)),
+                           clamp(space.vz,T(-1.0f),T(1.0f)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> inline __both__ LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); }
+  template<typename T> inline __both__ LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); }
+
+  template<typename T> inline __both__ LinearSpace3<T> operator*(const typename T::scalar_t & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+  template<typename T> inline T               operator*(const LinearSpace3<T>& a, const T              & b) { return b.x*a.vx + b.y*a.vy + b.z*a.vz; }
+  template<typename T> inline __both__ LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+
+  template<typename T> __both__ inline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::scalar_t & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); }
+  
+  template<typename T> __both__ inline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); }
+
+  template<typename T> inline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; }
+  template<typename T> inline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; }
+
+  template<typename T> inline __both__ T xfmPoint (const LinearSpace3<T>& s, const T& a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z*s.vz))); }
+  template<typename T> inline __both__ T xfmVector(const LinearSpace3<T>& s, const T& a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z*s.vz))); }
+  template<typename T> inline __both__ T xfmNormal(const LinearSpace3<T>& s, const T& a) { return xfmVector(s.inverse().transposed(),a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> inline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
+  template<typename T> inline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> inline std::ostream& operator<<(std::ostream& cout, const LinearSpace3<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  using LinearSpace2f  = LinearSpace2<vec2f> ;
+  using LinearSpace3f  = LinearSpace3<vec3f> ;
+  using LinearSpace3fa = LinearSpace3<vec3fa>;
+
+  using linear2f = LinearSpace2f;
+  using linear3f = LinearSpace3f;
+} // ::ospcommon
diff --git a/crates/optix/examples/common/gdt/gdt/math/Quaternion.h b/crates/optix/examples/common/gdt/gdt/math/Quaternion.h
new file mode 100644
index 00000000..ed62e924
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/math/Quaternion.h
@@ -0,0 +1,227 @@
+// ======================================================================== //
+// Copyright 2018 Ingo Wald                                                 //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+/* originally taken (and adapted) from ospray, under following license */
+
+// ======================================================================== //
+// Copyright 2009-2018 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "vec.h"
+
+namespace gdt
+{
+  ////////////////////////////////////////////////////////////////
+  // Quaternion Struct
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T>
+  struct QuaternionT
+  {
+    typedef vec_t<T,3> Vector;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __both__ QuaternionT           ( void )                     { }
+    __both__ QuaternionT           ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; }
+    __both__ QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; }
+
+    __both__          QuaternionT( const T& r       ) : r(r), i(zero), j(zero), k(zero) {}
+    __both__ explicit QuaternionT( const Vector& v ) : r(zero), i(v.x), j(v.y), k(v.z) {}
+    __both__          QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {}
+    __both__          QuaternionT( const T& r, const Vector& v ) : r(r), i(v.x), j(v.y), k(v.z) {}
+
+    __inline QuaternionT( const Vector& vx, const Vector& vy, const Vector& vz );
+    __inline QuaternionT( const T& yaw, const T& pitch, const T& roll );
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __both__ QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {}
+    __both__ QuaternionT( OneTy  ) : r( one), i(zero), j(zero), k(zero) {}
+
+    /*! return quaternion for rotation around arbitrary axis */
+    static __both__ QuaternionT rotate(const Vector& u, const T& r) {
+      return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u));
+    }
+
+    /*! returns the rotation axis of the quaternion as a vector */
+    __both__ const Vector v( ) const { return Vector(i, j, k); }
+
+  public:
+    T r, i, j, k;
+  };
+
+  template<typename T> __both__ QuaternionT<T> operator *( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); }
+  template<typename T> __both__ QuaternionT<T> operator *( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); }
+
+  ////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __both__ QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); }
+  template<typename T> __both__ QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __both__ QuaternionT<T> conj      ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __both__ T              abs       ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __both__ QuaternionT<T> rcp       ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __both__ QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+
+  ////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __both__ QuaternionT<T> operator +( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r,  b.i,  b.j,  b.k); }
+  template<typename T> __both__ QuaternionT<T> operator +( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); }
+  template<typename T> __both__ QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); }
+  template<typename T> __both__ QuaternionT<T> operator -( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); }
+  template<typename T> __both__ QuaternionT<T> operator -( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); }
+  template<typename T> __both__ QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); }
+
+  template<typename T> __both__ typename QuaternionT<T>::Vector       operator *( const QuaternionT<T>& a, const typename QuaternionT<T>::Vector      & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __both__ QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) {
+    return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k,
+                          a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j,
+                          a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i,
+                          a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r);
+  }
+  template<typename T> __both__ QuaternionT<T> operator /( const T             & a, const QuaternionT<T>& b ) { return a*rcp(b); }
+  template<typename T> __both__ QuaternionT<T> operator /( const QuaternionT<T>& a, const T             & b ) { return a*rcp(b); }
+  template<typename T> __both__ QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); }
+
+  template<typename T> __both__ QuaternionT<T>& operator +=( QuaternionT<T>& a, const T             & b ) { return a = a+b; }
+  template<typename T> __both__ QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; }
+  template<typename T> __both__ QuaternionT<T>& operator -=( QuaternionT<T>& a, const T             & b ) { return a = a-b; }
+  template<typename T> __both__ QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; }
+  template<typename T> __both__ QuaternionT<T>& operator *=( QuaternionT<T>& a, const T             & b ) { return a = a*b; }
+  template<typename T> __both__ QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; }
+  template<typename T> __both__ QuaternionT<T>& operator /=( QuaternionT<T>& a, const T             & b ) { return a = a*rcp(b); }
+  template<typename T> __both__ QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); }
+
+  template<typename T> __both__ typename QuaternionT<T>::Vector 
+  xfmPoint ( const QuaternionT<T>& a, 
+             const typename QuaternionT<T>::Vector&       b ) 
+  { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+
+  template<typename T> __both__ typename QuaternionT<T>::Vector 
+  xfmQuaternion( const QuaternionT<T>& a, 
+                 const typename QuaternionT<T>::Vector&       b ) 
+  { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+
+
+  template<typename T> __both__ typename QuaternionT<T>::Vector 
+  xfmNormal( const QuaternionT<T>& a, 
+             const typename QuaternionT<T>::Vector&       b ) 
+  { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __both__ bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; }
+
+  template<typename T> __both__ bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Orientation Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> 
+  QuaternionT<T>::QuaternionT(const typename QuaternionT<T>::Vector& vx, 
+                              const typename QuaternionT<T>::Vector& vy, 
+                              const typename QuaternionT<T>::Vector& vz )
+  {
+    if ( vx.x + vy.y + vz.z >= T(zero) )
+    {
+      const T t = T(one) + (vx.x + vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = t*s;
+      i = (vy.z - vz.y)*s;
+      j = (vz.x - vx.z)*s;
+      k = (vx.y - vy.x)*s;
+    }
+    else if ( vx.x >= max(vy.y, vz.z) )
+    {
+      const T t = (T(one) + vx.x) - (vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vy.z - vz.y)*s;
+      i = t*s;
+      j = (vx.y + vy.x)*s;
+      k = (vz.x + vx.z)*s;
+    }
+    else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) )
+    {
+      const T t = (T(one) + vy.y) - (vz.z + vx.x);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vz.x - vx.z)*s;
+      i = (vx.y + vy.x)*s;
+      j = t*s;
+      k = (vy.z + vz.y)*s;
+    }
+    else //if ( vz.z >= max(vy.y, vx.x) )
+    {
+      const T t = (T(one) + vz.z) - (vx.x + vy.y);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vx.y - vy.x)*s;
+      i = (vz.x + vx.z)*s;
+      j = (vy.z + vz.y)*s;
+      k = t*s;
+    }
+  }
+
+  template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll )
+  {
+    const T cya = cos(yaw  *T(0.5f));
+    const T cpi = cos(pitch*T(0.5f));
+    const T cro = cos(roll *T(0.5f));
+    const T sya = sin(yaw  *T(0.5f));
+    const T spi = sin(pitch*T(0.5f));
+    const T sro = sin(roll *T(0.5f));
+    r = cro*cya*cpi + sro*sya*spi;
+    i = cro*cya*spi + sro*sya*cpi;
+    j = cro*sya*cpi - sro*cya*spi;
+    k = sro*cya*cpi - cro*sya*spi;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static std::ostream& operator<<(std::ostream& cout, const QuaternionT<T>& q) {
+    return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }";
+  }
+
+  /*! default template instantiations */
+  typedef QuaternionT<float>  Quaternion3f;
+  typedef QuaternionT<double> Quaternion3d;
+}
diff --git a/crates/optix/examples/common/gdt/gdt/math/box.h b/crates/optix/examples/common/gdt/gdt/math/box.h
new file mode 100644
index 00000000..def74c79
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/math/box.h
@@ -0,0 +1,223 @@
+// ======================================================================== //
+// Copyright 2018 Ingo Wald                                                 //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "gdt/math/vec.h"
+
+namespace gdt {
+
+  template<typename T>
+  struct interval {
+    typedef T scalar_t;
+    inline __both__ interval() 
+      : lower(gdt::empty_bounds_lower<T>()),
+        upper(gdt::empty_bounds_upper<T>())
+    {}
+    inline __both__ interval(T begin, T end) : begin(begin), end(end) {}
+    
+    union {
+      T begin;
+      T lower;
+      T lo;
+    };
+    union {
+      T end;
+      T upper;
+      T hi;
+    };
+
+    inline __both__ bool contains(const T &t) const { return t >= lower && t <= upper; }
+    inline __both__ bool is_empty() const { return begin > end; }
+    inline __both__ T center() const { return (begin+end)/2; }
+    inline __both__ T span() const { return end - begin; }
+    inline __both__ T diagonal() const { return end - begin; }
+    inline __both__ interval<T> &extend(const T &t)
+    { lower = min(lower,t); upper = max(upper,t); return *this; }
+    inline __both__ interval<T> &extend(const interval<T> &t)
+    { lower = min(lower,t.lower); upper = max(upper,t.upper); return *this; }
+    
+    static inline __both__ interval<T> positive()
+    {
+      return interval<T>(0.f,gdt::open_range_upper<T>());
+    }
+  };
+
+  template<typename T>
+  inline __both__ std::ostream &operator<<(std::ostream &o, const interval<T> &b)
+  {
+#ifndef __CUDACC__
+    o << "[" << b.lower << ":" << b.upper << "]";
+#endif
+    return o;
+  }
+  
+  template<typename T>
+  inline __both__ interval<T> build_interval(const T &a, const T &b)
+  { return interval<T>(min(a,b),max(a,b)); }
+  
+  template<typename T>
+  inline __both__ interval<T> intersect(const interval<T> &a, const interval<T> &b)
+  { return interval<T>(max(a.lower,b.lower),min(a.upper,b.upper)); }
+  
+  template<typename T>
+  inline __both__ interval<T> operator-(const interval<T> &a, const T &b)
+  { return interval<T>(a.lower-b,a.upper-b); }
+
+  template<typename T>
+  inline __both__ interval<T> operator*(const interval<T> &a, const T &b)
+  { return build_interval<T>(a.lower*b,a.upper*b); }
+  
+  template<typename T>
+  inline __both__ bool operator==(const interval<T> &a, const interval<T> &b)
+  { return a.lower == b.lower && a.upper == b.upper; }
+  
+  template<typename T>
+  inline __both__ bool operator!=(const interval<T> &a, const interval<T> &b)
+  { return !(a == b); }
+
+
+  
+  template<typename T>
+  struct box_t {
+    typedef T vec_t;
+    typedef typename T::scalar_t scalar_t;
+    enum { dims = T::dims };
+
+    inline __both__ box_t()
+      : lower(gdt::empty_bounds_lower<typename T::scalar_t>()),
+        upper(gdt::empty_bounds_upper<typename T::scalar_t>())
+    {}
+
+    // /*! construct a new, origin-oriented box of given size */
+    // explicit inline __both__ box_t(const vec_t &box_size)
+    //   : lower(vec_t(0)),
+    //     upper(box_size)
+    // {}
+    /*! construct a new box around a single point */
+    explicit inline __both__ box_t(const vec_t &v)
+      : lower(v),
+        upper(v)
+    {}
+
+    /*! construct a new, origin-oriented box of given size */
+    inline __both__ box_t(const vec_t &lo, const vec_t &hi)
+      : lower(lo),
+        upper(hi)
+    {}
+
+    /*! returns new box including both ourselves _and_ the given point */
+    inline __both__ box_t including(const vec_t &other) const
+    { return box_t(min(lower,other),max(upper,other)); }
+
+    
+    /*! returns new box including both ourselves _and_ the given point */
+    inline __both__ box_t &extend(const vec_t &other) 
+    { lower = min(lower,other); upper = max(upper,other); return *this; }
+    /*! returns new box including both ourselves _and_ the given point */
+    inline __both__ box_t &extend(const box_t &other) 
+    { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+    
+    
+    /*! get the d-th dimensional slab (lo[dim]..hi[dim] */
+    inline __both__ interval<scalar_t> get_slab(const uint32_t dim)
+    {
+      return interval<scalar_t>(lower[dim],upper[dim]);
+    }
+    
+    inline __both__ bool contains(const vec_t &point) const
+    { return !(any_less_than(point,lower) || any_greater_than(point,upper)); }
+
+    inline __both__ bool overlaps(const box_t &other) const
+    { return !(any_less_than(other.upper,lower) || any_greater_than(other.lower,upper)); }
+
+    inline __both__ vec_t center() const { return (lower+upper)/(typename vec_t::scalar_t)2; }
+    inline __both__ vec_t span()   const { return upper-lower; }
+    inline __both__ vec_t size()   const { return upper-lower; }
+
+    inline __both__ typename long_type_of<typename T::scalar_t>::type volume() const
+    { return gdt::volume(size()); }
+    
+    inline __both__ bool empty() const { return any_less_than(upper,lower); }
+
+    vec_t lower, upper;
+  };
+  
+  // =======================================================
+  // default functions
+  // =======================================================
+
+  template<typename T>
+  inline __both__ typename long_type_of<T>::type area(const box_t<vec_t<T,2>> &b)
+  { return area(b.upper - b.lower); }
+
+  template<typename T>
+  inline __both__ typename long_type_of<T>::type area(const box_t<vec_t<T,3>> &b)
+  {
+    const vec_t<T,3> diag = b.upper - b.lower;
+    return 2.f*(area(vec_t<T,2>(diag.x,diag.y))+
+                area(vec_t<T,2>(diag.y,diag.z))+
+                area(vec_t<T,2>(diag.z,diag.x)));
+  }
+
+  template<typename T>
+  inline __both__ typename long_type_of<T>::type volume(const box_t<vec_t<T,3>> &b)
+  {
+    const vec_t<T,3> diag = b.upper - b.lower;
+    return diag.x*diag.y*diag.z;
+  }
+
+  template<typename T>
+  inline __both__ std::ostream &operator<<(std::ostream &o, const box_t<T> &b)
+  {
+#ifndef __CUDACC__
+    o << "[" << b.lower << ":" << b.upper << "]";
+#endif
+    return o;
+  }
+
+  template<typename T>
+  inline __both__ box_t<T> intersection(const box_t<T> &a, const box_t<T> &b)
+  { return box_t<T>(max(a.lower,b.lower),min(a.upper,b.upper)); }
+  
+  template<typename T>
+  inline __both__ bool operator==(const box_t<T> &a, const box_t<T> &b)
+  { return a.lower == b.lower && a.upper == b.upper; }
+  
+  template<typename T>
+  inline __both__ bool operator!=(const box_t<T> &a, const box_t<T> &b)
+  { return !(a == b); }
+
+
+    
+  
+  // =======================================================
+  // default instantiations
+  // =======================================================
+  
+#define _define_box_types(T,t)    \
+  typedef box_t<vec_t<T,2>> box2##t;            \
+  typedef box_t<vec_t<T,3>> box3##t;            \
+  typedef box_t<vec_t<T,4>> box4##t;            \
+  typedef box_t<vec3a_t<T>> box3##t##a;         \
+  
+  _define_box_types(int,i);
+  _define_box_types(unsigned int,ui);
+  _define_box_types(float,f);
+  
+#undef _define_box_types
+
+} // ::gdt
diff --git a/crates/optix/examples/common/gdt/gdt/math/constants.h b/crates/optix/examples/common/gdt/gdt/math/constants.h
new file mode 100644
index 00000000..bbacb22a
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/math/constants.h
@@ -0,0 +1,185 @@
+// ======================================================================== //
+// Copyright 2018 Ingo Wald                                                 //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "gdt/math/vec.h"
+#ifdef __CUDA_ARCH__
+#include <math_constants.h>
+#else
+#include <limits>
+#endif
+
+#ifndef M_PI
+    #define M_PI 3.141593f
+#endif
+
+namespace gdt {
+
+  static struct ZeroTy
+  {
+    __both__ operator          double   ( ) const { return 0; }
+    __both__ operator          float    ( ) const { return 0; }
+    __both__ operator          long long( ) const { return 0; }
+    __both__ operator unsigned long long( ) const { return 0; }
+    __both__ operator          long     ( ) const { return 0; }
+    __both__ operator unsigned long     ( ) const { return 0; }
+    __both__ operator          int      ( ) const { return 0; }
+    __both__ operator unsigned int      ( ) const { return 0; }
+    __both__ operator          short    ( ) const { return 0; }
+    __both__ operator unsigned short    ( ) const { return 0; }
+    __both__ operator          char     ( ) const { return 0; }
+    __both__ operator unsigned char     ( ) const { return 0; }
+  } zero MAYBE_UNUSED;
+
+  static struct OneTy
+  {
+    __both__ operator          double   ( ) const { return 1; }
+    __both__ operator          float    ( ) const { return 1; }
+    __both__ operator          long long( ) const { return 1; }
+    __both__ operator unsigned long long( ) const { return 1; }
+    __both__ operator          long     ( ) const { return 1; }
+    __both__ operator unsigned long     ( ) const { return 1; }
+    __both__ operator          int      ( ) const { return 1; }
+    __both__ operator unsigned int      ( ) const { return 1; }
+    __both__ operator          short    ( ) const { return 1; }
+    __both__ operator unsigned short    ( ) const { return 1; }
+    __both__ operator          char     ( ) const { return 1; }
+    __both__ operator unsigned char     ( ) const { return 1; }
+  } one MAYBE_UNUSED;
+
+  static struct NegInfTy
+  {
+#ifdef __CUDA_ARCH__
+    __device__ operator          double   ( ) const { return -CUDART_INF; }
+    __device__ operator          float    ( ) const { return -CUDART_INF_F; }
+#else
+    __both__ operator          double   ( ) const { return -std::numeric_limits<double>::infinity(); }
+    __both__ operator          float    ( ) const { return -std::numeric_limits<float>::infinity(); }
+    __both__ operator          long long( ) const { return std::numeric_limits<long long>::min(); }
+    __both__ operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); }
+    __both__ operator          long     ( ) const { return std::numeric_limits<long>::min(); }
+    __both__ operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::min(); }
+    __both__ operator          int      ( ) const { return std::numeric_limits<int>::min(); }
+    __both__ operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::min(); }
+    __both__ operator          short    ( ) const { return std::numeric_limits<short>::min(); }
+    __both__ operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::min(); }
+    __both__ operator          char     ( ) const { return std::numeric_limits<char>::min(); }
+    __both__ operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::min(); }
+#endif
+  } neg_inf MAYBE_UNUSED;
+
+  inline __both__ float infty() {
+#ifdef __CUDA_ARCH__
+    return CUDART_INF_F; 
+#else
+    return std::numeric_limits<float>::infinity(); 
+#endif
+  }
+  
+  static struct PosInfTy
+  {
+#ifdef __CUDA_ARCH__
+    __device__ operator          double   ( ) const { return CUDART_INF; }
+    __device__ operator          float    ( ) const { return CUDART_INF_F; }
+#else
+    __both__ operator          double   ( ) const { return std::numeric_limits<double>::infinity(); }
+    __both__ operator          float    ( ) const { return std::numeric_limits<float>::infinity(); }
+    __both__ operator          long long( ) const { return std::numeric_limits<long long>::max(); }
+    __both__ operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); }
+    __both__ operator          long     ( ) const { return std::numeric_limits<long>::max(); }
+    __both__ operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::max(); }
+    __both__ operator          int      ( ) const { return std::numeric_limits<int>::max(); }
+    __both__ operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::max(); }
+    __both__ operator          short    ( ) const { return std::numeric_limits<short>::max(); }
+    __both__ operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::max(); }
+    __both__ operator          char     ( ) const { return std::numeric_limits<char>::max(); }
+    __both__ operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::max(); }
+#endif
+  } inf MAYBE_UNUSED, pos_inf MAYBE_UNUSED;
+
+  static struct NaNTy
+  {
+#ifdef __CUDA_ARCH__
+    __device__ operator double( ) const { return CUDART_NAN_F; }
+    __device__ operator float ( ) const { return CUDART_NAN; }
+#else
+    __both__ operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); }
+    __both__ operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
+#endif
+  } nan MAYBE_UNUSED;
+
+  static struct UlpTy
+  {
+#ifdef __CUDACC__
+    // todo
+#else
+    __both__ operator double( ) const { return std::numeric_limits<double>::epsilon(); }
+    __both__ operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
+#endif
+  } ulp MAYBE_UNUSED;
+
+
+
+  template<bool is_integer>
+  struct limits_traits;
+
+  template<> struct limits_traits<true> {
+    template<typename T> static inline __both__ T value_limits_lower(T) { return std::numeric_limits<T>::min(); }
+    template<typename T> static inline __both__ T value_limits_upper(T) { return std::numeric_limits<T>::max(); }
+  };
+  template<> struct limits_traits<false> {
+    template<typename T> static inline __both__ T value_limits_lower(T) { return (T)NegInfTy(); }//{ return -std::numeric_limits<T>::infinity(); }
+    template<typename T> static inline __both__ T value_limits_upper(T) { return (T)PosInfTy(); }//{ return +std::numeric_limits<T>::infinity();  }
+  };
+  
+  /*! lower value of a completely *empty* range [+inf..-inf] */
+  template<typename T> inline __both__ T empty_bounds_lower()
+  {
+    return limits_traits<std::numeric_limits<T>::is_integer>::value_limits_upper(T());
+  }
+  
+  /*! upper value of a completely *empty* range [+inf..-inf] */
+  template<typename T> inline __both__ T empty_bounds_upper()
+  {
+    return limits_traits<std::numeric_limits<T>::is_integer>::value_limits_lower(T());
+  }
+
+  /*! lower value of a completely *empty* range [+inf..-inf] */
+  template<typename T> inline __both__ T empty_range_lower()
+  {
+    return limits_traits<std::numeric_limits<T>::is_integer>::value_limits_upper(T());
+  }
+  
+  /*! upper value of a completely *empty* range [+inf..-inf] */
+  template<typename T> inline __both__ T empty_range_upper()
+  {
+    return limits_traits<std::numeric_limits<T>::is_integer>::value_limits_lower(T());
+  }
+
+  /*! lower value of a completely open range [-inf..+inf] */
+  template<typename T> inline __both__ T open_range_lower()
+  {
+    return limits_traits<std::numeric_limits<T>::is_integer>::value_limits_lower(T());
+  }
+
+  /*! upper value of a completely open range [-inf..+inf] */
+  template<typename T> inline __both__ T open_range_upper()
+  {
+    return limits_traits<std::numeric_limits<T>::is_integer>::value_limits_upper(T());
+  }
+  
+} // ::gdt
diff --git a/crates/optix/examples/common/gdt/gdt/math/fixedpoint.h b/crates/optix/examples/common/gdt/gdt/math/fixedpoint.h
new file mode 100644
index 00000000..069be114
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/math/fixedpoint.h
@@ -0,0 +1,36 @@
+// ======================================================================== //
+// Copyright 2018 Ingo Wald                                                 //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "gdt/gdt.h"
+#include "gdt/math/constants.h"
+#include <iostream>
+
+namespace gdt {
+
+  /*! a n-bit fixed-point float in the [0..1] region */
+  template<typename storageT, int Nbits, int is_signed>
+  struct FixedPoint {
+    FixedPoint();
+
+    float operator float() const {
+      return bits / float((1ULL << Nbits)-1);
+    }
+    storageT bits;
+  };
+}
+ 
diff --git a/crates/optix/examples/common/gdt/gdt/math/vec.h b/crates/optix/examples/common/gdt/gdt/math/vec.h
new file mode 100644
index 00000000..72e3cf4a
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/math/vec.h
@@ -0,0 +1,400 @@
+// ======================================================================== //
+// Copyright 2018 Ingo Wald                                                 //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "gdt/gdt.h"
+#include "gdt/math/constants.h"
+#include <iostream>
+#include <math.h>
+#include <algorithm>
+
+#ifndef __CUDACC__
+// define builtins for IDE
+struct float2 {
+    float x;
+    float y;
+};
+struct float3 {
+    float x;
+    float y;
+    float z;
+};
+struct float4 {
+    float x;
+    float y;
+    float z;
+    float w;
+};
+#endif
+
+namespace gdt {
+
+  template<typename T> struct long_type_of { typedef T type; };
+  template<> struct long_type_of<int32_t>  { typedef int64_t  type; };
+  template<> struct long_type_of<uint32_t> { typedef uint64_t type; };
+  
+  template<typename T, int N>
+  struct GDT_INTERFACE vec_t { T t[N]; };
+
+
+  template<typename ScalarTypeA, typename ScalarTypeB> struct BinaryOpResultType;
+
+  // Binary Result type: scalar type with itself always returns same type
+  template<typename ScalarType>
+  struct BinaryOpResultType<ScalarType,ScalarType> { typedef ScalarType type; };
+
+  template<> struct BinaryOpResultType<int,float> { typedef float type; };
+  template<> struct BinaryOpResultType<float,int> { typedef float type; };
+  template<> struct BinaryOpResultType<unsigned int,float> { typedef float type; };
+  template<> struct BinaryOpResultType<float,unsigned int> { typedef float type; };
+
+  template<> struct BinaryOpResultType<int,double> { typedef double type; };
+  template<> struct BinaryOpResultType<double,int> { typedef double type; };
+  template<> struct BinaryOpResultType<unsigned int,double> { typedef double type; };
+  template<> struct BinaryOpResultType<double,unsigned int> { typedef double type; };
+  
+  // ------------------------------------------------------------------
+  // vec1 - not really a vector, but makes a scalar look like a
+  // vector, so we can use it in, say, box1f
+  // ------------------------------------------------------------------
+  template<typename T>
+  struct GDT_INTERFACE vec_t<T,1> {
+    enum { dims = 1 };
+    typedef T scalar_t;
+    
+    inline __both__ vec_t() {}
+    inline __both__ vec_t(const T &v) : v(v) {}
+
+    /*! assignment operator */
+    inline __both__ vec_t<T,1> &operator=(const vec_t<T,1> &other) {
+      this->v = other.v;
+      return *this;
+    }
+    
+    /*! construct 2-vector from 2-vector of another type */
+    template<typename OT>
+    inline __both__ explicit vec_t(const vec_t<OT,1> &o) : v(o.v) {}
+    
+    inline __both__ T &operator[](size_t dim) { return (&x)[dim]; }
+    inline __both__ const T &operator[](size_t dim) const { return (&x)[dim]; }
+
+    union {
+      T v;
+      T x; //!< just to allow all vec types to use x,y,z,w,...
+    };
+  };
+ 
+  // ------------------------------------------------------------------
+  // vec2
+  // ------------------------------------------------------------------
+  template<typename T>
+  struct GDT_INTERFACE vec_t<T,2> {
+    enum { dims = 2 };
+    typedef T scalar_t;
+    
+    inline __both__ vec_t() {}
+    inline __both__ vec_t(const T &t) : x(t), y(t) {}
+    inline __both__ vec_t(const T &x, const T &y) : x(x), y(y) {}
+#ifdef __CUDACC__
+    inline __both__ vec_t(const float2 v) : x(v.x), y(v.y) {}
+    inline __both__ vec_t(const int2 v) : x(v.x), y(v.y) {}
+    inline __both__ vec_t(const uint2 v) : x(v.x), y(v.y) {}
+    
+    inline __both__ operator float2() const { return make_float2(x,y); }
+    inline __both__ operator int2() const { return make_int2(x,y); }
+    inline __both__ operator uint2() const { return make_uint2(x,y); }
+    // inline __both__ vec_t(const size_t2 v) : x(v.x), y(v.y), z(v.z) {}
+    // inline __both__ operator size_t2() { return make_size_t2(x,y); }
+#endif
+
+    /*! assignment operator */
+    inline __both__ vec_t<T,2> &operator=(const vec_t<T,2> &other) {
+      this->x = other.x;
+      this->y = other.y;
+      return *this;
+    }
+    
+    /*! construct 2-vector from 2-vector of another type */
+    template<typename OT>
+    inline __both__ explicit vec_t(const vec_t<OT,2> &o) : x((T)o.x), y((T)o.y) {}
+    
+    inline __both__ T &operator[](size_t dim) { return (&x)[dim]; }
+    inline __both__ const T &operator[](size_t dim) const { return (&x)[dim]; }
+    
+    union {
+      struct { T x, y; };
+      struct { T s, t; };
+      struct { T u, v; };
+    };
+  };
+
+  // ------------------------------------------------------------------
+  // vec3
+  // ------------------------------------------------------------------
+  template<typename T>
+  struct GDT_INTERFACE vec_t<T,3> {
+    enum { dims = 3 };
+    typedef T scalar_t;
+    
+    inline __both__ vec_t() {}
+    inline __both__ vec_t(const T &t) : x(t), y(t), z(t) {}
+    inline __both__ vec_t(const T &_x, const T &_y, const T &_z) : x(_x), y(_y), z(_z) {}
+#ifdef __CUDACC__
+    inline __both__ vec_t(const int3 &v) : x(v.x), y(v.y), z(v.z) {}
+    inline __both__ vec_t(const uint3 &v) : x(v.x), y(v.y), z(v.z) {}
+    inline __both__ vec_t(const float3 &v) : x(v.x), y(v.y), z(v.z) {}
+    inline __both__ operator float3() const { return make_float3(x,y,z); }
+    inline __both__ operator int3() const { return make_int3(x,y,z); }
+    inline __both__ operator uint3() const { return make_uint3(x,y,z); }
+#endif
+    inline __both__ explicit vec_t(const vec_t<T,4> &v);
+    /*! construct 3-vector from 3-vector of another type */
+    template<typename OT>
+    inline __both__ explicit vec_t(const vec_t<OT,3> &o) : x((T)o.x), y((T)o.y), z((T)o.z) {}
+
+    /*! swizzle ... */
+    inline __both__ vec_t<T,3> yzx() const { return vec_t<T,3>(y,z,x); }
+    
+    /*! assignment operator */
+    inline __both__ vec_t<T,3> &operator=(const vec_t<T,3> &other) {
+      this->x = other.x;
+      this->y = other.y;
+      this->z = other.z;
+      return *this;
+    }
+    
+    inline __both__ T &operator[](size_t dim) { return (&x)[dim]; }
+    inline __both__ const T &operator[](size_t dim) const { return (&x)[dim]; }
+
+    template<typename OT, typename Lambda>
+    static inline __both__ vec_t<T,3> make_from(const vec_t<OT,3> &v, const Lambda &lambda)
+    { return vec_t<T,3>(lambda(v.x),lambda(v.y),lambda(v.z)); }
+    
+    union {
+      struct { T x, y, z; };
+      struct { T r, s, t; };
+      struct { T u, v, w; };
+    };
+  };
+
+  // ------------------------------------------------------------------
+  // vec3a
+  // ------------------------------------------------------------------
+  template<typename T>
+  struct GDT_INTERFACE vec3a_t : public vec_t<T,3> {
+    inline vec3a_t() {}
+    inline vec3a_t(const T &t) : vec_t<T,3>(t) {}
+    inline vec3a_t(const T &x, const T &y, const T &z) : vec_t<T,3>(x,y,z) {}
+
+    template<typename OT>
+    inline vec3a_t(const vec_t<OT,3> &v) : vec_t<T,3>(v.x,v.y,v.z) {}
+    
+    T a;
+  };
+  
+  // ------------------------------------------------------------------
+  // vec4
+  // ------------------------------------------------------------------
+  template<typename T>
+  struct GDT_INTERFACE vec_t<T,4> {
+    enum { dims = 4 };
+    typedef T scalar_t;
+    
+    inline __both__ vec_t() {}
+
+    inline __both__ vec_t(const T &t)
+      : x(t), y(t), z(t), w(t)
+    {}
+    inline __both__ vec_t(const vec_t<T,3> &xyz, const T &_w)
+      : x(xyz.x), y(xyz.y), z(xyz.z), w(_w)
+    {}
+    inline __both__ vec_t(const T &_x, const T &_y, const T &_z, const T &_w)
+      : x(_x), y(_y), z(_z), w(_w)
+    {}
+    
+#ifdef __CUDACC__
+    inline __both__ vec_t(const float4 &v)
+      : x(v.x), y(v.y), z(v.z), w(v.w)
+    {}
+    inline __both__ vec_t(const int4 &v)
+      : x(v.x), y(v.y), z(v.z), w(v.w)
+    {}
+    inline __both__ operator float4() const { return make_float4(x,y,z,w); }
+    inline __both__ operator int4()   const { return make_int4(x,y,z,w); }
+#endif
+    /*! construct 3-vector from 3-vector of another type */
+    template<typename OT>
+    inline __both__ explicit vec_t(const vec_t<OT,4> &o)
+      : x(o.x), y(o.y), z(o.z), w(o.w)
+    {}
+    inline __both__ vec_t(const vec_t<T,4> &o) : x(o.x), y(o.y), z(o.z), w(o.w) {}
+
+    /*! assignment operator */
+    inline __both__ vec_t<T,4> &operator=(const vec_t<T,4> &other) {
+      this->x = other.x;
+      this->y = other.y;
+      this->z = other.z;
+      this->w = other.w;
+      return *this;
+    }
+    
+    inline __both__ T &operator[](size_t dim) { return (&x)[dim]; }
+    inline __both__ const T &operator[](size_t dim) const { return (&x)[dim]; }
+
+    template<typename OT, typename Lambda>
+    static inline __both__ vec_t<T,4> make_from(const vec_t<OT,4> &v,
+                                                const Lambda &lambda)
+    { return vec_t<T,4>(lambda(v.x),lambda(v.y),lambda(v.z),lambda(v.w)); }
+    
+    T x, y, z, w;
+  };
+
+  template<typename T>
+  inline __both__ vec_t<T,3>::vec_t(const vec_t<T,4> &v)
+    : x(v.x), y(v.y), z(v.z)
+  {}
+
+  // =======================================================
+  // default functions
+  // =======================================================
+
+  template<typename T>
+  inline __both__ typename long_type_of<T>::type area(const vec_t<T,2> &v)
+  { return (typename long_type_of<T>::type)(v.x)*(typename long_type_of<T>::type)(v.y); }
+
+  
+  template<typename T>
+  inline __both__ typename long_type_of<T>::type volume(const vec_t<T,3> &v)
+  { return
+      (typename long_type_of<T>::type)(v.x)*
+      (typename long_type_of<T>::type)(v.y)*
+      (typename long_type_of<T>::type)(v.z);
+  }
+
+  template<typename T>
+  inline __both__ typename long_type_of<T>::type volume(const vec_t<T,4> &v)
+  { return
+      (typename long_type_of<T>::type)(v.x)*
+      (typename long_type_of<T>::type)(v.y)*
+      (typename long_type_of<T>::type)(v.z)*
+      (typename long_type_of<T>::type)(v.w);
+  }
+
+  template<typename T>
+  inline __both__ typename long_type_of<T>::type area(const vec_t<T,3> &v)
+  { return
+      T(2)*((typename long_type_of<T>::type)(v.x)*v.y+
+            (typename long_type_of<T>::type)(v.y)*v.z+
+            (typename long_type_of<T>::type)(v.z)*v.x);
+  }
+
+
+
+  /*! vector cross product */
+  template<typename T>
+  inline __both__ vec_t<T,3> cross(const vec_t<T,3> &a, const vec_t<T,3> &b)
+  {
+    return vec_t<T,3>(a.y*b.z-b.y*a.z,
+                      a.z*b.x-b.z*a.x,
+                      a.x*b.y-b.x*a.y);
+  }
+
+  /*! vector cross product */
+  template<typename T>
+  inline __both__ T dot(const vec_t<T,3> &a, const vec_t<T,3> &b)
+  {
+    return a.x*b.x + a.y*b.y + a.z*b.z;
+  }
+
+  /*! vector cross product */
+  template<typename T>
+  inline __both__ vec_t<T,3> normalize(const vec_t<T,3> &v)
+  {
+    return v * 1.f/gdt::overloaded::sqrt(dot(v,v));
+  }
+
+  /*! vector cross product */
+  template<typename T>
+  inline __both__ T length(const vec_t<T,3> &v)
+  {
+    return gdt::overloaded::sqrt(dot(v,v));
+  }
+
+  template<typename T>
+  inline __gdt_host std::ostream &operator<<(std::ostream &o, const vec_t<T,1> &v)
+  {
+    o << "(" << v.x << ")";
+    return o;
+  }
+  
+  template<typename T>
+  inline __gdt_host std::ostream &operator<<(std::ostream &o, const vec_t<T,2> &v)
+  {
+    o << "(" << v.x << "," << v.y << ")";
+    return o;
+  }
+  
+  template<typename T>
+  inline __gdt_host std::ostream &operator<<(std::ostream &o, const vec_t<T,3> &v)
+  {
+    o << "(" << v.x << "," << v.y << "," << v.z << ")";
+    return o;
+  }
+
+  template<typename T>
+  inline __gdt_host std::ostream &operator<<(std::ostream &o, const vec_t<T,4> &v)
+  {
+    o << "(" << v.x << "," << v.y << "," << v.z <<  "," << v.w << ")";
+    return o;
+  }
+
+  // =======================================================
+  // default instantiations
+  // =======================================================
+  
+#define _define_vec_types(T,t)    \
+using vec2##t = vec_t<T,2>; \
+using vec3##t = vec_t<T,3>; \
+using vec4##t = vec_t<T,4>; \
+using vec3##t##a = vec3a_t<T>; \
+  
+//#define _define_vec_types(T,t)    \
+//  typedef vec_t<T,2> vec2##t;    \
+//  typedef vec_t<T,3> vec3##t;    \
+//  typedef vec_t<T,4> vec4##t;    \
+//  typedef vec3a_t<T> vec3##t##a; \
+
+  _define_vec_types(int8_t ,c);
+  _define_vec_types(int16_t ,s);
+  _define_vec_types(int32_t ,i);
+  _define_vec_types(int64_t ,l);
+  _define_vec_types(uint8_t ,uc);
+  _define_vec_types(uint16_t,us);
+  _define_vec_types(uint32_t,ui);
+  _define_vec_types(uint64_t,ul);
+  _define_vec_types(float,f);
+  _define_vec_types(double,d);
+  
+#undef _define_vec_types
+  
+} // ::gdt
+
+
+#include "vec/functors.h"
+// comparison operators
+#include "vec/compare.h"
+#include "vec/rotate.h"
diff --git a/crates/optix/examples/common/gdt/gdt/math/vec/compare.h b/crates/optix/examples/common/gdt/gdt/math/vec/compare.h
new file mode 100644
index 00000000..0e50fdba
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/math/vec/compare.h
@@ -0,0 +1,59 @@
+// ======================================================================== //
+// Copyright 2018 Ingo Wald                                                 //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+namespace gdt {
+
+  // ------------------------------------------------------------------
+  // ==
+  // ------------------------------------------------------------------
+
+#if __CUDACC__
+  template<typename T>
+  inline __both__ bool operator==(const vec_t<T,2> &a, const vec_t<T,2> &b)
+  { return (a.x==b.x) & (a.y==b.y); }
+  
+  template<typename T>
+  inline __both__ bool operator==(const vec_t<T,3> &a, const vec_t<T,3> &b)
+  { return (a.x==b.x) & (a.y==b.y) & (a.z==b.z); }
+  
+  template<typename T>
+  inline __both__ bool operator==(const vec_t<T,4> &a, const vec_t<T,4> &b)
+  { return (a.x==b.x) & (a.y==b.y) & (a.z==b.z) & (a.w==b.w); }
+#else
+  template<typename T>
+  inline __both__ bool operator==(const vec_t<T,2> &a, const vec_t<T,2> &b)
+  { return a.x==b.x && a.y==b.y; }
+
+  template<typename T>
+  inline __both__ bool operator==(const vec_t<T,3> &a, const vec_t<T,3> &b)
+  { return a.x==b.x && a.y==b.y && a.z==b.z; }
+
+  template<typename T>
+  inline __both__ bool operator==(const vec_t<T,4> &a, const vec_t<T,4> &b)
+  { return a.x==b.x && a.y==b.y && a.z==b.z && a.w==b.w; }
+#endif
+  
+  // ------------------------------------------------------------------
+  // !=
+  // ------------------------------------------------------------------
+  
+  template<typename T, int N>
+  inline __both__ bool operator!=(const vec_t<T,N> &a, const vec_t<T,N> &b)
+  { return !(a==b); }
+  
+} // ::gdt
diff --git a/crates/optix/examples/common/gdt/gdt/math/vec/functors.h b/crates/optix/examples/common/gdt/gdt/math/vec/functors.h
new file mode 100644
index 00000000..a2c8e741
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/math/vec/functors.h
@@ -0,0 +1,364 @@
+// ======================================================================== //
+// Copyright 2018 Ingo Wald                                                 //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include <type_traits>
+
+namespace gdt {
+
+  // inline __both__ float min(float x, float y) { return fminf(x,y); }
+  // inline __both__ float max(float x, float y) { return fmaxf(x,y); }
+  
+  // =======================================================
+  // scalar functors
+  // =======================================================
+
+  template<typename T>
+  inline __both__ T divRoundUp(const T &a, const T &b)
+  { //causes issues on ubuntu16-gcc: static_assert(std::numeric_limits<T>::is_integer);
+    return T((a+b-1)/b); }
+  
+  // =======================================================
+  // vector specializations of those scalar functors
+  // =======================================================
+
+  template<typename T, int N> inline __both__
+  bool any_less_than(const vec_t<T,N> &a, const vec_t<T,N> &b)
+  { for (int i=0;i<N;i++) if (a[i] < b[i]) return true; return false; }
+  
+  template<typename T, int N> inline __both__
+  bool any_greater_than(const vec_t<T,N> &a, const vec_t<T,N> &b)
+  { for (int i=0;i<N;i++) if (a[i] > b[i]) return true; return false; }
+
+  // -------------------------------------------------------
+  // unary functors
+  // -------------------------------------------------------
+
+  template<typename T>
+  inline __both__ T clamp(const T &val, const T &lo, const T &hi)
+  { return min(hi,max(lo,val)); }
+  
+  template<typename T>
+  inline __both__ T clamp(const T &val, const T &hi)
+  { return clamp(val,(T)0,hi); }
+  
+#define _define_float_functor(func)                                     \
+  template<typename T> inline __both__ vec_t<T,2> func(const vec_t<T,2> &v) \
+  { return vec_t<T,2>(func(v.x),func(v.y)); }                           \
+                                                                        \
+  template<typename T> inline __both__ vec_t<T,3> func(const vec_t<T,3> &v) \
+  { return vec_t<T,3>(func(v.x),func(v.y),func(v.z)); }                 \
+                                                                        \
+  template<typename T> inline __both__ vec_t<T,4> func(const vec_t<T,4> &v) \
+  { return vec_t<T,4>(func(v.x),func(v.y),func(v.z),func(v.w)); }       \
+
+  _define_float_functor(rcp)
+  _define_float_functor(sin)
+  _define_float_functor(cos)
+  _define_float_functor(abs)
+  _define_float_functor(saturate)
+  
+#undef _define_float_functor
+  
+  // -------------------------------------------------------
+  // binary functors
+  // -------------------------------------------------------
+  // template<typename T>
+  // __both__ vec_t<T,2> divRoundUp(const vec_t<T,2> &a, const vec_t<T,2> &b)
+  // { return vec_t<T,2>(divRoundUp(a.x,b.x),divRoundUp(a.y,b.y)); }
+  
+  // template<typename T>
+  // __both__ vec_t<T,3> divRoundUp(const vec_t<T,3> &a, const vec_t<T,3> &b)
+  // { return vec_t<T,3>(divRoundUp(a.x,b.x),divRoundUp(a.y,b.y),divRoundUp(a.z,b.z)); }
+
+#define _define_binary_functor(fct)                                     \
+  template<typename T>                                                  \
+  __both__ vec_t<T,1> fct(const vec_t<T,1> &a, const vec_t<T,1> &b)     \
+  {                                                                     \
+    return vec_t<T,1>(fct(a.x,b.x));                                    \
+  }                                                                     \
+                                                                        \
+  template<typename T>                                                  \
+  __both__ vec_t<T,2> fct(const vec_t<T,2> &a, const vec_t<T,2> &b)     \
+  {                                                                     \
+    return vec_t<T,2>(fct(a.x,b.x),                                     \
+                      fct(a.y,b.y));                                    \
+  }                                                                     \
+                                                                        \
+  template<typename T>                                                  \
+  __both__ vec_t<T,3> fct(const vec_t<T,3> &a, const vec_t<T,3> &b)     \
+  {                                                                     \
+    return vec_t<T,3>(fct(a.x,b.x),                                     \
+                      fct(a.y,b.y),                                     \
+                      fct(a.z,b.z));                                    \
+  }                                                                     \
+                                                                        \
+  template<typename T1, typename T2>                                    \
+  __both__ vec_t<typename BinaryOpResultType<T1,T2>::type,3>            \
+  fct(const vec_t<T1,3> &a, const vec_t<T2,3> &b)                       \
+  {                                                                     \
+    return vec_t<typename BinaryOpResultType<T1,T2>::type,3>            \
+      (fct(a.x,b.x),                                                    \
+       fct(a.y,b.y),                                                    \
+       fct(a.z,b.z));                                                   \
+  }                                                                     \
+                                                                        \
+  template<typename T>                                                  \
+  __both__ vec_t<T,4> fct(const vec_t<T,4> &a, const vec_t<T,4> &b)     \
+  {                                                                     \
+    return vec_t<T,4>(fct(a.x,b.x),                                     \
+                      fct(a.y,b.y),                                     \
+                      fct(a.z,b.z),                                     \
+                      fct(a.w,b.w));                                    \
+  }                                                                     \
+
+  
+  _define_binary_functor(divRoundUp)
+  _define_binary_functor(min)
+  _define_binary_functor(max)
+#undef _define_binary_functor
+
+
+
+
+  
+
+  // -------------------------------------------------------
+  // binary operators
+  // -------------------------------------------------------
+#define _define_operator(op)                                            \
+  /* vec op vec */                                                      \
+  template<typename T>                                                  \
+  inline __both__ vec_t<T,2> operator op(const vec_t<T,2> &a,           \
+                                         const vec_t<T,2> &b)           \
+  { return vec_t<T,2>(a.x op b.x, a.y op b.y); }                        \
+                                                                        \
+  template<typename T>                                                  \
+  inline __both__ vec_t<T,3> operator op(const vec_t<T,3> &a,           \
+                                         const vec_t<T,3> &b)           \
+  { return vec_t<T,3>(a.x op b.x, a.y op b.y, a.z op b.z); }            \
+                                                                        \
+  template<typename T>                                                  \
+  inline __both__ vec_t<T,4> operator op(const vec_t<T,4> &a,           \
+                                         const vec_t<T,4> &b)           \
+  { return vec_t<T,4>(a.x op b.x,a.y op b.y,a.z op b.z,a.w op b.w); }   \
+                                                                        \
+  /* vec op scalar */                                                   \
+  template<typename T>                                                  \
+  inline __both__ vec_t<T,2> operator op(const vec_t<T,2> &a,           \
+                                         const T &b)                    \
+  { return vec_t<T,2>(a.x op b, a.y op b); }                            \
+                                                                        \
+  template<typename T1, typename T2>                                    \
+  inline __both__ vec_t<typename BinaryOpResultType<T1,T2>::type,3>     \
+  operator op(const vec_t<T1,3> &a, const T2 &b)                        \
+  { return vec_t<typename BinaryOpResultType<T1,T2>::type,3>            \
+      (a.x op b, a.y op b, a.z op b);                                   \
+  }                                                                     \
+                                                                        \
+  template<typename T>                                                  \
+  inline __both__ vec_t<T,4> operator op(const vec_t<T,4> &a,           \
+                                         const T &b)                    \
+  { return vec_t<T,4>(a.x op b, a.y op b, a.z op b, a.w op b); }        \
+                                                                        \
+  /* scalar op vec */                                                   \
+  template<typename T>                                                  \
+  inline __both__ vec_t<T,2> operator op(const T &a,                    \
+                                         const vec_t<T,2> &b)           \
+  { return vec_t<T,2>(a op b.x, a op b.y); }                            \
+                                                                        \
+  template<typename T>                                                  \
+  inline __both__ vec_t<T,3> operator op(const T &a,                    \
+                                         const vec_t<T,3> &b)           \
+  { return vec_t<T,3>(a op b.x, a op b.y, a op b.z); }                  \
+                                                                        \
+  template<typename T>                                                  \
+  inline __both__ vec_t<T,4> operator op(const T &a,                    \
+                                         const vec_t<T,4> &b)           \
+  { return vec_t<T,4>(a op b.x, a op b.y, a op b.z, a op b.w); }        \
+                                                                        \
+                                                                        \
+    
+  _define_operator(*);
+  _define_operator(/);
+  _define_operator(+);
+  _define_operator(-);
+  
+#undef _define_operator
+
+
+
+
+  // -------------------------------------------------------
+  // unary operators
+  // -------------------------------------------------------
+
+  template<typename T>
+  inline __both__ vec_t<T,2> operator-(const vec_t<T,2> &v)
+  { return vec_t<T,2>(-v.x, -v.y); }
+  
+  template<typename T>
+  inline __both__ vec_t<T,2> operator+(const vec_t<T,2> &v)
+  { return vec_t<T,2>(v.x, v.y); }
+
+  template<typename T>
+  inline __both__ vec_t<T,3> operator-(const vec_t<T,3> &v)
+  { return vec_t<T,3>(-v.x, -v.y, -v.z); }
+  
+  template<typename T>
+  inline __both__ vec_t<T,3> operator+(const vec_t<T,3> &v)
+  { return vec_t<T,3>(v.x, v.y, v.z); }
+
+
+
+  // -------------------------------------------------------
+  // binary op-assign operators
+  // -------------------------------------------------------
+#define  _define_op_assign_operator(operator_op,op)                     \
+  /* vec op vec */                                                      \
+  template<typename T, typename OT>                                     \
+  inline __both__ vec_t<T,2> &operator_op(vec_t<T,2> &a,                \
+                                          const vec_t<OT,2> &b)         \
+  {                                                                     \
+    a.x op (T)b.x;                                                      \
+    a.y op (T)b.y;                                                      \
+    return a;                                                           \
+  }                                                                     \
+                                                                        \
+  template<typename T, typename OT>                                     \
+  inline __both__ vec_t<T,3> &operator_op(vec_t<T,3> &a,                \
+                                          const vec_t<OT,3> &b)         \
+  {                                                                     \
+    a.x op (T)b.x;                                                      \
+    a.y op (T)b.y;                                                      \
+    a.z op (T)b.z;                                                      \
+    return a;                                                           \
+  }                                                                     \
+                                                                        \
+  template<typename T, typename OT>                                     \
+  inline __both__ vec_t<T,4> &operator_op(vec_t<T,4> &a,                \
+                                          const vec_t<OT,4> &b)         \
+  {                                                                     \
+    a.x op (T)b.x;                                                      \
+    a.y op (T)b.y;                                                      \
+    a.z op (T)b.z;                                                      \
+    a.w op (T)b.w;                                                      \
+    return a;                                                           \
+  }                                                                     \
+                                                                        \
+  /* vec op scalar */                                                   \
+  template<typename T, typename OT>                                     \
+  inline __both__ vec_t<T,2> &operator_op(vec_t<T,2> &a,                \
+                                          const OT &b)                  \
+  { a.x op (T)b; a.y op (T)b; return a; }                               \
+                                                                        \
+  template<typename T, typename OT>                                     \
+  inline __both__ vec_t<T,3> &operator_op(vec_t<T,3> &a,                \
+                                          const OT &b)                  \
+  { a.x op (T)b; a.y op (T)b; a.z op (T)b; return a; }                  \
+                                                                        \
+  template<typename T, typename OT>                                     \
+  inline __both__ vec_t<T,4> &operator_op(vec_t<T,4> &a,                \
+                                          const OT &b)                  \
+  { a.x op (T)b; a.y op (T)b; a.z op (T)b; a.w op (T)b; return a; }     \
+    
+  _define_op_assign_operator(operator*=,*=);
+  _define_op_assign_operator(operator/=,/=);
+  _define_op_assign_operator(operator+=,+=);
+  _define_op_assign_operator(operator-=,-=);
+  
+#undef _define_op_assign_operator
+
+
+  template<typename T>
+  __both__ T reduce_min(const vec_t<T,1> &v) { return v.x; }
+  template<typename T>
+  __both__ T reduce_min(const vec_t<T,2> &v) { return min(v.x,v.y); }
+  template<typename T>
+  __both__ T reduce_min(const vec_t<T,3> &v) { return min(min(v.x,v.y),v.z); }
+  template<typename T>
+  __both__ T reduce_min(const vec_t<T,4> &v) { return min(min(v.x,v.y),min(v.z,v.w)); }
+  template<typename T>
+  __both__ T reduce_max(const vec_t<T,2> &v) { return max(v.x,v.y); }
+  template<typename T>
+  __both__ T reduce_max(const vec_t<T,3> &v) { return max(max(v.x,v.y),v.z); }
+  template<typename T>
+  __both__ T reduce_max(const vec_t<T,4> &v) { return max(max(v.x,v.y),max(v.z,v.w)); }
+
+
+  template<typename T, int N>
+  __both__ vec_t<T,3> madd(const vec_t<T,N> &a, const vec_t<T,N> &b, const vec_t<T,N> &c)
+  {
+    return a*b + c;
+  }
+
+
+  template<typename T, int N>
+  __both__ int arg_max(const vec_t<T,N> &v)
+  {
+    int biggestDim = 0;
+    for (int i=1;i<N;i++)
+      if (abs(v[i]) > abs(v[biggestDim])) biggestDim = i;
+    return biggestDim;
+  }
+  
+
+  // less, for std::set, std::map, etc
+  template<typename T>
+  __both__ bool operator<(const vec_t<T,3> &a, const vec_t<T,3> &b)
+  {
+    if (a.x < b.x) return true;
+    if (a.x == b.x && a.y < b.y) return true;
+    if (a.x == b.x && a.y == b.y && a.z < b.z) return true;
+    return false;
+    // return
+    //   (a.x < b.x) |
+    //   ((a.x == b.x) & ((a.y < b.y) |
+    //                    ((a.y == b.y) & (a.z < b.z))));
+  }
+
+  /*! helper function that creates a semi-random color from an ID */
+  inline __both__ vec3f randomColor(int i)
+  {
+    int r = unsigned(i)*13*17 + 0x234235;
+    int g = unsigned(i)*7*3*5 + 0x773477;
+    int b = unsigned(i)*11*19 + 0x223766;
+    return vec3f((r&255)/255.f,
+                 (g&255)/255.f,
+                 (b&255)/255.f);
+  }
+
+  /*! helper function that creates a semi-random color from an ID */
+  inline __both__ vec3f randomColor(size_t idx)
+  {
+    unsigned int r = (unsigned int)(idx*13*17 + 0x234235);
+    unsigned int g = (unsigned int)(idx*7*3*5 + 0x773477);
+    unsigned int b = (unsigned int)(idx*11*19 + 0x223766);
+    return vec3f((r&255)/255.f,
+                 (g&255)/255.f,
+                 (b&255)/255.f);
+  }
+
+  /*! helper function that creates a semi-random color from an ID */
+  template<typename T>
+  inline __both__ vec3f randomColor(const T *ptr)
+  {
+    return randomColor((size_t)ptr);
+  }
+
+  
+} // ::gdt
diff --git a/crates/optix/examples/common/gdt/gdt/math/vec/rotate.h b/crates/optix/examples/common/gdt/gdt/math/vec/rotate.h
new file mode 100644
index 00000000..2a7ba29a
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/math/vec/rotate.h
@@ -0,0 +1,40 @@
+// ======================================================================== //
+// Copyright 2018 Ingo Wald                                                 //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+namespace gdt {
+
+  /*! perform 'rotation' of float a by amount b. Both a and b must be
+      in 0,1 range, the result will be (a+1) clamped to that same
+      range (ie, it is the value a shifted by the amount b to the
+      right, and re-entering the [0,1) range on the left if it
+      "rotates" out on the right */
+  inline __both__ float rotate(const float a, const float b)
+  {
+    float sum = a+b;
+    return (sum-1.f)<0.f?sum:(sum-1.f);
+  }
+
+  /*! perform 'rotation' of float a by amount b. Both a and b must be
+      in 0,1 range, the result will be (a+1) clamped to that same
+      range (ie, it is the value a shifted by the amount b to the
+      right, and re-entering the [0,1) range on the left if it
+      "rotates" out on the right */
+  inline __both__ vec2f rotate(const vec2f a, const vec2f b) 
+  { return vec2f(rotate(a.x,b.x),rotate(a.y,b.y)); }
+  
+} // ::gdt
diff --git a/crates/optix/examples/common/gdt/gdt/random/random.h b/crates/optix/examples/common/gdt/gdt/random/random.h
new file mode 100644
index 00000000..e2f77465
--- /dev/null
+++ b/crates/optix/examples/common/gdt/gdt/random/random.h
@@ -0,0 +1,91 @@
+// ======================================================================== //
+// Copyright 2018 Ingo Wald                                                 //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+/* pieces originally taken from optixPathTracer/random.h example,
+   under following license */
+
+/* 
+ * Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "gdt/gdt.h"
+
+namespace gdt {
+
+  /*! simple 24-bit linear congruence generator */
+  template<unsigned int N=16>
+  struct LCG {
+    
+    inline __both__ LCG()
+    { /* intentionally empty so we can use it in device vars that
+         don't allow dynamic initialization (ie, PRD) */
+    }
+    inline __both__ LCG(unsigned int val0, unsigned int val1)
+    { init(val0,val1); }
+    
+    inline __both__ void init(unsigned int val0, unsigned int val1)
+    {
+      unsigned int v0 = val0;
+      unsigned int v1 = val1;
+      unsigned int s0 = 0;
+      
+      for (unsigned int n = 0; n < N; n++) {
+        s0 += 0x9e3779b9;
+        v0 += ((v1<<4)+0xa341316c)^(v1+s0)^((v1>>5)+0xc8013ea4);
+        v1 += ((v0<<4)+0xad90777d)^(v0+s0)^((v0>>5)+0x7e95761e);
+      }
+      state = v0;
+    }
+    
+    // Generate random unsigned int in [0, 2^24)
+    inline __both__ float operator() ()
+    {
+      const uint32_t LCG_A = 1664525u;
+      const uint32_t LCG_C = 1013904223u;
+      state = (LCG_A * state + LCG_C);
+      return (state & 0x00FFFFFF) / (float) 0x01000000;
+    }
+    
+    uint32_t state;
+  };
+
+} // ::gdt
diff --git a/crates/optix/examples/ex02_pipeline/Cargo.toml b/crates/optix/examples/ex02_pipeline/Cargo.toml
new file mode 100644
index 00000000..caeaf8b6
--- /dev/null
+++ b/crates/optix/examples/ex02_pipeline/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "ex02_pipeline"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+optix = {path = "../../"}
+cust = {path = "../../../cust"}
+anyhow = "1.0.44"
+device = { path = "./device" }
+
+[build-dependencies]
+find_cuda_helper = { version = "0.2", path = "../../../find_cuda_helper" }
+cuda_builder = { version = "0.2", path = "../../../cuda_builder" }
diff --git a/crates/optix/examples/ex02_pipeline/build.rs b/crates/optix/examples/ex02_pipeline/build.rs
new file mode 100644
index 00000000..b7274457
--- /dev/null
+++ b/crates/optix/examples/ex02_pipeline/build.rs
@@ -0,0 +1,52 @@
+use cuda_builder::CudaBuilder;
+use find_cuda_helper::find_optix_root;
+
+fn main() {
+    let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap();
+
+    let mut optix_include = find_optix_root().expect(
+        "Unable to find the OptiX SDK, make sure you installed it and
+    that OPTIX_ROOT or OPTIX_ROOT_DIR are set",
+    );
+    optix_include = optix_include.join("include");
+
+    let args = vec![
+        format!("-I{}", optix_include.display()),
+        format!("-I{}/../common/gdt", manifest_dir),
+    ];
+
+    compile_to_ptx("src/ex02_pipeline.cu", &args);
+
+    let ptx_path = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()).join("device.ptx");
+
+    CudaBuilder::new("device")
+        .copy_to(ptx_path)
+        .arch(cuda_builder::NvvmArch::Compute75)
+        .optix(true)
+        .build()
+        .unwrap();
+}
+
+fn compile_to_ptx(cu_path: &str, args: &[String]) {
+    println!("cargo:rerun-if-changed={}", cu_path);
+
+    let full_path =
+        std::path::PathBuf::from(std::env::var("CARGO_MANIFEST_DIR").unwrap()).join(cu_path);
+
+    let mut ptx_path = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()).join(cu_path);
+    ptx_path.set_extension("ptx");
+    std::fs::create_dir_all(ptx_path.parent().unwrap()).unwrap();
+
+    let output = std::process::Command::new("nvcc")
+        .arg("-ptx")
+        .arg(&full_path)
+        .arg("-o")
+        .arg(&ptx_path)
+        .args(args)
+        .output()
+        .expect("failed to fun nvcc");
+
+    if !output.status.success() {
+        panic!("{}", unsafe { String::from_utf8_unchecked(output.stderr) });
+    }
+}
diff --git a/crates/optix/examples/ex02_pipeline/device/Cargo.toml b/crates/optix/examples/ex02_pipeline/device/Cargo.toml
new file mode 100644
index 00000000..0b3dfa59
--- /dev/null
+++ b/crates/optix/examples/ex02_pipeline/device/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "device"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+cuda_std = { version = "0.2", path = "../../../../cuda_std" }
+cust_core = { version = "0.1", path = "../../../../cust_core" }
+optix_device = { path = "../../../../optix_device" }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
+
diff --git a/crates/optix/examples/ex02_pipeline/device/src/lib.rs b/crates/optix/examples/ex02_pipeline/device/src/lib.rs
new file mode 100644
index 00000000..45cabfee
--- /dev/null
+++ b/crates/optix/examples/ex02_pipeline/device/src/lib.rs
@@ -0,0 +1,72 @@
+#![feature(asm)]
+#![cfg_attr(
+    target_os = "cuda",
+    no_std,
+    feature(register_attr),
+    register_attr(nvvm_internal)
+)]
+// #![deny(warnings)]
+#![allow(clippy::missing_safety_doc)]
+
+use cuda_std::*;
+use cust_core::DeviceCopy;
+
+use optix_device as optix;
+
+extern crate alloc;
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct LaunchParams {
+    pub frame_id: i32,
+    pub fb_size: [u32; 2],
+    pub color_buffer: u64,
+}
+
+unsafe impl DeviceCopy for LaunchParams {}
+
+#[no_mangle]
+static PARAMS: LaunchParams = LaunchParams {
+    frame_id: 88,
+    fb_size: [1, 1],
+    color_buffer: 0,
+};
+
+extern "C" {
+    pub fn vprintf(format: *const u8, valist: *const core::ffi::c_void) -> i32;
+}
+
+#[kernel]
+pub unsafe fn __closesthit__radiance() {}
+
+#[kernel]
+pub unsafe fn __anyhit__radiance() {}
+
+#[kernel]
+pub unsafe fn __miss__radiance() {}
+
+#[kernel]
+pub unsafe fn __raygen__renderFrame() {
+    // let ix = _optix_get_launch_index_x();
+    // let iy = _optix_get_launch_index_y();
+
+    let idx = optix::get_launch_index();
+
+    if idx[0] == 3 && idx[1] == 4 {
+        vprintf(
+            b"Hello from Rust kernel!\n\0".as_ptr().cast(),
+            std::ptr::null::<core::ffi::c_void>(),
+        );
+
+        #[repr(C)]
+        struct PrintArgs(i32);
+
+        vprintf(
+            b"frame id is %d\n\0".as_ptr().cast(),
+            core::mem::transmute(&PrintArgs(core::ptr::read_volatile(&PARAMS.frame_id))),
+        );
+    }
+}
+
+// #[kernel]
+// pub unsafe fn render(fb: *mut Vec3<f32>, view: &Viewport) {}
diff --git a/crates/optix/examples/ex02_pipeline/src/ex02_pipeline.cu b/crates/optix/examples/ex02_pipeline/src/ex02_pipeline.cu
new file mode 100644
index 00000000..7ff52b2b
--- /dev/null
+++ b/crates/optix/examples/ex02_pipeline/src/ex02_pipeline.cu
@@ -0,0 +1,82 @@
+// ======================================================================== //
+// Copyright 2018-2019 Ingo Wald                                            //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#include <optix_device.h>
+
+#include <gdt/math/vec.h>
+
+namespace osc {
+
+using namespace gdt;
+
+struct LaunchParams {
+    int frameID{0};
+    uint32_t* colorBuffer;
+    vec2i fbSize;
+};
+
+/*! launch parameters in constant memory, filled in by optix upon
+    optixLaunch (this gets filled in from the buffer we pass to
+    optixLaunch) */
+extern "C" __constant__ LaunchParams PARAMS;
+
+//------------------------------------------------------------------------------
+// closest hit and anyhit programs for radiance-type rays.
+//
+// Note eventually we will have to create one pair of those for each
+// ray type and each geometry type we want to render; but this
+// simple example doesn't use any actual geometries yet, so we only
+// create a single, dummy, set of them (we do have to have at least
+// one group of them to set up the SBT)
+//------------------------------------------------------------------------------
+
+extern "C" __global__ void
+__closesthit__radiance() { /*! for this simple example, this will remain empty
+                            */
+}
+
+extern "C" __global__ void
+__anyhit__radiance() { /*! for this simple example, this will remain empty */
+}
+
+//------------------------------------------------------------------------------
+// miss program that gets called for any ray that did not have a
+// valid intersection
+//
+// as with the anyhit/closest hit programs, in this example we only
+// need to have _some_ dummy function to set up a valid SBT
+// ------------------------------------------------------------------------------
+
+extern "C" __global__ void
+__miss__radiance() { /*! for this simple example, this will remain empty */
+}
+
+//------------------------------------------------------------------------------
+// ray gen program - the actual rendering happens in here
+//------------------------------------------------------------------------------
+extern "C" __global__ void __raygen__renderFrame() {
+    if (optixGetLaunchIndex().x == 0 &&
+        optixGetLaunchIndex().y == 0) {
+        // we could of course also have used optixGetLaunchDims to query
+        // the launch size, but accessing the PARAMS here
+        // makes sure they're not getting optimized away (because
+        // otherwise they'd not get used)
+        printf("Hello world from OptiX 7 c++!\n");
+        printf("frameID is %d\n", PARAMS.frameID);
+    }
+}
+
+} // namespace osc
diff --git a/crates/optix/examples/ex02_pipeline/src/launch_params.h b/crates/optix/examples/ex02_pipeline/src/launch_params.h
new file mode 100644
index 00000000..e4364e45
--- /dev/null
+++ b/crates/optix/examples/ex02_pipeline/src/launch_params.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <gdt/math/vec.h>
+
+namespace osc {
+} // namespace osc
diff --git a/crates/optix/examples/ex02_pipeline/src/main.rs b/crates/optix/examples/ex02_pipeline/src/main.rs
new file mode 100644
index 00000000..19c21f42
--- /dev/null
+++ b/crates/optix/examples/ex02_pipeline/src/main.rs
@@ -0,0 +1,8 @@
+mod renderer;
+use renderer::Renderer;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut renderer = Renderer::new(256, 128)?;
+    renderer.render()?;
+    Ok(())
+}
diff --git a/crates/optix/examples/ex02_pipeline/src/renderer.rs b/crates/optix/examples/ex02_pipeline/src/renderer.rs
new file mode 100644
index 00000000..1abd3358
--- /dev/null
+++ b/crates/optix/examples/ex02_pipeline/src/renderer.rs
@@ -0,0 +1,222 @@
+use anyhow::{Context, Result};
+use cust::context::{Context as CuContext, ContextFlags};
+use cust::device::{Device, DeviceAttribute};
+use cust::memory::{CopyDestination, DeviceBox, DeviceBuffer, DevicePointer, DeviceVariable};
+use cust::stream::{Stream, StreamFlags};
+use cust::CudaFlags;
+use cust::DeviceCopy;
+use optix::{
+    context::DeviceContext,
+    pipeline::{
+        CompileDebugLevel, CompileOptimizationLevel, ExceptionFlags, Module, ModuleCompileOptions,
+        Pipeline, PipelineCompileOptions, PipelineLinkOptions, ProgramGroup, ProgramGroupDesc,
+        TraversableGraphFlags,
+    },
+    shader_binding_table::{SbtRecord, ShaderBindingTable},
+};
+
+pub struct Renderer {
+    launch_params: DeviceVariable<LaunchParams>,
+    sbt: ShaderBindingTable,
+    pipeline: Pipeline,
+    buf_raygen: DeviceBuffer<RaygenRecord>,
+    buf_hitgroup: DeviceBuffer<HitgroupRecord>,
+    buf_miss: DeviceBuffer<MissRecord>,
+    color_buffer: DeviceBuffer<u32>,
+    ctx: DeviceContext,
+    stream: Stream,
+    cuda_context: CuContext,
+}
+
+use device::LaunchParams;
+
+impl Renderer {
+    pub fn new(width: usize, height: usize) -> Result<Renderer, Box<dyn std::error::Error>> {
+        init_optix()?;
+
+        // create CUDA and OptiX contexts
+        let device = Device::get_device(0)?;
+        let tex_align = device.get_attribute(DeviceAttribute::TextureAlignment)?;
+        let srf_align = device.get_attribute(DeviceAttribute::SurfaceAlignment)?;
+        println!("tex align: {}\nsrf align: {}", tex_align, srf_align);
+
+        let cuda_context =
+            CuContext::create_and_push(ContextFlags::SCHED_AUTO | ContextFlags::MAP_HOST, device)?;
+        let stream = Stream::new(StreamFlags::DEFAULT, None)?;
+
+        let mut ctx = DeviceContext::new(&cuda_context, false)?;
+        ctx.set_log_callback(|_level, tag, msg| println!("[{}]: {}", tag, msg), 4)?;
+
+        // create module
+        let module_compile_options = ModuleCompileOptions {
+            max_register_count: 50,
+            opt_level: CompileOptimizationLevel::Default,
+            debug_level: CompileDebugLevel::None,
+        };
+
+        let pipeline_compile_options = PipelineCompileOptions::new()
+            .pipeline_launch_params_variable_name("PARAMS")
+            .uses_motion_blur(false)
+            .num_attribute_values(2)
+            .num_payload_values(2)
+            .traversable_graph_flags(TraversableGraphFlags::ALLOW_SINGLE_GAS)
+            .exception_flags(ExceptionFlags::NONE);
+
+        // let ptx = include_str!(concat!(env!("OUT_DIR"), "/src/ex02_pipeline.ptx"));
+        let ptx = include_str!(concat!(env!("OUT_DIR"), "/device.ptx"));
+
+        let (module, _log) = Module::new(
+            &mut ctx,
+            &module_compile_options,
+            &pipeline_compile_options,
+            ptx,
+        )
+        .context("Create module")?;
+
+        // create raygen program
+        let pgdesc_raygen = ProgramGroupDesc::raygen(&module, "__raygen__renderFrame");
+
+        let (pg_raygen, _log) = ProgramGroup::new(&mut ctx, &[pgdesc_raygen])?;
+
+        // create miss program
+        let pgdesc_miss = ProgramGroupDesc::miss(&module, "__miss__radiance");
+
+        let (pg_miss, _log) = ProgramGroup::new(&mut ctx, &[pgdesc_miss])?;
+
+        let pgdesc_hitgroup = ProgramGroupDesc::hitgroup(
+            Some((&module, "__closesthit__radiance")),
+            Some((&module, "__anyhit__radiance")),
+            None,
+        );
+
+        // create hitgroup programs
+        let (pg_hitgroup, _log) = ProgramGroup::new(&mut ctx, &[pgdesc_hitgroup])?;
+
+        // create SBT
+        let rec_raygen: Vec<_> = pg_raygen
+            .iter()
+            .map(|pg| RaygenRecord::pack(0, pg).expect("failed to pack raygen record"))
+            .collect();
+
+        let rec_miss: Vec<_> = pg_miss
+            .iter()
+            .map(|pg| MissRecord::pack(0, pg).expect("failed to pack miss record"))
+            .collect();
+
+        let num_objects = 1;
+        let rec_hitgroup: Vec<_> = (0..num_objects)
+            .map(|i| {
+                let object_type = 0;
+                let rec = HitgroupRecord::pack(
+                    HitgroupSbtData { object_id: i },
+                    &pg_hitgroup[object_type],
+                )
+                .expect("failed to pack hitgroup record");
+                rec
+            })
+            .collect();
+
+        let mut buf_raygen = DeviceBuffer::from_slice(&rec_raygen)?;
+        let mut buf_miss = DeviceBuffer::from_slice(&rec_miss)?;
+        let mut buf_hitgroup = DeviceBuffer::from_slice(&rec_hitgroup)?;
+
+        let sbt = ShaderBindingTable::new(&mut buf_raygen)
+            .miss(&mut buf_miss)
+            .hitgroup(&mut buf_hitgroup);
+
+        // create pipeline
+        let mut program_groups = Vec::new();
+        program_groups.extend(pg_raygen.into_iter());
+        program_groups.extend(pg_miss.into_iter());
+        program_groups.extend(pg_hitgroup.into_iter());
+
+        let pipeline_link_options = PipelineLinkOptions {
+            max_trace_depth: 2,
+            debug_level: CompileDebugLevel::LineInfo,
+        };
+
+        let (pipeline, _log) = Pipeline::new(
+            &mut ctx,
+            &pipeline_compile_options,
+            pipeline_link_options,
+            &program_groups,
+        )?;
+
+        pipeline.set_stack_size(2 * 1024, 2 * 1024, 2 * 1024, 1)?;
+
+        let color_buffer = unsafe { DeviceBuffer::uninitialized(width * height)? };
+
+        let launch_params = DeviceVariable::new(LaunchParams {
+            frame_id: 17,
+            fb_size: [width as u32, height as u32],
+            color_buffer: color_buffer.as_device_ptr(),
+        })?;
+
+        Ok(Renderer {
+            ctx,
+            cuda_context,
+            stream,
+            launch_params,
+            buf_raygen,
+            buf_hitgroup,
+            buf_miss,
+            sbt,
+            pipeline,
+            color_buffer,
+        })
+    }
+
+    pub fn resize(
+        &mut self,
+        width: usize,
+        height: usize,
+    ) -> Result<(), Box<dyn std::error::Error>> {
+        self.color_buffer = unsafe { DeviceBuffer::uninitialized(width * height)? };
+        self.launch_params.fb_size[0] = width as u32;
+        self.launch_params.fb_size[1] = height as u32;
+        self.launch_params.color_buffer = self.color_buffer.as_device_ptr();
+        Ok(())
+    }
+
+    pub fn render(&mut self) -> Result<(), Box<dyn std::error::Error>> {
+        self.launch_params.frame_id = 555;
+        self.launch_params.copy_htod()?;
+        self.launch_params.frame_id = 777;
+
+        unsafe {
+            optix::launch(
+                &self.pipeline,
+                &self.stream,
+                &self.launch_params,
+                &self.sbt,
+                self.launch_params.fb_size[0],
+                self.launch_params.fb_size[1],
+                1,
+            )?;
+        }
+
+        self.stream.synchronize()?;
+
+        Ok(())
+    }
+}
+
+type RaygenRecord = SbtRecord<i32>;
+type MissRecord = SbtRecord<i32>;
+
+#[derive(Copy, Clone, Default, DeviceCopy)]
+struct HitgroupSbtData {
+    object_id: u32,
+}
+type HitgroupRecord = SbtRecord<HitgroupSbtData>;
+
+fn init_optix() -> Result<(), Box<dyn std::error::Error>> {
+    cust::init(CudaFlags::empty())?;
+    let device_count = Device::num_devices()?;
+    if device_count == 0 {
+        panic!("No CUDA devices found!");
+    }
+
+    optix::init()?;
+    Ok(())
+}
diff --git a/crates/optix/examples/ex03_window/Cargo.toml b/crates/optix/examples/ex03_window/Cargo.toml
new file mode 100644
index 00000000..c4d1fdf7
--- /dev/null
+++ b/crates/optix/examples/ex03_window/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "ex03_window"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+optix = {path = "../../"}
+cust = {path = "../../../cust"}
+anyhow = "1.0.44"
+glfw = "0.42.0"
+gl = "0.14.0"
+num-traits = "0.2.14"
+
+[build-dependencies]
+find_cuda_helper = { version = "0.2", path = "../../../find_cuda_helper" }
diff --git a/crates/optix/examples/ex03_window/build.rs b/crates/optix/examples/ex03_window/build.rs
new file mode 100644
index 00000000..7a7bdade
--- /dev/null
+++ b/crates/optix/examples/ex03_window/build.rs
@@ -0,0 +1,42 @@
+use find_cuda_helper::find_optix_root;
+
+fn main() {
+    let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap();
+
+    let mut optix_include = find_optix_root().expect(
+        "Unable to find the OptiX SDK, make sure you installed it and
+    that OPTIX_ROOT or OPTIX_ROOT_DIR are set",
+    );
+    optix_include = optix_include.join("include");
+
+    let args = vec![
+        format!("-I{}", optix_include.display()),
+        format!("-I{}/../common/gdt", manifest_dir),
+    ];
+
+    compile_to_ptx("src/ex03_window.cu", &args);
+}
+
+fn compile_to_ptx(cu_path: &str, args: &[String]) {
+    println!("cargo:rerun-if-changed={}", cu_path);
+
+    let full_path =
+        std::path::PathBuf::from(std::env::var("CARGO_MANIFEST_DIR").unwrap()).join(cu_path);
+
+    let mut ptx_path = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()).join(cu_path);
+    ptx_path.set_extension("ptx");
+    std::fs::create_dir_all(ptx_path.parent().unwrap()).unwrap();
+
+    let output = std::process::Command::new("nvcc")
+        .arg("-ptx")
+        .arg(&full_path)
+        .arg("-o")
+        .arg(&ptx_path)
+        .args(args)
+        .output()
+        .expect("failed to fun nvcc");
+
+    if !output.status.success() {
+        panic!("{}", unsafe { String::from_utf8_unchecked(output.stderr) });
+    }
+}
diff --git a/crates/optix/examples/ex03_window/src/ex03_window.cu b/crates/optix/examples/ex03_window/src/ex03_window.cu
new file mode 100644
index 00000000..b35e319c
--- /dev/null
+++ b/crates/optix/examples/ex03_window/src/ex03_window.cu
@@ -0,0 +1,103 @@
+// ======================================================================== //
+// Copyright 2018-2019 Ingo Wald                                            //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#include <optix_device.h>
+
+#include <gdt/math/vec.h>
+
+namespace osc {
+
+using namespace gdt;
+struct LaunchParams {
+    float4* color_buffer;
+    vec2i fb_size;
+    int frame_id{0};
+};
+
+/*! launch parameters in constant memory, filled in by optix upon
+    optixLaunch (this gets filled in from the buffer we pass to
+    optixLaunch) */
+extern "C" __constant__ LaunchParams PARAMS;
+
+//------------------------------------------------------------------------------
+// closest hit and anyhit programs for radiance-type rays.
+//
+// Note eventually we will have to create one pair of those for each
+// ray type and each geometry type we want to render; but this
+// simple example doesn't use any actual geometries yet, so we only
+// create a single, dummy, set of them (we do have to have at least
+// one group of them to set up the SBT)
+//------------------------------------------------------------------------------
+
+extern "C" __global__ void
+__closesthit__radiance() { /*! for this simple example, this will remain empty
+                            */
+}
+
+extern "C" __global__ void
+__anyhit__radiance() { /*! for this simple example, this will remain empty */
+}
+
+//------------------------------------------------------------------------------
+// miss program that gets called for any ray that did not have a
+// valid intersection
+//
+// as with the anyhit/closest hit programs, in this example we only
+// need to have _some_ dummy function to set up a valid SBT
+// ------------------------------------------------------------------------------
+
+extern "C" __global__ void
+__miss__radiance() { /*! for this simple example, this will remain empty */
+}
+
+
+//------------------------------------------------------------------------------
+// ray gen program - the actual rendering happens in here
+//------------------------------------------------------------------------------
+extern "C" __global__ void __raygen__renderFrame() {
+    if (PARAMS.frame_id == 0 && optixGetLaunchIndex().x == 0 &&
+        optixGetLaunchIndex().y == 0) {
+        // we could of course also have used optixGetLaunchDims to query
+        // the launch size, but accessing the PARAMS here
+        // makes sure they're not getting optimized away (because
+        // otherwise they'd not get used)
+        printf("############################################\n");
+        printf("Hello world from OptiX 7 raygen program!\n(within a "
+               "%ix%i-sized launch)\n",
+               PARAMS.fb_size.x, PARAMS.fb_size.y);
+        printf("############################################\n");
+    }
+
+    // ------------------------------------------------------------------
+    // for this example, produce a simple test pattern:
+    // ------------------------------------------------------------------
+
+    // compute a test pattern based on pixel ID
+    const int ix = optixGetLaunchIndex().x;
+    const int iy = optixGetLaunchIndex().y;
+
+    int frameID = PARAMS.frame_id;
+
+    const float r = float((ix + frameID) % 256) / 255.0f;
+    const float g = float((iy + frameID) % 256) / 255.0f;
+    const float b = float((ix + iy + frameID) % 256) / 255.0f;
+
+    // and write to frame buffer ...
+    const unsigned fb_index = ix + iy * PARAMS.fb_size.x;
+    PARAMS.color_buffer[fb_index] = make_float4(r, g, b, 1.0f);
+}
+
+} // namespace osc
diff --git a/crates/optix/examples/ex03_window/src/gl_util.rs b/crates/optix/examples/ex03_window/src/gl_util.rs
new file mode 100644
index 00000000..9cbc48c9
--- /dev/null
+++ b/crates/optix/examples/ex03_window/src/gl_util.rs
@@ -0,0 +1,557 @@
+use gl;
+use gl::types::{GLchar, GLenum, GLint, GLsizeiptr, GLuint, GLvoid};
+use std::ffi::{CStr, CString};
+
+use crate::vector::*;
+
+pub struct Shader {
+    id: GLuint,
+}
+
+impl Shader {
+    pub fn from_source(source: &CStr, shader_type: GLenum) -> Result<Shader, String> {
+        let id = unsafe { gl::CreateShader(shader_type) };
+
+        unsafe {
+            gl::ShaderSource(id, 1, &source.as_ptr(), std::ptr::null());
+            gl::CompileShader(id);
+        }
+
+        let mut success: GLint = 1;
+        unsafe {
+            gl::GetShaderiv(id, gl::COMPILE_STATUS, &mut success);
+        }
+
+        if success == 0 {
+            let mut len: GLint = 0;
+            unsafe {
+                gl::GetShaderiv(id, gl::INFO_LOG_LENGTH, &mut len);
+            }
+            let error = create_whitespace_cstring(len as usize);
+            unsafe {
+                gl::GetShaderInfoLog(id, len, std::ptr::null_mut(), error.as_ptr() as *mut GLchar);
+            }
+            Err(error.to_string_lossy().into_owned())
+        } else {
+            Ok(Shader { id })
+        }
+    }
+
+    pub fn vertex_from_source(source: &CStr) -> Result<Shader, String> {
+        Shader::from_source(source, gl::VERTEX_SHADER)
+    }
+
+    pub fn fragment_from_source(source: &CStr) -> Result<Shader, String> {
+        Shader::from_source(source, gl::FRAGMENT_SHADER)
+    }
+
+    pub fn id(&self) -> GLuint {
+        self.id
+    }
+}
+
+impl Drop for Shader {
+    fn drop(&mut self) {
+        unsafe { gl::DeleteShader(self.id) };
+    }
+}
+
+pub struct Program {
+    id: GLuint,
+}
+
+impl Program {
+    pub fn from_shaders(shaders: &[Shader]) -> Result<Program, String> {
+        let id = unsafe { gl::CreateProgram() };
+
+        for shader in shaders {
+            unsafe { gl::AttachShader(id, shader.id()) };
+        }
+
+        unsafe { gl::LinkProgram(id) };
+
+        let mut success: GLint = 1;
+        unsafe {
+            gl::GetProgramiv(id, gl::LINK_STATUS, &mut success);
+        }
+
+        if success == 0 {
+            let mut len: GLint = 0;
+            unsafe {
+                gl::GetProgramiv(id, gl::INFO_LOG_LENGTH, &mut len);
+            }
+            let error = create_whitespace_cstring(len as usize);
+            unsafe {
+                gl::GetProgramInfoLog(id, len, std::ptr::null_mut(), error.as_ptr() as *mut GLchar);
+            }
+            return Err(error.to_string_lossy().into_owned());
+        }
+
+        for shader in shaders {
+            unsafe { gl::DetachShader(id, shader.id()) }
+        }
+
+        Ok(Program { id })
+    }
+
+    pub fn id(&self) -> GLuint {
+        self.id
+    }
+
+    pub fn use_program(&self) {
+        unsafe {
+            gl::UseProgram(self.id);
+        }
+    }
+
+    pub fn get_location(&self, name: &str) -> Result<GLint, String> {
+        let cname = CString::new(name).unwrap();
+        let loc = unsafe { gl::GetUniformLocation(self.id, cname.as_ptr() as *mut GLchar) };
+
+        if loc != -1 {
+            Ok(loc)
+        } else {
+            Err("Could not get location".to_owned())
+        }
+    }
+
+    pub fn set_uniform(&self, loc: GLint, v: i32) {
+        unsafe {
+            gl::ProgramUniform1i(self.id, loc, v);
+        }
+    }
+}
+
+fn create_whitespace_cstring(len: usize) -> CString {
+    let mut buffer: Vec<u8> = Vec::with_capacity(len as usize + 1);
+    buffer.extend([b' '].iter().cycle().take(len as usize));
+    unsafe { CString::from_vec_unchecked(buffer) }
+}
+
+#[repr(u32)]
+#[derive(Copy, Clone)]
+pub enum BufferType {
+    ArrayBuffer = gl::ARRAY_BUFFER,
+}
+
+#[repr(u32)]
+#[derive(Copy, Clone)]
+pub enum BufferUsage {
+    StaticDraw = gl::STATIC_DRAW,
+    StreamDraw = gl::STREAM_DRAW,
+}
+
+pub struct Buffer<T> {
+    id: GLuint,
+    buffer_type: BufferType,
+    _phantom: std::marker::PhantomData<T>,
+}
+
+impl<T> Buffer<T> {
+    pub fn new(buffer_type: BufferType) -> Buffer<T> {
+        let mut id: GLuint = 0;
+        unsafe {
+            gl::GenBuffers(1, &mut id);
+        }
+        Buffer {
+            id,
+            buffer_type,
+            _phantom: std::marker::PhantomData,
+        }
+    }
+
+    pub fn id(&self) -> GLuint {
+        self.id
+    }
+
+    pub fn buffer_data(&self, data: &[T], usage: BufferUsage) {
+        unsafe {
+            gl::BindBuffer(self.buffer_type as GLuint, self.id);
+            gl::BufferData(
+                self.buffer_type as GLuint,
+                (data.len() * std::mem::size_of::<T>()) as GLsizeiptr,
+                data.as_ptr() as *const GLvoid,
+                usage as GLenum,
+            );
+            gl::BindBuffer(self.buffer_type as GLuint, 0);
+        }
+    }
+
+    pub fn bind(&self) {
+        unsafe {
+            gl::BindBuffer(self.buffer_type as GLuint, self.id);
+        }
+    }
+
+    pub fn unbind(&self) {
+        unsafe {
+            gl::BindBuffer(self.buffer_type as GLuint, 0);
+        }
+    }
+}
+
+impl<T> Drop for Buffer<T> {
+    fn drop(&mut self) {
+        unsafe {
+            gl::DeleteBuffers(1, &self.id as *const GLuint);
+        }
+    }
+}
+
+pub struct VertexArray {
+    id: GLuint,
+}
+
+impl VertexArray {
+    pub fn new() -> VertexArray {
+        let mut id: GLuint = 0;
+        unsafe {
+            gl::GenVertexArrays(1, &mut id);
+        }
+
+        VertexArray { id }
+    }
+
+    pub fn id(&self) -> GLuint {
+        self.id
+    }
+
+    pub fn bind(&self) {
+        unsafe {
+            gl::BindVertexArray(self.id);
+        }
+    }
+
+    pub fn unbind(&self) {
+        unsafe {
+            gl::BindVertexArray(0);
+        }
+    }
+}
+
+impl Drop for VertexArray {
+    fn drop(&mut self) {
+        unsafe {
+            gl::DeleteVertexArrays(1, &self.id as *const GLuint);
+        }
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[repr(C, packed)]
+#[derive(Copy, Clone, Debug)]
+pub struct f32x2 {
+    x: f32,
+    y: f32,
+}
+
+impl f32x2 {
+    pub fn new(x: f32, y: f32) -> f32x2 {
+        f32x2 { x, y }
+    }
+
+    pub fn num_components() -> usize {
+        2
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[repr(C, packed)]
+#[derive(Copy, Clone, Debug)]
+pub struct f32x3 {
+    x: f32,
+    y: f32,
+    z: f32,
+}
+
+impl f32x3 {
+    pub fn new(x: f32, y: f32, z: f32) -> f32x3 {
+        f32x3 { x, y, z }
+    }
+
+    pub fn num_components() -> usize {
+        3
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[repr(C, packed)]
+#[derive(Copy, Clone, Debug)]
+pub struct f32x4 {
+    x: f32,
+    y: f32,
+    z: f32,
+    w: f32,
+}
+
+impl f32x4 {
+    pub fn new(x: f32, y: f32, z: f32, w: f32) -> f32x4 {
+        f32x4 { x, y, z, w }
+    }
+
+    pub fn zero() -> f32x4 {
+        f32x4::new(0.0, 0.0, 0.0, 0.0)
+    }
+
+    pub fn set(&mut self, x: f32, y: f32, z: f32, w: f32) {
+        self.x = x;
+        self.y = y;
+        self.z = z;
+        self.w = w;
+    }
+
+    pub fn num_components() -> usize {
+        4
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C, packed)]
+pub struct Vertex {
+    p: f32x3,
+    st: f32x2,
+}
+impl Vertex {
+    pub fn new(p: f32x3, st: f32x2) -> Vertex {
+        Vertex { p, st }
+    }
+
+    unsafe fn vertex_attrib_pointer(
+        num_components: usize,
+        stride: usize,
+        location: usize,
+        offset: usize,
+    ) {
+        gl::EnableVertexAttribArray(location as gl::types::GLuint); // location(0)
+        gl::VertexAttribPointer(
+            location as gl::types::GLuint, // index of the vertex attribute
+            num_components as gl::types::GLint, /* number of components per
+                                            * vertex attrib */
+            gl::FLOAT,
+            gl::FALSE, // normalized (int-to-float conversion),
+            stride as gl::types::GLint, /* byte stride between
+                        * successive elements */
+            offset as *const gl::types::GLvoid, /* offset of the first
+                                                 * element */
+        );
+    }
+
+    pub fn vertex_attrib_pointers() {
+        let stride = std::mem::size_of::<Self>();
+
+        let location = 0;
+        let offset = 0;
+
+        // and configure the vertex array
+        unsafe {
+            Vertex::vertex_attrib_pointer(f32x3::num_components(), stride, location, offset);
+        }
+
+        let location = location + 1;
+        let offset = offset + std::mem::size_of::<f32x3>();
+
+        // and configure the st array
+        unsafe {
+            Vertex::vertex_attrib_pointer(f32x2::num_components(), stride, location, offset);
+        }
+    }
+}
+
+pub struct FullscreenQuad {
+    width: u32,
+    height: u32,
+    vertex_array: VertexArray,
+    program: Program,
+    texture_id: GLuint,
+    loc_progression: GLint,
+}
+
+impl FullscreenQuad {
+    pub fn new(width: u32, height: u32) -> Result<FullscreenQuad, String> {
+        let vert_shader = Shader::vertex_from_source(
+            CStr::from_bytes_with_nul(
+                b"
+            #version 330 core
+            layout (location = 0) in vec3 _p;
+            layout (location = 1) in vec2 _st;
+            out vec2 st;
+            void main() {
+                gl_Position = vec4(_p, 1.0);
+                st = _st;
+            }
+        \0",
+            )
+            .unwrap(),
+        )?;
+
+        let frag_shader = Shader::fragment_from_source(
+            CStr::from_bytes_with_nul(
+                b"
+            #version 330 core
+            in vec2 st;
+            out vec4 Color;
+
+            uniform sampler2D smp2d_0;
+            uniform int progression;
+
+            void main() {
+                vec4 col = texture(smp2d_0, st); 
+                col.r = pow(col.r / progression, 1/2.2);
+                col.g = pow(col.g / progression, 1/2.2);
+                col.b = pow(col.b / progression, 1/2.2);
+                Color = col;
+            }
+        \0",
+            )
+            .unwrap(),
+        )?;
+
+        let program = Program::from_shaders(&[vert_shader, frag_shader])?;
+        program.use_program();
+        let loc_progression = program.get_location("progression")?;
+
+        let vertices: Vec<Vertex> = vec![
+            Vertex::new(f32x3::new(-1.0, -1.0, 0.0), f32x2::new(0.0, 0.0)),
+            Vertex::new(f32x3::new(1.0, -1.0, 0.0), f32x2::new(1.0, 0.0)),
+            Vertex::new(f32x3::new(1.0, 1.0, 0.0), f32x2::new(1.0, 1.0)),
+            Vertex::new(f32x3::new(-1.0, -1.0, 0.0), f32x2::new(0.0, 0.0)),
+            Vertex::new(f32x3::new(1.0, 1.0, 0.0), f32x2::new(1.0, 1.0)),
+            Vertex::new(f32x3::new(-1.0, 1.0, 0.0), f32x2::new(0.0, 1.0)),
+        ];
+        let vertex_buffer = Buffer::<Vertex>::new(BufferType::ArrayBuffer);
+        vertex_buffer.buffer_data(&vertices, BufferUsage::StaticDraw);
+
+        // Generate and bind the VAO
+        let vertex_array = VertexArray::new();
+
+        vertex_array.bind();
+        // Re-bind the VBO to associate the two. We could just have left it
+        // bound earlier and let the association happen when we
+        // configure the VAO but this way at least makes the connection
+        // between the two seem more explicit, despite the magical
+        // state machine hiding in OpenGL
+        vertex_buffer.bind();
+
+        // Set up the vertex attribute pointers for all locations
+        Vertex::vertex_attrib_pointers();
+
+        // now unbind both the vbo and vao to keep everything cleaner
+        vertex_buffer.unbind();
+        vertex_array.unbind();
+
+        // generate test texture data using the image width rather than the
+        // framebuffer width
+        let mut tex_data = Vec::with_capacity((width * height) as usize);
+        for y in 0..height {
+            for x in 0..width {
+                tex_data.push(f32x4::new(
+                    (x as f32) / width as f32,
+                    (y as f32) / height as f32,
+                    1.0,
+                    0.0,
+                ));
+            }
+        }
+
+        // generate the texture for the quad
+        let mut texture_id: gl::types::GLuint = 0;
+        unsafe {
+            gl::GenTextures(1, &mut texture_id);
+            gl::ActiveTexture(gl::TEXTURE0);
+            gl::Enable(gl::TEXTURE_2D);
+            gl::BindTexture(gl::TEXTURE_2D, texture_id);
+            gl::TexParameteri(
+                gl::TEXTURE_2D,
+                gl::TEXTURE_WRAP_S,
+                gl::CLAMP_TO_BORDER as gl::types::GLint,
+            );
+            gl::TexParameteri(
+                gl::TEXTURE_2D,
+                gl::TEXTURE_WRAP_T,
+                gl::CLAMP_TO_BORDER as gl::types::GLint,
+            );
+            gl::TexParameteri(
+                gl::TEXTURE_2D,
+                gl::TEXTURE_MIN_FILTER,
+                gl::NEAREST as gl::types::GLint,
+            );
+            gl::TexParameteri(
+                gl::TEXTURE_2D,
+                gl::TEXTURE_MAG_FILTER,
+                gl::NEAREST as gl::types::GLint,
+            );
+            gl::TexImage2D(
+                gl::TEXTURE_2D,
+                0,
+                gl::RGBA32F as gl::types::GLint,
+                width as gl::types::GLint,
+                height as gl::types::GLint,
+                0,
+                gl::RGBA,
+                gl::FLOAT,
+                tex_data.as_ptr() as *const gl::types::GLvoid,
+            );
+        }
+
+        Ok(FullscreenQuad {
+            width,
+            height,
+            vertex_array,
+            program,
+            texture_id,
+            loc_progression,
+        })
+    }
+
+    pub fn draw(&self) {
+        self.program.use_program();
+        self.vertex_array.bind();
+        unsafe {
+            gl::DrawArrays(
+                gl::TRIANGLES,
+                0, // starting index in the enabled array
+                6, // number of indices to draw
+            )
+        }
+        self.vertex_array.unbind();
+    }
+
+    pub fn update_texture(&self, data: &[V4f32]) {
+        unsafe {
+            gl::BindTexture(gl::TEXTURE_2D, self.texture_id);
+            gl::TexSubImage2D(
+                gl::TEXTURE_2D,
+                0,
+                0,
+                0,
+                self.width as GLint,
+                self.height as GLint,
+                gl::RGBA,
+                gl::FLOAT,
+                data.as_ptr() as *const GLvoid,
+            );
+        }
+    }
+
+    pub fn resize(&mut self, width: u32, height: u32) {
+        unsafe {
+            gl::TexImage2D(
+                gl::TEXTURE_2D,
+                0,
+                gl::RGBA32F as gl::types::GLint,
+                width as gl::types::GLint,
+                height as gl::types::GLint,
+                0,
+                gl::RGBA,
+                gl::FLOAT,
+                std::ptr::null(),
+            );
+        }
+        self.width = width;
+        self.height = height;
+    }
+
+    pub fn set_progression(&self, progression: i32) {
+        self.program.set_uniform(self.loc_progression, progression);
+    }
+}
diff --git a/crates/optix/examples/ex03_window/src/main.rs b/crates/optix/examples/ex03_window/src/main.rs
new file mode 100644
index 00000000..08bc998e
--- /dev/null
+++ b/crates/optix/examples/ex03_window/src/main.rs
@@ -0,0 +1,88 @@
+#![allow(warnings)]
+
+mod renderer;
+use renderer::Renderer;
+
+mod vector;
+pub use vector::*;
+mod gl_util;
+use gl_util::FullscreenQuad;
+use glfw::{Action, Context, Key};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut glfw = glfw::init(glfw::FAIL_ON_ERRORS).unwrap();
+    glfw.window_hint(glfw::WindowHint::ContextVersion(4, 1));
+    glfw.window_hint(glfw::WindowHint::OpenGlForwardCompat(true));
+    glfw.window_hint(glfw::WindowHint::OpenGlProfile(
+        glfw::OpenGlProfileHint::Core,
+    ));
+
+    let mut width = 960u32;
+    let mut height = 540u32;
+
+    let mut renderer = Renderer::new(width, height)?;
+
+    let (mut window, events) = glfw
+        .create_window(
+            width,
+            height,
+            "Example 03: in window",
+            glfw::WindowMode::Windowed,
+        )
+        .expect("failed to create glfw window");
+
+    window.set_key_polling(true);
+    window.make_current();
+
+    // retina displays will return a higher res for the framebuffer
+    // which we need to use for the viewport
+    let (fb_width, fb_height) = window.get_framebuffer_size();
+
+    gl::load_with(|s| glfw.get_proc_address_raw(s) as *const std::os::raw::c_void);
+
+    let mut fsq = FullscreenQuad::new(width, height).unwrap();
+
+    let mut image_data = vec![v4f32(0.0, 0.0, 0.0, 0.0); (width * height) as usize];
+
+    unsafe {
+        gl::Viewport(0, 0, fb_width, fb_height);
+    };
+
+    while !window.should_close() {
+        glfw.poll_events();
+        for (_, event) in glfw::flush_messages(&events) {
+            handle_window_event(&mut window, event);
+        }
+
+        let (w, h) = window.get_framebuffer_size();
+        let w = w as u32;
+        let h = h as u32;
+        if w != width || h != height {
+            fsq.resize(w, h);
+            renderer.resize(w, h)?;
+            width = w;
+            height = h;
+            image_data.resize((width * height) as usize, v4f32(0.0, 0.0, 0.0, 0.0));
+        }
+
+        renderer.render()?;
+        renderer.download_pixels(&mut image_data)?;
+        fsq.update_texture(&image_data);
+        fsq.set_progression(1);
+
+        // draw the quad
+        fsq.draw();
+
+        window.swap_buffers();
+    }
+
+    renderer.render()?;
+    Ok(())
+}
+
+fn handle_window_event(window: &mut glfw::Window, event: glfw::WindowEvent) {
+    match event {
+        glfw::WindowEvent::Key(Key::Escape, _, Action::Press, _) => window.set_should_close(true),
+        _ => {}
+    }
+}
diff --git a/crates/optix/examples/ex03_window/src/renderer.rs b/crates/optix/examples/ex03_window/src/renderer.rs
new file mode 100644
index 00000000..03acd884
--- /dev/null
+++ b/crates/optix/examples/ex03_window/src/renderer.rs
@@ -0,0 +1,235 @@
+use anyhow::{Context, Result};
+use cust::context::{Context as CuContext, ContextFlags};
+use cust::device::Device;
+use cust::memory::{CopyDestination, DeviceBox, DeviceBuffer, DevicePointer, DeviceVariable};
+use cust::stream::{Stream, StreamFlags};
+use cust::{CudaFlags, DeviceCopy};
+use optix::{
+    context::DeviceContext,
+    pipeline::{
+        CompileDebugLevel, CompileOptimizationLevel, ExceptionFlags, Module, ModuleCompileOptions,
+        Pipeline, PipelineCompileOptions, PipelineLinkOptions, ProgramGroup, ProgramGroupDesc,
+        TraversableGraphFlags,
+    },
+    shader_binding_table::{SbtRecord, ShaderBindingTable},
+};
+
+use crate::vector::V4f32;
+
+pub struct Renderer {
+    launch_params: DeviceVariable<LaunchParams>,
+    sbt: ShaderBindingTable,
+    buf_raygen: DeviceBuffer<RaygenRecord>,
+    buf_hitgroup: DeviceBuffer<HitgroupRecord>,
+    buf_miss: DeviceBuffer<MissRecord>,
+    pipeline: Pipeline,
+    color_buffer: DeviceBuffer<V4f32>,
+    ctx: DeviceContext,
+    stream: Stream,
+    cuda_context: CuContext,
+}
+
+impl Renderer {
+    pub fn new(width: u32, height: u32) -> Result<Renderer, Box<dyn std::error::Error>> {
+        init_optix()?;
+
+        // create CUDA and OptiX contexts
+        let device = Device::get_device(0)?;
+
+        let cuda_context = CuContext::new(device)?;
+        let stream = Stream::new(StreamFlags::DEFAULT, None)?;
+
+        let mut ctx = DeviceContext::new(&cuda_context, false)?;
+        ctx.set_log_callback(|_level, tag, msg| println!("[{}]: {}", tag, msg), 4)?;
+
+        // create module
+        let module_compile_options = ModuleCompileOptions {
+            max_register_count: 50,
+            opt_level: CompileOptimizationLevel::Default,
+            debug_level: CompileDebugLevel::None,
+        };
+
+        let pipeline_compile_options = PipelineCompileOptions::new()
+            .pipeline_launch_params_variable_name("PARAMS")
+            .uses_motion_blur(false)
+            .num_attribute_values(2)
+            .num_payload_values(2)
+            .traversable_graph_flags(TraversableGraphFlags::ALLOW_SINGLE_GAS)
+            .exception_flags(ExceptionFlags::NONE);
+
+        let ptx = include_str!(concat!(env!("OUT_DIR"), "/src/ex03_window.ptx"));
+
+        let (module, _log) = Module::new(
+            &mut ctx,
+            &module_compile_options,
+            &pipeline_compile_options,
+            ptx,
+        )
+        .context("Create module")?;
+
+        // create raygen program
+        let pgdesc_raygen = ProgramGroupDesc::raygen(&module, "__raygen__renderFrame");
+
+        let (pg_raygen, _log) = ProgramGroup::new(&mut ctx, &[pgdesc_raygen])?;
+
+        // create miss program
+        let pgdesc_miss = ProgramGroupDesc::miss(&module, "__miss__radiance");
+
+        let (pg_miss, _log) = ProgramGroup::new(&mut ctx, &[pgdesc_miss])?;
+
+        let pgdesc_hitgroup = ProgramGroupDesc::hitgroup(
+            Some((&module, "__closesthit__radiance")),
+            Some((&module, "__anyhit__radiance")),
+            None,
+        );
+
+        // create hitgroup programs
+        let (pg_hitgroup, _log) = ProgramGroup::new(&mut ctx, &[pgdesc_hitgroup])?;
+
+        // create SBT
+        let rec_raygen: Vec<_> = pg_raygen
+            .iter()
+            .map(|pg| RaygenRecord::pack(0, pg).expect("failed to pack raygen record"))
+            .collect();
+
+        let rec_miss: Vec<_> = pg_miss
+            .iter()
+            .map(|pg| MissRecord::pack(0, pg).expect("failed to pack miss record"))
+            .collect();
+
+        let num_objects = 1;
+        let rec_hitgroup: Vec<_> = (0..num_objects)
+            .map(|i| {
+                let object_type = 0;
+                let rec = HitgroupRecord::pack(
+                    HitgroupSbtData { object_id: i },
+                    &pg_hitgroup[object_type],
+                )
+                .expect("failed to pack hitgroup record");
+                rec
+            })
+            .collect();
+
+        let mut buf_raygen = DeviceBuffer::from_slice(&rec_raygen)?;
+        let mut buf_miss = DeviceBuffer::from_slice(&rec_miss)?;
+        let mut buf_hitgroup = DeviceBuffer::from_slice(&rec_hitgroup)?;
+
+        let sbt = ShaderBindingTable::new(&mut buf_raygen)
+            .miss(&mut buf_miss)
+            .hitgroup(&mut buf_hitgroup);
+
+        // create pipeline
+        let mut program_groups = Vec::new();
+        program_groups.extend(pg_raygen.into_iter());
+        program_groups.extend(pg_miss.into_iter());
+        program_groups.extend(pg_hitgroup.into_iter());
+
+        let pipeline_link_options = PipelineLinkOptions {
+            max_trace_depth: 2,
+            debug_level: CompileDebugLevel::LineInfo,
+        };
+
+        let (pipeline, _log) = Pipeline::new(
+            &mut ctx,
+            &pipeline_compile_options,
+            pipeline_link_options,
+            &program_groups,
+        )?;
+
+        pipeline.set_stack_size(2 * 1024, 2 * 1024, 2 * 1024, 1)?;
+
+        let mut color_buffer =
+            unsafe { DeviceBuffer::uninitialized(width as usize * height as usize)? };
+
+        let launch_params = DeviceVariable::new(LaunchParams {
+            frame_id: 0,
+            color_buffer: color_buffer.as_device_ptr(),
+            fb_size: Point2i {
+                x: width as i32,
+                y: height as i32,
+            },
+        })?;
+
+        Ok(Renderer {
+            ctx,
+            cuda_context,
+            stream,
+            launch_params,
+            buf_raygen,
+            buf_hitgroup,
+            buf_miss,
+            sbt,
+            pipeline,
+            color_buffer,
+        })
+    }
+
+    pub fn resize(&mut self, width: u32, height: u32) -> Result<(), Box<dyn std::error::Error>> {
+        self.color_buffer = unsafe { DeviceBuffer::uninitialized((width * height) as usize)? };
+        self.launch_params.fb_size.x = width as i32;
+        self.launch_params.fb_size.y = height as i32;
+        self.launch_params.color_buffer = self.color_buffer.as_device_ptr();
+        Ok(())
+    }
+
+    pub fn render(&mut self) -> Result<(), Box<dyn std::error::Error>> {
+        self.launch_params.copy_htod()?;
+        self.launch_params.frame_id += 1;
+
+        unsafe {
+            optix::launch(
+                &self.pipeline,
+                &self.stream,
+                &self.launch_params,
+                &self.sbt,
+                self.launch_params.fb_size.x as u32,
+                self.launch_params.fb_size.y as u32,
+                1,
+            )?;
+        }
+
+        self.stream.synchronize()?;
+
+        Ok(())
+    }
+
+    pub fn download_pixels(&self, slice: &mut [V4f32]) -> Result<(), Box<dyn std::error::Error>> {
+        self.color_buffer.copy_to(slice)?;
+        Ok(())
+    }
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, DeviceCopy)]
+struct Point2i {
+    pub x: i32,
+    pub y: i32,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, DeviceCopy)]
+struct LaunchParams {
+    pub color_buffer: DevicePointer<V4f32>,
+    pub fb_size: Point2i,
+    pub frame_id: i32,
+}
+
+type RaygenRecord = SbtRecord<i32>;
+type MissRecord = SbtRecord<i32>;
+
+#[derive(Copy, Clone, Default, DeviceCopy)]
+struct HitgroupSbtData {
+    object_id: u32,
+}
+type HitgroupRecord = SbtRecord<HitgroupSbtData>;
+
+fn init_optix() -> Result<(), Box<dyn std::error::Error>> {
+    cust::init(CudaFlags::empty())?;
+    let device_count = Device::num_devices()?;
+    if device_count == 0 {
+        panic!("No CUDA devices found!");
+    }
+
+    optix::init()?;
+    Ok(())
+}
diff --git a/crates/optix/examples/ex03_window/src/vector.rs b/crates/optix/examples/ex03_window/src/vector.rs
new file mode 100644
index 00000000..589a4134
--- /dev/null
+++ b/crates/optix/examples/ex03_window/src/vector.rs
@@ -0,0 +1,301 @@
+use core::ops;
+pub use num_traits::{One, Zero};
+
+pub trait Scalar: num_traits::One + num_traits::Zero {}
+
+impl Scalar for i8 {}
+impl Scalar for i16 {}
+impl Scalar for i32 {}
+impl Scalar for i64 {}
+impl Scalar for f32 {}
+impl Scalar for f64 {}
+
+pub trait Vector {
+    type Component: Scalar;
+
+    fn dot(&self, v: &Self) -> Self::Component;
+
+    #[inline]
+    fn length2(&self) -> Self::Component {
+        self.dot(&self)
+    }
+}
+
+macro_rules! vec_impl {
+    ($name:ident: $t:ty, $sc:ident, $align:expr, ($($c:ident),+)) => {
+        #[repr(C)]
+        #[derive(Clone, Copy, Default, PartialEq, Debug)]
+        pub struct $name
+        {
+            $(
+                pub $c: $t,
+            )+
+        }
+
+        impl $name
+        {
+            pub fn new($($c: $t),+) -> Self
+            {
+                Self {
+                    $(
+                        $c,
+                    )+
+                }
+            }
+        }
+
+        impl Vector for $name
+        {
+            type Component = $t;
+
+            #[inline]
+            fn dot(&self, v: &Self) -> $t
+            {
+                <$t>::zero() $(
+                    + self.$c * v.$c
+                )+
+            }
+        }
+
+        impl From<$t> for $name
+        {
+            fn from(x: $t) -> Self
+            {
+                Self {
+                    $(
+                        $c: x,
+                    )+
+                }
+            }
+        }
+
+        impl ops::Neg for $name
+        {
+            type Output = Self;
+
+            fn neg(self) -> Self
+            {
+                Self {
+                    $(
+                        $c: -self.$c,
+                    )+
+                }
+            }
+        }
+
+        impl ops::Add for $name
+        {
+            type Output = Self;
+
+            #[inline]
+            fn add(self, v: Self) -> Self
+            {
+                Self {
+                    $(
+                        $c: self.$c + v.$c,
+                    )+
+                }
+            }
+        }
+
+        impl ops::AddAssign for $name
+        {
+            #[inline]
+            fn add_assign(&mut self, v: Self)
+            {
+                $(
+                    self.$c += v.$c;
+                )+
+            }
+        }
+
+        impl ops::Sub for $name
+        {
+            type Output = Self;
+
+            #[inline]
+            fn sub(self, v: Self) -> Self
+            {
+                Self {
+                    $(
+                        $c: self.$c - v.$c,
+                    )+
+                }
+            }
+        }
+
+        impl ops::SubAssign for $name
+        {
+            #[inline]
+            fn sub_assign(&mut self, v: Self)
+            {
+                $(
+                    self.$c -= v.$c;
+                )+
+            }
+        }
+
+        impl ops::Mul for $name
+        {
+            type Output = Self;
+
+            #[inline]
+            fn mul(self, v: Self) -> Self
+            {
+                Self {
+                    $(
+                        $c: self.$c * v.$c,
+                    )+
+                }
+            }
+        }
+
+        impl ops::MulAssign for $name
+        {
+            #[inline]
+            fn mul_assign(&mut self, v: Self)
+            {
+                $(
+                    self.$c *= v.$c;
+                )+
+            }
+        }
+
+        impl ops::Mul<$t> for $name
+        {
+            type Output = Self;
+
+            #[inline]
+            fn mul(self, v: $t) -> Self
+            {
+                Self {
+                    $(
+                        $c: self.$c * v,
+                    )+
+                }
+            }
+        }
+
+        impl ops::MulAssign<$t> for $name
+        {
+            #[inline]
+            fn mul_assign(&mut self, v: $t)
+            {
+                $(
+                    self.$c *= v;
+                )+
+            }
+        }
+
+        impl ops::Div<$t> for $name
+        {
+            type Output = Self;
+
+            #[inline]
+            fn div(self, v: $t) -> Self
+            {
+                Self {
+                    $(
+                        $c: self.$c / v,
+                    )+
+                }
+            }
+        }
+
+        impl ops::DivAssign<$t> for $name
+        {
+            #[inline]
+            fn div_assign(&mut self, v: $t)
+            {
+                $(
+                    self.$c /= v;
+                )+
+            }
+        }
+
+        impl ops::Mul<$name> for $t
+        {
+            type Output = $name;
+
+            #[inline]
+            fn mul(self, v: $name) -> $name
+            {
+                $name {
+                    $(
+                        $c: self * v.$c,
+                    )+
+                }
+            }
+        }
+
+        impl ops::Div<$name> for $t
+        {
+            type Output = $name;
+
+            #[inline]
+            fn div(self, v: $name) -> $name
+            {
+                $name {
+                    $(
+                        $c: self / v.$c,
+                    )+
+                }
+            }
+        }
+
+        pub fn $sc($($c: $t),+) -> $name
+        {
+            $name {
+                $(
+                    $c,
+                )+
+            }
+        }
+
+        unsafe impl cust::memory::DeviceCopy for $name {
+            // fn device_align() -> usize {
+            //     $align
+            // }
+        }
+    };
+
+}
+
+vec_impl!(V2i8: i8, v2i8, 1, (x, y));
+vec_impl!(V2i16: i16, v2i16, 2, (x, y));
+vec_impl!(V2i32: i32, v2i32, 8, (x, y));
+vec_impl!(V2i64: i64, v2i64, 8, (x, y));
+vec_impl!(V3i8: i8, v3i8, 1, (x, y, z));
+vec_impl!(V3i16: i16, v3i16, 2, (x, y, z));
+vec_impl!(V3i32: i32, v3i32, 4, (x, y, z));
+vec_impl!(V3i64: i64, v3i64, 8, (x, y, z));
+vec_impl!(V4i8: i8, v4i8, 1, (x, y, z, w));
+vec_impl!(V4i16: i16, v4i16, 2, (x, y, z, w));
+vec_impl!(V4i32: i32, v4i32, 16, (x, y, z, w));
+vec_impl!(V4i64: i64, v4i64, 8, (x, y, z, w));
+
+vec_impl!(V2f32: f32, v2f32, 8, (x, y));
+vec_impl!(V2f64: f64, v2f64, 8, (x, y));
+vec_impl!(V3f32: f32, v3f32, 4, (x, y, z));
+vec_impl!(V3f64: f64, v3f64, 8, (x, y, z));
+vec_impl!(V4f32: f32, v4f32, 16, (x, y, z, w));
+vec_impl!(V4f64: f64, v4f64, 8, (x, y, z, w));
+
+vec_impl!(P2f32: f32, p2f32, 8, (x, y));
+vec_impl!(P2f64: f64, p2f64, 8, (x, y));
+vec_impl!(P3f32: f32, p3f32, 4, (x, y, z));
+vec_impl!(P3f64: f64, p3f64, 8, (x, y, z));
+vec_impl!(P4f32: f32, p4f32, 16, (x, y, z, w));
+vec_impl!(P4f64: f64, p4f64, 8, (x, y, z, w));
+
+vec_impl!(N2f32: f32, n2f32, 8, (x, y));
+vec_impl!(N2f64: f64, n2f64, 8, (x, y));
+vec_impl!(N3f32: f32, n3f32, 4, (x, y, z));
+vec_impl!(N3f64: f64, n3f64, 8, (x, y, z));
+vec_impl!(N4f32: f32, n4f32, 16, (x, y, z, w));
+vec_impl!(N4f64: f64, n4f64, 8, (x, y, z, w));
+
+#[inline]
+pub fn dot<T: Vector>(a: &T, b: &T) -> T::Component {
+    a.dot(b)
+}
diff --git a/crates/optix/examples/ex04_mesh/Cargo.toml b/crates/optix/examples/ex04_mesh/Cargo.toml
new file mode 100644
index 00000000..53725824
--- /dev/null
+++ b/crates/optix/examples/ex04_mesh/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "ex04_mesh"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+optix = {path = "../../"}
+cust = {path = "../../../cust", features=["impl_glam"]}
+anyhow = "1.0.44"
+glfw = "0.42.0"
+gl = "0.14.0"
+num-traits = "0.2.14"
+glam = { version = "0.20", features=["cuda"] }
+
+[build-dependencies]
+find_cuda_helper = { version = "0.2", path = "../../../find_cuda_helper" }
diff --git a/crates/optix/examples/ex04_mesh/build.rs b/crates/optix/examples/ex04_mesh/build.rs
new file mode 100644
index 00000000..b7c67358
--- /dev/null
+++ b/crates/optix/examples/ex04_mesh/build.rs
@@ -0,0 +1,42 @@
+use find_cuda_helper::find_optix_root;
+
+fn main() {
+    let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap();
+
+    let mut optix_include = find_optix_root().expect(
+        "Unable to find the OptiX SDK, make sure you installed it and
+    that OPTIX_ROOT or OPTIX_ROOT_DIR are set",
+    );
+    optix_include = optix_include.join("include");
+
+    let args = vec![
+        format!("-I{}", optix_include.display()),
+        format!("-I{}/../common/gdt", manifest_dir),
+    ];
+
+    compile_to_ptx("src/ex04_mesh.cu", &args);
+}
+
+fn compile_to_ptx(cu_path: &str, args: &[String]) {
+    println!("cargo:rerun-if-changed={}", cu_path);
+
+    let full_path =
+        std::path::PathBuf::from(std::env::var("CARGO_MANIFEST_DIR").unwrap()).join(cu_path);
+
+    let mut ptx_path = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()).join(cu_path);
+    ptx_path.set_extension("ptx");
+    std::fs::create_dir_all(ptx_path.parent().unwrap()).unwrap();
+
+    let output = std::process::Command::new("nvcc")
+        .arg("-ptx")
+        .arg(&full_path)
+        .arg("-o")
+        .arg(&ptx_path)
+        .args(args)
+        .output()
+        .expect("failed to fun nvcc");
+
+    if !output.status.success() {
+        panic!("{}", unsafe { String::from_utf8_unchecked(output.stderr) });
+    }
+}
diff --git a/crates/optix/examples/ex04_mesh/src/ex04_mesh.cu b/crates/optix/examples/ex04_mesh/src/ex04_mesh.cu
new file mode 100644
index 00000000..43573084
--- /dev/null
+++ b/crates/optix/examples/ex04_mesh/src/ex04_mesh.cu
@@ -0,0 +1,145 @@
+// ======================================================================== //
+// Copyright 2018-2019 Ingo Wald                                            //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#include <optix_device.h>
+
+#include <gdt/math/vec.h>
+
+namespace osc {
+
+using namespace gdt;
+struct LaunchParams {
+    struct {
+        float4* colorBuffer;
+        vec2i size;
+    } frame;
+
+    struct {
+        vec3f position;
+        vec3f direction;
+        vec3f horizontal;
+        vec3f vertical;
+    } camera;
+
+    OptixTraversableHandle traversable;
+};
+
+/*! launch parameters in constant memory, filled in by optix upon
+    optixLaunch (this gets filled in from the buffer we pass to
+    optixLaunch) */
+extern "C" __constant__ LaunchParams PARAMS;
+// for this simple example, we have a single ray type
+enum { SURFACE_RAY_TYPE = 0, RAY_TYPE_COUNT };
+
+static __forceinline__ __device__ void* unpackPointer(uint32_t i0,
+                                                      uint32_t i1) {
+    const uint64_t uptr = static_cast<uint64_t>(i0) << 32 | i1;
+    void* ptr = reinterpret_cast<void*>(uptr);
+    return ptr;
+}
+
+static __forceinline__ __device__ void packPointer(void* ptr, uint32_t& i0,
+                                                   uint32_t& i1) {
+    const uint64_t uptr = reinterpret_cast<uint64_t>(ptr);
+    i0 = uptr >> 32;
+    i1 = uptr & 0x00000000ffffffff;
+}
+
+template <typename T> static __forceinline__ __device__ T* getPRD() {
+    const uint32_t u0 = optixGetPayload_0();
+    const uint32_t u1 = optixGetPayload_1();
+    return reinterpret_cast<T*>(unpackPointer(u0, u1));
+}
+
+//------------------------------------------------------------------------------
+// closest hit and anyhit programs for radiance-type rays.
+//
+// Note eventually we will have to create one pair of those for each
+// ray type and each geometry type we want to render; but this
+// simple example doesn't use any actual geometries yet, so we only
+// create a single, dummy, set of them (we do have to have at least
+// one group of them to set up the SBT)
+//------------------------------------------------------------------------------
+
+extern "C" __global__ void __closesthit__radiance() {
+    const int primID = optixGetPrimitiveIndex();
+    vec3f& prd = *(vec3f*)getPRD<vec3f>();
+    prd = gdt::randomColor(primID);
+}
+
+extern "C" __global__ void
+__anyhit__radiance() { /*! for this simple example, this will remain empty */
+}
+
+//------------------------------------------------------------------------------
+// miss program that gets called for any ray that did not have a
+// valid intersection
+//
+// as with the anyhit/closest hit programs, in this example we only
+// need to have _some_ dummy function to set up a valid SBT
+// ------------------------------------------------------------------------------
+
+extern "C" __global__ void __miss__radiance() {
+    vec3f& prd = *(vec3f*)getPRD<vec3f>();
+    // set to constant white as background color
+    prd = vec3f(1.f);
+}
+
+//------------------------------------------------------------------------------
+// ray gen program - the actual rendering happens in here
+//------------------------------------------------------------------------------
+extern "C" __global__ void __raygen__renderFrame() {
+    // compute a test pattern based on pixel ID
+    const int ix = optixGetLaunchIndex().x;
+    const int iy = optixGetLaunchIndex().y;
+
+    const auto& camera = PARAMS.camera;
+
+    // our per-ray data for this example. what we initialize it to
+    // won't matter, since this value will be overwritten by either
+    // the miss or hit program, anyway
+    vec3f pixelColorPRD = vec3f(0.f);
+
+    // the values we store the PRD pointer in:
+    uint32_t u0, u1;
+    packPointer(&pixelColorPRD, u0, u1);
+
+    // normalized screen plane position, in [0,1]^2
+    const vec2f screen(vec2f(ix + .5f, iy + .5f) / vec2f(PARAMS.frame.size));
+
+    // generate ray direction
+    vec3f rayDir =
+        normalize(camera.direction + (screen.x - 0.5f) * camera.horizontal +
+                  (screen.y - 0.5f) * camera.vertical);
+
+    optixTrace(PARAMS.traversable, camera.position, rayDir,
+               0.f,   // tmin
+               1e20f, // tmax
+               0.0f,  // rayTime
+               OptixVisibilityMask(255),
+               OPTIX_RAY_FLAG_DISABLE_ANYHIT, // OPTIX_RAY_FLAG_NONE,
+               SURFACE_RAY_TYPE,              // SBT offset
+               RAY_TYPE_COUNT,                // SBT stride
+               SURFACE_RAY_TYPE,              // missSBTIndex
+               u0, u1);
+
+    // and write to frame buffer ...
+    const uint32_t fbIndex = ix + iy * PARAMS.frame.size.x;
+    PARAMS.frame.colorBuffer[fbIndex] =
+        make_float4(pixelColorPRD.x, pixelColorPRD.y, pixelColorPRD.z, 1.0f);
+}
+
+} // namespace osc
diff --git a/crates/optix/examples/ex04_mesh/src/gl_util.rs b/crates/optix/examples/ex04_mesh/src/gl_util.rs
new file mode 100644
index 00000000..2199181a
--- /dev/null
+++ b/crates/optix/examples/ex04_mesh/src/gl_util.rs
@@ -0,0 +1,557 @@
+use gl;
+use gl::types::{GLchar, GLenum, GLint, GLsizeiptr, GLuint, GLvoid};
+use std::ffi::{CStr, CString};
+
+use glam::Vec4;
+
+pub struct Shader {
+    id: GLuint,
+}
+
+impl Shader {
+    pub fn from_source(source: &CStr, shader_type: GLenum) -> Result<Shader, String> {
+        let id = unsafe { gl::CreateShader(shader_type) };
+
+        unsafe {
+            gl::ShaderSource(id, 1, &source.as_ptr(), std::ptr::null());
+            gl::CompileShader(id);
+        }
+
+        let mut success: GLint = 1;
+        unsafe {
+            gl::GetShaderiv(id, gl::COMPILE_STATUS, &mut success);
+        }
+
+        if success == 0 {
+            let mut len: GLint = 0;
+            unsafe {
+                gl::GetShaderiv(id, gl::INFO_LOG_LENGTH, &mut len);
+            }
+            let error = create_whitespace_cstring(len as usize);
+            unsafe {
+                gl::GetShaderInfoLog(id, len, std::ptr::null_mut(), error.as_ptr() as *mut GLchar);
+            }
+            Err(error.to_string_lossy().into_owned())
+        } else {
+            Ok(Shader { id })
+        }
+    }
+
+    pub fn vertex_from_source(source: &CStr) -> Result<Shader, String> {
+        Shader::from_source(source, gl::VERTEX_SHADER)
+    }
+
+    pub fn fragment_from_source(source: &CStr) -> Result<Shader, String> {
+        Shader::from_source(source, gl::FRAGMENT_SHADER)
+    }
+
+    pub fn id(&self) -> GLuint {
+        self.id
+    }
+}
+
+impl Drop for Shader {
+    fn drop(&mut self) {
+        unsafe { gl::DeleteShader(self.id) };
+    }
+}
+
+pub struct Program {
+    id: GLuint,
+}
+
+impl Program {
+    pub fn from_shaders(shaders: &[Shader]) -> Result<Program, String> {
+        let id = unsafe { gl::CreateProgram() };
+
+        for shader in shaders {
+            unsafe { gl::AttachShader(id, shader.id()) };
+        }
+
+        unsafe { gl::LinkProgram(id) };
+
+        let mut success: GLint = 1;
+        unsafe {
+            gl::GetProgramiv(id, gl::LINK_STATUS, &mut success);
+        }
+
+        if success == 0 {
+            let mut len: GLint = 0;
+            unsafe {
+                gl::GetProgramiv(id, gl::INFO_LOG_LENGTH, &mut len);
+            }
+            let error = create_whitespace_cstring(len as usize);
+            unsafe {
+                gl::GetProgramInfoLog(id, len, std::ptr::null_mut(), error.as_ptr() as *mut GLchar);
+            }
+            return Err(error.to_string_lossy().into_owned());
+        }
+
+        for shader in shaders {
+            unsafe { gl::DetachShader(id, shader.id()) }
+        }
+
+        Ok(Program { id })
+    }
+
+    pub fn id(&self) -> GLuint {
+        self.id
+    }
+
+    pub fn use_program(&self) {
+        unsafe {
+            gl::UseProgram(self.id);
+        }
+    }
+
+    pub fn get_location(&self, name: &str) -> Result<GLint, String> {
+        let cname = CString::new(name).unwrap();
+        let loc = unsafe { gl::GetUniformLocation(self.id, cname.as_ptr() as *mut GLchar) };
+
+        if loc != -1 {
+            Ok(loc)
+        } else {
+            Err("Could not get location".to_owned())
+        }
+    }
+
+    pub fn set_uniform(&self, loc: GLint, v: i32) {
+        unsafe {
+            gl::ProgramUniform1i(self.id, loc, v);
+        }
+    }
+}
+
+fn create_whitespace_cstring(len: usize) -> CString {
+    let mut buffer: Vec<u8> = Vec::with_capacity(len as usize + 1);
+    buffer.extend([b' '].iter().cycle().take(len as usize));
+    unsafe { CString::from_vec_unchecked(buffer) }
+}
+
+#[repr(u32)]
+#[derive(Copy, Clone)]
+pub enum BufferType {
+    ArrayBuffer = gl::ARRAY_BUFFER,
+}
+
+#[repr(u32)]
+#[derive(Copy, Clone)]
+pub enum BufferUsage {
+    StaticDraw = gl::STATIC_DRAW,
+    StreamDraw = gl::STREAM_DRAW,
+}
+
+pub struct Buffer<T> {
+    id: GLuint,
+    buffer_type: BufferType,
+    _phantom: std::marker::PhantomData<T>,
+}
+
+impl<T> Buffer<T> {
+    pub fn new(buffer_type: BufferType) -> Buffer<T> {
+        let mut id: GLuint = 0;
+        unsafe {
+            gl::GenBuffers(1, &mut id);
+        }
+        Buffer {
+            id,
+            buffer_type,
+            _phantom: std::marker::PhantomData,
+        }
+    }
+
+    pub fn id(&self) -> GLuint {
+        self.id
+    }
+
+    pub fn buffer_data(&self, data: &[T], usage: BufferUsage) {
+        unsafe {
+            gl::BindBuffer(self.buffer_type as GLuint, self.id);
+            gl::BufferData(
+                self.buffer_type as GLuint,
+                (data.len() * std::mem::size_of::<T>()) as GLsizeiptr,
+                data.as_ptr() as *const GLvoid,
+                usage as GLenum,
+            );
+            gl::BindBuffer(self.buffer_type as GLuint, 0);
+        }
+    }
+
+    pub fn bind(&self) {
+        unsafe {
+            gl::BindBuffer(self.buffer_type as GLuint, self.id);
+        }
+    }
+
+    pub fn unbind(&self) {
+        unsafe {
+            gl::BindBuffer(self.buffer_type as GLuint, 0);
+        }
+    }
+}
+
+impl<T> Drop for Buffer<T> {
+    fn drop(&mut self) {
+        unsafe {
+            gl::DeleteBuffers(1, &self.id as *const GLuint);
+        }
+    }
+}
+
+pub struct VertexArray {
+    id: GLuint,
+}
+
+impl VertexArray {
+    pub fn new() -> VertexArray {
+        let mut id: GLuint = 0;
+        unsafe {
+            gl::GenVertexArrays(1, &mut id);
+        }
+
+        VertexArray { id }
+    }
+
+    pub fn id(&self) -> GLuint {
+        self.id
+    }
+
+    pub fn bind(&self) {
+        unsafe {
+            gl::BindVertexArray(self.id);
+        }
+    }
+
+    pub fn unbind(&self) {
+        unsafe {
+            gl::BindVertexArray(0);
+        }
+    }
+}
+
+impl Drop for VertexArray {
+    fn drop(&mut self) {
+        unsafe {
+            gl::DeleteVertexArrays(1, &self.id as *const GLuint);
+        }
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[repr(C, packed)]
+#[derive(Copy, Clone, Debug)]
+pub struct f32x2 {
+    x: f32,
+    y: f32,
+}
+
+impl f32x2 {
+    pub fn new(x: f32, y: f32) -> f32x2 {
+        f32x2 { x, y }
+    }
+
+    pub fn num_components() -> usize {
+        2
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[repr(C, packed)]
+#[derive(Copy, Clone, Debug)]
+pub struct f32x3 {
+    x: f32,
+    y: f32,
+    z: f32,
+}
+
+impl f32x3 {
+    pub fn new(x: f32, y: f32, z: f32) -> f32x3 {
+        f32x3 { x, y, z }
+    }
+
+    pub fn num_components() -> usize {
+        3
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[repr(C, packed)]
+#[derive(Copy, Clone, Debug)]
+pub struct f32x4 {
+    x: f32,
+    y: f32,
+    z: f32,
+    w: f32,
+}
+
+impl f32x4 {
+    pub fn new(x: f32, y: f32, z: f32, w: f32) -> f32x4 {
+        f32x4 { x, y, z, w }
+    }
+
+    pub fn zero() -> f32x4 {
+        f32x4::new(0.0, 0.0, 0.0, 0.0)
+    }
+
+    pub fn set(&mut self, x: f32, y: f32, z: f32, w: f32) {
+        self.x = x;
+        self.y = y;
+        self.z = z;
+        self.w = w;
+    }
+
+    pub fn num_components() -> usize {
+        4
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C, packed)]
+pub struct Vertex {
+    p: f32x3,
+    st: f32x2,
+}
+impl Vertex {
+    pub fn new(p: f32x3, st: f32x2) -> Vertex {
+        Vertex { p, st }
+    }
+
+    unsafe fn vertex_attrib_pointer(
+        num_components: usize,
+        stride: usize,
+        location: usize,
+        offset: usize,
+    ) {
+        gl::EnableVertexAttribArray(location as gl::types::GLuint); // location(0)
+        gl::VertexAttribPointer(
+            location as gl::types::GLuint, // index of the vertex attribute
+            num_components as gl::types::GLint, /* number of components per
+                                            * vertex attrib */
+            gl::FLOAT,
+            gl::FALSE, // normalized (int-to-float conversion),
+            stride as gl::types::GLint, /* byte stride between
+                        * successive elements */
+            offset as *const gl::types::GLvoid, /* offset of the first
+                                                 * element */
+        );
+    }
+
+    pub fn vertex_attrib_pointers() {
+        let stride = std::mem::size_of::<Self>();
+
+        let location = 0;
+        let offset = 0;
+
+        // and configure the vertex array
+        unsafe {
+            Vertex::vertex_attrib_pointer(f32x3::num_components(), stride, location, offset);
+        }
+
+        let location = location + 1;
+        let offset = offset + std::mem::size_of::<f32x3>();
+
+        // and configure the st array
+        unsafe {
+            Vertex::vertex_attrib_pointer(f32x2::num_components(), stride, location, offset);
+        }
+    }
+}
+
+pub struct FullscreenQuad {
+    width: u32,
+    height: u32,
+    vertex_array: VertexArray,
+    program: Program,
+    texture_id: GLuint,
+    loc_progression: GLint,
+}
+
+impl FullscreenQuad {
+    pub fn new(width: u32, height: u32) -> Result<FullscreenQuad, String> {
+        let vert_shader = Shader::vertex_from_source(
+            CStr::from_bytes_with_nul(
+                b"
+            #version 330 core
+            layout (location = 0) in vec3 _p;
+            layout (location = 1) in vec2 _st;
+            out vec2 st;
+            void main() {
+                gl_Position = vec4(_p, 1.0);
+                st = _st;
+            }
+        \0",
+            )
+            .unwrap(),
+        )?;
+
+        let frag_shader = Shader::fragment_from_source(
+            CStr::from_bytes_with_nul(
+                b"
+            #version 330 core
+            in vec2 st;
+            out vec4 Color;
+
+            uniform sampler2D smp2d_0;
+            uniform int progression;
+
+            void main() {
+                vec4 col = texture(smp2d_0, st); 
+                col.r = pow(col.r / progression, 1/2.2);
+                col.g = pow(col.g / progression, 1/2.2);
+                col.b = pow(col.b / progression, 1/2.2);
+                Color = col;
+            }
+        \0",
+            )
+            .unwrap(),
+        )?;
+
+        let program = Program::from_shaders(&[vert_shader, frag_shader])?;
+        program.use_program();
+        let loc_progression = program.get_location("progression")?;
+
+        let vertices: Vec<Vertex> = vec![
+            Vertex::new(f32x3::new(-1.0, -1.0, 0.0), f32x2::new(0.0, 0.0)),
+            Vertex::new(f32x3::new(1.0, -1.0, 0.0), f32x2::new(1.0, 0.0)),
+            Vertex::new(f32x3::new(1.0, 1.0, 0.0), f32x2::new(1.0, 1.0)),
+            Vertex::new(f32x3::new(-1.0, -1.0, 0.0), f32x2::new(0.0, 0.0)),
+            Vertex::new(f32x3::new(1.0, 1.0, 0.0), f32x2::new(1.0, 1.0)),
+            Vertex::new(f32x3::new(-1.0, 1.0, 0.0), f32x2::new(0.0, 1.0)),
+        ];
+        let vertex_buffer = Buffer::<Vertex>::new(BufferType::ArrayBuffer);
+        vertex_buffer.buffer_data(&vertices, BufferUsage::StaticDraw);
+
+        // Generate and bind the VAO
+        let vertex_array = VertexArray::new();
+
+        vertex_array.bind();
+        // Re-bind the VBO to associate the two. We could just have left it
+        // bound earlier and let the association happen when we
+        // configure the VAO but this way at least makes the connection
+        // between the two seem more explicit, despite the magical
+        // state machine hiding in OpenGL
+        vertex_buffer.bind();
+
+        // Set up the vertex attribute pointers for all locations
+        Vertex::vertex_attrib_pointers();
+
+        // now unbind both the vbo and vao to keep everything cleaner
+        vertex_buffer.unbind();
+        vertex_array.unbind();
+
+        // generate test texture data using the image width rather than the
+        // framebuffer width
+        let mut tex_data = Vec::with_capacity((width * height) as usize);
+        for y in 0..height {
+            for x in 0..width {
+                tex_data.push(f32x4::new(
+                    (x as f32) / width as f32,
+                    (y as f32) / height as f32,
+                    1.0,
+                    0.0,
+                ));
+            }
+        }
+
+        // generate the texture for the quad
+        let mut texture_id: gl::types::GLuint = 0;
+        unsafe {
+            gl::GenTextures(1, &mut texture_id);
+            gl::ActiveTexture(gl::TEXTURE0);
+            gl::Enable(gl::TEXTURE_2D);
+            gl::BindTexture(gl::TEXTURE_2D, texture_id);
+            gl::TexParameteri(
+                gl::TEXTURE_2D,
+                gl::TEXTURE_WRAP_S,
+                gl::CLAMP_TO_BORDER as gl::types::GLint,
+            );
+            gl::TexParameteri(
+                gl::TEXTURE_2D,
+                gl::TEXTURE_WRAP_T,
+                gl::CLAMP_TO_BORDER as gl::types::GLint,
+            );
+            gl::TexParameteri(
+                gl::TEXTURE_2D,
+                gl::TEXTURE_MIN_FILTER,
+                gl::NEAREST as gl::types::GLint,
+            );
+            gl::TexParameteri(
+                gl::TEXTURE_2D,
+                gl::TEXTURE_MAG_FILTER,
+                gl::NEAREST as gl::types::GLint,
+            );
+            gl::TexImage2D(
+                gl::TEXTURE_2D,
+                0,
+                gl::RGBA32F as gl::types::GLint,
+                width as gl::types::GLint,
+                height as gl::types::GLint,
+                0,
+                gl::RGBA,
+                gl::FLOAT,
+                tex_data.as_ptr() as *const gl::types::GLvoid,
+            );
+        }
+
+        Ok(FullscreenQuad {
+            width,
+            height,
+            vertex_array,
+            program,
+            texture_id,
+            loc_progression,
+        })
+    }
+
+    pub fn draw(&self) {
+        self.program.use_program();
+        self.vertex_array.bind();
+        unsafe {
+            gl::DrawArrays(
+                gl::TRIANGLES,
+                0, // starting index in the enabled array
+                6, // number of indices to draw
+            )
+        }
+        self.vertex_array.unbind();
+    }
+
+    pub fn update_texture(&self, data: &[Vec4]) {
+        unsafe {
+            gl::BindTexture(gl::TEXTURE_2D, self.texture_id);
+            gl::TexSubImage2D(
+                gl::TEXTURE_2D,
+                0,
+                0,
+                0,
+                self.width as GLint,
+                self.height as GLint,
+                gl::RGBA,
+                gl::FLOAT,
+                data.as_ptr() as *const GLvoid,
+            );
+        }
+    }
+
+    pub fn resize(&mut self, width: u32, height: u32) {
+        unsafe {
+            gl::TexImage2D(
+                gl::TEXTURE_2D,
+                0,
+                gl::RGBA32F as gl::types::GLint,
+                width as gl::types::GLint,
+                height as gl::types::GLint,
+                0,
+                gl::RGBA,
+                gl::FLOAT,
+                std::ptr::null(),
+            );
+        }
+        self.width = width;
+        self.height = height;
+    }
+
+    pub fn set_progression(&self, progression: i32) {
+        self.program.set_uniform(self.loc_progression, progression);
+    }
+}
diff --git a/crates/optix/examples/ex04_mesh/src/main.rs b/crates/optix/examples/ex04_mesh/src/main.rs
new file mode 100644
index 00000000..e10ffac7
--- /dev/null
+++ b/crates/optix/examples/ex04_mesh/src/main.rs
@@ -0,0 +1,88 @@
+#![allow(warnings)]
+
+mod renderer;
+use renderer::Renderer;
+
+use glam::{vec4, Vec4};
+
+mod gl_util;
+use gl_util::FullscreenQuad;
+use glfw::{Action, Context, Key};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut glfw = glfw::init(glfw::FAIL_ON_ERRORS).unwrap();
+    glfw.window_hint(glfw::WindowHint::ContextVersion(4, 1));
+    glfw.window_hint(glfw::WindowHint::OpenGlForwardCompat(true));
+    glfw.window_hint(glfw::WindowHint::OpenGlProfile(
+        glfw::OpenGlProfileHint::Core,
+    ));
+
+    let mut width = 960u32;
+    let mut height = 540u32;
+
+    let mut renderer = Renderer::new(width, height)?;
+
+    let (mut window, events) = glfw
+        .create_window(
+            width,
+            height,
+            "Example 04: mesh",
+            glfw::WindowMode::Windowed,
+        )
+        .expect("failed to create glfw window");
+
+    window.set_key_polling(true);
+    window.make_current();
+
+    // retina displays will return a higher res for the framebuffer
+    // which we need to use for the viewport
+    let (fb_width, fb_height) = window.get_framebuffer_size();
+
+    gl::load_with(|s| glfw.get_proc_address_raw(s) as *const std::os::raw::c_void);
+
+    let mut fsq = FullscreenQuad::new(width, height).unwrap();
+
+    let mut image_data = vec![vec4(0.0, 0.0, 0.0, 0.0); (width * height) as usize];
+
+    unsafe {
+        gl::Viewport(0, 0, fb_width, fb_height);
+    };
+
+    while !window.should_close() {
+        glfw.poll_events();
+        for (_, event) in glfw::flush_messages(&events) {
+            handle_window_event(&mut window, event);
+        }
+
+        let (w, h) = window.get_framebuffer_size();
+        let w = w as u32;
+        let h = h as u32;
+        if w != width || h != height {
+            fsq.resize(w, h);
+            renderer.resize(w, h)?;
+            width = w;
+            height = h;
+            image_data.resize((width * height) as usize, vec4(0.0, 0.0, 0.0, 0.0));
+        }
+
+        renderer.render()?;
+        renderer.download_pixels(&mut image_data)?;
+        fsq.update_texture(&image_data);
+        fsq.set_progression(1);
+
+        // draw the quad
+        fsq.draw();
+
+        window.swap_buffers();
+    }
+
+    renderer.render()?;
+    Ok(())
+}
+
+fn handle_window_event(window: &mut glfw::Window, event: glfw::WindowEvent) {
+    match event {
+        glfw::WindowEvent::Key(Key::Escape, _, Action::Press, _) => window.set_should_close(true),
+        _ => {}
+    }
+}
diff --git a/crates/optix/examples/ex04_mesh/src/renderer.rs b/crates/optix/examples/ex04_mesh/src/renderer.rs
new file mode 100644
index 00000000..e99cd4e1
--- /dev/null
+++ b/crates/optix/examples/ex04_mesh/src/renderer.rs
@@ -0,0 +1,322 @@
+use anyhow::{Context, Result};
+use cust::context::{Context as CuContext, ContextFlags};
+use cust::device::Device;
+use cust::memory::{CopyDestination, DeviceBox, DeviceBuffer, DevicePointer, DeviceVariable};
+use cust::stream::{Stream, StreamFlags};
+use cust::{CudaFlags, DeviceCopy};
+
+use optix::{
+    acceleration::IndexedTriangleArray,
+    acceleration::{
+        Accel, AccelBuildOptions, BuildFlags, GeometryFlags, Traversable, TraversableHandle,
+    },
+    context::DeviceContext,
+    pipeline::{
+        CompileDebugLevel, CompileOptimizationLevel, ExceptionFlags, Module, ModuleCompileOptions,
+        Pipeline, PipelineCompileOptions, PipelineLinkOptions, ProgramGroup, ProgramGroupDesc,
+        TraversableGraphFlags,
+    },
+    shader_binding_table::{SbtRecord, ShaderBindingTable},
+};
+
+use glam::{ivec2, vec3, IVec2, IVec3, Vec3, Vec4};
+
+pub struct Renderer {
+    launch_params: DeviceVariable<LaunchParams>,
+    sbt: ShaderBindingTable,
+    gas: Accel,
+    buf_raygen: DeviceBuffer<RaygenRecord>,
+    buf_hitgroup: DeviceBuffer<HitgroupRecord>,
+    buf_miss: DeviceBuffer<MissRecord>,
+    pipeline: Pipeline,
+    color_buffer: DeviceBuffer<Vec4>,
+    ctx: DeviceContext,
+    stream: Stream,
+    cuda_context: CuContext,
+}
+
+impl Renderer {
+    pub fn new(width: u32, height: u32) -> Result<Renderer, Box<dyn std::error::Error>> {
+        init_optix()?;
+
+        // create CUDA and OptiX contexts
+        let device = Device::get_device(0)?;
+
+        let cuda_context = CuContext::new(device)?;
+        let stream = Stream::new(StreamFlags::DEFAULT, None)?;
+
+        let mut ctx = DeviceContext::new(&cuda_context, true)?;
+        ctx.set_log_callback(|_level, tag, msg| println!("[{}]: {}", tag, msg), 4)?;
+
+        // create module
+        let module_compile_options = ModuleCompileOptions {
+            max_register_count: 50,
+            opt_level: CompileOptimizationLevel::Default,
+            debug_level: CompileDebugLevel::None,
+        };
+
+        let pipeline_compile_options = PipelineCompileOptions::new()
+            .pipeline_launch_params_variable_name("PARAMS")
+            .uses_motion_blur(false)
+            .num_attribute_values(2)
+            .num_payload_values(2)
+            .traversable_graph_flags(TraversableGraphFlags::ALLOW_SINGLE_GAS)
+            .exception_flags(ExceptionFlags::NONE);
+
+        let ptx = include_str!(concat!(env!("OUT_DIR"), "/src/ex04_mesh.ptx"));
+
+        let (module, _log) = Module::new(
+            &mut ctx,
+            &module_compile_options,
+            &pipeline_compile_options,
+            ptx,
+        )
+        .context("Create module")?;
+
+        // create raygen program
+        let pgdesc_raygen = ProgramGroupDesc::raygen(&module, "__raygen__renderFrame");
+
+        let (pg_raygen, _log) = ProgramGroup::new(&mut ctx, &[pgdesc_raygen])?;
+
+        // create miss program
+        let pgdesc_miss = ProgramGroupDesc::miss(&module, "__miss__radiance");
+
+        let (pg_miss, _log) = ProgramGroup::new(&mut ctx, &[pgdesc_miss])?;
+
+        let pgdesc_hitgroup = ProgramGroupDesc::hitgroup(
+            Some((&module, "__closesthit__radiance")),
+            Some((&module, "__anyhit__radiance")),
+            None,
+        );
+
+        // create hitgroup programs
+        let (pg_hitgroup, _log) = ProgramGroup::new(&mut ctx, &[pgdesc_hitgroup])?;
+
+        // create geometry and accels
+        let mut vertices = Vec::new();
+        let mut indices = Vec::new();
+        add_cube(
+            vec3(0.0, -1.5, 0.0),
+            vec3(10.0, 0.1, 10.0),
+            &mut vertices,
+            &mut indices,
+        );
+        add_cube(
+            vec3(0.0, 0.0, 0.0),
+            vec3(2.0, 2.0, 2.0),
+            &mut vertices,
+            &mut indices,
+        );
+
+        let buf_vertex = DeviceBuffer::from_slice(&vertices)?;
+        let buf_indices = DeviceBuffer::from_slice(&indices)?;
+
+        let geometry_flags = GeometryFlags::None;
+        let build_inputs =
+            IndexedTriangleArray::new(&[&buf_vertex], &buf_indices, &[geometry_flags]);
+
+        let accel_options =
+            AccelBuildOptions::new(BuildFlags::PREFER_FAST_TRACE | BuildFlags::ALLOW_COMPACTION);
+
+        // build and compact the GAS
+        let gas = Accel::build(&ctx, &stream, &[accel_options], &[build_inputs], true)?;
+
+        stream.synchronize()?;
+
+        // create SBT
+        let rec_raygen: Vec<_> = pg_raygen
+            .iter()
+            .map(|pg| RaygenRecord::pack(0, pg).expect("failed to pack raygen record"))
+            .collect();
+
+        let rec_miss: Vec<_> = pg_miss
+            .iter()
+            .map(|pg| MissRecord::pack(0, pg).expect("failed to pack miss record"))
+            .collect();
+
+        let num_objects = 1;
+        let rec_hitgroup: Vec<_> = (0..num_objects)
+            .map(|i| {
+                let object_type = 0;
+                let rec = HitgroupRecord::pack(
+                    HitgroupSbtData { object_id: i },
+                    &pg_hitgroup[object_type],
+                )
+                .expect("failed to pack hitgroup record");
+                rec
+            })
+            .collect();
+
+        let mut buf_raygen = DeviceBuffer::from_slice(&rec_raygen)?;
+        let mut buf_miss = DeviceBuffer::from_slice(&rec_miss)?;
+        let mut buf_hitgroup = DeviceBuffer::from_slice(&rec_hitgroup)?;
+
+        let sbt = ShaderBindingTable::new(&buf_raygen)
+            .miss(&buf_miss)
+            .hitgroup(&buf_hitgroup);
+
+        // create pipeline
+        let mut program_groups = Vec::new();
+        program_groups.extend(pg_raygen.into_iter());
+        program_groups.extend(pg_miss.into_iter());
+        program_groups.extend(pg_hitgroup.into_iter());
+
+        let pipeline_link_options = PipelineLinkOptions {
+            max_trace_depth: 2,
+            debug_level: CompileDebugLevel::LineInfo,
+        };
+
+        let (pipeline, _log) = Pipeline::new(
+            &mut ctx,
+            &pipeline_compile_options,
+            pipeline_link_options,
+            &program_groups,
+        )?;
+
+        pipeline.set_stack_size(2 * 1024, 2 * 1024, 2 * 1024, 1)?;
+
+        let color_buffer =
+            unsafe { DeviceBuffer::uninitialized(width as usize * height as usize)? };
+
+        let from = vec3(-10.0, 2.0, -12.0);
+        let at = vec3(0.0, 0.0, 0.0);
+        let up = vec3(0.0, 1.0, 0.0);
+
+        let cosfovy = 0.66f32;
+        let aspect = width as f32 / height as f32;
+        let direction = (at - from).normalize();
+        let horizontal = cosfovy * aspect * direction.cross(up).normalize();
+        let vertical = cosfovy * horizontal.cross(direction).normalize();
+
+        let launch_params = DeviceVariable::new(LaunchParams {
+            frame: Frame {
+                color_buffer: color_buffer.as_device_ptr(),
+                size: ivec2(width as i32, height as i32),
+            },
+            camera: RenderCamera {
+                position: from,
+                direction,
+                horizontal,
+                vertical,
+            },
+            traversable: gas.handle(),
+        })?;
+
+        Ok(Renderer {
+            ctx,
+            cuda_context,
+            stream,
+            launch_params,
+            gas,
+            buf_raygen,
+            buf_hitgroup,
+            buf_miss,
+            sbt,
+            pipeline,
+            color_buffer,
+        })
+    }
+
+    pub fn resize(&mut self, width: u32, height: u32) -> Result<(), Box<dyn std::error::Error>> {
+        self.color_buffer = unsafe { DeviceBuffer::uninitialized((width * height) as usize)? };
+        self.launch_params.frame.size.x = width as i32;
+        self.launch_params.frame.size.y = height as i32;
+        self.launch_params.frame.color_buffer = self.color_buffer.as_device_ptr();
+        Ok(())
+    }
+
+    pub fn render(&mut self) -> Result<(), Box<dyn std::error::Error>> {
+        self.launch_params.copy_htod()?;
+
+        unsafe {
+            optix::launch(
+                &self.pipeline,
+                &self.stream,
+                &self.launch_params,
+                &self.sbt,
+                self.launch_params.frame.size.x as u32,
+                self.launch_params.frame.size.y as u32,
+                1,
+            )?;
+        }
+
+        self.stream.synchronize()?;
+
+        Ok(())
+    }
+
+    pub fn download_pixels(&self, slice: &mut [Vec4]) -> Result<(), Box<dyn std::error::Error>> {
+        self.color_buffer.copy_to(slice)?;
+        Ok(())
+    }
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, DeviceCopy)]
+pub struct Frame {
+    color_buffer: DevicePointer<Vec4>,
+    size: IVec2,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, DeviceCopy)]
+pub struct RenderCamera {
+    position: Vec3,
+    direction: Vec3,
+    horizontal: Vec3,
+    vertical: Vec3,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, DeviceCopy)]
+pub struct LaunchParams {
+    pub frame: Frame,
+    pub camera: RenderCamera,
+    pub traversable: TraversableHandle,
+}
+
+type RaygenRecord = SbtRecord<i32>;
+type MissRecord = SbtRecord<i32>;
+
+#[derive(Copy, Clone, Default, DeviceCopy)]
+struct HitgroupSbtData {
+    object_id: u32,
+}
+type HitgroupRecord = SbtRecord<HitgroupSbtData>;
+
+fn init_optix() -> Result<(), Box<dyn std::error::Error>> {
+    cust::init(CudaFlags::empty())?;
+    let device_count = Device::num_devices()?;
+    if device_count == 0 {
+        panic!("No CUDA devices found!");
+    }
+
+    optix::init()?;
+    Ok(())
+}
+
+pub fn add_cube(center: Vec3, size: Vec3, vertices: &mut Vec<Vec3>, indices: &mut Vec<IVec3>) {
+    let start_index = vertices.len() as i32;
+
+    vertices.push((vec3(0.0, 0.0, 0.0)) * size + center - 0.5 * size);
+    vertices.push((vec3(1.0, 0.0, 0.0)) * size + center - 0.5 * size);
+    vertices.push((vec3(0.0, 1.0, 0.0)) * size + center - 0.5 * size);
+    vertices.push((vec3(1.0, 1.0, 0.0)) * size + center - 0.5 * size);
+    vertices.push((vec3(0.0, 0.0, 1.0)) * size + center - 0.5 * size);
+    vertices.push((vec3(1.0, 0.0, 1.0)) * size + center - 0.5 * size);
+    vertices.push((vec3(0.0, 1.0, 1.0)) * size + center - 0.5 * size);
+    vertices.push((vec3(1.0, 1.0, 1.0)) * size + center - 0.5 * size);
+
+    const idx: [i32; 36] = [
+        0, 1, 3, 2, 3, 0, 5, 7, 6, 5, 6, 4, 0, 4, 5, 0, 5, 1, 2, 3, 7, 2, 7, 6, 1, 5, 6, 1, 7, 3,
+        4, 0, 2, 4, 2, 6,
+    ];
+
+    for c in idx.chunks(3) {
+        indices.push(IVec3::new(
+            c[0] + start_index,
+            c[1] + start_index,
+            c[2] + start_index,
+        ));
+    }
+}
diff --git a/crates/optix/examples/ex04_mesh/src/vector.rs b/crates/optix/examples/ex04_mesh/src/vector.rs
new file mode 100644
index 00000000..589a4134
--- /dev/null
+++ b/crates/optix/examples/ex04_mesh/src/vector.rs
@@ -0,0 +1,301 @@
+use core::ops;
+pub use num_traits::{One, Zero};
+
+pub trait Scalar: num_traits::One + num_traits::Zero {}
+
+impl Scalar for i8 {}
+impl Scalar for i16 {}
+impl Scalar for i32 {}
+impl Scalar for i64 {}
+impl Scalar for f32 {}
+impl Scalar for f64 {}
+
+pub trait Vector {
+    type Component: Scalar;
+
+    fn dot(&self, v: &Self) -> Self::Component;
+
+    #[inline]
+    fn length2(&self) -> Self::Component {
+        self.dot(&self)
+    }
+}
+
+macro_rules! vec_impl {
+    ($name:ident: $t:ty, $sc:ident, $align:expr, ($($c:ident),+)) => {
+        #[repr(C)]
+        #[derive(Clone, Copy, Default, PartialEq, Debug)]
+        pub struct $name
+        {
+            $(
+                pub $c: $t,
+            )+
+        }
+
+        impl $name
+        {
+            pub fn new($($c: $t),+) -> Self
+            {
+                Self {
+                    $(
+                        $c,
+                    )+
+                }
+            }
+        }
+
+        impl Vector for $name
+        {
+            type Component = $t;
+
+            #[inline]
+            fn dot(&self, v: &Self) -> $t
+            {
+                <$t>::zero() $(
+                    + self.$c * v.$c
+                )+
+            }
+        }
+
+        impl From<$t> for $name
+        {
+            fn from(x: $t) -> Self
+            {
+                Self {
+                    $(
+                        $c: x,
+                    )+
+                }
+            }
+        }
+
+        impl ops::Neg for $name
+        {
+            type Output = Self;
+
+            fn neg(self) -> Self
+            {
+                Self {
+                    $(
+                        $c: -self.$c,
+                    )+
+                }
+            }
+        }
+
+        impl ops::Add for $name
+        {
+            type Output = Self;
+
+            #[inline]
+            fn add(self, v: Self) -> Self
+            {
+                Self {
+                    $(
+                        $c: self.$c + v.$c,
+                    )+
+                }
+            }
+        }
+
+        impl ops::AddAssign for $name
+        {
+            #[inline]
+            fn add_assign(&mut self, v: Self)
+            {
+                $(
+                    self.$c += v.$c;
+                )+
+            }
+        }
+
+        impl ops::Sub for $name
+        {
+            type Output = Self;
+
+            #[inline]
+            fn sub(self, v: Self) -> Self
+            {
+                Self {
+                    $(
+                        $c: self.$c - v.$c,
+                    )+
+                }
+            }
+        }
+
+        impl ops::SubAssign for $name
+        {
+            #[inline]
+            fn sub_assign(&mut self, v: Self)
+            {
+                $(
+                    self.$c -= v.$c;
+                )+
+            }
+        }
+
+        impl ops::Mul for $name
+        {
+            type Output = Self;
+
+            #[inline]
+            fn mul(self, v: Self) -> Self
+            {
+                Self {
+                    $(
+                        $c: self.$c * v.$c,
+                    )+
+                }
+            }
+        }
+
+        impl ops::MulAssign for $name
+        {
+            #[inline]
+            fn mul_assign(&mut self, v: Self)
+            {
+                $(
+                    self.$c *= v.$c;
+                )+
+            }
+        }
+
+        impl ops::Mul<$t> for $name
+        {
+            type Output = Self;
+
+            #[inline]
+            fn mul(self, v: $t) -> Self
+            {
+                Self {
+                    $(
+                        $c: self.$c * v,
+                    )+
+                }
+            }
+        }
+
+        impl ops::MulAssign<$t> for $name
+        {
+            #[inline]
+            fn mul_assign(&mut self, v: $t)
+            {
+                $(
+                    self.$c *= v;
+                )+
+            }
+        }
+
+        impl ops::Div<$t> for $name
+        {
+            type Output = Self;
+
+            #[inline]
+            fn div(self, v: $t) -> Self
+            {
+                Self {
+                    $(
+                        $c: self.$c / v,
+                    )+
+                }
+            }
+        }
+
+        impl ops::DivAssign<$t> for $name
+        {
+            #[inline]
+            fn div_assign(&mut self, v: $t)
+            {
+                $(
+                    self.$c /= v;
+                )+
+            }
+        }
+
+        impl ops::Mul<$name> for $t
+        {
+            type Output = $name;
+
+            #[inline]
+            fn mul(self, v: $name) -> $name
+            {
+                $name {
+                    $(
+                        $c: self * v.$c,
+                    )+
+                }
+            }
+        }
+
+        impl ops::Div<$name> for $t
+        {
+            type Output = $name;
+
+            #[inline]
+            fn div(self, v: $name) -> $name
+            {
+                $name {
+                    $(
+                        $c: self / v.$c,
+                    )+
+                }
+            }
+        }
+
+        pub fn $sc($($c: $t),+) -> $name
+        {
+            $name {
+                $(
+                    $c,
+                )+
+            }
+        }
+
+        unsafe impl cust::memory::DeviceCopy for $name {
+            // fn device_align() -> usize {
+            //     $align
+            // }
+        }
+    };
+
+}
+
+vec_impl!(V2i8: i8, v2i8, 1, (x, y));
+vec_impl!(V2i16: i16, v2i16, 2, (x, y));
+vec_impl!(V2i32: i32, v2i32, 8, (x, y));
+vec_impl!(V2i64: i64, v2i64, 8, (x, y));
+vec_impl!(V3i8: i8, v3i8, 1, (x, y, z));
+vec_impl!(V3i16: i16, v3i16, 2, (x, y, z));
+vec_impl!(V3i32: i32, v3i32, 4, (x, y, z));
+vec_impl!(V3i64: i64, v3i64, 8, (x, y, z));
+vec_impl!(V4i8: i8, v4i8, 1, (x, y, z, w));
+vec_impl!(V4i16: i16, v4i16, 2, (x, y, z, w));
+vec_impl!(V4i32: i32, v4i32, 16, (x, y, z, w));
+vec_impl!(V4i64: i64, v4i64, 8, (x, y, z, w));
+
+vec_impl!(V2f32: f32, v2f32, 8, (x, y));
+vec_impl!(V2f64: f64, v2f64, 8, (x, y));
+vec_impl!(V3f32: f32, v3f32, 4, (x, y, z));
+vec_impl!(V3f64: f64, v3f64, 8, (x, y, z));
+vec_impl!(V4f32: f32, v4f32, 16, (x, y, z, w));
+vec_impl!(V4f64: f64, v4f64, 8, (x, y, z, w));
+
+vec_impl!(P2f32: f32, p2f32, 8, (x, y));
+vec_impl!(P2f64: f64, p2f64, 8, (x, y));
+vec_impl!(P3f32: f32, p3f32, 4, (x, y, z));
+vec_impl!(P3f64: f64, p3f64, 8, (x, y, z));
+vec_impl!(P4f32: f32, p4f32, 16, (x, y, z, w));
+vec_impl!(P4f64: f64, p4f64, 8, (x, y, z, w));
+
+vec_impl!(N2f32: f32, n2f32, 8, (x, y));
+vec_impl!(N2f64: f64, n2f64, 8, (x, y));
+vec_impl!(N3f32: f32, n3f32, 4, (x, y, z));
+vec_impl!(N3f64: f64, n3f64, 8, (x, y, z));
+vec_impl!(N4f32: f32, n4f32, 16, (x, y, z, w));
+vec_impl!(N4f64: f64, n4f64, 8, (x, y, z, w));
+
+#[inline]
+pub fn dot<T: Vector>(a: &T, b: &T) -> T::Component {
+    a.dot(b)
+}
diff --git a/crates/optix/images/example_sbt.jpg b/crates/optix/images/example_sbt.jpg
new file mode 100644
index 00000000..1898e0a0
Binary files /dev/null and b/crates/optix/images/example_sbt.jpg differ
diff --git a/crates/optix/images/example_sbt.png b/crates/optix/images/example_sbt.png
new file mode 100644
index 00000000..03a86851
Binary files /dev/null and b/crates/optix/images/example_sbt.png differ
diff --git a/crates/optix/images/optix_programs.jpg b/crates/optix/images/optix_programs.jpg
new file mode 100644
index 00000000..850fd7e8
Binary files /dev/null and b/crates/optix/images/optix_programs.jpg differ
diff --git a/crates/optix/images/scene_graph.jpg b/crates/optix/images/scene_graph.jpg
new file mode 100644
index 00000000..8f24b0a3
Binary files /dev/null and b/crates/optix/images/scene_graph.jpg differ
diff --git a/crates/optix/images/scene_graph.png b/crates/optix/images/scene_graph.png
new file mode 100644
index 00000000..360becf7
Binary files /dev/null and b/crates/optix/images/scene_graph.png differ
diff --git a/crates/optix/images/traversables_graph.jpg b/crates/optix/images/traversables_graph.jpg
new file mode 100644
index 00000000..7250201e
Binary files /dev/null and b/crates/optix/images/traversables_graph.jpg differ
diff --git a/crates/optix_sys/optix_stubs.c b/crates/optix/optix_stubs.c
similarity index 99%
rename from crates/optix_sys/optix_stubs.c
rename to crates/optix/optix_stubs.c
index 01529541..3325d867 100644
--- a/crates/optix_sys/optix_stubs.c
+++ b/crates/optix/optix_stubs.c
@@ -603,3 +603,4 @@ extern "C"
 #ifdef __cplusplus
 }
 #endif
+
diff --git a/crates/optix_sys/optix.rs b/crates/optix/optix_wrapper.rs
similarity index 57%
rename from crates/optix_sys/optix.rs
rename to crates/optix/optix_wrapper.rs
index 2643576e..24bad9e9 100644
--- a/crates/optix_sys/optix.rs
+++ b/crates/optix/optix_wrapper.rs
@@ -1,4 +1,4 @@
-/* automatically generated by rust-bindgen 0.58.1 */
+/* automatically generated by rust-bindgen 0.59.2 */
 
 #[repr(C)]
 pub struct __BindgenUnionField<T>(::std::marker::PhantomData<T>);
@@ -43,16 +43,6 @@ impl<T> ::std::cmp::PartialEq for __BindgenUnionField<T> {
     }
 }
 impl<T> ::std::cmp::Eq for __BindgenUnionField<T> {}
-pub const OPTIX_VERSION: u32 = 70300;
-pub const OPTIX_SBT_RECORD_ALIGNMENT: u32 = 16;
-pub const OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT: u32 = 128;
-pub const OPTIX_INSTANCE_BYTE_ALIGNMENT: u32 = 16;
-pub const OPTIX_AABB_BUFFER_BYTE_ALIGNMENT: u32 = 8;
-pub const OPTIX_GEOMETRY_TRANSFORM_BYTE_ALIGNMENT: u32 = 16;
-pub const OPTIX_TRANSFORM_BYTE_ALIGNMENT: u32 = 64;
-pub const OPTIX_COMPILE_DEFAULT_MAX_REGISTER_COUNT: u32 = 0;
-pub const OPTIX_COMPILE_DEFAULT_MAX_PAYLOAD_VALUE_COUNT: u32 = 8;
-pub const OPTIX_ABI_VERSION: u32 = 47;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct OptixDeviceContext_t {
@@ -85,59 +75,131 @@ pub struct OptixDenoiser_t {
 pub type OptixDenoiser = *mut OptixDenoiser_t;
 pub type OptixTraversableHandle = ::std::os::raw::c_ulonglong;
 pub type OptixVisibilityMask = ::std::os::raw::c_uint;
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixResult {
-    OPTIX_SUCCESS = 0,
-    OPTIX_ERROR_INVALID_VALUE = 7001,
-    OPTIX_ERROR_HOST_OUT_OF_MEMORY = 7002,
-    OPTIX_ERROR_INVALID_OPERATION = 7003,
-    OPTIX_ERROR_FILE_IO_ERROR = 7004,
-    OPTIX_ERROR_INVALID_FILE_FORMAT = 7005,
-    OPTIX_ERROR_DISK_CACHE_INVALID_PATH = 7010,
-    OPTIX_ERROR_DISK_CACHE_PERMISSION_ERROR = 7011,
-    OPTIX_ERROR_DISK_CACHE_DATABASE_ERROR = 7012,
-    OPTIX_ERROR_DISK_CACHE_INVALID_DATA = 7013,
-    OPTIX_ERROR_LAUNCH_FAILURE = 7050,
-    OPTIX_ERROR_INVALID_DEVICE_CONTEXT = 7051,
-    OPTIX_ERROR_CUDA_NOT_INITIALIZED = 7052,
-    OPTIX_ERROR_VALIDATION_FAILURE = 7053,
-    OPTIX_ERROR_INVALID_PTX = 7200,
-    OPTIX_ERROR_INVALID_LAUNCH_PARAMETER = 7201,
-    OPTIX_ERROR_INVALID_PAYLOAD_ACCESS = 7202,
-    OPTIX_ERROR_INVALID_ATTRIBUTE_ACCESS = 7203,
-    OPTIX_ERROR_INVALID_FUNCTION_USE = 7204,
-    OPTIX_ERROR_INVALID_FUNCTION_ARGUMENTS = 7205,
-    OPTIX_ERROR_PIPELINE_OUT_OF_CONSTANT_MEMORY = 7250,
-    OPTIX_ERROR_PIPELINE_LINK_ERROR = 7251,
-    OPTIX_ERROR_ILLEGAL_DURING_TASK_EXECUTE = 7270,
-    OPTIX_ERROR_INTERNAL_COMPILER_ERROR = 7299,
-    OPTIX_ERROR_DENOISER_MODEL_NOT_SET = 7300,
-    OPTIX_ERROR_DENOISER_NOT_INITIALIZED = 7301,
-    OPTIX_ERROR_ACCEL_NOT_COMPATIBLE = 7400,
-    OPTIX_ERROR_NOT_SUPPORTED = 7800,
-    OPTIX_ERROR_UNSUPPORTED_ABI_VERSION = 7801,
-    OPTIX_ERROR_FUNCTION_TABLE_SIZE_MISMATCH = 7802,
-    OPTIX_ERROR_INVALID_ENTRY_FUNCTION_OPTIONS = 7803,
-    OPTIX_ERROR_LIBRARY_NOT_FOUND = 7804,
-    OPTIX_ERROR_ENTRY_SYMBOL_NOT_FOUND = 7805,
-    OPTIX_ERROR_LIBRARY_UNLOAD_FAILURE = 7806,
-    OPTIX_ERROR_CUDA_ERROR = 7900,
-    OPTIX_ERROR_INTERNAL_ERROR = 7990,
-    OPTIX_ERROR_UNKNOWN = 7999,
+impl OptixResult {
+    pub const OPTIX_SUCCESS: OptixResult = OptixResult(0);
 }
-#[repr(i32)]
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_VALUE: OptixResult = OptixResult(7001);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_HOST_OUT_OF_MEMORY: OptixResult = OptixResult(7002);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_OPERATION: OptixResult = OptixResult(7003);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_FILE_IO_ERROR: OptixResult = OptixResult(7004);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_FILE_FORMAT: OptixResult = OptixResult(7005);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_DISK_CACHE_INVALID_PATH: OptixResult = OptixResult(7010);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_DISK_CACHE_PERMISSION_ERROR: OptixResult = OptixResult(7011);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_DISK_CACHE_DATABASE_ERROR: OptixResult = OptixResult(7012);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_DISK_CACHE_INVALID_DATA: OptixResult = OptixResult(7013);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_LAUNCH_FAILURE: OptixResult = OptixResult(7050);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_DEVICE_CONTEXT: OptixResult = OptixResult(7051);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_CUDA_NOT_INITIALIZED: OptixResult = OptixResult(7052);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_VALIDATION_FAILURE: OptixResult = OptixResult(7053);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_PTX: OptixResult = OptixResult(7200);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_LAUNCH_PARAMETER: OptixResult = OptixResult(7201);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_PAYLOAD_ACCESS: OptixResult = OptixResult(7202);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_ATTRIBUTE_ACCESS: OptixResult = OptixResult(7203);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_FUNCTION_USE: OptixResult = OptixResult(7204);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_FUNCTION_ARGUMENTS: OptixResult = OptixResult(7205);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_PIPELINE_OUT_OF_CONSTANT_MEMORY: OptixResult = OptixResult(7250);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_PIPELINE_LINK_ERROR: OptixResult = OptixResult(7251);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_ILLEGAL_DURING_TASK_EXECUTE: OptixResult = OptixResult(7270);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INTERNAL_COMPILER_ERROR: OptixResult = OptixResult(7299);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_DENOISER_MODEL_NOT_SET: OptixResult = OptixResult(7300);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_DENOISER_NOT_INITIALIZED: OptixResult = OptixResult(7301);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_ACCEL_NOT_COMPATIBLE: OptixResult = OptixResult(7400);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_NOT_SUPPORTED: OptixResult = OptixResult(7800);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_UNSUPPORTED_ABI_VERSION: OptixResult = OptixResult(7801);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_FUNCTION_TABLE_SIZE_MISMATCH: OptixResult = OptixResult(7802);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INVALID_ENTRY_FUNCTION_OPTIONS: OptixResult = OptixResult(7803);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_LIBRARY_NOT_FOUND: OptixResult = OptixResult(7804);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_ENTRY_SYMBOL_NOT_FOUND: OptixResult = OptixResult(7805);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_LIBRARY_UNLOAD_FAILURE: OptixResult = OptixResult(7806);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_CUDA_ERROR: OptixResult = OptixResult(7900);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_INTERNAL_ERROR: OptixResult = OptixResult(7990);
+}
+impl OptixResult {
+    pub const OPTIX_ERROR_UNKNOWN: OptixResult = OptixResult(7999);
+}
+#[repr(transparent)]
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixDeviceProperty {
-    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_TRACE_DEPTH = 8193,
-    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_TRAVERSABLE_GRAPH_DEPTH = 8194,
-    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_PRIMITIVES_PER_GAS = 8195,
-    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCES_PER_IAS = 8196,
-    OPTIX_DEVICE_PROPERTY_RTCORE_VERSION = 8197,
-    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID = 8198,
-    OPTIX_DEVICE_PROPERTY_LIMIT_NUM_BITS_INSTANCE_VISIBILITY_MASK = 8199,
-    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_SBT_RECORDS_PER_GAS = 8200,
-    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_SBT_OFFSET = 8201,
+pub struct OptixResult(pub ::std::os::raw::c_int);
+pub mod OptixDeviceProperty {
+    pub type Type = ::std::os::raw::c_int;
+    pub const OPTIX_DEVICE_PROPERTY_LIMIT_MAX_TRACE_DEPTH: Type = 8193;
+    pub const OPTIX_DEVICE_PROPERTY_LIMIT_MAX_TRAVERSABLE_GRAPH_DEPTH: Type = 8194;
+    pub const OPTIX_DEVICE_PROPERTY_LIMIT_MAX_PRIMITIVES_PER_GAS: Type = 8195;
+    pub const OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCES_PER_IAS: Type = 8196;
+    pub const OPTIX_DEVICE_PROPERTY_RTCORE_VERSION: Type = 8197;
+    pub const OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID: Type = 8198;
+    pub const OPTIX_DEVICE_PROPERTY_LIMIT_NUM_BITS_INSTANCE_VISIBILITY_MASK: Type = 8199;
+    pub const OPTIX_DEVICE_PROPERTY_LIMIT_MAX_SBT_RECORDS_PER_GAS: Type = 8200;
+    pub const OPTIX_DEVICE_PROPERTY_LIMIT_MAX_SBT_OFFSET: Type = 8201;
 }
 pub type OptixLogCallback = ::std::option::Option<
     unsafe extern "C" fn(
@@ -147,14 +209,13 @@ pub type OptixLogCallback = ::std::option::Option<
         cbdata: *mut ::std::os::raw::c_void,
     ),
 >;
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixDeviceContextValidationMode {
-    OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_OFF = 0,
-    OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL = -1,
-}
+pub const OptixDeviceContextValidationMode_OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_OFF:
+    OptixDeviceContextValidationMode = 0;
+pub const OptixDeviceContextValidationMode_OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL:
+    OptixDeviceContextValidationMode = -1;
+pub type OptixDeviceContextValidationMode = ::std::os::raw::c_int;
 #[repr(C)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, PartialEq)]
 pub struct OptixDeviceContextOptions {
     pub logCallbackFunction: OptixLogCallback,
     pub logCallbackData: *mut ::std::os::raw::c_void,
@@ -163,46 +224,31 @@ pub struct OptixDeviceContextOptions {
 }
 impl Default for OptixDeviceContextOptions {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixGeometryFlags {
-    OPTIX_GEOMETRY_FLAG_NONE = 0,
-    OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT = 1,
-    OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL = 2,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixHitKind {
-    OPTIX_HIT_KIND_TRIANGLE_FRONT_FACE = 254,
-    OPTIX_HIT_KIND_TRIANGLE_BACK_FACE = 255,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixIndicesFormat {
-    OPTIX_INDICES_FORMAT_NONE = 0,
-    OPTIX_INDICES_FORMAT_UNSIGNED_SHORT3 = 8450,
-    OPTIX_INDICES_FORMAT_UNSIGNED_INT3 = 8451,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixVertexFormat {
-    OPTIX_VERTEX_FORMAT_NONE = 0,
-    OPTIX_VERTEX_FORMAT_FLOAT3 = 8481,
-    OPTIX_VERTEX_FORMAT_FLOAT2 = 8482,
-    OPTIX_VERTEX_FORMAT_HALF3 = 8483,
-    OPTIX_VERTEX_FORMAT_HALF2 = 8484,
-    OPTIX_VERTEX_FORMAT_SNORM16_3 = 8485,
-    OPTIX_VERTEX_FORMAT_SNORM16_2 = 8486,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixTransformFormat {
-    OPTIX_TRANSFORM_FORMAT_NONE = 0,
-    OPTIX_TRANSFORM_FORMAT_MATRIX_FLOAT12 = 8673,
-}
+pub const OptixHitKind_OPTIX_HIT_KIND_TRIANGLE_FRONT_FACE: OptixHitKind = 254;
+pub const OptixHitKind_OPTIX_HIT_KIND_TRIANGLE_BACK_FACE: OptixHitKind = 255;
+pub type OptixHitKind = ::std::os::raw::c_int;
+pub const OptixIndicesFormat_OPTIX_INDICES_FORMAT_NONE: OptixIndicesFormat = 0;
+pub const OptixIndicesFormat_OPTIX_INDICES_FORMAT_UNSIGNED_SHORT3: OptixIndicesFormat = 8450;
+pub const OptixIndicesFormat_OPTIX_INDICES_FORMAT_UNSIGNED_INT3: OptixIndicesFormat = 8451;
+pub type OptixIndicesFormat = ::std::os::raw::c_int;
+pub const OptixVertexFormat_OPTIX_VERTEX_FORMAT_NONE: OptixVertexFormat = 0;
+pub const OptixVertexFormat_OPTIX_VERTEX_FORMAT_FLOAT3: OptixVertexFormat = 8481;
+pub const OptixVertexFormat_OPTIX_VERTEX_FORMAT_FLOAT2: OptixVertexFormat = 8482;
+pub const OptixVertexFormat_OPTIX_VERTEX_FORMAT_HALF3: OptixVertexFormat = 8483;
+pub const OptixVertexFormat_OPTIX_VERTEX_FORMAT_HALF2: OptixVertexFormat = 8484;
+pub const OptixVertexFormat_OPTIX_VERTEX_FORMAT_SNORM16_3: OptixVertexFormat = 8485;
+pub const OptixVertexFormat_OPTIX_VERTEX_FORMAT_SNORM16_2: OptixVertexFormat = 8486;
+pub type OptixVertexFormat = ::std::os::raw::c_int;
+pub const OptixTransformFormat_OPTIX_TRANSFORM_FORMAT_NONE: OptixTransformFormat = 0;
+pub const OptixTransformFormat_OPTIX_TRANSFORM_FORMAT_MATRIX_FLOAT12: OptixTransformFormat = 8673;
+pub type OptixTransformFormat = ::std::os::raw::c_int;
 #[repr(C)]
 pub struct OptixBuildInputTriangleArray {
     pub vertexBuffers: *const CUdeviceptr,
@@ -224,27 +270,30 @@ pub struct OptixBuildInputTriangleArray {
 }
 impl Default for OptixBuildInputTriangleArray {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixPrimitiveType {
-    OPTIX_PRIMITIVE_TYPE_CUSTOM = 9472,
-    OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE = 9473,
-    OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE = 9474,
-    OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR = 9475,
-    OPTIX_PRIMITIVE_TYPE_TRIANGLE = 9521,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixPrimitiveTypeFlags {
-    OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM = 1,
-    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE = 2,
-    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE = 4,
-    OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR = 8,
-    OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE = -2147483648,
-}
+pub const OptixPrimitiveType_OPTIX_PRIMITIVE_TYPE_CUSTOM: OptixPrimitiveType = 9472;
+pub const OptixPrimitiveType_OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE: OptixPrimitiveType =
+    9473;
+pub const OptixPrimitiveType_OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE: OptixPrimitiveType = 9474;
+pub const OptixPrimitiveType_OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR: OptixPrimitiveType = 9475;
+pub const OptixPrimitiveType_OPTIX_PRIMITIVE_TYPE_TRIANGLE: OptixPrimitiveType = 9521;
+pub type OptixPrimitiveType = ::std::os::raw::c_int;
+pub const OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM: OptixPrimitiveTypeFlags = 1;
+pub const OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE:
+    OptixPrimitiveTypeFlags = 2;
+pub const OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE:
+    OptixPrimitiveTypeFlags = 4;
+pub const OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR: OptixPrimitiveTypeFlags =
+    8;
+pub const OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE: OptixPrimitiveTypeFlags =
+    -2147483648;
+pub type OptixPrimitiveTypeFlags = ::std::os::raw::c_int;
 #[repr(C)]
 pub struct OptixBuildInputCurveArray {
     pub curveType: OptixPrimitiveType,
@@ -263,7 +312,11 @@ pub struct OptixBuildInputCurveArray {
 }
 impl Default for OptixBuildInputCurveArray {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
@@ -290,7 +343,11 @@ pub struct OptixBuildInputCustomPrimitiveArray {
 }
 impl Default for OptixBuildInputCustomPrimitiveArray {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
@@ -300,23 +357,19 @@ pub struct OptixBuildInputInstanceArray {
 }
 impl Default for OptixBuildInputInstanceArray {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixBuildInputType {
-    OPTIX_BUILD_INPUT_TYPE_TRIANGLES = 8513,
-    OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES = 8514,
-    OPTIX_BUILD_INPUT_TYPE_INSTANCES = 8515,
-    OPTIX_BUILD_INPUT_TYPE_INSTANCE_POINTERS = 8516,
-    OPTIX_BUILD_INPUT_TYPE_CURVES = 8517,
-}
-#[repr(C)]
-pub struct OptixBuildInput {
-    pub type_: OptixBuildInputType,
-    pub __bindgen_anon_1: OptixBuildInput__bindgen_ty_1,
-}
+pub const OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_TRIANGLES: OptixBuildInputType = 8513;
+pub const OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES: OptixBuildInputType = 8514;
+pub const OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_INSTANCES: OptixBuildInputType = 8515;
+pub const OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_INSTANCE_POINTERS: OptixBuildInputType = 8516;
+pub const OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_CURVES: OptixBuildInputType = 8517;
+pub type OptixBuildInputType = ::std::os::raw::c_int;
 #[repr(C)]
 pub struct OptixBuildInput__bindgen_ty_1 {
     pub triangleArray: __BindgenUnionField<OptixBuildInputTriangleArray>,
@@ -328,24 +381,21 @@ pub struct OptixBuildInput__bindgen_ty_1 {
 }
 impl Default for OptixBuildInput__bindgen_ty_1 {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
-impl Default for OptixBuildInput {
-    fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
-    }
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixInstanceFlags {
-    OPTIX_INSTANCE_FLAG_NONE = 0,
-    OPTIX_INSTANCE_FLAG_DISABLE_TRIANGLE_FACE_CULLING = 1,
-    OPTIX_INSTANCE_FLAG_FLIP_TRIANGLE_FACING = 2,
-    OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT = 4,
-    OPTIX_INSTANCE_FLAG_ENFORCE_ANYHIT = 8,
-    OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM = 64,
-}
+pub const OptixInstanceFlags_OPTIX_INSTANCE_FLAG_NONE: OptixInstanceFlags = 0;
+pub const OptixInstanceFlags_OPTIX_INSTANCE_FLAG_DISABLE_TRIANGLE_FACE_CULLING: OptixInstanceFlags =
+    1;
+pub const OptixInstanceFlags_OPTIX_INSTANCE_FLAG_FLIP_TRIANGLE_FACING: OptixInstanceFlags = 2;
+pub const OptixInstanceFlags_OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT: OptixInstanceFlags = 4;
+pub const OptixInstanceFlags_OPTIX_INSTANCE_FLAG_ENFORCE_ANYHIT: OptixInstanceFlags = 8;
+pub const OptixInstanceFlags_OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM: OptixInstanceFlags = 64;
+pub type OptixInstanceFlags = ::std::os::raw::c_int;
 #[repr(C)]
 #[derive(Debug, Default, Copy, Clone, PartialEq)]
 pub struct OptixInstance {
@@ -357,30 +407,21 @@ pub struct OptixInstance {
     pub traversableHandle: OptixTraversableHandle,
     pub pad: [::std::os::raw::c_uint; 2usize],
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixBuildFlags {
-    OPTIX_BUILD_FLAG_NONE = 0,
-    OPTIX_BUILD_FLAG_ALLOW_UPDATE = 1,
-    OPTIX_BUILD_FLAG_ALLOW_COMPACTION = 2,
-    OPTIX_BUILD_FLAG_PREFER_FAST_TRACE = 4,
-    OPTIX_BUILD_FLAG_PREFER_FAST_BUILD = 8,
-    OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS = 16,
-    OPTIX_BUILD_FLAG_ALLOW_RANDOM_INSTANCE_ACCESS = 32,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixBuildOperation {
-    OPTIX_BUILD_OPERATION_BUILD = 8545,
-    OPTIX_BUILD_OPERATION_UPDATE = 8546,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixMotionFlags {
-    OPTIX_MOTION_FLAG_NONE = 0,
-    OPTIX_MOTION_FLAG_START_VANISH = 1,
-    OPTIX_MOTION_FLAG_END_VANISH = 2,
-}
+pub const OptixBuildFlags_OPTIX_BUILD_FLAG_NONE: OptixBuildFlags = 0;
+pub const OptixBuildFlags_OPTIX_BUILD_FLAG_ALLOW_UPDATE: OptixBuildFlags = 1;
+pub const OptixBuildFlags_OPTIX_BUILD_FLAG_ALLOW_COMPACTION: OptixBuildFlags = 2;
+pub const OptixBuildFlags_OPTIX_BUILD_FLAG_PREFER_FAST_TRACE: OptixBuildFlags = 4;
+pub const OptixBuildFlags_OPTIX_BUILD_FLAG_PREFER_FAST_BUILD: OptixBuildFlags = 8;
+pub const OptixBuildFlags_OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS: OptixBuildFlags = 16;
+pub const OptixBuildFlags_OPTIX_BUILD_FLAG_ALLOW_RANDOM_INSTANCE_ACCESS: OptixBuildFlags = 32;
+pub type OptixBuildFlags = ::std::os::raw::c_int;
+pub const OptixBuildOperation_OPTIX_BUILD_OPERATION_BUILD: OptixBuildOperation = 8545;
+pub const OptixBuildOperation_OPTIX_BUILD_OPERATION_UPDATE: OptixBuildOperation = 8546;
+pub type OptixBuildOperation = ::std::os::raw::c_int;
+pub const OptixMotionFlags_OPTIX_MOTION_FLAG_NONE: OptixMotionFlags = 0;
+pub const OptixMotionFlags_OPTIX_MOTION_FLAG_START_VANISH: OptixMotionFlags = 1;
+pub const OptixMotionFlags_OPTIX_MOTION_FLAG_END_VANISH: OptixMotionFlags = 2;
+pub type OptixMotionFlags = ::std::os::raw::c_int;
 #[repr(C)]
 #[derive(Debug, Default, Copy, Clone, PartialEq)]
 pub struct OptixMotionOptions {
@@ -398,22 +439,31 @@ pub struct OptixAccelBuildOptions {
 }
 impl Default for OptixAccelBuildOptions {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
-#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
 pub struct OptixAccelBufferSizes {
-    pub outputSizeInBytes: usize,
-    pub tempSizeInBytes: usize,
-    pub tempUpdateSizeInBytes: usize,
+    pub outputSizeInBytes: size_t,
+    pub tempSizeInBytes: size_t,
+    pub tempUpdateSizeInBytes: size_t,
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixAccelPropertyType {
-    OPTIX_PROPERTY_TYPE_COMPACTED_SIZE = 8577,
-    OPTIX_PROPERTY_TYPE_AABBS = 8578,
+impl Default for OptixAccelBufferSizes {
+    fn default() -> Self {
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
+    }
 }
+pub const OptixAccelPropertyType_OPTIX_PROPERTY_TYPE_COMPACTED_SIZE: OptixAccelPropertyType = 8577;
+pub const OptixAccelPropertyType_OPTIX_PROPERTY_TYPE_AABBS: OptixAccelPropertyType = 8578;
+pub type OptixAccelPropertyType = ::std::os::raw::c_int;
 #[repr(C)]
 pub struct OptixAccelEmitDesc {
     pub result: CUdeviceptr,
@@ -421,11 +471,15 @@ pub struct OptixAccelEmitDesc {
 }
 impl Default for OptixAccelEmitDesc {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
-#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
 pub struct OptixAccelRelocationInfo {
     pub info: [::std::os::raw::c_ulonglong; 4usize],
 }
@@ -473,24 +527,22 @@ pub struct OptixSRTMotionTransform {
     pub pad: [::std::os::raw::c_uint; 3usize],
     pub srtData: [OptixSRTData; 2usize],
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixTraversableType {
-    OPTIX_TRAVERSABLE_TYPE_STATIC_TRANSFORM = 8641,
-    OPTIX_TRAVERSABLE_TYPE_MATRIX_MOTION_TRANSFORM = 8642,
-    OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM = 8643,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixPixelFormat {
-    OPTIX_PIXEL_FORMAT_HALF2 = 8711,
-    OPTIX_PIXEL_FORMAT_HALF3 = 8705,
-    OPTIX_PIXEL_FORMAT_HALF4 = 8706,
-    OPTIX_PIXEL_FORMAT_FLOAT2 = 8712,
-    OPTIX_PIXEL_FORMAT_FLOAT3 = 8707,
-    OPTIX_PIXEL_FORMAT_FLOAT4 = 8708,
-    OPTIX_PIXEL_FORMAT_UCHAR3 = 8709,
-    OPTIX_PIXEL_FORMAT_UCHAR4 = 8710,
+pub const OptixTraversableType_OPTIX_TRAVERSABLE_TYPE_STATIC_TRANSFORM: OptixTraversableType = 8641;
+pub const OptixTraversableType_OPTIX_TRAVERSABLE_TYPE_MATRIX_MOTION_TRANSFORM:
+    OptixTraversableType = 8642;
+pub const OptixTraversableType_OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM: OptixTraversableType =
+    8643;
+pub type OptixTraversableType = ::std::os::raw::c_int;
+pub mod OptixPixelFormat {
+    pub type Type = ::std::os::raw::c_int;
+    pub const OPTIX_PIXEL_FORMAT_HALF2: Type = 8711;
+    pub const OPTIX_PIXEL_FORMAT_HALF3: Type = 8705;
+    pub const OPTIX_PIXEL_FORMAT_HALF4: Type = 8706;
+    pub const OPTIX_PIXEL_FORMAT_FLOAT2: Type = 8712;
+    pub const OPTIX_PIXEL_FORMAT_FLOAT3: Type = 8707;
+    pub const OPTIX_PIXEL_FORMAT_FLOAT4: Type = 8708;
+    pub const OPTIX_PIXEL_FORMAT_UCHAR3: Type = 8709;
+    pub const OPTIX_PIXEL_FORMAT_UCHAR4: Type = 8710;
 }
 #[repr(C)]
 pub struct OptixImage2D {
@@ -499,23 +551,26 @@ pub struct OptixImage2D {
     pub height: ::std::os::raw::c_uint,
     pub rowStrideInBytes: ::std::os::raw::c_uint,
     pub pixelStrideInBytes: ::std::os::raw::c_uint,
-    pub format: OptixPixelFormat,
+    pub format: OptixPixelFormat::Type,
 }
 impl Default for OptixImage2D {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixDenoiserModelKind {
-    OPTIX_DENOISER_MODEL_KIND_LDR = 8994,
-    OPTIX_DENOISER_MODEL_KIND_HDR = 8995,
-    OPTIX_DENOISER_MODEL_KIND_AOV = 8996,
-    OPTIX_DENOISER_MODEL_KIND_TEMPORAL = 8997,
+pub mod OptixDenoiserModelKind {
+    pub type Type = ::std::os::raw::c_int;
+    pub const OPTIX_DENOISER_MODEL_KIND_LDR: Type = 8994;
+    pub const OPTIX_DENOISER_MODEL_KIND_HDR: Type = 8995;
+    pub const OPTIX_DENOISER_MODEL_KIND_AOV: Type = 8996;
+    pub const OPTIX_DENOISER_MODEL_KIND_TEMPORAL: Type = 8997;
 }
 #[repr(C)]
-#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
 pub struct OptixDenoiserOptions {
     pub guideAlbedo: ::std::os::raw::c_uint,
     pub guideNormal: ::std::os::raw::c_uint,
@@ -528,7 +583,11 @@ pub struct OptixDenoiserGuideLayer {
 }
 impl Default for OptixDenoiserGuideLayer {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
@@ -539,7 +598,11 @@ pub struct OptixDenoiserLayer {
 }
 impl Default for OptixDenoiserLayer {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
@@ -551,117 +614,127 @@ pub struct OptixDenoiserParams {
 }
 impl Default for OptixDenoiserParams {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
-#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
 pub struct OptixDenoiserSizes {
-    pub stateSizeInBytes: usize,
-    pub withOverlapScratchSizeInBytes: usize,
-    pub withoutOverlapScratchSizeInBytes: usize,
+    pub stateSizeInBytes: size_t,
+    pub withOverlapScratchSizeInBytes: size_t,
+    pub withoutOverlapScratchSizeInBytes: size_t,
     pub overlapWindowSizeInPixels: ::std::os::raw::c_uint,
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixRayFlags {
-    OPTIX_RAY_FLAG_NONE = 0,
-    OPTIX_RAY_FLAG_DISABLE_ANYHIT = 1,
-    OPTIX_RAY_FLAG_ENFORCE_ANYHIT = 2,
-    OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT = 4,
-    OPTIX_RAY_FLAG_DISABLE_CLOSESTHIT = 8,
-    OPTIX_RAY_FLAG_CULL_BACK_FACING_TRIANGLES = 16,
-    OPTIX_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES = 32,
-    OPTIX_RAY_FLAG_CULL_DISABLED_ANYHIT = 64,
-    OPTIX_RAY_FLAG_CULL_ENFORCED_ANYHIT = 128,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixTransformType {
-    OPTIX_TRANSFORM_TYPE_NONE = 0,
-    OPTIX_TRANSFORM_TYPE_STATIC_TRANSFORM = 1,
-    OPTIX_TRANSFORM_TYPE_MATRIX_MOTION_TRANSFORM = 2,
-    OPTIX_TRANSFORM_TYPE_SRT_MOTION_TRANSFORM = 3,
-    OPTIX_TRANSFORM_TYPE_INSTANCE = 4,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixTraversableGraphFlags {
-    OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY = 0,
-    OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS = 1,
-    OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING = 2,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixCompileOptimizationLevel {
-    OPTIX_COMPILE_OPTIMIZATION_DEFAULT = 0,
-    OPTIX_COMPILE_OPTIMIZATION_LEVEL_0 = 9024,
-    OPTIX_COMPILE_OPTIMIZATION_LEVEL_1 = 9025,
-    OPTIX_COMPILE_OPTIMIZATION_LEVEL_2 = 9026,
-    OPTIX_COMPILE_OPTIMIZATION_LEVEL_3 = 9027,
+impl Default for OptixDenoiserSizes {
+    fn default() -> Self {
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
+    }
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixCompileDebugLevel {
-    OPTIX_COMPILE_DEBUG_LEVEL_DEFAULT = 0,
-    OPTIX_COMPILE_DEBUG_LEVEL_NONE = 9040,
-    OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO = 9041,
-    OPTIX_COMPILE_DEBUG_LEVEL_FULL = 9042,
+pub const OptixRayFlags_OPTIX_RAY_FLAG_NONE: OptixRayFlags = 0;
+pub const OptixRayFlags_OPTIX_RAY_FLAG_DISABLE_ANYHIT: OptixRayFlags = 1;
+pub const OptixRayFlags_OPTIX_RAY_FLAG_ENFORCE_ANYHIT: OptixRayFlags = 2;
+pub const OptixRayFlags_OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT: OptixRayFlags = 4;
+pub const OptixRayFlags_OPTIX_RAY_FLAG_DISABLE_CLOSESTHIT: OptixRayFlags = 8;
+pub const OptixRayFlags_OPTIX_RAY_FLAG_CULL_BACK_FACING_TRIANGLES: OptixRayFlags = 16;
+pub const OptixRayFlags_OPTIX_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES: OptixRayFlags = 32;
+pub const OptixRayFlags_OPTIX_RAY_FLAG_CULL_DISABLED_ANYHIT: OptixRayFlags = 64;
+pub const OptixRayFlags_OPTIX_RAY_FLAG_CULL_ENFORCED_ANYHIT: OptixRayFlags = 128;
+pub type OptixRayFlags = ::std::os::raw::c_int;
+pub const OptixTransformType_OPTIX_TRANSFORM_TYPE_NONE: OptixTransformType = 0;
+pub const OptixTransformType_OPTIX_TRANSFORM_TYPE_STATIC_TRANSFORM: OptixTransformType = 1;
+pub const OptixTransformType_OPTIX_TRANSFORM_TYPE_MATRIX_MOTION_TRANSFORM: OptixTransformType = 2;
+pub const OptixTransformType_OPTIX_TRANSFORM_TYPE_SRT_MOTION_TRANSFORM: OptixTransformType = 3;
+pub const OptixTransformType_OPTIX_TRANSFORM_TYPE_INSTANCE: OptixTransformType = 4;
+pub type OptixTransformType = ::std::os::raw::c_int;
+pub mod OptixTraversableGraphFlags {
+    pub type Type = ::std::os::raw::c_int;
+    pub const OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY: Type = 0;
+    pub const OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS: Type = 1;
+    pub const OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING: Type = 2;
+}
+pub mod OptixCompileOptimizationLevel {
+    pub type Type = ::std::os::raw::c_int;
+    pub const OPTIX_COMPILE_OPTIMIZATION_DEFAULT: Type = 0;
+    pub const OPTIX_COMPILE_OPTIMIZATION_LEVEL_0: Type = 9024;
+    pub const OPTIX_COMPILE_OPTIMIZATION_LEVEL_1: Type = 9025;
+    pub const OPTIX_COMPILE_OPTIMIZATION_LEVEL_2: Type = 9026;
+    pub const OPTIX_COMPILE_OPTIMIZATION_LEVEL_3: Type = 9027;
+}
+pub mod OptixCompileDebugLevel {
+    pub type Type = ::std::os::raw::c_int;
+    pub const OPTIX_COMPILE_DEBUG_LEVEL_DEFAULT: Type = 0;
+    pub const OPTIX_COMPILE_DEBUG_LEVEL_NONE: Type = 9040;
+    pub const OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO: Type = 9041;
+    pub const OPTIX_COMPILE_DEBUG_LEVEL_FULL: Type = 9042;
 }
 #[repr(C)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub struct OptixModuleCompileBoundValueEntry {
-    pub pipelineParamOffsetInBytes: usize,
-    pub sizeInBytes: usize,
+    pub pipelineParamOffsetInBytes: size_t,
+    pub sizeInBytes: size_t,
     pub boundValuePtr: *const ::std::os::raw::c_void,
     pub annotation: *const ::std::os::raw::c_char,
 }
 impl Default for OptixModuleCompileBoundValueEntry {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, PartialEq)]
 pub struct OptixModuleCompileOptions {
     pub maxRegisterCount: ::std::os::raw::c_int,
-    pub optLevel: OptixCompileOptimizationLevel,
-    pub debugLevel: OptixCompileDebugLevel,
+    pub optLevel: OptixCompileOptimizationLevel::Type,
+    pub debugLevel: OptixCompileDebugLevel::Type,
     pub boundValues: *const OptixModuleCompileBoundValueEntry,
     pub numBoundValues: ::std::os::raw::c_uint,
 }
 impl Default for OptixModuleCompileOptions {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixProgramGroupKind {
-    OPTIX_PROGRAM_GROUP_KIND_RAYGEN = 9249,
-    OPTIX_PROGRAM_GROUP_KIND_MISS = 9250,
-    OPTIX_PROGRAM_GROUP_KIND_EXCEPTION = 9251,
-    OPTIX_PROGRAM_GROUP_KIND_HITGROUP = 9252,
-    OPTIX_PROGRAM_GROUP_KIND_CALLABLES = 9253,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixProgramGroupFlags {
-    OPTIX_PROGRAM_GROUP_FLAGS_NONE = 0,
+pub mod OptixProgramGroupKind {
+    pub type Type = ::std::os::raw::c_int;
+    pub const OPTIX_PROGRAM_GROUP_KIND_RAYGEN: Type = 9249;
+    pub const OPTIX_PROGRAM_GROUP_KIND_MISS: Type = 9250;
+    pub const OPTIX_PROGRAM_GROUP_KIND_EXCEPTION: Type = 9251;
+    pub const OPTIX_PROGRAM_GROUP_KIND_HITGROUP: Type = 9252;
+    pub const OPTIX_PROGRAM_GROUP_KIND_CALLABLES: Type = 9253;
 }
+pub const OptixProgramGroupFlags_OPTIX_PROGRAM_GROUP_FLAGS_NONE: OptixProgramGroupFlags = 0;
+pub type OptixProgramGroupFlags = ::std::os::raw::c_int;
 #[repr(C)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, PartialEq)]
 pub struct OptixProgramGroupSingleModule {
     pub module: OptixModule,
     pub entryFunctionName: *const ::std::os::raw::c_char,
 }
 impl Default for OptixProgramGroupSingleModule {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, PartialEq)]
 pub struct OptixProgramGroupHitgroup {
     pub moduleCH: OptixModule,
     pub entryFunctionNameCH: *const ::std::os::raw::c_char,
@@ -672,11 +745,15 @@ pub struct OptixProgramGroupHitgroup {
 }
 impl Default for OptixProgramGroupHitgroup {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, PartialEq)]
 pub struct OptixProgramGroupCallables {
     pub moduleDC: OptixModule,
     pub entryFunctionNameDC: *const ::std::os::raw::c_char,
@@ -685,13 +762,17 @@ pub struct OptixProgramGroupCallables {
 }
 impl Default for OptixProgramGroupCallables {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
 #[derive(Copy, Clone)]
 pub struct OptixProgramGroupDesc {
-    pub kind: OptixProgramGroupKind,
+    pub kind: OptixProgramGroupKind::Type,
     pub flags: ::std::os::raw::c_uint,
     pub __bindgen_anon_1: OptixProgramGroupDesc__bindgen_ty_1,
 }
@@ -706,52 +787,68 @@ pub union OptixProgramGroupDesc__bindgen_ty_1 {
 }
 impl Default for OptixProgramGroupDesc__bindgen_ty_1 {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 impl Default for OptixProgramGroupDesc {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
-#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
 pub struct OptixProgramGroupOptions {
     pub reserved: ::std::os::raw::c_int,
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixExceptionCodes {
-    OPTIX_EXCEPTION_CODE_STACK_OVERFLOW = -1,
-    OPTIX_EXCEPTION_CODE_TRACE_DEPTH_EXCEEDED = -2,
-    OPTIX_EXCEPTION_CODE_TRAVERSAL_DEPTH_EXCEEDED = -3,
-    OPTIX_EXCEPTION_CODE_TRAVERSAL_INVALID_TRAVERSABLE = -5,
-    OPTIX_EXCEPTION_CODE_TRAVERSAL_INVALID_MISS_SBT = -6,
-    OPTIX_EXCEPTION_CODE_TRAVERSAL_INVALID_HIT_SBT = -7,
-    OPTIX_EXCEPTION_CODE_UNSUPPORTED_PRIMITIVE_TYPE = -8,
-    OPTIX_EXCEPTION_CODE_INVALID_RAY = -9,
-    OPTIX_EXCEPTION_CODE_CALLABLE_PARAMETER_MISMATCH = -10,
-    OPTIX_EXCEPTION_CODE_BUILTIN_IS_MISMATCH = -11,
-    OPTIX_EXCEPTION_CODE_CALLABLE_INVALID_SBT = -12,
-    OPTIX_EXCEPTION_CODE_CALLABLE_NO_DC_SBT_RECORD = -13,
-    OPTIX_EXCEPTION_CODE_CALLABLE_NO_CC_SBT_RECORD = -14,
-    OPTIX_EXCEPTION_CODE_UNSUPPORTED_SINGLE_LEVEL_GAS = -15,
-    OPTIX_EXCEPTION_CODE_INVALID_VALUE_ARGUMENT_0 = -16,
-    OPTIX_EXCEPTION_CODE_INVALID_VALUE_ARGUMENT_1 = -17,
-    OPTIX_EXCEPTION_CODE_INVALID_VALUE_ARGUMENT_2 = -18,
-    OPTIX_EXCEPTION_CODE_UNSUPPORTED_DATA_ACCESS = -32,
-}
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixExceptionFlags {
-    OPTIX_EXCEPTION_FLAG_NONE = 0,
-    OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW = 1,
-    OPTIX_EXCEPTION_FLAG_TRACE_DEPTH = 2,
-    OPTIX_EXCEPTION_FLAG_USER = 4,
-    OPTIX_EXCEPTION_FLAG_DEBUG = 8,
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_STACK_OVERFLOW: OptixExceptionCodes = -1;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_TRACE_DEPTH_EXCEEDED: OptixExceptionCodes = -2;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_TRAVERSAL_DEPTH_EXCEEDED: OptixExceptionCodes =
+    -3;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_TRAVERSAL_INVALID_TRAVERSABLE:
+    OptixExceptionCodes = -5;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_TRAVERSAL_INVALID_MISS_SBT: OptixExceptionCodes =
+    -6;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_TRAVERSAL_INVALID_HIT_SBT: OptixExceptionCodes =
+    -7;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_UNSUPPORTED_PRIMITIVE_TYPE: OptixExceptionCodes =
+    -8;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_INVALID_RAY: OptixExceptionCodes = -9;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_CALLABLE_PARAMETER_MISMATCH:
+    OptixExceptionCodes = -10;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_BUILTIN_IS_MISMATCH: OptixExceptionCodes = -11;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_CALLABLE_INVALID_SBT: OptixExceptionCodes = -12;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_CALLABLE_NO_DC_SBT_RECORD: OptixExceptionCodes =
+    -13;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_CALLABLE_NO_CC_SBT_RECORD: OptixExceptionCodes =
+    -14;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_UNSUPPORTED_SINGLE_LEVEL_GAS:
+    OptixExceptionCodes = -15;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_INVALID_VALUE_ARGUMENT_0: OptixExceptionCodes =
+    -16;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_INVALID_VALUE_ARGUMENT_1: OptixExceptionCodes =
+    -17;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_INVALID_VALUE_ARGUMENT_2: OptixExceptionCodes =
+    -18;
+pub const OptixExceptionCodes_OPTIX_EXCEPTION_CODE_UNSUPPORTED_DATA_ACCESS: OptixExceptionCodes =
+    -32;
+pub type OptixExceptionCodes = ::std::os::raw::c_int;
+pub mod OptixExceptionFlags {
+    pub type Type = ::std::os::raw::c_int;
+    pub const OPTIX_EXCEPTION_FLAG_NONE: Type = 0;
+    pub const OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW: Type = 1;
+    pub const OPTIX_EXCEPTION_FLAG_TRACE_DEPTH: Type = 2;
+    pub const OPTIX_EXCEPTION_FLAG_USER: Type = 4;
+    pub const OPTIX_EXCEPTION_FLAG_DEBUG: Type = 8;
 }
 #[repr(C)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub struct OptixPipelineCompileOptions {
     pub usesMotionBlur: ::std::os::raw::c_int,
     pub traversableGraphFlags: ::std::os::raw::c_uint,
@@ -761,22 +858,30 @@ pub struct OptixPipelineCompileOptions {
     pub pipelineLaunchParamsVariableName: *const ::std::os::raw::c_char,
     pub usesPrimitiveTypeFlags: ::std::os::raw::c_uint,
     pub reserved: ::std::os::raw::c_uint,
-    pub reserved2: usize,
+    pub reserved2: size_t,
 }
 impl Default for OptixPipelineCompileOptions {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, PartialEq)]
 pub struct OptixPipelineLinkOptions {
     pub maxTraceDepth: ::std::os::raw::c_uint,
-    pub debugLevel: OptixCompileDebugLevel,
+    pub debugLevel: OptixCompileDebugLevel::Type,
 }
 impl Default for OptixPipelineLinkOptions {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
@@ -795,11 +900,15 @@ pub struct OptixShaderBindingTable {
 }
 impl Default for OptixShaderBindingTable {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 #[repr(C)]
-#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
 pub struct OptixStackSizes {
     pub cssRG: ::std::os::raw::c_uint,
     pub cssMS: ::std::os::raw::c_uint,
@@ -809,11 +918,9 @@ pub struct OptixStackSizes {
     pub cssCC: ::std::os::raw::c_uint,
     pub dssDC: ::std::os::raw::c_uint,
 }
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum OptixQueryFunctionTableOptions {
-    OPTIX_QUERY_FUNCTION_TABLE_OPTION_DUMMY = 0,
-}
+pub const OptixQueryFunctionTableOptions_OPTIX_QUERY_FUNCTION_TABLE_OPTION_DUMMY:
+    OptixQueryFunctionTableOptions = 0;
+pub type OptixQueryFunctionTableOptions = ::std::os::raw::c_int;
 pub type OptixQueryFunctionTable_t = ::std::option::Option<
     unsafe extern "C" fn(
         abiId: ::std::os::raw::c_int,
@@ -821,18 +928,22 @@ pub type OptixQueryFunctionTable_t = ::std::option::Option<
         arg1: *mut OptixQueryFunctionTableOptions,
         arg2: *mut *const ::std::os::raw::c_void,
         functionTable: *mut ::std::os::raw::c_void,
-        sizeOfTable: usize,
+        sizeOfTable: size_t,
     ) -> OptixResult,
 >;
 #[repr(C)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, PartialEq)]
 pub struct OptixBuiltinISOptions {
     pub builtinISModuleType: OptixPrimitiveType,
     pub usesMotionBlur: ::std::os::raw::c_int,
 }
 impl Default for OptixBuiltinISOptions {
     fn default() -> Self {
-        unsafe { ::std::mem::zeroed() }
+        let mut s = ::std::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
     }
 }
 extern "C" {
@@ -854,9 +965,9 @@ extern "C" {
 extern "C" {
     pub fn optixDeviceContextGetProperty(
         context: OptixDeviceContext,
-        property: OptixDeviceProperty,
+        property: OptixDeviceProperty::Type,
         value: *mut ::std::os::raw::c_void,
-        sizeInBytes: usize,
+        sizeInBytes: size_t,
     ) -> OptixResult;
 }
 extern "C" {
@@ -882,8 +993,8 @@ extern "C" {
 extern "C" {
     pub fn optixDeviceContextSetCacheDatabaseSizes(
         context: OptixDeviceContext,
-        lowWaterMark: usize,
-        highWaterMark: usize,
+        lowWaterMark: size_t,
+        highWaterMark: size_t,
     ) -> OptixResult;
 }
 extern "C" {
@@ -896,14 +1007,14 @@ extern "C" {
     pub fn optixDeviceContextGetCacheLocation(
         context: OptixDeviceContext,
         location: *mut ::std::os::raw::c_char,
-        locationSize: usize,
+        locationSize: size_t,
     ) -> OptixResult;
 }
 extern "C" {
     pub fn optixDeviceContextGetCacheDatabaseSizes(
         context: OptixDeviceContext,
-        lowWaterMark: *mut usize,
-        highWaterMark: *mut usize,
+        lowWaterMark: *mut size_t,
+        highWaterMark: *mut size_t,
     ) -> OptixResult;
 }
 extern "C" {
@@ -914,7 +1025,7 @@ extern "C" {
         programGroups: *const OptixProgramGroup,
         numProgramGroups: ::std::os::raw::c_uint,
         logString: *mut ::std::os::raw::c_char,
-        logStringSize: *mut usize,
+        logStringSize: *mut size_t,
         pipeline: *mut OptixPipeline,
     ) -> OptixResult;
 }
@@ -936,9 +1047,9 @@ extern "C" {
         moduleCompileOptions: *const OptixModuleCompileOptions,
         pipelineCompileOptions: *const OptixPipelineCompileOptions,
         PTX: *const ::std::os::raw::c_char,
-        PTXsize: usize,
+        PTXsize: size_t,
         logString: *mut ::std::os::raw::c_char,
-        logStringSize: *mut usize,
+        logStringSize: *mut size_t,
         module: *mut OptixModule,
     ) -> OptixResult;
 }
@@ -967,7 +1078,7 @@ extern "C" {
         numProgramGroups: ::std::os::raw::c_uint,
         options: *const OptixProgramGroupOptions,
         logString: *mut ::std::os::raw::c_char,
-        logStringSize: *mut usize,
+        logStringSize: *mut size_t,
         programGroups: *mut OptixProgramGroup,
     ) -> OptixResult;
 }
@@ -979,7 +1090,7 @@ extern "C" {
         pipeline: OptixPipeline,
         stream: CUstream,
         pipelineParams: CUdeviceptr,
-        pipelineParamsSize: usize,
+        pipelineParamsSize: size_t,
         sbt: *const OptixShaderBindingTable,
         width: ::std::os::raw::c_uint,
         height: ::std::os::raw::c_uint,
@@ -1009,9 +1120,9 @@ extern "C" {
         buildInputs: *const OptixBuildInput,
         numBuildInputs: ::std::os::raw::c_uint,
         tempBuffer: CUdeviceptr,
-        tempBufferSizeInBytes: usize,
+        tempBufferSizeInBytes: size_t,
         outputBuffer: CUdeviceptr,
-        outputBufferSizeInBytes: usize,
+        outputBufferSizeInBytes: size_t,
         outputHandle: *mut OptixTraversableHandle,
         emittedProperties: *const OptixAccelEmitDesc,
         numEmittedProperties: ::std::os::raw::c_uint,
@@ -1037,9 +1148,9 @@ extern "C" {
         stream: CUstream,
         info: *const OptixAccelRelocationInfo,
         instanceTraversableHandles: CUdeviceptr,
-        numInstanceTraversableHandles: usize,
+        numInstanceTraversableHandles: size_t,
         targetAccel: CUdeviceptr,
-        targetAccelSizeInBytes: usize,
+        targetAccelSizeInBytes: size_t,
         targetHandle: *mut OptixTraversableHandle,
     ) -> OptixResult;
 }
@@ -1049,7 +1160,7 @@ extern "C" {
         stream: CUstream,
         inputHandle: OptixTraversableHandle,
         outputBuffer: CUdeviceptr,
-        outputBufferSizeInBytes: usize,
+        outputBufferSizeInBytes: size_t,
         outputHandle: *mut OptixTraversableHandle,
     ) -> OptixResult;
 }
@@ -1064,7 +1175,7 @@ extern "C" {
 extern "C" {
     pub fn optixDenoiserCreate(
         context: OptixDeviceContext,
-        modelKind: OptixDenoiserModelKind,
+        modelKind: OptixDenoiserModelKind::Type,
         options: *const OptixDenoiserOptions,
         denoiser: *mut OptixDenoiser,
     ) -> OptixResult;
@@ -1073,7 +1184,7 @@ extern "C" {
     pub fn optixDenoiserCreateWithUserModel(
         context: OptixDeviceContext,
         userData: *const ::std::os::raw::c_void,
-        userDataSizeInBytes: usize,
+        userDataSizeInBytes: size_t,
         denoiser: *mut OptixDenoiser,
     ) -> OptixResult;
 }
@@ -1095,9 +1206,9 @@ extern "C" {
         inputWidth: ::std::os::raw::c_uint,
         inputHeight: ::std::os::raw::c_uint,
         denoiserState: CUdeviceptr,
-        denoiserStateSizeInBytes: usize,
+        denoiserStateSizeInBytes: size_t,
         scratch: CUdeviceptr,
-        scratchSizeInBytes: usize,
+        scratchSizeInBytes: size_t,
     ) -> OptixResult;
 }
 extern "C" {
@@ -1106,14 +1217,14 @@ extern "C" {
         stream: CUstream,
         params: *const OptixDenoiserParams,
         denoiserState: CUdeviceptr,
-        denoiserStateSizeInBytes: usize,
+        denoiserStateSizeInBytes: size_t,
         guideLayer: *const OptixDenoiserGuideLayer,
         layers: *const OptixDenoiserLayer,
         numLayers: ::std::os::raw::c_uint,
         inputOffsetX: ::std::os::raw::c_uint,
         inputOffsetY: ::std::os::raw::c_uint,
         scratch: CUdeviceptr,
-        scratchSizeInBytes: usize,
+        scratchSizeInBytes: size_t,
     ) -> OptixResult;
 }
 extern "C" {
@@ -1123,7 +1234,7 @@ extern "C" {
         inputImage: *const OptixImage2D,
         outputIntensity: CUdeviceptr,
         scratch: CUdeviceptr,
-        scratchSizeInBytes: usize,
+        scratchSizeInBytes: size_t,
     ) -> OptixResult;
 }
 extern "C" {
@@ -1133,11 +1244,11 @@ extern "C" {
         inputImage: *const OptixImage2D,
         outputAverageColor: CUdeviceptr,
         scratch: CUdeviceptr,
-        scratchSizeInBytes: usize,
+        scratchSizeInBytes: size_t,
     ) -> OptixResult;
 }
 #[repr(C)]
-#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
 pub struct OptixFunctionTable {
     pub optixGetErrorName: ::std::option::Option<
         unsafe extern "C" fn(result: OptixResult) -> *const ::std::os::raw::c_char,
@@ -1157,9 +1268,9 @@ pub struct OptixFunctionTable {
     pub optixDeviceContextGetProperty: ::std::option::Option<
         unsafe extern "C" fn(
             context: OptixDeviceContext,
-            property: OptixDeviceProperty,
+            property: OptixDeviceProperty::Type,
             value: *mut ::std::os::raw::c_void,
-            sizeInBytes: usize,
+            sizeInBytes: size_t,
         ) -> OptixResult,
     >,
     pub optixDeviceContextSetLogCallback: ::std::option::Option<
@@ -1185,8 +1296,8 @@ pub struct OptixFunctionTable {
     pub optixDeviceContextSetCacheDatabaseSizes: ::std::option::Option<
         unsafe extern "C" fn(
             context: OptixDeviceContext,
-            lowWaterMark: usize,
-            highWaterMark: usize,
+            lowWaterMark: size_t,
+            highWaterMark: size_t,
         ) -> OptixResult,
     >,
     pub optixDeviceContextGetCacheEnabled: ::std::option::Option<
@@ -1199,14 +1310,14 @@ pub struct OptixFunctionTable {
         unsafe extern "C" fn(
             context: OptixDeviceContext,
             location: *mut ::std::os::raw::c_char,
-            locationSize: usize,
+            locationSize: size_t,
         ) -> OptixResult,
     >,
     pub optixDeviceContextGetCacheDatabaseSizes: ::std::option::Option<
         unsafe extern "C" fn(
             context: OptixDeviceContext,
-            lowWaterMark: *mut usize,
-            highWaterMark: *mut usize,
+            lowWaterMark: *mut size_t,
+            highWaterMark: *mut size_t,
         ) -> OptixResult,
     >,
     pub optixModuleCreateFromPTX: ::std::option::Option<
@@ -1215,9 +1326,9 @@ pub struct OptixFunctionTable {
             moduleCompileOptions: *const OptixModuleCompileOptions,
             pipelineCompileOptions: *const OptixPipelineCompileOptions,
             PTX: *const ::std::os::raw::c_char,
-            PTXsize: usize,
+            PTXsize: size_t,
             logString: *mut ::std::os::raw::c_char,
-            logStringSize: *mut usize,
+            logStringSize: *mut size_t,
             module: *mut OptixModule,
         ) -> OptixResult,
     >,
@@ -1239,7 +1350,7 @@ pub struct OptixFunctionTable {
             numProgramGroups: ::std::os::raw::c_uint,
             options: *const OptixProgramGroupOptions,
             logString: *mut ::std::os::raw::c_char,
-            logStringSize: *mut usize,
+            logStringSize: *mut size_t,
             programGroups: *mut OptixProgramGroup,
         ) -> OptixResult,
     >,
@@ -1259,7 +1370,7 @@ pub struct OptixFunctionTable {
             programGroups: *const OptixProgramGroup,
             numProgramGroups: ::std::os::raw::c_uint,
             logString: *mut ::std::os::raw::c_char,
-            logStringSize: *mut usize,
+            logStringSize: *mut size_t,
             pipeline: *mut OptixPipeline,
         ) -> OptixResult,
     >,
@@ -1291,9 +1402,9 @@ pub struct OptixFunctionTable {
             buildInputs: *const OptixBuildInput,
             numBuildInputs: ::std::os::raw::c_uint,
             tempBuffer: CUdeviceptr,
-            tempBufferSizeInBytes: usize,
+            tempBufferSizeInBytes: size_t,
             outputBuffer: CUdeviceptr,
-            outputBufferSizeInBytes: usize,
+            outputBufferSizeInBytes: size_t,
             outputHandle: *mut OptixTraversableHandle,
             emittedProperties: *const OptixAccelEmitDesc,
             numEmittedProperties: ::std::os::raw::c_uint,
@@ -1319,9 +1430,9 @@ pub struct OptixFunctionTable {
             stream: CUstream,
             info: *const OptixAccelRelocationInfo,
             instanceTraversableHandles: CUdeviceptr,
-            numInstanceTraversableHandles: usize,
+            numInstanceTraversableHandles: size_t,
             targetAccel: CUdeviceptr,
-            targetAccelSizeInBytes: usize,
+            targetAccelSizeInBytes: size_t,
             targetHandle: *mut OptixTraversableHandle,
         ) -> OptixResult,
     >,
@@ -1331,7 +1442,7 @@ pub struct OptixFunctionTable {
             stream: CUstream,
             inputHandle: OptixTraversableHandle,
             outputBuffer: CUdeviceptr,
-            outputBufferSizeInBytes: usize,
+            outputBufferSizeInBytes: size_t,
             outputHandle: *mut OptixTraversableHandle,
         ) -> OptixResult,
     >,
@@ -1354,7 +1465,7 @@ pub struct OptixFunctionTable {
             pipeline: OptixPipeline,
             stream: CUstream,
             pipelineParams: CUdeviceptr,
-            pipelineParamsSize: usize,
+            pipelineParamsSize: size_t,
             sbt: *const OptixShaderBindingTable,
             width: ::std::os::raw::c_uint,
             height: ::std::os::raw::c_uint,
@@ -1364,7 +1475,7 @@ pub struct OptixFunctionTable {
     pub optixDenoiserCreate: ::std::option::Option<
         unsafe extern "C" fn(
             context: OptixDeviceContext,
-            modelKind: OptixDenoiserModelKind,
+            modelKind: OptixDenoiserModelKind::Type,
             options: *const OptixDenoiserOptions,
             returnHandle: *mut OptixDenoiser,
         ) -> OptixResult,
@@ -1386,9 +1497,9 @@ pub struct OptixFunctionTable {
             inputWidth: ::std::os::raw::c_uint,
             inputHeight: ::std::os::raw::c_uint,
             state: CUdeviceptr,
-            stateSizeInBytes: usize,
+            stateSizeInBytes: size_t,
             scratch: CUdeviceptr,
-            scratchSizeInBytes: usize,
+            scratchSizeInBytes: size_t,
         ) -> OptixResult,
     >,
     pub optixDenoiserInvoke: ::std::option::Option<
@@ -1397,14 +1508,14 @@ pub struct OptixFunctionTable {
             stream: CUstream,
             params: *const OptixDenoiserParams,
             denoiserState: CUdeviceptr,
-            denoiserStateSizeInBytes: usize,
+            denoiserStateSizeInBytes: size_t,
             guideLayer: *const OptixDenoiserGuideLayer,
             layers: *const OptixDenoiserLayer,
             numLayers: ::std::os::raw::c_uint,
             inputOffsetX: ::std::os::raw::c_uint,
             inputOffsetY: ::std::os::raw::c_uint,
             scratch: CUdeviceptr,
-            scratchSizeInBytes: usize,
+            scratchSizeInBytes: size_t,
         ) -> OptixResult,
     >,
     pub optixDenoiserComputeIntensity: ::std::option::Option<
@@ -1414,7 +1525,7 @@ pub struct OptixFunctionTable {
             inputImage: *const OptixImage2D,
             outputIntensity: CUdeviceptr,
             scratch: CUdeviceptr,
-            scratchSizeInBytes: usize,
+            scratchSizeInBytes: size_t,
         ) -> OptixResult,
     >,
     pub optixDenoiserComputeAverageColor: ::std::option::Option<
@@ -1424,15 +1535,32 @@ pub struct OptixFunctionTable {
             inputImage: *const OptixImage2D,
             outputAverageColor: CUdeviceptr,
             scratch: CUdeviceptr,
-            scratchSizeInBytes: usize,
+            scratchSizeInBytes: size_t,
         ) -> OptixResult,
     >,
     pub optixDenoiserCreateWithUserModel: ::std::option::Option<
         unsafe extern "C" fn(
             context: OptixDeviceContext,
             data: *const ::std::os::raw::c_void,
-            dataSizeInBytes: usize,
+            dataSizeInBytes: size_t,
             returnHandle: *mut OptixDenoiser,
         ) -> OptixResult,
     >,
 }
+pub const OptixSbtRecordHeaderSize: size_t = 32;
+pub const OptixSbtRecordAlignment: size_t = 16;
+pub const OptixAccelBufferByteAlignment: size_t = 128;
+pub const OptixInstanceByteAlignment: size_t = 16;
+pub const OptixAabbBufferByteAlignment: size_t = 8;
+pub const OptixGeometryTransformByteAlignment: size_t = 16;
+pub const OptixTransformByteAlignment: size_t = 64;
+pub const OptixVersion: size_t = 70300;
+pub const OptixBuildInputSize: size_t = 1032;
+pub const OptixShaderBindingTableSize: size_t = 64;
+#[repr(i32)]
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+pub enum OptixGeometryFlags {
+    None = 0,
+    DisableAnyHit = 1,
+    RequireSingleAnyHitCall = 2,
+}
diff --git a/crates/optix/src/acceleration.md b/crates/optix/src/acceleration.md
new file mode 100644
index 00000000..aefb0f61
--- /dev/null
+++ b/crates/optix/src/acceleration.md
@@ -0,0 +1,857 @@
+# Acceleration Structures
+
+```no_run
+use cust::prelude as cu;
+use optix::prelude as ox;
+# fn doit() -> Result<(), Box<dyn std::error::Error>> {
+# cust::init(cu::CudaFlags::empty())?;
+# ox::init()?;
+# let device = cu::Device::get_device(0)?;
+# let cu_ctx = cu::Context::create_and_push(cu::ContextFlags::SCHED_AUTO |
+# cu::ContextFlags::MAP_HOST, device)?;
+# let ctx = ox::DeviceContext::new(&cu_ctx, false)?;
+# let vertices: Vec<[f32; 3]> = Vec::new();
+# let indices: Vec<[u32; 3]> = Vec::new();
+# let stream = cu::Stream::new(cu::StreamFlags::DEFAULT, None)?;
+
+// Allocate buffers and copy vertex and index data to device
+let buf_vertex = cu::DeviceBuffer::from_slice(&vertices)?;
+let buf_indices = cu::DeviceBuffer::from_slice(&indices)?;
+
+// Tell OptiX the structure of our triangle mesh
+let geometry_flags = ox::GeometryFlags::None;
+let triangle_input =
+    ox::IndexedTriangleArray::new(
+        &[&buf_vertex],
+        &buf_indices,
+        &[geometry_flags]
+    );
+
+// Tell OptiX we'd prefer a faster traversal over a faster bvh build.
+let accel_options = AccelBuildOptions::new(ox::BuildFlags::PREFER_FAST_TRACE);
+
+// Build the accel asynchronously
+let gas = ox::Accel::build(
+    &ctx,
+    &stream,
+    &[accel_options],
+    &[triangle_input],
+    true
+)?;
+# Ok(())
+# }
+```
+
+# Programming Guide...
+<details>
+<summary>Click here to expand programming guide</summary>
+
+## Contents
+
+- [Building](#building)
+    - [Building Safe API](#building-safe-api)
+    - [Buliding Unsafe API](#building-unsafe-api)
+- [Primitive Build Inputs](#primitive-build-inputs)
+- [Build Flags](#build-flags)
+- [Dynamic Updates](#dynamic-updates)
+    - [Dynamic Updates Safe API](#dynamic-updates-safe-api)
+    - [Dynamic Updates Unsafe API](#dynamic-updates-unsafe-api)
+- [Compaction](#compaction)
+    - [Compaction Safe API](#compaction-safe-api)
+    - [Compaction Unsafe API](#compaction-unsafe-api)
+- [Traversable Objects](#traversable-objects)
+    - [Traversable Objects Safe API](#traversable-objects-safe-api)
+    - [Traversable Objects Unsafe API](#traversable-objects-unsafe-api)
+- [Motion Blur](#motion-blur)
+    - [Basics](#basics)
+    - [Motion Geometry Acceleration Structure](#motion-geometry-acceleration-structure)
+    - [Motion Instance Acceleration Structure](#motion-instance-acceleration-structure)
+    - [Motion Matrix Transform](#motion-matrix-transform)
+    - [Motion Scale Rotate Translate Transform](#motion-scale-rotate-translate-transform)
+    - [Transforms Trade-Offs](#transforms-trade-offs)
+
+
+NVIDIA OptiX 7 provides acceleration structures to optimize the search for the
+intersection of rays with the geometric data in the scene. Acceleration structures
+can contain two types of data: geometric primitives (a geometry-AS) or instances
+(an instance-AS). Acceleration structures are created on the device using a set
+of functions. These functions enable overlapping and pipelining of acceleration
+structure creation, called a build. The functions use one or more [`BuildInput`]
+structs to specify the geometry plus a set of parameters to control the build.
+
+Acceleration structures have size limits, listed in “Limits”. For an instance
+acceleration structure, the number of instances has an upper limit. For a geometry
+acceleration structure, the number of geometric primitives is limited,
+specifically the total number of primitives in its build inputs, multiplied by the
+number of motion keys.
+ 
+The following acceleration structure types are supported:
+
+#### Instance acceleration structures
+- [`InstanceArray`](crate::instance_array::InstanceArray)
+- [`InstancePointerArray`](crate::instance_array::InstancePointerArray)
+
+#### Geometry acceleration structure containing built-in triangles
+- [`TriangleArray`](crate::triangle_array::TriangleArray)
+- [`IndexedTriangleArray`](crate::triangle_array::IndexedTriangleArray)
+
+#### Geometry acceleration structure containing built-in curves
+- [`CurveArray`](crate::curve_array::CurveArray)
+
+#### Geometry acceleration structure containing custom primitives
+- [`CustomPrimitiveArray`](crate::custom_primitive_array::CustomPrimitiveArray)
+
+## Building
+
+For geometry-AS builds, each build input can specify a set of triangles, a set
+of curves, or a set of user-defined primitives bounded by specified axis-aligned
+bounding boxes. Multiple build inputs can be passed as an array to [`Accel::build()`]
+to combine different meshes into a single acceleration structure. All build
+inputs for a single build must agree on the build input type.
+
+Instance acceleration structures have a single build input and specify an array
+of instances. Each [`Instance`] includes a ray transformation and a
+[`TraversableHandle`] that refers to a geometry-AS, a transform node, or another
+instance acceleration structure.
+
+### Building Safe API
+
+The easiest way to build an acceleration structure is using [`Accel::build()`]
+to which you just pass a slice of [`BuildInput`]s and the function handles
+memory allocation and synchronization for you.
+
+This is handy for getting something working with the minimum of fuss, but
+means reallocating temporary storage each time. It also means synchronizing
+after each build rather than potentially processing many builds on a stream
+and synchronizing at the end.
+
+```no_run
+use cust::prelude as cu;
+use optix::prelude as ox;
+# fn doit() -> Result<(), Box<dyn std::error::Error>> {
+# cust::init(cu::CudaFlags::empty())?;
+# ox::init()?;
+# let device = cu::Device::get_device(0)?;
+# let cu_ctx = cu::Context::create_and_push(cu::ContextFlags::SCHED_AUTO |
+# cu::ContextFlags::MAP_HOST, device)?;
+# let ctx = ox::DeviceContext::new(&cu_ctx, false)?;
+# let vertices: Vec<[f32; 3]> = Vec::new();
+# let indices: Vec<[u32; 3]> = Vec::new();
+# let stream = cu::Stream::new(cu::StreamFlags::DEFAULT, None)?;
+
+let buf_vertex = cu::DeviceBuffer::from_slice(&vertices)?;
+let buf_indices = cu::DeviceBuffer::from_slice(&indices)?;
+
+let geometry_flags = ox::GeometryFlags::None;
+let triangle_input =
+    ox::IndexedTriangleArray::new(
+        &[&buf_vertex],
+        &buf_indices,
+        &[geometry_flags]
+    );
+
+let accel_options =
+    ox::AccelBuildOptions::new(
+        ox::BuildFlags::ALLOW_COMPACTION,
+        ox::BuildOperation::Build
+    );
+
+let build_inputs = vec![triangle_input];
+
+let gas = ox::Accel::build(
+    &ctx,
+    &stream,
+    &[accel_options],
+    &build_inputs,
+    true
+)?;
+
+stream.synchronize()?;
+# Ok(())
+# }
+```
+
+### Building Unsafe API
+
+As an alternative, you can also use the unsafe functions [`accel_build()`],
+[`accel_compact()`], and [`Accel::from_raw_parts()`] to handle the memory
+allocation yourself, meaning you can reuse buffers between accel builds.
+
+To prepare for a build, the required memory sizes are queried by passing an
+initial set of build inputs and parameters to [`accel_compute_memory_usage()`].
+It returns three different sizes:
+
+* `output_size_in_bytes` - Size of the memory region where the resulting
+acceleration structure is placed. This size is an upper bound and may be
+substantially larger than the final acceleration structure. (See “Compacting acceleration structures”.)
+* `temp_size_in_bytes` - Size of the memory region that is temporarily used during
+the build.
+* `temp_update_size_in_bytes` - Size of the memory region that is temporarily
+required to update the acceleration structure.
+
+Using these sizes, the application allocates memory for the output and temporary
+memory buffers on the device. The pointers to these buffers must be aligned to
+a 128-byte boundary. These buffers are actively used for the duration of the
+build. For this reason, they cannot be shared with other currently active build
+requests.
+
+Note that [`accel_compute_memory_usage()`] does not initiate any activity on the
+device; pointers to device memory or contents of input buffers are not required to point to allocated memory.
+
+The function [`accel_build()`] takes the same array of [`BuildInput`] structs as
+[`accel_compute_memory_usage()`] and builds a single acceleration structure from
+these inputs. This acceleration structure can contain either geometry or
+instances, depending on the inputs to the build.
+
+The build operation is executed on the device in the specified CUDA stream and
+runs asynchronously on the device, similar to CUDA kernel launches. The
+application may choose to block the host-side thread or synchronize with other
+CUDA streams by using available CUDA synchronization functionality such as
+[`Stream::synchronize()`](cust::stream::Stream::synchronize) or CUDA events.
+The traversable handle returned is computed on the host and is returned from
+the function immediately, without waiting for the build to finish. By producing
+handles at acceleration time, custom handles can also be generated based on
+input to the builder.
+
+The acceleration structure constructed by [`accel_build()`] does not reference
+any of the device buffers referenced in the build inputs. All relevant data
+is copied from these buffers into the acceleration output buffer, possibly in
+a different format.
+
+The application is free to release this memory after the build without
+invalidating the acceleration structure. However, instance-AS builds will
+continue to refer to other instance-AS and geometry-AS instances and transform
+nodes.
+
+```no_run
+use cust::prelude as cu;
+use optix::prelude as ox;
+# fn doit() -> Result<(), Box<dyn std::error::Error>> {
+# cust::init(cu::CudaFlags::empty())?;
+# ox::init()?;
+# let device = cu::Device::get_device(0)?;
+# let cu_ctx = cu::Context::create_and_push(cu::ContextFlags::SCHED_AUTO |
+# cu::ContextFlags::MAP_HOST, device)?;
+# let ctx = ox::DeviceContext::new(&cu_ctx, false)?;
+# let vertices: Vec<[f32; 3]> = Vec::new();
+# let indices: Vec<[u32; 3]> = Vec::new();
+# let stream = cu::Stream::new(cu::StreamFlags::DEFAULT, None)?;
+
+let buf_vertex = cu::DeviceBuffer::from_slice(&vertices)?;
+let buf_indices = cu::DeviceBuffer::from_slice(&indices)?;
+
+let geometry_flags = ox::GeometryFlags::None;
+
+let build_inputs =
+    [ox::IndexedTriangleArray::new(
+        &[&buf_vertex],
+        &buf_indices,
+        &[geometry_flags]
+    )];
+
+let accel_options =
+    ox::AccelBuildOptions::new(
+        ox::BuildFlags::ALLOW_COMPACTION,
+        ox::BuildOperation::Build
+    );
+
+// Get the storage requirements for temporary and output buffers
+let sizes = accel_compute_memory_usage(ctx, accel_options, build_inputs)?;
+
+// Allocate temporary and output buffers
+let mut output_buffer =
+    unsafe { DeviceBuffer::<u8>::uninitialized(sizes.output_size_in_bytes)? };
+let mut temp_buffer =
+    unsafe { DeviceBuffer::<u8>::uninitialized(sizes.temp_size_in_bytes)? };
+
+// Build the accel
+let hnd = unsafe {
+    accel_build(
+        ctx,
+        stream,
+        accel_options,
+        build_inputs,
+        &mut temp_buffer,
+        &mut output_buffer,
+        &mut properties,
+    )?
+};
+
+// The accel build is asynchronous
+stream.synchronize()?;
+
+# Ok(())
+# }
+```
+
+## Primitive Build Inputs
+The [`accel_build`] function accepts multiple build inputs per call, but they
+must be all triangle inputs, all curve inputs, or all AABB inputs. Mixing build
+input types in a single geometry-AS is not allowed.
+
+Each build input maps to one or more consecutive records in the shader binding
+table (SBT), which controls program dispatch. (See [Shader binding table](crate::shader_binding_table).) If
+multiple records in the SBT are required, the application needs to provide a
+device buffer with per-primitive SBT record indices for that build input. If
+only a single SBT record is requested, all primitives reference this same unique
+SBT record. Note that there is a limit to the number of referenced SBT records
+per geometry-AS. (Limits are discussed in “Limits”.)
+
+Each build input also specifies an array of [`GeometryFlags`], one for each SBT
+record. The flags for one record apply to all primitives mapped to this SBT record.
+
+The following flags are supported:
+
+* [`GeometryFlags::None`](crate::acceleration::GeometryFlags) - Applies the default behavior when calling the any-hit
+program, possibly multiple times, allowing the acceleration-structure builder
+to apply all optimizations.
+* [`GeometryFlags::RequireSingleAnyHitCall`](crate::acceleration::GeometryFlags) - Disables some optimizations
+specific to acceleration-structure builders. By default, traversal may call
+the any-hit program more than once for each intersected primitive. Setting
+the flag ensures that the any-hit program is called only once for a hit with a
+primitive. However, setting this flag may change traversal performance. The
+usage of this flag may be required for correctness of some rendering algorithms;
+for example, in cases where opacity or transparency information is accumulated
+in an any-hit program.
+* [`GeometryFlags::DisableAnyHit`](crate::acceleration::GeometryFlags) - Indicates that traversal should not call
+the any-hit program for this primitive even if the corresponding SBT record
+contains an any-hit program. Setting this flag usually improves performance
+even if no any-hit program is present in the SBT.
+
+Primitives inside a build input are indexed starting from zero. This primitive
+index is accessible inside the intersection, any-hit, and closest-hit programs.
+If the application chooses to offset this index for all primitives in a build
+input, there is no overhead at runtime. This can be particularly useful when
+data for consecutive build inputs is stored consecutively in device memory.
+The `primitive_index_offset` value is only used when reporting the intersection
+primitive.
+
+## Build Flags
+
+An acceleration structure build can be controlled using the values of the
+[`BuildFlags`] enum. To enable random vertex access on an acceleration structure,
+use [`BuildFlags::ALLOW_RANDOM_VERTEX_ACCESS`](crate::acceleration::BuildFlags). 
+To steer trade-offs between build performance, runtime traversal performance
+and acceleration structure memory usage, use [`BuildFlags::PREFER_FAST_TRACE`](crate::acceleration::BuildFlags)
+and [`BuildFlags::PREFER_FAST_BUILD`](crate::acceleration::BuildFlags). For curve primitives in particular,
+these flags control splitting; see “Splitting curve segments”.
+
+The flags [`BuildFlags::PREFER_FAST_TRACE`](crate::acceleration::BuildFlags) and [`BuildFlags::PREFER_FAST_BUILD`](crate::acceleration::BuildFlags)
+are mutually exclusive. To combine multiple flags that are not mutually exclusive,
+use the logical “or” operator.
+
+## Dynamic Updates
+
+Building an acceleration structure can be computationally costly. Applications
+may choose to update an existing acceleration structure using modified vertex
+data or bounding boxes. Updating an existing acceleration structure is generally
+much faster than rebuilding. However, the quality of the acceleration structure
+may degrade if the data changes too much with an update, for example, through
+explosions or other chaotic transitions—even if for only parts of the mesh.
+The degraded acceleration structure may result in slower traversal performance
+as compared to an acceleration structure built from scratch from the modified
+input data.
+
+### Dynamic Updates Safe API
+
+The simplest way to use dynamic updates is with the [`DynamicAccel`] structure, which wraps an [`Accel`] and adds extra checks and functionality to support dyanmic updates to the acceleration structure.
+
+Simply call [`DynamicAccel::build()`] as you would with [`Accel`], and then
+call [`DynamicAccel::update()`] with the updated build inputs when you want
+to update the acceleration structure.
+
+Note that the inputs to [`DynamicAccel::update()`] must have the same structure,
+i.e. the number of motion keys, aabbs, triangle topology etc must be the same,
+although the underlying data (including the data pointers) can be different.
+If the data have a different structure, then behaviour is undefined.
+[`DynamicAccel`] checks this by hashing the inputs and returns an error if
+the data do not match.
+
+### Dynamic Updates Unsafe API
+
+To allow for future updates of an acceleration structure, set
+[`BuildFlags::ALLOW_UPDATE`](crate::acceleration::BuildFlags) in the build flags when building the acceleration
+structure initially.
+
+To update the previously built acceleration structure, set the operation to
+[`BuildOperation::Update`](crate::acceleration::BuildOperation) and then call [`accel_build()`] on the same output
+data. All other options are required to be identical to the original build.
+The update is done in-place on the output data.
+
+Updating an acceleration structure usually requires a different amount of temporary memory than the original build.
+
+When updating an existing acceleration structure, only the device pointers and/or
+their buffer content may be changed. You cannot change the number of build inputs,
+the build input types, build flags, traversable handles for instances (for an
+instance-AS), or the number of vertices, indices, AABBs, instances, SBT records
+or motion keys. Changes to any of these things may result in undefined behavior,
+including GPU faults.
+
+Note the following:
+
+* When using indices, changing the connectivity or, in general, using shuffled
+vertex positions will work, but the quality of the acceleration structure will
+likely degrade substantially.
+* During an animation operation, geometry that should be invisible to the camera
+should not be “removed” from the scene, either by moving it very far away or
+by converting it into a degenerate form. Such changes to the geometry will also
+degrade the acceleration structure.
+* In these cases, it is more efficient to re-build the geometry-AS and/or the
+instance-AS, or to use the respective masking and flags.
+
+Updating an acceleration structure requires that any other acceleration structure
+that is using this acceleration structure as a child directly or indirectly
+also needs to be updated or rebuild.
+
+## Compaction
+A post-process can compact an acceleration structure after construction. This
+process can significantly reduce memory usage, but it requires an additional
+pass. The build and compact operations are best performed in batches to ensure
+that device synchronization does not degrade performance. The compacted size
+depends on the acceleration structure type and its properties and on the device
+architecture.
+
+### Compaction Safe API
+To compact an [`Accel`] or [`DynamicAccel`] when building, simply pass `true`
+for the `compact` parameter. This handles all buffer allocation and management 
+internally, providing safely and simplicity at the cost of not being able to re-use
+temporary buffers.
+
+### Compaction Unsafe API
+
+To compact the acceleration structure as a post-process, do the following:
+
+* Build flag [`BuildFlags::ALLOW_COMPACTION`](crate::acceleration::BuildFlags) must be set in the
+    [`AccelBuildOptions`] passed to optixAccelBuild.
+* The emit property [`AccelEmitDesc::CompactedSize`](crate::acceleration::AccelEmitDesc) must be passed to
+    [`accel_build()`]. This property is generated on the device and it must be
+    copied back to the host if it is required for allocating the new output
+    buffer. The application may then choose to compact the acceleration structure
+    using [`accel_compact()`].
+
+The [`accel_compact()`] call should be guarded by an
+`if compacted_size < output_size` (or similar) to avoid the compacting pass in
+cases where it is not beneficial. Note that this check requires a copy of the
+compacted size (as queried by [`accel_build()`]) from the device memory to host
+memory.
+
+Just like an uncompacted acceleration structure, it is possible to traverse,
+update, or relocate a compacted acceleration structure.
+
+For example:
+```no_run
+use cust::prelude as cu;
+use optix::prelude as ox;
+# fn doit() -> Result<(), Box<dyn std::error::Error>> {
+# cust::init(cu::CudaFlags::empty())?;
+# ox::init()?;
+# let device = cu::Device::get_device(0)?;
+# let cu_ctx = cu::Context::create_and_push(cu::ContextFlags::SCHED_AUTO |
+# cu::ContextFlags::MAP_HOST, device)?;
+# let ctx = ox::DeviceContext::new(&cu_ctx, false)?;
+# let vertices: Vec<[f32; 3]> = Vec::new();
+# let indices: Vec<[u32; 3]> = Vec::new();
+# let stream = cu::Stream::new(cu::StreamFlags::DEFAULT, None)?;
+
+let buf_vertex = cu::DeviceBuffer::from_slice(&vertices)?;
+let buf_indices = cu::DeviceBuffer::from_slice(&indices)?;
+
+let geometry_flags = ox::GeometryFlags::None;
+
+let build_inputs =
+    [ox::IndexedTriangleArray::new(
+        &[&buf_vertex],
+        &buf_indices,
+        &[geometry_flags]
+    )];
+
+let accel_options =
+    ox::AccelBuildOptions::new(
+        ox::BuildFlags::ALLOW_COMPACTION,
+        ox::BuildOperation::Build
+    );
+
+// Get the storage requirements for temporary and output buffers
+let sizes = accel_compute_memory_usage(ctx, accel_options, build_inputs)?;
+
+// Allocate temporary and output buffers
+let mut output_buffer =
+    unsafe { DeviceBuffer::<u8>::uninitialized(sizes.output_size_in_bytes)? };
+let mut temp_buffer =
+    unsafe { DeviceBuffer::<u8>::uninitialized(sizes.temp_size_in_bytes)? };
+
+// Build the accel
+let hnd = unsafe {
+    accel_build(
+        ctx,
+        stream,
+        accel_options,
+        build_inputs,
+        &mut temp_buffer,
+        &mut output_buffer,
+        &mut properties,
+    )?
+};
+
+stream.synchronize()?;
+
+let mut compacted_size = 0usize;
+compacted_size_buffer.copy_to(&mut compacted_size)?;
+
+let accel = if compacted_size < sizes.output_size_in_bytes {
+    let mut buf = unsafe { DeviceBuffer::<u8>::uninitialized(compacted_size)? };
+    let hnd = unsafe { accel_compact(ctx, stream, hnd, &mut buf)? };
+
+    stream.synchronize()?;
+    Accel::from_raw_parts(buf, hnd);
+else {
+    Accel::from_raw_parts(output_buffer, hnd)
+};
+
+# Ok(())
+# }
+```
+
+## Traversable Objects
+
+### Traversable Objects Safe API
+
+The transform traversable types, [`StaticTransform`](crate::transform::StaticTransform),
+[`MatrixMotionTransform`](crate::transform::MatrixMotionTransform), and
+[`SrtMotionTransform`](crate::transform::SrtMotionTransform) handle all
+necessary memory allocation and pointer conversion for you in their `new()`
+constructors.
+
+### Traversable Objects Unsafe API
+The instances in an instance-AS may reference transform traversables, as well
+as geometry-ASs. Transform traversables are fully managed by the application.
+The application needs to create these traversables manually in device memory
+in a specific form. The function [`convert_pointer_to_traversable_handle`]
+converts a raw pointer into a traversable handle of the specified type. The
+traversable handle can then be used to link traversables together.
+
+In device memory, all traversable objects need to be 64-byte aligned. Note that
+moving a traversable to another location in memory invalidates the traversable
+handle. The application is responsible for constructing a new traversable handle
+and updating any other traversables referencing the invalidated traversable
+handle.
+
+The traversable handle is considered opaque and the application should not rely
+on any particular mapping of a pointer to the traversable handle.
+
+### Traversal of a Single Geometry Acceleration Structure
+The traversable handle passed to `optixTrace` can be a traversable handle
+created from a geometry-AS. This can be useful for scenes where single
+geometry-AS objects represent the root of the scene graph.
+
+If the modules and pipeline only need to support single geometry-AS traversables,
+it is beneficial to change the
+[`PipelineCompileOptions::traversable_graph_flags`](crate::module::PipelineCompileOptions) from
+[`TraversableGraphFlags::ALLOW_ANY`](crate::module::TraversableGraphFlags) to
+[`TraversableGraphFlags::ALLOW_SINGLE_GAS`](crate::module::TraversableGraphFlags).
+
+This signals to NVIDIA OptiX 7 that no other traversable types require support
+during traversal.
+
+## Motion Blur
+
+Motion support in OptiX targets the rendering of images with motion blur using a
+stochastic sampling of time. OptiX supports two types of motion as part of the
+scene: transform motion and vertex motion, often called deformation motion. When
+setting up the scene traversal graph and building the acceleration structures,
+motion options can be specified per acceleration structure as well as per motion
+transform traversable. At run time, a time parameter is passed to the trace call
+to perform the intersection of a ray against the scene at the selected point in
+time.
+
+The general design of the motion feature in OptiX tries to strike a balance
+between providing many parameters to offer a high degree of freedom combined
+with a simple mapping of scene descriptions to these parameters but also
+delivering high traversal performance at the same time. As such OptiX supports
+the following key features:
+
+* Vertex and transformation motion
+* Matrix as well as SRT (scale rotation translation) transformations
+* Arbitrary time ranges (ranges not limited to [0,1]) and flags to specify behavior outside the time range
+* Arbitrary concatenations of transformations (for example, a matrix transformation on top of a SRT transformation)
+* Per-ray timestamps
+
+Scene descriptions with motion need to map easily to traversable objects and
+their motion options as offered by OptiX. As such, the idea is that the motion
+options are directly derived by the scene description, delivering high traversal
+performance without the need for any performance-driven adjustments. However, due
+to the complexity of the subject, there are a few exceptions that are discussed
+in this section.
+
+This section details the usage of the motion options on the different traversable
+types and how to map scene options best to avoid potential performance pitfalls.
+
+### Basics
+Motion is supported by
+[`MatrixMotionTransform`],
+[`SrtMotionTransform`] and
+acceleration structure traversables. The general motion characteristics are
+specified per traversable as motion options: the number of motion keys, flags,
+and the beginning and ending motion times corresponding to the first and last
+key. The remaining motion keys are evenly spaced between the beginning and
+ending times. The motion keys are the data at specific points in time and the
+data is interpolated in between neighboring keys. The motion options are
+specified in the [`MotionOptions`] struct.
+
+The motion options are always specified per traversable (acceleration structure
+or motion transform). There is no dependency between the motion options of
+traversables; given an instance referencing a geometry acceleration structure
+with motion, it is not required to build an instance acceleration structure
+with motion. The same goes for motion transforms. Even if an instance references
+a motion transform as child traversable, the instance acceleration structure
+itself may or may not have motion.
+
+Motion transforms must specify at least two motion keys. Acceleration structures,
+however, also accept [`AccelBuildOptions`] with field [`MotionOptions`] set to
+`default()`. This effectively disables motion for the acceleration structure and
+ignores the motion beginning and ending times, along with the motion flags.
+
+OptiX also supports static transform traversables in addition to the static
+transform of an instance. Static transforms are intended for the case of motion
+transforms in the scene. Without any motion transforms
+([`MatrixMotionTransform`] or
+[`SrtMotionTransform`]) in the traversable
+graph, any static transformation should be baked into the instance transform.
+However, if there is a motion transform, it may be required to apply a static
+transformation on a traversable (for example, on a geometry-AS) first before
+applying the motion transform. For example, a motion transform may be specified
+in world coordinates, but the geometry it applies to needs to be placed into the
+scene first (object-to-world transformation, which is usually done using the
+instance transform). In this case, a static transform pointing at the geometry
+acceleration structure can be used for the object-to-world transformation and
+the instance transform pointing to the motion transform has an identity matrix
+as transformation.
+
+Motion boundary conditions are specified by using flags. By default, the
+behavior for any time outside the time range, is as if time was clamped to the
+range, meaning it appears static and visible. Alternatively, to remove the
+traversable before the beginning time, set [`MotionFlags::START_VANISH`](crate::acceleration::MotionFlags); to
+remove it after the ending time, set [`MotionFlags::END_VANISH`](crate::acceleration::MotionFlags).
+
+For example:
+```
+let motion_options = MotionFlags {
+    num_keys: 3,
+    time_begin: -1.0,
+    time_end: 1.5
+    flags: MotionFlags::NONE,
+};
+```
+
+OptiX offers two types of motion transforms, SRTs (scale-rotation-translation)
+as well as 3x4 affine matrices, each specifying one transform (SRT or matrix)
+per motion key. The transformations are always specified as object-to-world
+transformation just like the instance transformation. During traversal OptiX
+performs a per-component linear interpolation of the two nearest keys. The
+rotation component (expressed as a quaternion) of the SRT is an exception,
+OptiX ensures that the interpolated quaternion of two SRTs is of unit length
+by using nlerp interpolation for performance reasons. This results in a smooth,
+scale-preserving rotation in Cartesian space though with non-constant velocity.
+
+For vertex motion, OptiX applies a linear interpolation between the vertex data
+that are provided by the application. If intersection programs are used and
+AABBs are supplied for the custom primitives, the AABBs are also linearly
+interpolated for intersection. The AABBs at the motion keys must therefore be
+big enough to contain any motion path of the underlying custom primitive.
+
+There are several device-side functions that take a time parameter such as
+`optixTrace` and respect the motion options as set at the traversables. The
+result of these device-side functions is always that of the specified point
+in time, e.g, the intersection of the ray with the scene at the selected point
+in time. Device-side functions are discussed in detail in “Device-side functions”.
+
+### Motion Geometry Acceleration Structure
+Use [`Accel::build()`] to build a motion acceleration structure. The motion
+options are part of the build options ([`AccelBuildOptions`]) and apply to all
+build inputs. Build inputs must specify primitive vertex buffers (for
+[`TriangleArray`] and [`CurveArray`]), radius buffers (for [`CurveArray`]), and
+AABB buffers (for [`CustomPrimitiveArray`] and [`InstanceArray`]) for all motion
+keys. These are interpolated during traversal to obtain the continuous motion vertices and AABBs between the begin and end time.
+
+The motion options are typically defined by the mesh data which should directly
+map to the motion options on the geometry acceleration structure. For example,
+if a triangle mesh has three per-vertex motion values, the geometry acceleration
+structure needs to have three motion keys. Just as for non-motion meshes, it is
+possible to combine meshes within a single geometry acceleration structure to
+potentially increase traversal performance (this is generally recommended if
+there is only a single instance of each mesh and the meshes overlap or are close
+together). However, these meshes need to share the same motion options (as they
+are specified per geometry acceleration structure). The usual trade-offs apply
+in case meshes need to be updated from one frame to another as in an interactive
+application. The entire geometry acceleration structure needs to be rebuilt or
+refitted if the vertices of at least one mesh change.
+
+It is possible to use a custom intersection program to decouple the actual vertex
+data and the motion options of the geometry acceleration structure. Intersection
+programs allow any kind of intersection routine. For example, it is possible to
+implement a three-motion-key-triangle intersection, but build a static geometry
+acceleration structure over AABBs by passing AABBs to the geometry acceleration
+structure build that enclose the full motion path of the triangles. However, this
+is generally not recommended for two reasons: First, the AABBs tend to increase
+in size very quickly even with very little motion. Second, it prevents the use
+of hardware intersection routines. Both of these effects can have a tremendous
+impact on performance.
+
+### Motion Instance Acceleration Structure
+
+Just as for a geometry acceleration structure, the motion options for an instance acceleration structure are specified as part of the build options. The notable difference to a geometry acceleration structure is that the motion options for an instance acceleration structure almost only impact performance. Hence, whether or not to build a motion instance acceleration structure has no impact on the correctness of the rendering (determining which instances can be intersected), but impacts memory usage as well as traversal performance. The only exception to that are the vanish flags as these force any instance of the instance acceleration structure to be non-intersectable for any ray time outside of the time range of the instance acceleration structure.
+
+In the following, guidelines are provided on setting the motion options to achieve good performance and avoid pitfalls. We will focus on the number of motion keys, usually the main discriminator for traversal performance and the only factor for memory usage. The optimal number of motion keys used for the instance acceleration structure build depends on the amount and linearity of the motion of the traversables referenced by the instances. The time beginning and ending range are usually defined by what is required to render the current frame. The recommendations given here may change in the future.
+
+The following advice should be considered a simplified heuristic. A more detailed derivation of whether or not to use motion is given below. For RTCores version 1.0 (Turing architecture), do not use motion for instance acceleration structure, but instead build a static instance acceleration structure that can leverage hardware-accelerated traversal. For any other device (devices without RTCores or RTCores version >= 2.0), build a motion instance acceleration structure if any of the instances references a motion transform or a motion acceleration structure as traversable child.
+
+If a motion instance acceleration structure is built, it is often sufficient to use a low number of motion keys (two or three) to avoid high memory costs. Also, it is not required to use a large number of motion keys just because one of the referenced motion transforms has many motion keys (such as the maximum motion keys of any referenced traversable by any of the instances). The motion options have no dependency between traversable objects and a high number of motion keys on the instance acceleration structure causes a high memory overhead. Clearly, motion should not be used for an instance acceleration structure if the instances only reference static traversables.
+
+Further considerations when using motion blur:
+
+#### Is motion enabled?
+An instance acceleration structure should be built with motion on (the number of motion keys larger than one) if the overall amount of motion of the instanced traversables is non-minimal. For a single instance this can be quantified by the amount of change of its AABB over time. Hence, in case of a simple translation (for example, due to a matrix motion transform), the metric is the amount of the translation in comparison to the size of the AABB. In case of a scaling, it is the ratio of the size of the AABB at different points in times. If sufficiently many instanced traversables exhibit a non-minimal amount of change of their AABB over time, build a motion instance acceleration structure. Inversely, a static instance acceleration structure can yield higher traversal performance if many instanced traversables have no motion at all or only very little. The latter can happen for rotations. A rotation around the center of an object causes a rather small difference in the AABB of the object. However, if the rotational pivot point is not the center, it is likely to cause a big difference in the AABB of the object.
+
+As it is typically hard to actually quantify the amount of motion for the instances, switch to motion if sufficiently many instanced traversables have or are expected to have motion. Yet it is difficult to predict when exactly it pays off to use or not use motion on the instance acceleration structure.
+
+#### If motion is enabled, how many keys should be defined?
+
+A reasonable metric to determine the required number of motion keys for an instance acceleration structure is the linearity of the motion of the instanced traversables. If there are motion transforms with many motion keys, rotations, or a hierarchical set of motion transforms, more motion keys on the instance acceleration structure may increase traversal performance. Transformations like a simple translation, rotation around the center of an object, a small scale, or even all of those together are usually handles well by a two-motion-key instance acceleration structure.
+
+Finally, the quality of the instance acceleration structure is also affected by the number of motion keys of the referenced traversables of the instances. As such, it is desirable to have the motion options of the instance acceleration structure match the motion options of any referenced motion transform. For example, if all instances reference motion transforms with three keys, it is reasonable to also use three motion keys for the instance acceleration structure. Note that also in this case the statement from above still applies that using more motion keys only helps if the underlying transformation results in a non-linear motion.
+
+### Motion Matrix Transform
+
+The motion matrix transform traversable ([`MatrixMotionTransform`]) transforms the ray during traversal using a motion matrix. The traversable provides a 3x4 row-major object-to-world transformation matrix for each motion key. The final motion matrix is constructed during traversal by interpolating the elements of the matrices at the nearest motion keys.
+
+The [`MatrixMotionTransform`] can be created with an arbitrary number of keys
+using its [`new()`](crate::acceleration::MatrixMotionTransform::new) constructor.
+
+### Motion Scale Rotate Translate Transform
+
+The behavior of the motion transform [`SrtMotionTransform`] is similar to the matrix motion transform [`MatrixMotionTransform`]. In [`SrtMotionTransform`] the object-to-world transforms per motion key are specified as a scale, rotation and translation (SRT) decomposition instead of a single 3x4 matrix. Each motion key is a struct of type [`SrtData`], which consists of 16 floats:
+
+```
+struct SrtData {
+   pub sx: f32,
+   pub a: f32,
+   pub b: f32,
+   pub pvx: f32,
+   pub sy: f32,
+   pub c: f32,
+   pub pvy: f32,
+   pub sz: f32,
+   pub pvz: f32,
+   pub qx: f32,
+   pub qy: f32,
+   pub qz: f32,
+   pub qw: f32,
+   pub tx: f32,
+   pub ty: f32,
+   pub tz: f32,
+}
+```
+
+* The scaling matrix,
+$$
+S=\begin{bmatrix}
+sx & a & b & pvx \cr 0 & sy & c & pvy \cr 0 & 0 & sz & pvz
+\end{bmatrix}
+$$
+
+defines an affine transformation that can include scale, shear, and a translation.
+The translation allows to define the pivot point for the subsequent rotation.
+
+* The rotation quaternion
+$$
+R = [qx, qy, qz, qw]
+$$
+describes a rotation with angular
+component $qw = \cos(\theta / 2)$ and other components
+$$
+[qx, qy, qz] = \sin(\theta / 2) \cdot [ax, ay, az]
+$$ where the axis $[ax, ay, az]$ is normalized.
+
+* The translation matrix,
+$$
+T = \begin{bmatrix} 1 & 0 & 0 & tx \cr 0 & 1 & 0 & ty \cr 0 & 0 & 1 & tz \end{bmatrix}
+$$
+defines another translation that is applied after the rotation. Typically, this
+translation includes the inverse translation from the matrix $S$ to reverse the
+translation for the pivot point for $R$.
+
+To obtain the effective transformation at time $t$, the elements of the components
+of $S$, $R$, and $T$ will be interpolated linearly. The components are then
+multiplied to obtain the combined transformation $C = T \times R \times S$. The
+transformation $C$ is the effective object-to-world transformations at time $t$,
+and $C^{-1}$ is the effective world-to-object transformation at time $t$.
+
+#### Example 1 - rotation about the origin:
+
+Use two motion keys. Set the first key to identity values. For the second key, define a quaternion from an axis and angle, for example, a 60-degree rotation about the z axis is given by:
+
+$$ Q = [ 0 0 \sin(\pi/6) \cos(\pi/6) ] $$
+
+#### Example 2 - rotation about a pivot point:
+Use two motion keys. Set the first key to identity values. Represent the pivot point as a translation $P$, and define the second key as follows:
+$$
+S^{\prime} = P^{-1} \times S \newline
+T^{\prime} = T \times P \newline
+C = T^{\prime} \times R \times S^{\prime}
+$$
+
+#### Example 3 - scaling about a pivot point
+
+Use two motion keys. Set the first key to identity values. Represent the pivot as a translation $G = [G_x, G_y, f G_z]$ and modify the pivot point described above:
+
+$$
+P_x^{\prime} = P_x + (-S_x \times G_x + G_x) \newline
+P_y^{\prime} = P_y + (-S_y \times G_y + G_y) \newline
+P_z^{\prime} = P_z + (-S_z \times G_z + G_z) \newline
+$$
+
+### Transforms trade-offs
+Several trade-offs must be considered when using transforms.
+
+#### SRTs compared to matrix motion transforms
+Use SRTs for any transformations containing a rotation. Only SRTs produce a smooth rotation without distortion. They also avoid any oversampling of matrix transforms to approximate a rotation. However, note that the maximum angle of rotation due to two neighboring SRT keys needs to be less than 180 degrees, hence, the dot product of the quaternions needs to be positive. This way the rotations are interpolated using the shortest path. If a rotation of 180 degrees or more is required, additional keys need to be specified such that the rotation between two keys is less than 180 degrees. OptiX uses nlerp to interpolate quaternion at runtime. While nlerp produces the best traversal performance, it causes non-constant velocity in the rotation. The variation of rotational velocity is directly dependent on the amount of the rotation. If near constant rotation velocity is required, more SRT keys can be used.
+
+Due to the complexity of the rotation, instance acceleration structure builds with instances that reference SRT transforms can be relatively slow. For real-time or interactive applications, it can be advantageous to use matrix transforms to have fast rebuilds or refits of the instance acceleration structure.
+
+#### Motion options for motion transforms
+The motion options for motion transforms should be derived by the scene setup and used as needed. The number of keys is defined by the number of transformations specified by the scene description. The beginning, ending times should be as needed for the frame or tighter if specified by the scene description.
+
+Avoid duplicating instances of motion transforms to achieve a motion behavior that can also be expressed by a single motion transform but many motion keys. An example is the handling of irregular keys, which is discussed in the following section.
+
+#### Dealing with irregular keys
+OptiX only supports regular time intervals in its motion options. Irregular keys should be resampled to fit regular keys, potentially with a much higher number of keys if needed.
+
+A practical example for this is a motion matrix transform that performs a rotation. Since the matrix elements are linearly interpolated between keys, the rotation is not an actual rotation, but a scale/shear/translation. To avoid visual artifacts, the rotation needs to be sampled with potentially many matrix motion keys. Such a sampling bounds the maximum error in the approximation of the rotation by the linear interpolation of matrices. The sampling should not try to minimize the number of motion keys by outputting irregular motion keys, but rather oversample the rotation with many keys.
+
+Duplicate motion transforms should not be used as a workaround for irregular keys, where each key has varying motion beginning and ending times and vanish motion flags set. This duplication creates traversal overhead as all copies need to be intersected and their motion times compared to the ray's time.
+
+
+</details>
+
+[`Accel::build()`]: crate::acceleration::Accel::build
+[`Accel::from_raw_parts()`]: crate::acceleration::Accel::from_raw_parts
+[`Accel]: crate::acceleration::Accel
+[`Instance`]: crate::instance_array::Instance
+[`TriangleArray`]: crate::triangle_array::TriangleArray
+[`CurveArray`]: crate::curve_array::CurveArray
+[`InstanceArray`]: crate::instance_array::InstanceArray
+[`MatrixMotionTransform`]: crate::transform::MatrixMotionTransform
+[`SrtMotionTransform`]: crate::transform::SrtMotionTransform
+[`BuildInput`]: crate::acceleration::BuildInput
+[`TraversableHandle`]: crate::acceleration::TraversableHandle
+[`accel_build()`]: crate::acceleration::accel_build
+[`accel_compute_memory_usage()`]: crate::acceleration::accel_compute_memory_usage
+[`accel_compact()`]: crate::acceleration::accel_compact
+[`GeometryFlags`]: crate::acceleration::GeometryFlags
+[`BuildFlags`]: crate::acceleration::BuildFlags
+[`DynamicAccel`]: crate::acceleration::DynamicAccel
+[`DynamicAccel::build()`]: crate::acceleration::DynamicAccel::build
+[`DynamicAccel::update()`]: crate::acceleration::DynamicAccel::update
+[`AccelBuildOptions`]: crate::acceleration::AccelBuildOptions
+[`convert_pointer_to_traversable_handle`]: crate::acceleration::convert_pointer_to_traversable_handle
+[`MotionOptions`]: crate::acceleration::MotionOptions
diff --git a/crates/optix/src/acceleration.rs b/crates/optix/src/acceleration.rs
new file mode 100644
index 00000000..3b65aaa5
--- /dev/null
+++ b/crates/optix/src/acceleration.rs
@@ -0,0 +1,2082 @@
+#![allow(clippy::missing_safety_doc)]
+
+use crate::{const_assert, const_assert_eq, context::DeviceContext, error::Error, optix_call, sys};
+use cust::{
+    memory::{CopyDestination, DeviceBox, DeviceBuffer, DeviceCopy, DevicePointer, DeviceSlice},
+    DeviceCopy,
+};
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+use memoffset::offset_of;
+use std::ffi::c_void;
+use std::mem::size_of;
+use std::ops::Deref;
+use std::{
+    collections::hash_map::DefaultHasher,
+    hash::{Hash, Hasher},
+    marker::PhantomData,
+};
+
+use cust_raw::CUdeviceptr;
+use mint::{RowMatrix3x4, Vector3};
+
+// Kinda nasty hack to work around the fact taht bindgen generates an i32 for enums on windows,
+// but a u32 on linux
+#[cfg(windows)]
+type OptixEnumBaseType = i32;
+#[cfg(unix)]
+type OptixEnumBaseType = u32;
+
+pub trait BuildInput: std::hash::Hash {
+    fn to_sys(&self) -> sys::OptixBuildInput;
+}
+
+pub trait Traversable {
+    fn handle(&self) -> TraversableHandle;
+}
+
+/// Wrapper struct containing the storage and handle for a static acceleration
+/// structure.
+///
+/// An Accel can be built by providing a slice of [`BuildInput`]s over which to
+/// build the acceleration structure, together with a matching slice of
+/// [`AccelBuildOptions`].
+///
+/// ```no_run
+/// use cust::prelude as cu;
+/// use optix::prelude as ox;
+/// # fn doit() -> Result<(), Box<dyn std::error::Error>> {
+/// # cust::init(cu::CudaFlags::empty())?;
+/// # ox::init()?;
+/// # let device = cu::Device::get_device(0)?;
+/// # let cu_ctx = cu::Context::create_and_push(cu::ContextFlags::SCHED_AUTO |
+/// # cu::ContextFlags::MAP_HOST, device)?;
+/// # let ctx = ox::DeviceContext::new(&cu_ctx, false)?;
+/// # let vertices: Vec<[f32; 3]> = Vec::new();
+/// # let indices: Vec<[u32; 3]> = Vec::new();
+/// # let stream = cu::Stream::new(cu::StreamFlags::DEFAULT, None)?;
+///
+/// let buf_vertex = cu::DeviceBuffer::from_slice(&vertices)?;
+/// let buf_indices = cu::DeviceBuffer::from_slice(&indices)?;
+///
+/// let geometry_flags = ox::GeometryFlags::None;
+/// let triangle_input =
+///     ox::IndexedTriangleArray::new(
+///         &[&buf_vertex],
+///         &buf_indices,
+///         &[geometry_flags]
+///     );
+///
+/// let accel_options =
+///     ox::AccelBuildOptions::new(
+///         ox::BuildFlags::ALLOW_COMPACTION,
+///         ox::BuildOperation::Build
+///     );
+///
+/// let build_inputs = vec![triangle_input];
+///
+/// let gas = ox::Accel::build(
+///     &ctx,
+///     &stream,
+///     &[accel_options],
+///     &build_inputs,
+///     true
+/// )?;
+///
+/// stream.synchronize()?;
+/// # Ok(())
+/// # }
+/// ```
+pub struct Accel {
+    #[allow(dead_code)]
+    buf: DeviceBuffer<u8>,
+    hnd: TraversableHandle,
+}
+
+impl Traversable for Accel {
+    /// Get the [`TraversableHandle`] that represents this accel.
+    fn handle(&self) -> TraversableHandle {
+        self.hnd
+    }
+}
+
+impl Accel {
+    /// Build and (optionally) compact the acceleration structure for the given
+    /// `build_inputs`.
+    ///
+    /// This will handle all necessary memory allocation internally, synchronizing
+    /// all internal steps, but NOT the final build or compaction.
+    ///
+    /// If you want to re-use buffers between builds and line up multiple builds
+    /// at once for more performance/efficiency, you should use the unsafe api.
+    ///
+    /// ```no_run
+    /// use cust::prelude as cu;
+    /// use optix::prelude as ox;
+    /// # fn doit() -> Result<(), Box<dyn std::error::Error>> {
+    /// # cust::init(cu::CudaFlags::empty())?;
+    /// # ox::init()?;
+    /// # let device = cu::Device::get_device(0)?;
+    /// # let cu_ctx = cu::Context::create_and_push(cu::ContextFlags::SCHED_AUTO |
+    /// # cu::ContextFlags::MAP_HOST, device)?;
+    /// # let ctx = ox::DeviceContext::new(&cu_ctx, false)?;
+    /// # let vertices: Vec<[f32; 3]> = Vec::new();
+    /// # let indices: Vec<[u32; 3]> = Vec::new();
+    /// # let stream = cu::Stream::new(cu::StreamFlags::DEFAULT, None)?;
+    ///
+    /// let buf_vertex = cu::DeviceBuffer::from_slice(&vertices)?;
+    /// let buf_indices = cu::DeviceBuffer::from_slice(&indices)?;
+    ///
+    /// let geometry_flags = ox::GeometryFlags::None;
+    /// let triangle_input =
+    ///     ox::IndexedTriangleArray::new(
+    ///         &[&buf_vertex],
+    ///         &buf_indices,
+    ///         &[geometry_flags]
+    ///     );
+    ///
+    /// let accel_options =
+    ///     ox::AccelBuildOptions::new(
+    ///         ox::BuildFlags::ALLOW_COMPACTION,
+    ///         ox::BuildOperation::Build
+    ///     );
+    ///
+    /// let build_inputs = vec![triangle_input];
+    ///
+    /// let gas = ox::Accel::build(
+    ///     &ctx,
+    ///     &stream,
+    ///     &[accel_options],
+    ///     &build_inputs,
+    ///     true
+    /// )?;
+    ///
+    /// stream.synchronize()?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn build<I: BuildInput>(
+        ctx: &DeviceContext,
+        stream: &cust::stream::Stream,
+        accel_options: &[AccelBuildOptions],
+        build_inputs: &[I],
+        compact: bool,
+    ) -> Result<Accel> {
+        let sizes = accel_compute_memory_usage(ctx, accel_options, build_inputs)?;
+        let mut output_buffer =
+            unsafe { DeviceBuffer::<u8>::uninitialized(sizes.output_size_in_bytes)? };
+
+        let mut temp_buffer =
+            unsafe { DeviceBuffer::<u8>::uninitialized(sizes.temp_size_in_bytes)? };
+
+        let compacted_size_buffer = unsafe { DeviceBox::<usize>::uninitialized()? };
+
+        let mut properties = vec![AccelEmitDesc::CompactedSize(
+            compacted_size_buffer.as_device_ptr(),
+        )];
+
+        let hnd = unsafe {
+            accel_build(
+                ctx,
+                stream,
+                accel_options,
+                build_inputs,
+                &mut temp_buffer,
+                &mut output_buffer,
+                &mut properties,
+            )?
+        };
+
+        if compact {
+            stream.synchronize()?;
+
+            let mut compacted_size = 0usize;
+            compacted_size_buffer.copy_to(&mut compacted_size)?;
+
+            if compacted_size < sizes.output_size_in_bytes {
+                let mut buf = unsafe { DeviceBuffer::<u8>::uninitialized(compacted_size)? };
+                let hnd = unsafe { accel_compact(ctx, stream, hnd, &mut buf)? };
+                Ok(Accel { buf, hnd })
+            } else {
+                Ok(Accel {
+                    buf: output_buffer,
+                    hnd,
+                })
+            }
+        } else {
+            Ok(Accel {
+                buf: output_buffer,
+                hnd,
+            })
+        }
+    }
+
+    /// Construct a new Accel from a handle and buffer.
+    pub unsafe fn from_raw_parts(buf: DeviceBuffer<u8>, hnd: TraversableHandle) -> Accel {
+        Accel { buf, hnd }
+    }
+
+    /// Obtain opaque relocation information for this accel in the given [`DeviceContext`].
+    ///
+    /// The location information may be passed to
+    /// [`check_relocation_compatibility()`](Accel::check_relocation_compatibility) to
+    /// determine if this acceleration structure can be relocated to a different device's
+    /// memory space.
+    ///
+    /// When used with [`relocate`](Accel::relocate) it provides the data necessary
+    /// for doing the relocation.
+    ///
+    /// If this acceleration structure is copied multiple times, the same
+    /// [`AccelRelocationInfo`] can also be used on all copies.
+    pub fn get_relocation_info(&self, ctx: &DeviceContext) -> Result<AccelRelocationInfo> {
+        let mut inner = sys::OptixAccelRelocationInfo::default();
+        unsafe {
+            Ok(optix_call!(optixAccelGetRelocationInfo(
+                ctx.raw,
+                self.hnd.inner,
+                &mut inner
+            ))
+            .map(|_| AccelRelocationInfo { inner })?)
+        }
+    }
+}
+
+/// Acceleration structure supporting dynamic updates.
+///
+/// Building an acceleration structure can be computationally costly. Applications
+/// may choose to update an existing acceleration structure using modified vertex
+/// data or bounding boxes. Updating an existing acceleration structure is generally
+/// much faster than rebuilding. However, the quality of the acceleration structure
+/// may degrade if the data changes too much with an update, for example, through
+/// explosions or other chaotic transitions—even if for only parts of the mesh.
+/// The degraded acceleration structure may result in slower traversal performance
+/// as compared to an acceleration structure built from scratch from the modified
+/// input data.
+pub struct DynamicAccel {
+    accel: Accel,
+    hash: u64,
+}
+
+impl Traversable for DynamicAccel {
+    /// Get the [`TraversableHandle`] that represents this accel.
+    fn handle(&self) -> TraversableHandle {
+        self.accel.hnd
+    }
+}
+
+impl Deref for DynamicAccel {
+    type Target = Accel;
+
+    fn deref(&self) -> &Self::Target {
+        &self.accel
+    }
+}
+
+impl DynamicAccel {
+    /// Build and compact the acceleration structure for the given inputs.
+    ///
+    /// This forces the ALLOW_UPDATE flag for the build flags to make sure the
+    /// resulting accel can be updated.
+    pub fn build<I: BuildInput>(
+        ctx: &DeviceContext,
+        stream: &cust::stream::Stream,
+        accel_options: &mut [AccelBuildOptions],
+        build_inputs: &[I],
+        compact: bool,
+    ) -> Result<DynamicAccel> {
+        // Force ALLOW_UPDATE
+        for opt in accel_options.iter_mut() {
+            opt.build_flags |= BuildFlags::ALLOW_UPDATE;
+            opt.operation = BuildOperation::Build;
+        }
+
+        let sizes = accel_compute_memory_usage(ctx, accel_options, build_inputs)?;
+        let mut output_buffer =
+            unsafe { DeviceBuffer::<u8>::uninitialized(sizes.output_size_in_bytes)? };
+
+        let mut temp_buffer =
+            unsafe { DeviceBuffer::<u8>::uninitialized(sizes.temp_size_in_bytes)? };
+
+        let compacted_size_buffer = unsafe { DeviceBox::<usize>::uninitialized()? };
+
+        let mut properties = vec![AccelEmitDesc::CompactedSize(
+            compacted_size_buffer.as_device_ptr(),
+        )];
+
+        let hnd = unsafe {
+            accel_build(
+                ctx,
+                stream,
+                accel_options,
+                build_inputs,
+                &mut temp_buffer,
+                &mut output_buffer,
+                &mut properties,
+            )?
+        };
+
+        let mut hasher = DefaultHasher::new();
+        build_inputs.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        if compact {
+            stream.synchronize()?;
+
+            let mut compacted_size = 0usize;
+            compacted_size_buffer.copy_to(&mut compacted_size)?;
+
+            let mut buf = unsafe { DeviceBuffer::<u8>::uninitialized(compacted_size)? };
+
+            let hnd = unsafe { accel_compact(ctx, stream, hnd, &mut buf)? };
+
+            Ok(DynamicAccel {
+                accel: Accel { buf, hnd },
+                hash,
+            })
+        } else {
+            Ok(DynamicAccel {
+                accel: Accel {
+                    buf: output_buffer,
+                    hnd,
+                },
+                hash,
+            })
+        }
+    }
+
+    /// Update the acceleration structure
+    ///
+    /// This forces the build operation to Update.
+    ///
+    /// # Errors
+    /// * [`Error::AccelUpdateMismatch`] - if the provided `build_inputs` do
+    /// not match the structure of those provided to [`build()`](DynamicAccel::build)
+    pub fn update<I: BuildInput>(
+        &mut self,
+        ctx: &DeviceContext,
+        stream: &cust::stream::Stream,
+        accel_options: &mut [AccelBuildOptions],
+        build_inputs: &[I],
+    ) -> Result<()> {
+        for opt in accel_options.iter_mut() {
+            opt.build_flags |= BuildFlags::ALLOW_UPDATE;
+            opt.operation = BuildOperation::Update;
+        }
+
+        let mut hasher = DefaultHasher::new();
+        build_inputs.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        if hash != self.hash {
+            return Err(Error::AccelUpdateMismatch);
+        }
+
+        let sizes = accel_compute_memory_usage(ctx, accel_options, build_inputs)?;
+        let mut output_buffer =
+            unsafe { DeviceBuffer::<u8>::uninitialized(sizes.output_size_in_bytes)? };
+
+        let mut temp_buffer =
+            unsafe { DeviceBuffer::<u8>::uninitialized(sizes.temp_size_in_bytes)? };
+
+        let compacted_size_buffer = unsafe { DeviceBox::<usize>::uninitialized()? };
+
+        let mut properties = vec![AccelEmitDesc::CompactedSize(
+            compacted_size_buffer.as_device_ptr(),
+        )];
+
+        let hnd = unsafe {
+            accel_build(
+                ctx,
+                stream,
+                accel_options,
+                build_inputs,
+                &mut temp_buffer,
+                &mut output_buffer,
+                &mut properties,
+            )?
+        };
+
+        self.accel = Accel {
+            buf: output_buffer,
+            hnd,
+        };
+
+        Ok(())
+    }
+}
+
+/// Opaque handle to a traversable acceleration structure.
+///
+/// # Safety
+/// You should consider this handle to be a raw pointer, thus you can copy it
+/// and it provides no tracking of lifetime or ownership. You are responsible
+/// for ensuring that the device memory containing the acceleration structures
+/// this handle references are alive if you try to use this handle
+#[repr(transparent)]
+#[derive(Copy, Clone, Debug, PartialEq, DeviceCopy, Default)]
+pub struct TraversableHandle {
+    pub(crate) inner: u64,
+}
+
+/// Computes the device memory required for temporary and output buffers
+/// when building the acceleration structure. Use the returned sizes to
+/// allocate enough memory to pass to [`accel_build()`].
+///
+/// # Examples
+/// ```no_run
+/// use cust::prelude as cu;
+/// use optix::prelude as ox;
+/// # fn doit() -> Result<(), Box<dyn std::error::Error>> {
+/// # cust::init(cu::CudaFlags::empty())?;
+/// # ox::init()?;
+/// # let device = cu::Device::get_device(0)?;
+/// # let cu_ctx = cu::Context::create_and_push(cu::ContextFlags::SCHED_AUTO |
+/// # cu::ContextFlags::MAP_HOST, device)?;
+/// # let ctx = ox::DeviceContext::new(&cu_ctx, false)?;
+/// # let vertices: Vec<[f32; 3]> = Vec::new();
+/// # let indices: Vec<[u32; 3]> = Vec::new();
+/// # let stream = cu::Stream::new(cu::StreamFlags::DEFAULT, None)?;
+/// let buf_vertex = DeviceBuffer::from_slice(&vertices)?;
+/// let buf_indices = DeviceBuffer::from_slice(&indices)?;
+///
+/// let geometry_flags = GeometryFlags::None;
+/// let build_inputs = [IndexedTriangleArray::new(
+///     &[&buf_vertex],
+///     &buf_indices,
+///     &[geometry_flags],
+/// )];
+/// let accel_options =
+///     AccelBuildOptions::new(BuildFlags::ALLOW_COMPACTION, BuildOperation::Build);
+///
+/// let sizes = accel_compute_memory_usage(ctx, accel_options, build_inputs)?;
+/// let mut output_buffer =
+///     unsafe { DeviceBuffer::<u8>::uninitialized(sizes.output_size_in_bytes)? };
+///
+/// let mut temp_buffer =
+///     unsafe { DeviceBuffer::<u8>::uninitialized(sizes.temp_size_in_bytes)? };
+///
+/// let mut compacted_size_buffer = unsafe { DeviceBox::<usize>::uninitialized()? };
+///
+/// let mut properties = vec![AccelEmitDesc::CompactedSize(
+///     compacted_size_buffer.as_device_ptr(),
+/// )];
+///
+/// let hnd = unsafe {
+///     accel_build(
+///         ctx,
+///         stream,
+///         accel_options,
+///         build_inputs,
+///         &mut temp_buffer,
+///         &mut output_buffer,
+///         &mut properties,
+///     )?
+/// };
+///
+/// # Ok(())
+/// # }
+/// ```
+pub fn accel_compute_memory_usage<I: BuildInput>(
+    ctx: &DeviceContext,
+    accel_options: &[AccelBuildOptions],
+    build_inputs: &[I],
+) -> Result<AccelBufferSizes> {
+    let mut buffer_sizes = AccelBufferSizes::default();
+    let build_sys: Vec<_> = build_inputs.iter().map(|b| b.to_sys()).collect();
+
+    unsafe {
+        Ok(optix_call!(optixAccelComputeMemoryUsage(
+            ctx.raw,
+            accel_options.as_ptr() as *const _,
+            build_sys.as_ptr(),
+            build_sys.len() as u32,
+            &mut buffer_sizes as *mut _ as *mut _,
+        ))
+        .map(|_| buffer_sizes)?)
+    }
+}
+
+/// Builds the acceleration structure.
+/// `temp_buffer` and `output_buffer` must be at least as large as the sizes
+/// returned by `accel_compute_memory_usage()`
+///
+/// # Examples
+/// ```no_run
+/// use cust::prelude as cu;
+/// use optix::prelude as ox;
+/// # fn doit() -> Result<(), Box<dyn std::error::Error>> {
+/// # cust::init(cu::CudaFlags::empty())?;
+/// # ox::init()?;
+/// # let device = cu::Device::get_device(0)?;
+/// # let cu_ctx = cu::Context::create_and_push(cu::ContextFlags::SCHED_AUTO |
+/// # cu::ContextFlags::MAP_HOST, device)?;
+/// # let ctx = ox::DeviceContext::new(&cu_ctx, false)?;
+/// # let vertices: Vec<[f32; 3]> = Vec::new();
+/// # let indices: Vec<[u32; 3]> = Vec::new();
+/// # let stream = cu::Stream::new(cu::StreamFlags::DEFAULT, None)?;
+/// let buf_vertex = DeviceBuffer::from_slice(&vertices)?;
+/// let buf_indices = DeviceBuffer::from_slice(&indices)?;
+///
+/// let geometry_flags = GeometryFlags::None;
+/// let build_inputs = [IndexedTriangleArray::new(
+///     &[&buf_vertex],
+///     &buf_indices,
+///     &[geometry_flags],
+/// )];
+/// let accel_options =
+///     AccelBuildOptions::new(BuildFlags::ALLOW_COMPACTION, BuildOperation::Build);
+///
+/// let sizes = accel_compute_memory_usage(ctx, accel_options, build_inputs)?;
+/// let mut output_buffer =
+///     unsafe { DeviceBuffer::<u8>::uninitialized(sizes.output_size_in_bytes)? };
+///
+/// let mut temp_buffer =
+///     unsafe { DeviceBuffer::<u8>::uninitialized(sizes.temp_size_in_bytes)? };
+///
+/// let mut compacted_size_buffer = unsafe { DeviceBox::<usize>::uninitialized()? };
+///
+/// let mut properties = vec![AccelEmitDesc::CompactedSize(
+///     compacted_size_buffer.as_device_ptr(),
+/// )];
+///
+/// let hnd = unsafe {
+///     accel_build(
+///         ctx,
+///         stream,
+///         accel_options,
+///         build_inputs,
+///         &mut temp_buffer,
+///         &mut output_buffer,
+///         &mut properties,
+///     )?
+/// };
+///
+/// # Ok(())
+/// # }
+/// ```
+pub unsafe fn accel_build<I: BuildInput>(
+    ctx: &DeviceContext,
+    stream: &cust::stream::Stream,
+    accel_options: &[AccelBuildOptions],
+    build_inputs: &[I],
+    temp_buffer: &mut DeviceSlice<u8>,
+    output_buffer: &mut DeviceSlice<u8>,
+    emitted_properties: &mut [AccelEmitDesc],
+) -> Result<TraversableHandle> {
+    let mut traversable_handle = TraversableHandle { inner: 0 };
+    let properties: Vec<sys::OptixAccelEmitDesc> =
+        emitted_properties.iter_mut().map(|p| p.into()).collect();
+
+    let build_sys: Vec<_> = build_inputs.iter().map(|b| b.to_sys()).collect();
+
+    Ok(optix_call!(optixAccelBuild(
+        ctx.raw,
+        stream.as_inner(),
+        accel_options.as_ptr() as *const _,
+        build_sys.as_ptr(),
+        build_sys.len() as u32,
+        temp_buffer.as_device_ptr().as_raw(),
+        temp_buffer.len(),
+        output_buffer.as_device_ptr().as_raw(),
+        output_buffer.len(),
+        &mut traversable_handle as *mut _ as *mut _,
+        properties.as_ptr() as *const _,
+        properties.len() as u32,
+    ))
+    .map(|_| traversable_handle)?)
+}
+
+/// Compacts the acceleration structure referenced by `input_handle`,
+/// storing the result in `output_buffer` and returning a handle to the
+/// newly compacted structure
+///
+/// # Examples
+/// ```no_run
+/// use cust::prelude as cu;
+/// use optix::prelude as ox;
+/// # fn doit() -> Result<(), Box<dyn std::error::Error>> {
+/// # cust::init(cu::CudaFlags::empty())?;
+/// # ox::init()?;
+/// # let device = cu::Device::get_device(0)?;
+/// # let cu_ctx = cu::Context::create_and_push(cu::ContextFlags::SCHED_AUTO |
+/// # cu::ContextFlags::MAP_HOST, device)?;
+/// # let ctx = ox::DeviceContext::new(&cu_ctx, false)?;
+/// # let vertices: Vec<[f32; 3]> = Vec::new();
+/// # let indices: Vec<[u32; 3]> = Vec::new();
+/// # let stream = cu::Stream::new(cu::StreamFlags::DEFAULT, None)?;
+/// let buf_vertex = DeviceBuffer::from_slice(&vertices)?;
+/// let buf_indices = DeviceBuffer::from_slice(&indices)?;
+///
+/// let geometry_flags = GeometryFlags::None;
+/// let build_inputs = [IndexedTriangleArray::new(
+///     &[&buf_vertex],
+///     &buf_indices,
+///     &[geometry_flags],
+/// )];
+/// let accel_options =
+///     AccelBuildOptions::new(BuildFlags::ALLOW_COMPACTION, BuildOperation::Build);
+///
+/// let sizes = accel_compute_memory_usage(ctx, accel_options, build_inputs)?;
+/// let mut output_buffer =
+///     unsafe { DeviceBuffer::<u8>::uninitialized(sizes.output_size_in_bytes)? };
+///
+/// let mut temp_buffer =
+///     unsafe { DeviceBuffer::<u8>::uninitialized(sizes.temp_size_in_bytes)? };
+///
+/// // Storage for the size of the compacted buffer
+/// let mut compacted_size_buffer = unsafe { DeviceBox::<usize>::uninitialized()? };
+///
+/// // Tell OptiX that we want to know how big the compacted buffer needs to be
+/// let mut properties = vec![AccelEmitDesc::CompactedSize(
+///     compacted_size_buffer.as_device_ptr(),
+/// )];
+///
+/// let hnd = unsafe {
+///     accel_build(
+///         ctx,
+///         stream,
+///         accel_options,
+///         build_inputs,
+///         &mut temp_buffer,
+///         &mut output_buffer,
+///         &mut properties,
+///     )?
+/// };
+///
+/// // The build is asynchronous, so we need to block on the stream before
+/// // reading back the emitted compacted size
+/// stream.synchronize()?;
+///
+/// // Copy the returned size needed for the compacted buffer and allocate
+/// // storage
+/// let mut compacted_size = 0usize;
+/// compacted_size_buffer.copy_to(&mut compacted_size)?;
+///
+/// let mut buf = unsafe { DeviceBuffer::<u8>::uninitialized(compacted_size)? };
+///
+/// // Compact the accel structure.
+/// let hnd = unsafe { accel_compact(ctx, stream, hnd, &mut buf)? };
+///
+/// # Ok(())
+/// # }
+/// ```
+pub unsafe fn accel_compact(
+    ctx: &DeviceContext,
+    stream: &cust::stream::Stream,
+    input_handle: TraversableHandle,
+    output_buffer: &mut DeviceSlice<u8>,
+) -> Result<TraversableHandle> {
+    let mut traversable_handle = TraversableHandle { inner: 0 };
+    Ok(optix_call!(optixAccelCompact(
+        ctx.raw,
+        stream.as_inner(),
+        input_handle.inner,
+        output_buffer.as_device_ptr().as_raw(),
+        output_buffer.len(),
+        &mut traversable_handle as *mut _ as *mut _,
+    ))
+    .map(|_| traversable_handle)?)
+}
+
+bitflags::bitflags! {
+    /// Flags providing configuration options to acceleration structure build.
+    ///
+    /// * `ALLOW_UPDATE` - Must be provided if the accel is to support dynamic updates.
+    /// * `ALLOW_COMPACTION` - Must be provided to enable memory compaction for the accel.
+    /// * `PREFER_FAST_TRACE` - Accel build is slower, but tracing against it will be faster.
+    /// * `PREFER_FAST_BUILD` - Accel build is faster, but tracing against it will be slower.
+    /// * `ALLOW_RANDOM_VERTEX_ACCESS` - Must be provided to be able to get at vertex data from CH
+    /// an AH programs on the device. May affect the performance of the accel (seems to be larger).
+    ///
+    /// Note that `PREFER_FAST_TRACE` and `PREFER_FAST_BUILD` are mutually exclusive.
+    #[derive(Default)]
+    pub struct BuildFlags: OptixEnumBaseType {
+        const NONE = sys::OptixBuildFlags_OPTIX_BUILD_FLAG_NONE;
+        const ALLOW_UPDATE = sys::OptixBuildFlags_OPTIX_BUILD_FLAG_ALLOW_UPDATE;
+        const ALLOW_COMPACTION = sys::OptixBuildFlags_OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+        const PREFER_FAST_TRACE = sys::OptixBuildFlags_OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
+        const PREFER_FAST_BUILD = sys::OptixBuildFlags_OPTIX_BUILD_FLAG_PREFER_FAST_BUILD;
+        const ALLOW_RANDOM_VERTEX_ACCESS = sys::OptixBuildFlags_OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS;
+    }
+}
+
+/// Select which operation to perform with [`accel_build()`].
+#[cfg_attr(windows, repr(i32))]
+#[cfg_attr(unix, repr(u32))]
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum BuildOperation {
+    Build = sys::OptixBuildOperation_OPTIX_BUILD_OPERATION_BUILD,
+    Update = sys::OptixBuildOperation_OPTIX_BUILD_OPERATION_UPDATE,
+}
+
+impl Default for BuildOperation {
+    fn default() -> Self {
+        BuildOperation::Build
+    }
+}
+
+bitflags::bitflags! {
+    /// Configure how to handle ray times that are outside of the provided motion keys.
+    ///
+    /// By default, the object will appear static (clamped) to the nearest motion
+    /// key for rays outside of the range of key times.
+    ///
+    /// * `START_VANISH` - The object will be invisible to rays with a time less
+    /// than the first provided motion key
+    /// * `END_VANISH` - The object will be invisible to rays with a time less
+    /// than the first provided motion key
+    #[derive(DeviceCopy)]
+    pub struct MotionFlags: u16 {
+        const NONE = sys::OptixMotionFlags_OPTIX_MOTION_FLAG_NONE as u16;
+        const START_VANISH = sys::OptixMotionFlags_OPTIX_MOTION_FLAG_START_VANISH as u16;
+        const END_VANISH = sys::OptixMotionFlags_OPTIX_MOTION_FLAG_END_VANISH as u16;
+    }
+}
+
+/// Provide an accel build with motion keys for motion blur.
+///
+/// The motion options are always specified per traversable (acceleration structure
+/// or motion transform). There is no dependency between the motion options of
+/// traversables; given an instance referencing a geometry acceleration structure
+/// with motion, it is not required to build an instance acceleration structure
+/// with motion. The same goes for motion transforms. Even if an instance references
+/// a motion transform as child traversable, the instance acceleration structure
+/// itself may or may not have motion.
+///
+/// Motion transforms must specify at least two motion keys. Acceleration structures,
+/// however, also accept [`BuildOptions`] with field `motion_options` set
+/// to zero. This effectively disables motion for the acceleration structure and
+/// ignores the motion beginning and ending times, along with the motion flags.
+#[repr(C)]
+#[derive(Debug, Copy, Clone, PartialEq, DeviceCopy)]
+pub struct MotionOptions {
+    pub num_keys: u16,
+    pub flags: MotionFlags,
+    pub time_begin: f32,
+    pub time_end: f32,
+}
+
+impl Default for MotionOptions {
+    fn default() -> Self {
+        MotionOptions {
+            num_keys: 0,
+            flags: MotionFlags::NONE,
+            time_begin: 0.0,
+            time_end: 0.0,
+        }
+    }
+}
+
+const_assert_eq!(
+    std::mem::size_of::<MotionOptions>(),
+    std::mem::size_of::<sys::OptixMotionOptions>(),
+);
+
+/// Options to configure the [`accel_build()`]
+#[repr(C)]
+#[derive(Debug, Copy, Clone, PartialEq, Default)]
+pub struct AccelBuildOptions {
+    build_flags: BuildFlags,
+    operation: BuildOperation,
+    motion_options: MotionOptions,
+}
+
+impl AccelBuildOptions {
+    /// Create a new AccelBuildOptions with the given flags and operation and
+    /// no motion blur.
+    pub fn new(build_flags: BuildFlags) -> Self {
+        AccelBuildOptions {
+            build_flags,
+            operation: BuildOperation::Build,
+            motion_options: MotionOptions {
+                num_keys: 1,
+                flags: MotionFlags::NONE,
+                time_begin: 0.0f32,
+                time_end: 1.0f32,
+            },
+        }
+    }
+
+    /// Set the build operation to either build or update
+    pub fn build_operation(mut self, op: BuildOperation) -> Self {
+        self.operation = op;
+        self
+    }
+
+    /// Set the number of motion keys.
+    ///
+    /// This must either be 0 for no motion blur, or >= 2.
+    pub fn num_keys(mut self, num_keys: u16) -> Self {
+        self.motion_options.num_keys = num_keys;
+        self
+    }
+
+    /// Set the start and end time that the first and last motion keys represent.
+    pub fn time_interval(mut self, time_begin: f32, time_end: f32) -> Self {
+        self.motion_options.time_begin = time_begin;
+        self.motion_options.time_end = time_end;
+        self
+    }
+
+    /// Set the flags describing how to handle out-of-range time samples.
+    pub fn motion_flags(mut self, flags: MotionFlags) -> Self {
+        self.motion_options.flags = flags;
+        self
+    }
+}
+
+/// Opaque relocation information for an [`Accel`] in a given [`DeviceContext`].
+///
+/// The location information may be passed to
+/// [`check_relocation_compatibility()`](Accel::check_relocation_compatibility) to
+/// determine if the associated acceleration structure can be relocated to a different device's
+/// memory space.
+///
+/// When used with [`relocate`](Accel::relocate) it provides the data necessary
+/// for doing the relocation.
+///
+/// If the acceleration structure is copied multiple times, the same
+/// [`AccelRelocationInfo`] can also be used on all copies.
+#[repr(transparent)]
+pub struct AccelRelocationInfo {
+    #[allow(dead_code)]
+    inner: sys::OptixAccelRelocationInfo,
+}
+
+/// Struct used for OptiX to communicate the necessary buffer sizes for accel
+/// temp and final outputs.
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct AccelBufferSizes {
+    pub output_size_in_bytes: usize,
+    pub temp_size_in_bytes: usize,
+    pub temp_update_size_in_bytes: usize,
+}
+
+/// Struct used for Optix to communicate the compacted size or list of bounding
+/// boxes back from an accel build.
+///
+/// # Examples
+/// ```
+/// // Copy the returned size needed for the compacted buffer and allocate
+/// // storage
+/// let mut compacted_size = 0usize;
+/// compacted_size_buffer.copy_to(&mut compacted_size)?;
+///
+/// let mut buf = unsafe { DeviceBuffer::<u8>::uninitialized(compacted_size)? };
+///
+/// // Compact the accel structure.
+/// let hnd = unsafe { accel_compact(ctx, stream, hnd, &mut buf)? };
+/// ```
+pub enum AccelEmitDesc {
+    CompactedSize(DevicePointer<usize>),
+    Aabbs(DevicePointer<Aabb>),
+}
+
+/// An axis-aligned bounding box.
+///
+/// Used to communicate bounds info to and from OptiX for bounding custom primitives
+/// and instances
+#[repr(C)]
+#[derive(DeviceCopy, Copy, Clone)]
+pub struct Aabb {
+    min: Vector3<f32>,
+    max: Vector3<f32>,
+}
+
+impl Aabb {
+    /// Create a new Aabb by supplying the min and max points
+    pub fn new<V: Into<Vector3<f32>>>(min: V, max: V) -> Self {
+        let min = min.into();
+        let max = max.into();
+        Self { min, max }
+    }
+}
+
+impl From<&mut AccelEmitDesc> for sys::OptixAccelEmitDesc {
+    fn from(aed: &mut AccelEmitDesc) -> Self {
+        match aed {
+            AccelEmitDesc::CompactedSize(p) => Self {
+                result: p.as_raw(),
+                type_: sys::OptixAccelPropertyType_OPTIX_PROPERTY_TYPE_COMPACTED_SIZE,
+            },
+            AccelEmitDesc::Aabbs(p) => Self {
+                result: p.as_raw(),
+                type_: sys::OptixAccelPropertyType_OPTIX_PROPERTY_TYPE_AABBS,
+            },
+        }
+    }
+}
+
+/// Per-geometry tracing requirements used to allow potential optimizations.
+///
+/// * `GeometryFlags::None` - Applies the default behavior when calling the
+///     any-hit program, possibly multiple times, allowing the acceleration-structure
+///     builder to apply all optimizations.
+/// * `GeometryFlags::DisableAnyHit` - Disables some optimizations specific to
+///     acceleration-structure builders. By default, traversal may call the any-hit
+///     program more than once for each intersected primitive. Setting the flag
+///     ensures that the any-hit program is called only once for a hit with a primitive.
+///     However, setting this flag may change traversal performance. The usage of
+///     this flag may be required for correctness of some rendering algorithms;
+///     for example, in cases where opacity or transparency information is accumulated
+///     in an any-hit program.
+/// * `GeometryFlags::RequireSingleAnyHitCall` - Indicates that traversal should
+///     not call the any-hit program for this primitive even if the corresponding SBT
+///     record contains an any-hit program. Setting this flag usually improves
+///     performance even if no any-hit program is present in the SBT.
+#[repr(u32)]
+#[derive(Copy, Clone, PartialEq, Hash)]
+pub enum GeometryFlags {
+    None = sys::OptixGeometryFlags::None as u32,
+    DisableAnyHit = sys::OptixGeometryFlags::DisableAnyHit as u32,
+    RequireSingleAnyHitCall = sys::OptixGeometryFlags::RequireSingleAnyHitCall as u32,
+}
+
+impl From<GeometryFlags> for sys::OptixGeometryFlags {
+    fn from(f: GeometryFlags) -> Self {
+        match f {
+            GeometryFlags::None => sys::OptixGeometryFlags::None,
+            GeometryFlags::DisableAnyHit => sys::OptixGeometryFlags::DisableAnyHit,
+            GeometryFlags::RequireSingleAnyHitCall => {
+                sys::OptixGeometryFlags::RequireSingleAnyHitCall
+            }
+        }
+    }
+}
+
+impl From<GeometryFlags> for u32 {
+    fn from(f: GeometryFlags) -> Self {
+        match f {
+            GeometryFlags::None => sys::OptixGeometryFlags::None as u32,
+            GeometryFlags::DisableAnyHit => sys::OptixGeometryFlags::DisableAnyHit as u32,
+            GeometryFlags::RequireSingleAnyHitCall => {
+                sys::OptixGeometryFlags::RequireSingleAnyHitCall as u32
+            }
+        }
+    }
+}
+
+/// Specify acceleration structure build input data for a curves geometry
+///
+/// A curve is a swept surface defined by a 3D spline curve and a varying width (radius). A curve (or "strand") of degree d (3=cubic, 2=quadratic, 1=linear) is represented by N > d vertices and N width values, and comprises N - d segments. Each segment is defined by d+1 consecutive vertices. Each curve may have a different number of vertices.
+///
+/// OptiX describes the curve array as a list of curve segments. The primitive id is the segment number. It is the user's responsibility to maintain a mapping between curves and curve segments. Each index buffer entry i = indexBuffer[primid] specifies the start of a curve segment, represented by d+1 consecutive vertices in the vertex buffer, and d+1 consecutive widths in the width buffer. Width is interpolated the same way vertices are interpolated, that is, using the curve basis.
+///
+/// Each curves build input has only one SBT record. To create curves with different materials in the same BVH, use multiple build inputs.
+pub struct CurveArray<'v, 'w, 'i> {
+    curve_type: CurveType,
+    num_primitives: u32,
+    vertex_buffers: PhantomData<&'v f32>,
+    num_vertices: u32,
+    d_vertex_buffers: Vec<CUdeviceptr>,
+    vertex_stride_in_bytes: u32,
+    width_buffers: PhantomData<&'w f32>,
+    #[allow(dead_code)]
+    num_width_buffers: u32,
+    d_width_buffers: Vec<CUdeviceptr>,
+    width_stride_in_bytes: u32,
+    index_buffer: &'i DeviceSlice<u32>,
+    index_stride_in_bytes: u32,
+    flags: GeometryFlags,
+    primitive_index_offset: u32,
+}
+
+impl<'v, 'w, 'i> Hash for CurveArray<'v, 'w, 'i> {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.curve_type.hash(state);
+        state.write_u32(self.num_primitives);
+        state.write_u32(self.num_vertices);
+        state.write_usize(self.d_vertex_buffers.len());
+        state.write_u32(self.vertex_stride_in_bytes);
+        state.write_u32(self.num_vertices);
+        state.write_usize(self.d_width_buffers.len());
+        state.write_u32(self.width_stride_in_bytes);
+        state.write_usize(self.index_buffer.len());
+        state.write_u32(self.index_stride_in_bytes);
+        self.flags.hash(state);
+        state.write_u32(self.primitive_index_offset);
+    }
+}
+
+impl<'v, 'w, 'i> CurveArray<'v, 'w, 'i> {
+    /// Constructor
+    ///
+    /// # Parameters
+    /// * `curve_type` - Curve degree and basis
+    /// * `vertex_buffers` - A slice of device buffers, one per motion step.
+    ///     The length of this slice must match the number of motion keys specified
+    ///     in [`AccelBuildOptions::motion_options`]
+    /// * `width_buffers` - Parallel to `vertex_buffers` with matching lengths and
+    ///     number of motion steps. One value per vertex specifying the width of
+    ///     the curve
+    /// * `index_buffer` - An array of u32, one per curve segment. Each index is
+    ///     the start of `degree+1` consecutive vertices in `vertex_buffers`, and
+    ///     corresponding widths in `width_buffers`. These define a single segment.
+    ///     The length of this array is therefore the number of curve segments
+    pub fn new(
+        curve_type: CurveType,
+        vertex_buffers: &[&'v DeviceSlice<f32>],
+        width_buffers: &[&'w DeviceSlice<f32>],
+        index_buffer: &'i DeviceSlice<u32>,
+    ) -> Result<CurveArray<'v, 'w, 'i>> {
+        // TODO (AL): Do some sanity checking on the values here
+        let num_vertices = vertex_buffers[0].len() as u32;
+        let d_vertex_buffers: Vec<_> = vertex_buffers
+            .iter()
+            .map(|b| b.as_device_ptr().as_raw())
+            .collect();
+
+        let num_width_buffers = width_buffers.len() as u32;
+        let d_width_buffers: Vec<_> = width_buffers
+            .iter()
+            .map(|b| b.as_device_ptr().as_raw())
+            .collect();
+
+        Ok(CurveArray {
+            curve_type,
+            num_primitives: index_buffer.len() as u32,
+            vertex_buffers: PhantomData,
+            num_vertices,
+            d_vertex_buffers,
+            vertex_stride_in_bytes: 0,
+            width_buffers: PhantomData,
+            num_width_buffers,
+            d_width_buffers,
+            width_stride_in_bytes: 0,
+            index_buffer,
+            index_stride_in_bytes: 0,
+            flags: GeometryFlags::None,
+            primitive_index_offset: 0,
+        })
+    }
+
+    /// Stride between vertices. If not specified, vertices are assumed to be
+    /// tightly packed.
+    pub fn vertex_stride(mut self, stride_in_bytes: u32) -> Self {
+        self.vertex_stride_in_bytes = stride_in_bytes;
+        self
+    }
+
+    /// Stride between width values. If not specified, values are assumed to be
+    /// tightly packed.
+    pub fn width_stride(mut self, stride_in_bytes: u32) -> Self {
+        self.vertex_stride_in_bytes = stride_in_bytes;
+        self
+    }
+
+    /// Stride between indices. If not specified, indices are assumed to be
+    /// tightly packed.
+    pub fn index_stride(mut self, stride_in_bytes: u32) -> Self {
+        self.vertex_stride_in_bytes = stride_in_bytes;
+        self
+    }
+
+    /// Combination of [`GeometryFlags`] specifying the primitive behaviour
+    pub fn flags(mut self, flags: GeometryFlags) -> Self {
+        self.flags = flags;
+        self
+    }
+
+    /// Primitive index bias, applied on the device in `optixGetPrimitiveIndex()`.
+    ///
+    /// Sum of primitiveIndexOffset and number of primitives must not overflow 32bits.
+    pub fn primitive_index_offset(mut self, offset: u32) -> Self {
+        self.primitive_index_offset = offset;
+        self
+    }
+}
+
+impl<'v, 'w, 'i> BuildInput for CurveArray<'v, 'w, 'i> {
+    fn to_sys(&self) -> sys::OptixBuildInput {
+        sys::OptixBuildInput {
+            type_: sys::OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_CURVES,
+            input: sys::OptixBuildInputUnion {
+                curve_array: std::mem::ManuallyDrop::new(sys::OptixBuildInputCurveArray {
+                    curveType: self.curve_type.into(),
+                    numPrimitives: self.num_primitives,
+                    vertexBuffers: self.d_vertex_buffers.as_ptr() as *const CUdeviceptr,
+                    numVertices: self.num_vertices,
+                    vertexStrideInBytes: self.vertex_stride_in_bytes,
+                    widthBuffers: self.d_width_buffers.as_ptr() as *const CUdeviceptr,
+                    widthStrideInBytes: self.width_stride_in_bytes,
+                    normalBuffers: std::ptr::null(),
+                    normalStrideInBytes: 0,
+                    indexBuffer: self.index_buffer.as_device_ptr().as_raw(),
+                    indexStrideInBytes: self.index_stride_in_bytes,
+                    flag: self.flags as u32,
+                    primitiveIndexOffset: self.primitive_index_offset,
+                }),
+            },
+        }
+    }
+}
+
+/// Specifies the type of curves, either linear, quadratic or cubic b-splines.
+#[derive(Debug, Copy, Clone, PartialEq, Hash)]
+pub enum CurveType {
+    RoundLinear,
+    RoundQuadraticBSpline,
+    RoundCubicBSpline,
+}
+
+impl From<CurveType> for sys::OptixPrimitiveType {
+    fn from(c: CurveType) -> Self {
+        match c {
+            CurveType::RoundLinear => sys::OptixPrimitiveType_OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR,
+            CurveType::RoundQuadraticBSpline => {
+                sys::OptixPrimitiveType_OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE
+            }
+            CurveType::RoundCubicBSpline => {
+                sys::OptixPrimitiveType_OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE
+            }
+        }
+    }
+}
+
+/// Specifies the type of vertex data
+#[repr(u32)]
+#[derive(Copy, Clone, PartialEq)]
+pub enum VertexFormat {
+    None = sys::OptixVertexFormat_OPTIX_VERTEX_FORMAT_NONE as u32,
+    Float3 = sys::OptixVertexFormat_OPTIX_VERTEX_FORMAT_FLOAT3 as u32,
+    Float2 = sys::OptixVertexFormat_OPTIX_VERTEX_FORMAT_FLOAT2 as u32,
+    Half3 = sys::OptixVertexFormat_OPTIX_VERTEX_FORMAT_HALF3 as u32,
+    Half2 = sys::OptixVertexFormat_OPTIX_VERTEX_FORMAT_HALF2 as u32,
+    SNorm16 = sys::OptixVertexFormat_OPTIX_VERTEX_FORMAT_SNORM16_3 as u32,
+    SNorm32 = sys::OptixVertexFormat_OPTIX_VERTEX_FORMAT_SNORM16_2 as u32,
+}
+
+/// Specifies the type of index data
+#[cfg_attr(windows, repr(i32))]
+#[cfg_attr(unix, repr(u32))]
+#[derive(Copy, Clone, PartialEq)]
+pub enum IndicesFormat {
+    None = sys::OptixIndicesFormat_OPTIX_INDICES_FORMAT_NONE,
+    Short3 = sys::OptixIndicesFormat_OPTIX_INDICES_FORMAT_UNSIGNED_SHORT3,
+    Int3 = sys::OptixIndicesFormat_OPTIX_INDICES_FORMAT_UNSIGNED_INT3,
+}
+
+/// Specifies the format of transform data
+#[cfg_attr(windows, repr(i32))]
+#[cfg_attr(unix, repr(u32))]
+#[derive(Copy, Clone, PartialEq)]
+pub enum TransformFormat {
+    None = sys::OptixTransformFormat_OPTIX_TRANSFORM_FORMAT_NONE,
+    MatrixFloat12 = sys::OptixTransformFormat_OPTIX_TRANSFORM_FORMAT_MATRIX_FLOAT12,
+}
+
+/// Trait allowing the triangle builds to be generic over the input vertex data.
+///
+/// For instance, if you had a custom vertex type:
+/// ```
+/// struct MyVertex {
+///      x: i16,
+///      y: i16,
+///      z: i16,
+///      nx: f32,
+///      ny: f32,
+///      nz: f32,
+/// }
+///
+/// impl Vertex for MyVertex {
+///     const FORMAT: VertexFormat = VertexFormat::SNorm16;
+///     const STRIDE: u32 = 18;
+/// }
+/// ```
+pub trait Vertex: cust::memory::DeviceCopy {
+    const FORMAT: VertexFormat;
+    const STRIDE: u32 = 0;
+}
+
+#[cfg(feature = "impl_half")]
+impl Vertex for [half::f16; 2] {
+    const FORMAT: VertexFormat = VertexFormat::Half2;
+}
+
+#[cfg(feature = "impl_half")]
+impl Vertex for [half::f16; 3] {
+    const FORMAT: VertexFormat = VertexFormat::Half3;
+}
+
+#[cfg(feature = "impl_half")]
+impl Vertex for mint::Vector2<half::f16> {
+    const FORMAT: VertexFormat = VertexFormat::Half2;
+}
+
+#[cfg(feature = "impl_half")]
+impl Vertex for mint::Vector3<half::f16> {
+    const FORMAT: VertexFormat = VertexFormat::Half3;
+}
+
+impl Vertex for [f32; 2] {
+    const FORMAT: VertexFormat = VertexFormat::Float2;
+}
+
+impl Vertex for [f32; 3] {
+    const FORMAT: VertexFormat = VertexFormat::Float3;
+}
+
+impl Vertex for [i16; 3] {
+    const FORMAT: VertexFormat = VertexFormat::SNorm16;
+}
+
+impl Vertex for [i32; 3] {
+    const FORMAT: VertexFormat = VertexFormat::SNorm32;
+}
+
+impl Vertex for mint::Vector2<f32> {
+    const FORMAT: VertexFormat = VertexFormat::Float2;
+}
+
+impl Vertex for mint::Vector3<f32> {
+    const FORMAT: VertexFormat = VertexFormat::Float3;
+}
+
+impl Vertex for mint::Vector3<i16> {
+    const FORMAT: VertexFormat = VertexFormat::SNorm16;
+}
+
+impl Vertex for mint::Vector3<i32> {
+    const FORMAT: VertexFormat = VertexFormat::SNorm32;
+}
+
+/// Trait allowing build inputs to be generic over the index type
+pub trait IndexTriple: cust::memory::DeviceCopy {
+    const FORMAT: IndicesFormat;
+    const STRIDE: u32 = 0;
+}
+
+impl IndexTriple for [u16; 3] {
+    const FORMAT: IndicesFormat = IndicesFormat::Short3;
+}
+
+impl IndexTriple for [u32; 3] {
+    const FORMAT: IndicesFormat = IndicesFormat::Int3;
+}
+
+impl IndexTriple for mint::Vector3<u16> {
+    const FORMAT: IndicesFormat = IndicesFormat::Short3;
+}
+
+impl IndexTriple for mint::Vector3<u32> {
+    const FORMAT: IndicesFormat = IndicesFormat::Int3;
+}
+
+/// Build input for specifying a (non-indexed) triangle geometry
+pub struct TriangleArray<'v, 'g, V: Vertex> {
+    // We hold slices here to make sure the referenced device memory remains
+    // valid for the lifetime of the build input
+    vertex_buffers: PhantomData<&'v V>,
+    num_vertices: u32,
+    d_vertex_buffers: Vec<CUdeviceptr>,
+    // per-sbt-record geometry flags
+    geometry_flags: &'g [GeometryFlags],
+    pre_transform: Option<DevicePointer<[f32; 12]>>,
+}
+
+impl<'v, 'g, V: Vertex> TriangleArray<'v, 'g, V> {
+    pub fn new(vertex_buffers: &[&'v DeviceSlice<V>], geometry_flags: &'g [GeometryFlags]) -> Self {
+        // TODO (AL): do some sanity checking on the slice lengths here
+        let num_vertices = vertex_buffers[0].len() as u32;
+        let d_vertex_buffers: Vec<_> = vertex_buffers
+            .iter()
+            .map(|b| b.as_device_ptr().as_raw())
+            .collect();
+        TriangleArray {
+            vertex_buffers: PhantomData,
+            num_vertices,
+            d_vertex_buffers,
+            geometry_flags,
+            pre_transform: None,
+        }
+    }
+
+    pub fn pre_transform(mut self, pre_transform: DevicePointer<[f32; 12]>) -> Self {
+        self.pre_transform = Some(pre_transform);
+        self
+    }
+}
+
+impl<'v, 'g, V: Vertex> Hash for TriangleArray<'v, 'g, V> {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write_u32(self.num_vertices);
+        state.write_usize(self.d_vertex_buffers.len());
+        self.geometry_flags.hash(state);
+    }
+}
+
+impl<'v, 'g, V: Vertex> BuildInput for TriangleArray<'v, 'g, V> {
+    fn to_sys(&self) -> sys::OptixBuildInput {
+        sys::OptixBuildInput {
+            type_: sys::OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_TRIANGLES,
+            input: sys::OptixBuildInputUnion {
+                triangle_array: std::mem::ManuallyDrop::new(sys::OptixBuildInputTriangleArray {
+                    vertexBuffers: self.d_vertex_buffers.as_ptr() as *const u64,
+                    numVertices: self.num_vertices,
+                    vertexFormat: V::FORMAT as _,
+                    vertexStrideInBytes: V::STRIDE,
+                    indexBuffer: 0,
+                    numIndexTriplets: 0,
+                    indexFormat: 0,
+                    indexStrideInBytes: 0,
+                    flags: self.geometry_flags.as_ptr() as *const _,
+                    numSbtRecords: 1,
+                    sbtIndexOffsetBuffer: 0,
+                    sbtIndexOffsetSizeInBytes: 0,
+                    sbtIndexOffsetStrideInBytes: 0,
+                    primitiveIndexOffset: 0,
+                    preTransform: if let Some(t) = self.pre_transform {
+                        t.as_raw()
+                    } else {
+                        0
+                    },
+                    transformFormat: if self.pre_transform.is_some() {
+                        sys::OptixTransformFormat_OPTIX_TRANSFORM_FORMAT_MATRIX_FLOAT12
+                    } else {
+                        sys::OptixTransformFormat_OPTIX_TRANSFORM_FORMAT_NONE
+                    },
+                }),
+            },
+        }
+    }
+}
+
+pub struct IndexedTriangleArray<'v, 'i, V: Vertex, I: IndexTriple> {
+    // We hold slices here to make sure the referenced device memory remains
+    // valid for the lifetime of the build input
+    vertex_buffers: PhantomData<&'v V>,
+    num_vertices: u32,
+    d_vertex_buffers: Vec<CUdeviceptr>,
+    index_buffer: &'i DeviceSlice<I>,
+    // per-object geometry flags
+    geometry_flags: Vec<GeometryFlags>,
+    pre_transform: Option<DevicePointer<[f32; 12]>>,
+}
+
+impl<'v, 'i, V: Vertex, I: IndexTriple> IndexedTriangleArray<'v, 'i, V, I> {
+    pub fn new(
+        vertex_buffers: &[&'v DeviceSlice<V>],
+        index_buffer: &'i DeviceSlice<I>,
+        geometry_flags: &[GeometryFlags],
+    ) -> Self {
+        let num_vertices = vertex_buffers[0].len() as u32;
+        let d_vertex_buffers: Vec<_> = vertex_buffers
+            .iter()
+            .map(|b| b.as_device_ptr().as_raw())
+            .collect();
+        IndexedTriangleArray {
+            vertex_buffers: PhantomData,
+            num_vertices,
+            d_vertex_buffers,
+            geometry_flags: geometry_flags.to_vec(),
+            index_buffer,
+            pre_transform: None,
+        }
+    }
+
+    pub fn pre_transform(mut self, pre_transform: DevicePointer<[f32; 12]>) -> Self {
+        self.pre_transform = Some(pre_transform);
+        self
+    }
+}
+
+impl<'v, 'i, V: Vertex, I: IndexTriple> Hash for IndexedTriangleArray<'v, 'i, V, I> {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write_u32(self.num_vertices);
+        state.write_usize(self.d_vertex_buffers.len());
+        self.geometry_flags.hash(state);
+        state.write_usize(self.index_buffer.len());
+    }
+}
+
+impl<'v, 'i, V: Vertex, I: IndexTriple> BuildInput for IndexedTriangleArray<'v, 'i, V, I> {
+    fn to_sys(&self) -> sys::OptixBuildInput {
+        sys::OptixBuildInput {
+            type_: sys::OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_TRIANGLES,
+            input: sys::OptixBuildInputUnion {
+                triangle_array: std::mem::ManuallyDrop::new(sys::OptixBuildInputTriangleArray {
+                    vertexBuffers: self.d_vertex_buffers.as_ptr() as *const u64,
+                    numVertices: self.num_vertices,
+                    vertexFormat: V::FORMAT as _,
+                    vertexStrideInBytes: V::STRIDE,
+                    indexBuffer: self.index_buffer.as_device_ptr().as_raw(),
+                    numIndexTriplets: self.index_buffer.len() as u32,
+                    indexFormat: I::FORMAT as _,
+                    indexStrideInBytes: I::STRIDE,
+                    flags: self.geometry_flags.as_ptr() as *const _,
+                    numSbtRecords: 1,
+                    sbtIndexOffsetBuffer: 0,
+                    sbtIndexOffsetSizeInBytes: 0,
+                    sbtIndexOffsetStrideInBytes: 0,
+                    primitiveIndexOffset: 0,
+                    preTransform: if let Some(t) = self.pre_transform {
+                        t.as_raw()
+                    } else {
+                        0
+                    },
+                    transformFormat: if self.pre_transform.is_some() {
+                        sys::OptixTransformFormat_OPTIX_TRANSFORM_FORMAT_MATRIX_FLOAT12
+                    } else {
+                        sys::OptixTransformFormat_OPTIX_TRANSFORM_FORMAT_NONE
+                    },
+                }),
+            },
+        }
+    }
+}
+
+pub struct CustomPrimitiveArray<'a, 's> {
+    aabb_buffers: Vec<CUdeviceptr>,
+    aabb_buffers_marker: PhantomData<&'a Aabb>,
+    num_primitives: u32,
+    stride_in_bytes: u32,
+    flags: Vec<GeometryFlags>,
+    num_sbt_records: u32,
+    sbt_index_offset_buffer: Option<&'s DeviceSlice<u32>>,
+    sbt_index_offset_stride_in_bytes: u32,
+    primitive_index_offset: u32,
+}
+
+impl<'a, 'g, 's> Hash for CustomPrimitiveArray<'a, 's> {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write_usize(self.aabb_buffers.len());
+        state.write_u32(self.num_primitives);
+        state.write_u32(self.stride_in_bytes);
+        self.flags.hash(state);
+        state.write_u32(self.num_sbt_records);
+        if let Some(b) = self.sbt_index_offset_buffer {
+            state.write_usize(b.len());
+        } else {
+            state.write_usize(0);
+        }
+        state.write_u32(self.sbt_index_offset_stride_in_bytes);
+        state.write_u32(self.primitive_index_offset);
+    }
+}
+
+impl<'a, 's> CustomPrimitiveArray<'a, 's> {
+    pub fn new(
+        aabb_buffers: &[&'a DeviceSlice<Aabb>],
+        flags: &[GeometryFlags],
+    ) -> Result<CustomPrimitiveArray<'a, 's>> {
+        let num_primitives = aabb_buffers.len() as u32;
+        let aabb_buffers: Vec<_> = aabb_buffers
+            .iter()
+            .map(|b| b.as_device_ptr().as_raw())
+            .collect();
+
+        Ok(CustomPrimitiveArray {
+            aabb_buffers,
+            aabb_buffers_marker: PhantomData,
+            num_primitives,
+            stride_in_bytes: 0,
+            flags: flags.to_vec(),
+            num_sbt_records: 1,
+            sbt_index_offset_buffer: None,
+            sbt_index_offset_stride_in_bytes: 0,
+            primitive_index_offset: 0,
+        })
+    }
+
+    pub fn stride(mut self, stride_in_bytes: u32) -> Self {
+        self.stride_in_bytes = stride_in_bytes;
+        self
+    }
+
+    pub fn primitive_index_offset(mut self, offset: u32) -> Self {
+        self.primitive_index_offset = offset;
+        self
+    }
+
+    pub fn num_sbt_records(mut self, num_sbt_records: u32) -> Self {
+        self.num_sbt_records = num_sbt_records;
+        self
+    }
+
+    pub fn sbt_index_offset_buffer(
+        mut self,
+        sbt_index_offset_buffer: &'s DeviceSlice<u32>,
+    ) -> Self {
+        self.sbt_index_offset_buffer = Some(sbt_index_offset_buffer);
+        self
+    }
+
+    pub fn sbt_index_offset_buffer_stride(mut self, stride_in_bytes: u32) -> Self {
+        self.sbt_index_offset_stride_in_bytes = stride_in_bytes;
+        self
+    }
+}
+
+impl<'a, 's> BuildInput for CustomPrimitiveArray<'a, 's> {
+    fn to_sys(&self) -> sys::OptixBuildInput {
+        sys::OptixBuildInput {
+            type_: sys::OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES,
+            input: sys::OptixBuildInputUnion {
+                custom_primitive_array: std::mem::ManuallyDrop::new(
+                    sys::OptixBuildInputCustomPrimitiveArray {
+                        aabbBuffers: self.aabb_buffers.as_ptr(),
+                        numPrimitives: self.num_primitives,
+                        strideInBytes: self.stride_in_bytes,
+                        flags: self.flags.as_ptr() as *const u32,
+                        numSbtRecords: self.num_sbt_records,
+                        sbtIndexOffsetBuffer: if let Some(sbt_index_offset_buffer) =
+                            self.sbt_index_offset_buffer
+                        {
+                            sbt_index_offset_buffer.as_device_ptr().as_raw()
+                        } else {
+                            0
+                        },
+                        sbtIndexOffsetSizeInBytes: 4,
+                        sbtIndexOffsetStrideInBytes: self.sbt_index_offset_stride_in_bytes,
+                        primitiveIndexOffset: self.primitive_index_offset,
+                    },
+                ),
+            },
+        }
+    }
+}
+
+#[repr(C, align(16))]
+#[derive(Debug, Copy, Clone, DeviceCopy)]
+pub struct Instance<'a> {
+    transform: RowMatrix3x4<f32>,
+    instance_id: u32,
+    sbt_offset: u32,
+    visibility_mask: u32,
+    flags: InstanceFlags,
+    traversable_handle: TraversableHandle,
+    pad: [u32; 2],
+    accel: PhantomData<&'a ()>,
+}
+
+const_assert_eq!(
+    std::mem::align_of::<Instance>(),
+    sys::OptixInstanceByteAlignment
+);
+const_assert_eq!(
+    std::mem::size_of::<Instance>(),
+    std::mem::size_of::<sys::OptixInstance>()
+);
+
+bitflags::bitflags! {
+    #[derive(DeviceCopy)]
+    pub struct InstanceFlags: OptixEnumBaseType {
+        const NONE = sys::OptixInstanceFlags_OPTIX_INSTANCE_FLAG_NONE;
+        const DISABLE_TRIANGLE_FACE_CULLING = sys::OptixInstanceFlags_OPTIX_INSTANCE_FLAG_DISABLE_TRIANGLE_FACE_CULLING;
+        const FLIP_TRIANGLE_FACING = sys::OptixInstanceFlags_OPTIX_INSTANCE_FLAG_FLIP_TRIANGLE_FACING;
+        const DISABLE_ANYHIT = sys::OptixInstanceFlags_OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT;
+        const ENFORCE_ANYHIT = sys::OptixInstanceFlags_OPTIX_INSTANCE_FLAG_ENFORCE_ANYHIT;
+        const DISABLE_TRANSFORM = sys::OptixInstanceFlags_OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+    }
+}
+
+impl<'a> Instance<'a> {
+    pub fn new<T: Traversable>(accel: &'a T) -> Instance<'a> {
+        #[allow(clippy::deprecated_cfg_attr)]
+        #[cfg_attr(rustfmt, rustfmt_skip)]
+        Instance {
+            transform: [
+                1.0, 0.0, 0.0, 0.0,
+                0.0, 1.0, 0.0, 0.0,
+                0.0, 0.0, 1.0, 0.0].into(),
+            instance_id: 0,
+            sbt_offset: 0,
+            visibility_mask: 255,
+            flags: InstanceFlags::NONE,
+            traversable_handle: accel.handle(),
+            pad: [0; 2],
+            accel: PhantomData,
+        }
+    }
+
+    pub unsafe fn from_handle(traversable_handle: TraversableHandle) -> Instance<'static> {
+        #[allow(clippy::deprecated_cfg_attr)]
+        #[cfg_attr(rustfmt, rustfmt_skip)]
+        Instance {
+            transform: [
+                1.0, 0.0, 0.0, 0.0,
+                0.0, 1.0, 0.0, 0.0,
+                0.0, 0.0, 1.0, 0.0].into(),
+            instance_id: 0,
+            sbt_offset: 0,
+            visibility_mask: 255,
+            flags: InstanceFlags::NONE,
+            traversable_handle,
+            pad: [0; 2],
+            accel: PhantomData,
+        }
+    }
+
+    pub fn transform<T: Into<RowMatrix3x4<f32>>>(mut self, transform: T) -> Instance<'a> {
+        self.transform = transform.into();
+        self
+    }
+
+    pub fn instance_id(mut self, instance_id: u32) -> Instance<'a> {
+        self.instance_id = instance_id;
+        self
+    }
+
+    pub fn sbt_offset(mut self, sbt_offset: u32) -> Instance<'a> {
+        self.sbt_offset = sbt_offset;
+        self
+    }
+
+    pub fn visibility_mask(mut self, visibility_mask: u8) -> Instance<'a> {
+        self.visibility_mask = visibility_mask as u32;
+        self
+    }
+
+    pub fn flags(mut self, flags: InstanceFlags) -> Instance<'a> {
+        self.flags = flags;
+        self
+    }
+}
+
+pub struct InstanceArray<'i, 'a> {
+    instances: &'i DeviceSlice<Instance<'a>>,
+}
+
+impl<'i, 'a> InstanceArray<'i, 'a> {
+    pub fn new(instances: &'i DeviceSlice<Instance<'a>>) -> InstanceArray<'i, 'a> {
+        InstanceArray { instances }
+    }
+}
+
+impl<'i, 'a> Hash for InstanceArray<'i, 'a> {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write_usize(self.instances.len());
+    }
+}
+
+impl<'i, 'a> BuildInput for InstanceArray<'i, 'a> {
+    fn to_sys(&self) -> sys::OptixBuildInput {
+        cfg_if::cfg_if! {
+            if #[cfg(any(feature="optix72", feature="optix73"))] {
+                sys::OptixBuildInput {
+                    type_: sys::OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_INSTANCES,
+                    input: sys::OptixBuildInputUnion {
+                        instance_array: std::mem::ManuallyDrop::new(sys::OptixBuildInputInstanceArray {
+                            instances: self.instances.as_device_ptr().as_raw(),
+                            numInstances: self.instances.len() as u32,
+                        })
+                    }
+                }
+            } else {
+                sys::OptixBuildInput {
+                    type_: sys::OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_INSTANCES,
+                    input: sys::OptixBuildInputUnion {
+                        instance_array: std::mem::ManuallyDrop::new(sys::OptixBuildInputInstanceArray {
+                            instances: self.instances.as_device_ptr(),
+                            numInstances: self.instances.len() as u32,
+                            aabbs: 0,
+                            numAabbs: 0,
+                        })
+                    }
+                }
+            }
+        }
+    }
+}
+
+pub struct InstancePointerArray<'i> {
+    instances: &'i DeviceSlice<CUdeviceptr>,
+}
+
+impl<'i> InstancePointerArray<'i> {
+    pub fn new(instances: &'i DeviceSlice<CUdeviceptr>) -> InstancePointerArray {
+        InstancePointerArray { instances }
+    }
+}
+
+impl<'i> Hash for InstancePointerArray<'i> {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write_usize(self.instances.len());
+    }
+}
+
+impl<'i> BuildInput for InstancePointerArray<'i> {
+    fn to_sys(&self) -> sys::OptixBuildInput {
+        cfg_if::cfg_if! {
+            if #[cfg(any(feature="optix72", feature="optix73"))] {
+                sys::OptixBuildInput {
+                    type_: sys::OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_INSTANCE_POINTERS,
+                    input: sys::OptixBuildInputUnion {
+                        instance_array: std::mem::ManuallyDrop::new(sys::OptixBuildInputInstanceArray {
+                            instances: self.instances.as_device_ptr().as_raw(),
+                            numInstances: self.instances.len() as u32,
+                        })
+                    }
+                }
+            } else {
+                sys::OptixBuildInput {
+                    type_: sys::OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_INSTANCE_POINTERS,
+                    input: sys::OptixBuildInputUnion {
+                        instance_array: std::mem::ManuallyDrop::new(sys::OptixBuildInputInstanceArray {
+                            instances: self.instances.as_device_ptr(),
+                            numInstances: self.instances.len() as u32,
+                            aabbs: 0,
+                            numAabbs: 0,
+                        })
+                    }
+                }
+            }
+        }
+    }
+}
+
+/// A scene graph node holding a child node with a transform to be applied during
+/// ray traversal.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct StaticTransformWrapper(sys::OptixStaticTransform);
+
+unsafe impl DeviceCopy for StaticTransformWrapper {}
+
+const_assert_eq!(
+    std::mem::size_of::<StaticTransformWrapper>(),
+    std::mem::size_of::<sys::OptixStaticTransform>(),
+);
+
+/// Stores the device memory and the [`TraversableHandle`] for a [`StaticTransform`]
+pub struct StaticTransform {
+    #[allow(dead_code)]
+    buf: DeviceBox<StaticTransformWrapper>,
+    hnd: TraversableHandle,
+}
+
+impl StaticTransform {
+    /// Create a new DeviceStaticTransform by copying the given [`StaticTransform`]
+    /// to the device and converting the resulting pointer to an OptiX [`Traversable`];
+    pub fn new<T: Traversable, M: Into<RowMatrix3x4<f32>> + Clone>(
+        ctx: &DeviceContext,
+        child: &T,
+        transform: &M,
+        inv_transform: &M,
+    ) -> Result<StaticTransform> {
+        let transform = (*transform).clone().into();
+        let inv_transform = (*inv_transform).clone().into();
+        let buf = DeviceBox::new(&StaticTransformWrapper(sys::OptixStaticTransform {
+            child: child.handle().inner,
+            transform: transform.into(),
+            invTransform: inv_transform.into(),
+            ..Default::default()
+        }))?;
+        let hnd = unsafe {
+            convert_pointer_to_traversable_handle(
+                ctx,
+                buf.as_device_ptr().as_raw(),
+                TraversableType::StaticTransform,
+            )?
+        };
+
+        Ok(StaticTransform { buf, hnd })
+    }
+
+    /// Create a new DeviceStaticTransform from device memory and pre-converted
+    /// handle
+    pub unsafe fn from_raw_parts(
+        buf: DeviceBox<StaticTransformWrapper>,
+        hnd: TraversableHandle,
+    ) -> Self {
+        Self { buf, hnd }
+    }
+}
+
+impl Traversable for StaticTransform {
+    fn handle(&self) -> TraversableHandle {
+        self.hnd
+    }
+}
+
+/// A scene graph node holding a child node with a motion transform to be applied
+/// during ray traversal, represented as SRT Data.
+///
+/// Stores the device memory and the [`TraversableHandle`] for a [`sys::OptixMatrixMotionTransform`]
+/// and an arbitrary number of motion keys
+pub struct MatrixMotionTransform {
+    #[allow(dead_code)]
+    buf: DeviceBuffer<u8>,
+    hnd: TraversableHandle,
+}
+
+impl MatrixMotionTransform {
+    /// Create a new MatrixMotionTransform with the given time range, flags and
+    /// motion keys.
+    ///
+    /// This method handles all memory allocation and copying the data to the
+    /// device.
+    ///
+    /// # Errors
+    /// * [`Error::TooFewMotionKeys`] - If `transforms.len() < 2`
+    /// * [`Error::OptixError`] - Any internal OptiX error
+    /// * [`Error::CudaError`] - Any internal OptiX error
+    pub fn new<T: Traversable>(
+        ctx: &DeviceContext,
+        child: &T,
+        time_begin: f32,
+        time_end: f32,
+        flags: MotionFlags,
+        transforms: &[RowMatrix3x4<f32>],
+    ) -> Result<MatrixMotionTransform> {
+        let num_keys = transforms.len();
+        if num_keys < 2 {
+            return Err(Error::TooFewMotionKeys(num_keys));
+        }
+
+        let mmt = sys::OptixMatrixMotionTransform {
+            child: child.handle().inner,
+            motionOptions: sys::OptixMotionOptions {
+                numKeys: num_keys as u16,
+                timeBegin: time_begin,
+                timeEnd: time_end,
+                flags: flags.bits(),
+            },
+            ..Default::default()
+        };
+
+        let size =
+            size_of::<sys::OptixMatrixMotionTransform>() + size_of::<f32>() * 12 * (num_keys - 2);
+
+        // copy the transform data
+        unsafe {
+            // allocate memory for the transform struct and all the matrices
+            let buf = DeviceBuffer::<u8>::uninitialized(size)?;
+
+            // get the offset of the matrix data from the base of the struct
+            let transform_ptr = buf
+                .as_device_ptr()
+                .add(offset_of!(sys::OptixMatrixMotionTransform, transform));
+
+            // copy the transform data.
+            // Note we're writing 24 bytes of data for the transform field that
+            // we'll just overwrite on the next line, but it's probably more
+            // efficient to do that than to write each field individually
+            cust::memory::memcpy_htod(
+                buf.as_device_ptr().as_raw(),
+                &mmt as *const _ as *const c_void,
+                size_of::<sys::OptixMatrixMotionTransform>(),
+            )?;
+
+            // copy the matrix data
+            cust::memory::memcpy_htod(
+                transform_ptr.as_raw(),
+                transforms.as_ptr() as *const c_void,
+                std::mem::size_of::<RowMatrix3x4<f32>>() * num_keys,
+            )?;
+
+            let hnd = convert_pointer_to_traversable_handle(
+                ctx,
+                buf.as_device_ptr().as_raw(),
+                TraversableType::MatrixMotionTransform,
+            )?;
+
+            Ok(Self { buf, hnd })
+        }
+    }
+
+    /// Create a new MatrixMotionTransform from device memory and pre-converted
+    /// handle
+    pub unsafe fn from_raw_parts(buf: DeviceBuffer<u8>, hnd: TraversableHandle) -> Self {
+        Self { buf, hnd }
+    }
+}
+
+impl Traversable for MatrixMotionTransform {
+    fn handle(&self) -> TraversableHandle {
+        self.hnd
+    }
+}
+
+/// Represents an SRT transformation.
+///
+/// An SRT transformation can represent a smooth rotation with fewer motion keys
+/// than a matrix transformation. Each motion key is constructed from elements
+/// taken from a matrix $S$, a quaternion $R$, and a translation $T$.
+///
+/// The scaling matrix,
+/// $$
+/// S=\begin{bmatrix}
+/// sx & a & b & pvx \cr 0 & sy & c & pvy \cr 0 & 0 & sz & pvz
+/// \end{bmatrix}
+/// $$
+///
+/// defines an affine transformation that can include scale, shear, and a translation.
+/// The translation allows to define the pivot point for the subsequent rotation.
+///
+/// The rotation quaternion $R = [qx, qy, qz, qw]$ describes a rotation with angular
+/// component $qw = \cos(\theta / 2)$ and other components
+/// $[qx, qy, qz] = \sin(\theta / 2) \cdot [ax, ay, az]$ where the axis $[ax, ay, az]$
+/// is normalized.
+///
+/// The translation matrix,
+/// $$
+/// T = \begin{bmatrix} 1 & 0 & 0 & tx \cr 0 & 1 & 0 & ty \cr 0 & 0 & 1 & tz \end{bmatrix}
+/// $$
+/// defines another translation that is applied after the rotation. Typically, this
+/// translation includes the inverse translation from the matrix $S$ to reverse the
+/// translation for the pivot point for $R$.
+///
+/// To obtain the effective transformation at time $t$, the elements of the components
+/// of $S$, $R$, and $T$ will be interpolated linearly. The components are then
+/// multiplied to obtain the combined transformation $C = T \cdot R \cdot S$. The
+/// transformation $C$ is the effective object-to-world transformations at time $t$,
+/// and $C^{-1}$ is the effective world-to-object transformation at time $t$.
+///
+#[repr(transparent)]
+#[derive(Copy, Clone, Debug)]
+pub struct SrtData(sys::OptixSRTData);
+
+unsafe impl DeviceCopy for SrtData {}
+
+impl Deref for SrtData {
+    type Target = sys::OptixSRTData;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+/// A scene graph node holding a child node with a motion transform to be applied
+/// during ray traversal, represented as SRT Data.
+///
+/// Stores the device memory and the [`TraversableHandle`] for a [`sys::OptixSRTMotionTransform`]
+/// and an arbitrary number of motion keys
+pub struct SrtMotionTransform {
+    // TODO(RDambrosio016): ask al what this is for :p
+    #[allow(dead_code)]
+    buf: DeviceBuffer<u8>,
+    hnd: TraversableHandle,
+}
+
+impl SrtMotionTransform {
+    /// Create a new SrtMotionTransform from the given child [`TraversableHandle`],
+    /// time range, flags and [`SrtData`]
+    ///
+    /// This method handles all memory allocation and copying the data to the
+    /// device.
+    ///
+    /// # Errors
+    /// * [`Error::TooFewMotionKeys`] - If `srt_data.len() < 2`
+    /// * [`Error::OptixError`] - Any internal OptiX error
+    /// * [`Error::CudaError`] - Any internal OptiX error
+    pub fn new<T: Traversable>(
+        ctx: &DeviceContext,
+        child: &T,
+        time_begin: f32,
+        time_end: f32,
+        flags: MotionFlags,
+        srt_data: &[SrtData],
+    ) -> Result<SrtMotionTransform> {
+        let num_keys = srt_data.len();
+        if num_keys < 2 {
+            return Err(Error::TooFewMotionKeys(num_keys));
+        }
+
+        let mmt = sys::OptixSRTMotionTransform {
+            child: child.handle().inner,
+            motionOptions: sys::OptixMotionOptions {
+                numKeys: num_keys as u16,
+                timeBegin: time_begin,
+                timeEnd: time_end,
+                flags: flags.bits(),
+            },
+            ..Default::default()
+        };
+
+        let size = size_of::<sys::OptixSRTMotionTransform>()
+            + size_of::<f32>() * size_of::<SrtData>() * (num_keys - 2);
+
+        // copy the transform data
+        unsafe {
+            // allocate memory for the transform struct and all the matrices
+            let buf = DeviceBuffer::<u8>::uninitialized(size)?;
+
+            // get the offset of the matrix data from the base of the struct
+            let transform_ptr = buf
+                .as_device_ptr()
+                .add(offset_of!(sys::OptixSRTMotionTransform, srtData));
+
+            // copy the transform data.
+            // Note we're writing 24 bytes of data for the transform field that
+            // we'll just overwrite on the next line, but it's probably more
+            // efficient to do that than to write each field individually
+            cust::memory::memcpy_htod(
+                buf.as_device_ptr().as_raw(),
+                &mmt as *const _ as *const c_void,
+                size_of::<sys::OptixSRTMotionTransform>(),
+            )?;
+
+            // copy the matrix data
+            cust::memory::memcpy_htod(
+                transform_ptr.as_raw(),
+                srt_data.as_ptr() as *const c_void,
+                std::mem::size_of::<SrtData>() * num_keys,
+            )?;
+
+            let hnd = convert_pointer_to_traversable_handle(
+                ctx,
+                buf.as_device_ptr().as_raw(),
+                TraversableType::SrtMotionTransform,
+            )?;
+
+            Ok(Self { buf, hnd })
+        }
+    }
+
+    /// Create a new SrtMotionTransform from device memory and pre-converted
+    /// handle
+    pub unsafe fn from_raw_parts(buf: DeviceBuffer<u8>, hnd: TraversableHandle) -> Self {
+        Self { buf, hnd }
+    }
+}
+
+impl Traversable for SrtMotionTransform {
+    fn handle(&self) -> TraversableHandle {
+        self.hnd
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum TraversableType {
+    StaticTransform,
+    MatrixMotionTransform,
+    SrtMotionTransform,
+}
+
+impl From<TraversableType> for sys::OptixTraversableType {
+    fn from(t: TraversableType) -> Self {
+        match t {
+            TraversableType::StaticTransform => {
+                sys::OptixTraversableType_OPTIX_TRAVERSABLE_TYPE_STATIC_TRANSFORM
+            }
+            TraversableType::MatrixMotionTransform => {
+                sys::OptixTraversableType_OPTIX_TRAVERSABLE_TYPE_MATRIX_MOTION_TRANSFORM
+            }
+            TraversableType::SrtMotionTransform => {
+                sys::OptixTraversableType_OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM
+            }
+        }
+    }
+}
+
+/// Convert a device pointer into a [`TraversableHandle`].
+///
+/// OptiX transform traversables are managed by the application. Once you have
+/// created your transform and copied it to the device, use this to get a
+/// [`TraversableHandle`] from it.
+pub unsafe fn convert_pointer_to_traversable_handle(
+    ctx: &DeviceContext,
+    ptr: CUdeviceptr,
+    pointer_type: TraversableType,
+) -> Result<TraversableHandle> {
+    let mut inner = 0;
+    Ok(optix_call!(optixConvertPointerToTraversableHandle(
+        ctx.raw,
+        ptr,
+        pointer_type.into(),
+        &mut inner
+    ))
+    .map(|_| TraversableHandle { inner })?)
+}
diff --git a/crates/optix/src/context.md b/crates/optix/src/context.md
new file mode 100644
index 00000000..01fbdb1d
--- /dev/null
+++ b/crates/optix/src/context.md
@@ -0,0 +1,167 @@
+# OptiX Device Context handling.
+
+# Programming Guide...
+<details>
+<summary>Click here to expand programming guide</summary>
+
+A context is created by [`DeviceContext::new()`] and is used to manage a single
+GPU. The NVIDIA OptiX 7 device context is created by specifying the CUDA
+context associated with the device.
+
+```
+# fn doit() -> Result<(), Box<dyn std::error::Error>> {
+use optix::prelude as ox;
+use cust::prelude as cu;
+
+// Initialize cuda and optix
+cust::init(cu::CudaFlags::empty())?;
+ox::init()?;
+
+// Create a cuda context for the first device
+let device = cu::Device::get_device(0)?;
+let cu_ctx = cu::Context::create_and_push(cu::ContextFlags::SCHED_AUTO |
+cu::ContextFlags::MAP_HOST, device)?;
+
+// Create optix device context
+let ctx = ox::DeviceContext::new(&cu_ctx, false)?;
+
+# Ok(())
+# }
+```
+A small set of context properties exist for determining sizes and limits. These
+are queried using [`DeviceContext::get_property()`]. Such properties include
+maximum trace depth, maximum traversable graph depth, maximum primitives per
+build input, and maximum number of instances per acceleration structure.
+
+The context may retain ownership of any GPU resources necessary to launch the
+ray tracing kernels. Some API objects will retain host memory. These are defined
+with create/destroy patterns in the API. The context's `Drop` impl will clean
+up any host or device resources associated with the context. If any other API
+objects associated with this context still exist when the context is destroyed,
+they are also destroyed.
+
+An application may combine any mixture of supported GPUs as long as the data
+transfer and synchronization is handled appropriately. Some applications may
+choose to simplify multi-GPU handling by restricting the variety of these blends,
+for example, by mixing only GPUs of the same streaming multiprocessor version
+to simplify data sharing.
+
+## Logging callbacks
+
+A logging callback closure can be specified using [`DeviceContext::set_log_callback`].
+The closure has the signiature:
+`F: FnMut(u32, &str, &str) + 'static`
+
+The first argument is the log level and indicates the serverity of the message:
+ 
+* 0 - disable: Setting the callback level will disable all messages. The
+callback function will not be called in this case.
+* 1 - fatal: A non-recoverable error. The context and/or OptiX itself
+  might
+no longer be in a usable state.
+* 2 - error: A recoverable error, e.g., when passing invalid call
+parameters.
+* 3 - warning: Hints that OptiX might not behave exactly as requested by
+the user or may perform slower than expected.
+* 4 - print: Status or progress messages.
+Higher levels might occur.
+The second argument is a message category description (for example, "SCENE STAT")
+The last argument is the message itself.
+
+## Compilation caching
+
+Compilation of input programs will be cached to disk when creating [`Module`](crate::module::Module),
+[`ProgramGroup`](crate::program_group::ProgramGroup), and
+[`Pipeline`](crate::pipeline::Pipeline) objects if caching has been enabled.
+
+Subsequent compilation can reuse the cached data to improve the time to create
+these objects. The cache can be shared between multiple [`DeviceContext`]
+objects, and NVIDIA OptiX 7 will take care of ensuring correct multi-threaded
+access to the cache. If no sharing between [`DeviceContext`] objects is desired,
+the path to the cache can be set differently for each [`DeviceContext`].
+Caching can be disabled entirely by setting the environment variable
+`OPTIX_CACHE_MAXSIZE` to 0. Disabling the cache via the environment variable
+will not affect existing cache files or their contents.
+
+The disk cache can be controlled with:
+
+### [`DeviceContext::set_cache_enabled()`]
+The cache database is initialized when the device context is created and when
+enabled through this function call. If the database cannot be initialized when
+the device context is created, caching will be disabled; a message is reported
+to the log callback if caching is enabled. In this case, the call to
+[`DeviceContext::new()`] does not return an error. To ensure that cache
+initialization succeeded on context creation, the status can be queried using
+[`DeviceContext::get_cache_enabled`]. If caching is disabled, the cache can be
+reconfigured and then enabled using [`DeviceContext::set_cache_enabled`]. If
+the cache database cannot be initialized, an error is returned. Garbage
+collection is performed on the next write to the cache database, not when the
+cache is enabled.
+
+### [`DeviceContext::set_cache_location`]
+The disk cache is created in the directory specified by location. The directory
+is created if it does not exist.
+
+The cache database is created immediately if the cache is currently enabled.
+Otherwise the cache database is created later when the cache is enabled. An
+error is returned if it is not possible to create the cache database file at
+the specified location for any reason (for example, if the path is invalid or
+if the directory is not writable) and caching will be disabled. If the disk
+cache is located on a network file share, behavior is undefined.
+
+The location of the disk cache can be overridden with the environment variable
+`OPTIX_CACHE_PATH`. This environment variable takes precedence over the value
+passed to this function when the disk cache is enabled.
+
+The default location of the cache depends on the operating system:
+* Windows -	`%LOCALAPPDATA%\NVIDIA\OptixCache`
+* Linux	- `/var/tmp/OptixCache_username`, or `/tmp/OptixCache_username` if the
+first choice is not usable. The underscore and username suffix are omitted if
+the username cannot be obtained.
+
+### [`DeviceContext::set_cache_database_sizes()`]
+Parameters `low` and `high` set the low and high water marks for disk cache
+garbage collection. Setting either limit to zero disables garbage collection.
+Garbage collection only happens when the cache database is written. It is
+triggered whenever the cache data size exceeds the high water mark and proceeding
+until the size reaches the low water mark. Garbage collection always frees enough
+space to allow the insertion of the new entry within the boundary of the low
+water mark. An error is returned if either limit is nonzero and the high water
+mark is lower than the low water mark. If more than one device context accesses
+the same cache database with different high and low water mark values, the device
+context uses its values when writing to the cache database.
+
+The high water mark can be overridden with the environment variable
+`OPTIX_CACHE_MAXSIZE`. Setting `OPTIX_CACHE_MAXSIZE` to 0 will disable the cache.
+Negative and non-integer values will be ignored.
+
+`OPTIX_CACHE_MAXSIZE` takes precedence over the `high` value passed to this
+function. The low water mark will be set to half the value of
+`OPTIX_CACHE_MAXSIZE`.
+
+Corresponding `get_xxx()` functions are supplied to retrieve the current value of these
+cache properties.
+
+## Validation Mode
+The NVIDIA OptiX 7 validation mode can help uncover errors which might otherwise
+go undetected or which occur only intermittently and are difficult to locate.
+Validation mode enables additional tests and settings during application
+execution. This additional processing can reduce performance, so it should only
+be used during debugging or in the final testing phase of a completed application.
+
+Validation mode can be enabled by passing `true` to the `enable_validation`
+parameter of [`DeviceContext::new()`].
+
+[`OptixError::ValidationFailure`](crate::error::OptixError::ValidationFailure)
+will be signalled if an error is caught when validation mode is enabled.
+[`launch()`](crate::launch) will synchronize after the launch and report errors,
+if any.
+
+Among other effects, validation mode implicitly enables all OptiX debug
+exceptions and provides an exception program if none is provided. The first
+non-user exception caught inside an exception program will therefore be reported
+and the launch terminated immediately. This will make exceptions more visible
+that otherwise might be overlooked.
+
+</details>
+
diff --git a/crates/optix/src/context.rs b/crates/optix/src/context.rs
index a95ff086..0ead60f9 100644
--- a/crates/optix/src/context.rs
+++ b/crates/optix/src/context.rs
@@ -1,15 +1,18 @@
-//! OptiX Device Context handling.
-
-use std::{ffi::c_void, mem::MaybeUninit, ptr};
+use std::os::raw::{c_char, c_uint};
+use std::{
+    ffi::{c_void, CStr, CString},
+    mem::MaybeUninit,
+};
 
 use cust::context::ContextHandle;
 
-use crate::{error::OptixResult, optix_call, sys};
+use crate::{error::Error, optix_call, sys};
+type Result<T, E = Error> = std::result::Result<T, E>;
 
 /// A certain property belonging to an OptiX device.
 #[non_exhaustive]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum OptixDeviceProperty {
+pub enum DeviceProperty {
     /// The maximum value that can be given to the OptiX pipeline's max trace depth.
     MaxTraceDepth,
     /// The maximum value that can be given to the OptiX pipeline's stack size method's max traversable
@@ -31,11 +34,11 @@ pub enum OptixDeviceProperty {
     MaxSbtOffset,
 }
 
-impl OptixDeviceProperty {
+impl DeviceProperty {
     // we could repr this the same as the sys version, but for better compatability
     // and safety in the future, we just match.
-    pub fn to_raw(self) -> sys::OptixDeviceProperty {
-        use OptixDeviceProperty::*;
+    pub fn to_raw(self) -> sys::OptixDeviceProperty::Type {
+        use DeviceProperty::*;
         match self {
         MaxTraceDepth => sys::OptixDeviceProperty::OPTIX_DEVICE_PROPERTY_LIMIT_MAX_TRACE_DEPTH,
         MaxTraversableGraphDepth => sys::OptixDeviceProperty::OPTIX_DEVICE_PROPERTY_LIMIT_MAX_TRAVERSABLE_GRAPH_DEPTH,
@@ -51,11 +54,12 @@ impl OptixDeviceProperty {
 }
 
 #[derive(Debug)]
-pub struct OptixContext {
+#[repr(transparent)]
+pub struct DeviceContext {
     pub(crate) raw: sys::OptixDeviceContext,
 }
 
-impl Drop for OptixContext {
+impl Drop for DeviceContext {
     fn drop(&mut self) {
         unsafe {
             sys::optixDeviceContextDestroy(self.raw);
@@ -63,16 +67,28 @@ impl Drop for OptixContext {
     }
 }
 
-impl OptixContext {
+impl DeviceContext {
     // TODO(RDambrosio016): expose device context options
 
-    /// Creates a new [`OptixContext`] from a cust CUDA context.
-    pub fn new(cuda_ctx: &impl ContextHandle) -> OptixResult<Self> {
+    /// Creates a new [`DeviceContext`] from a cust CUDA context.
+    ///
+    /// If `enable_validation` is `true`, then additional tests and settings are
+    /// enabled during application execution. This additional processing can reduce
+    /// performance, so it should only be used during debugging or in the final
+    /// testing phase of a completed application.
+    pub fn new(cuda_ctx: &impl ContextHandle, enable_validation: bool) -> Result<Self> {
         let mut raw = MaybeUninit::uninit();
+
+        let mut opt = sys::OptixDeviceContextOptions::default();
+        if enable_validation {
+            opt.validationMode =
+                sys::OptixDeviceContextValidationMode_OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
+        }
+
         unsafe {
             optix_call!(optixDeviceContextCreate(
                 cuda_ctx.get_inner(),
-                ptr::null(),
+                &opt,
                 raw.as_mut_ptr()
             ))?;
             Ok(Self {
@@ -81,7 +97,47 @@ impl OptixContext {
         }
     }
 
-    pub fn get_property(&self, property: OptixDeviceProperty) -> OptixResult<u32> {
+    /// Returns the low and high water marks, respectively, for disk cache garbage collection.
+    /// If the cache has been disabled by setting the environment variable
+    /// OPTIX_CACHE_MAXSIZE=0, this function will return 0 for the low and high water marks.
+    pub fn get_cache_database_sizes(&self) -> Result<(usize, usize)> {
+        let mut low = 0;
+        let mut high = 0;
+        unsafe {
+            Ok(optix_call!(optixDeviceContextGetCacheDatabaseSizes(
+                self.raw, &mut low, &mut high,
+            ))
+            .map(|_| (low as usize, high as usize))?)
+        }
+    }
+
+    /// Indicated whether the disk cache is enabled
+    pub fn get_cache_enabled(&self) -> Result<bool> {
+        let mut result = 0;
+        unsafe {
+            Ok(
+                optix_call!(optixDeviceContextGetCacheEnabled(self.raw, &mut result,))
+                    .map(|_| result != 0)?,
+            )
+        }
+    }
+
+    /// Returns the location of the disk cache. If the cache has been disabled
+    /// by setting the environment variable OPTIX_CACHE_MAXSIZE=0, this function will return an empy string.
+    pub fn get_cache_location(&self) -> Result<String> {
+        let mut buf = [0i8; 1024];
+        unsafe {
+            Ok(optix_call!(optixDeviceContextGetCacheLocation(
+                self.raw,
+                buf.as_mut_ptr(),
+                buf.len(),
+            ))
+            .map(|_| CStr::from_ptr(buf.as_ptr()).to_string_lossy().to_string())?)
+        }
+    }
+
+    /// Query properties of this context.
+    pub fn get_property(&self, property: DeviceProperty) -> Result<u32> {
         let raw_prop = property.to_raw();
         unsafe {
             let mut value = 0u32;
@@ -95,7 +151,158 @@ impl OptixContext {
         }
     }
 
+    /// Sets the low and high water marks for disk cache garbage collection.
+    ///
+    /// Garbage collection is triggered when a new entry is written to the cache
+    /// and the current cache data size plus the size of the cache entry that is
+    /// about to be inserted exceeds the high water mark. Garbage collection proceeds
+    /// until the size reaches the low water mark. Garbage collection will always
+    /// free enough space to insert the new entry without exceeding the low water
+    /// mark. Setting either limit to zero will disable garbage collection. An
+    /// error will be returned if both limits are non-zero and the high water mark
+    /// is smaller than the low water mark.
+    ///
+    /// Note that garbage collection is performed only on writes to the disk cache.
+    /// No garbage collection is triggered on disk cache initialization or immediately
+    /// when calling this function, but on subsequent inserting of data into the
+    /// database.
+    ///
+    /// If the size of a compiled module exceeds the value configured for the high
+    /// water mark and garbage collection is enabled, the module will not be added
+    /// to the cache and a warning will be added to the log.
+    ///
+    /// The high water mark can be overridden with the environment variable
+    /// OPTIX_CACHE_MAXSIZE. The environment variable takes precedence over the
+    /// function parameters. The low water mark will be set to half the value of
+    /// OPTIX_CACHE_MAXSIZE. Setting OPTIX_CACHE_MAXSIZE to 0 will disable the
+    /// disk cache, but will not alter the contents of the cache. Negative and
+    /// non-integer values will be ignored.    
+    pub fn set_cache_database_sizes(&mut self, low: usize, high: usize) -> Result<()> {
+        unsafe {
+            Ok(optix_call!(optixDeviceContextSetCacheDatabaseSizes(
+                self.raw, low, high,
+            ))?)
+        }
+    }
+
+    /// Enables or disables the disk cache.
+    ///
+    /// If caching was previously disabled, enabling it will attempt to initialize
+    /// the disk cache database using the currently configured cache location.
+    /// An error will be returned if initialization fails.
+    ///
+    /// Note that no in-memory cache is used, so no caching behavior will be observed
+    /// if the disk cache is disabled.
+    ///
+    /// The cache can be disabled by setting the environment variable
+    /// OPTIX_CACHE_MAXSIZE=0. The environment variable takes precedence over this
+    /// setting. See optixDeviceContextSetCacheDatabaseSizes for additional information.
+    ///
+    /// Note that the disk cache can be disabled by the environment variable, but
+    /// it cannot be enabled via the environment if it is disabled via the API.    
+    pub fn set_cache_enabled(&mut self, enable: bool) -> Result<()> {
+        unsafe {
+            Ok(optix_call!(optixDeviceContextSetCacheEnabled(
+                self.raw,
+                if enable { 1 } else { 0 }
+            ))?)
+        }
+    }
+
+    /// Sets the location of the disk cache.
+    ///
+    /// The location is specified by a directory. This directory should not be used for other purposes and will be created if it does not exist. An error will be returned if is not possible to create the disk cache at the specified location for any reason (e.g., the path is invalid or the directory is not writable). Caching will be disabled if the disk cache cannot be initialized in the new location. If caching is disabled, no error will be returned until caching is enabled. If the disk cache is located on a network file share, behavior is undefined.
+    ///
+    /// The location of the disk cache can be overridden with the environment variable OPTIX_CACHE_PATH. The environment variable takes precedence over this setting.
+    ///
+    /// The default location depends on the operating system:
+    ///
+    /// * Windows: `LOCALAPPDATA%\NVIDIA\OptixCache`
+    /// * Linux: `/var/tmp/OptixCache_<username>` (or `/tmp/OptixCache_<username>`
+    ///     if the first choice is not usable), the underscore and username suffix are omitted if the username cannot be obtained
+    /// * MacOS X:  `/Library/Application Support/NVIDIA/OptixCache`
+    pub fn set_cache_location(&mut self, location: &str) -> Result<()> {
+        let location = CString::new(location).map_err(|_| Error::NulBytesInString)?;
+        unsafe {
+            Ok(optix_call!(optixDeviceContextSetCacheLocation(
+                self.raw,
+                location.as_ptr()
+            ))?)
+        }
+    }
+
+    /// Sets the current log callback method.
+    ///
+    /// The following log levels are defined.
+    /// * 0 - disable: Setting the callback level will disable all messages. The
+    /// callback function will not be called in this case.
+    /// * 1 - fatal: A non-recoverable error. The context and/or OptiX itself
+    ///   might
+    /// no longer be in a usable state.
+    /// * 2 - error: A recoverable error, e.g., when passing invalid call
+    /// parameters.
+    /// * 3 - warning: Hints that OptiX might not behave exactly as requested by
+    /// the user or may perform slower than expected.
+    /// * 4 - print: Status or progress messages.
+    /// Higher levels might occur.
+    pub fn set_log_callback<F>(&mut self, cb: F, level: u32) -> Result<()>
+    where
+        F: FnMut(u32, &str, &str) + 'static,
+    {
+        let (closure, trampoline) = unsafe { unpack_closure(cb) };
+        unsafe {
+            Ok(optix_call!(optixDeviceContextSetLogCallback(
+                self.raw,
+                Some(trampoline),
+                closure,
+                level
+            ))?)
+        }
+    }
+
+    /// Get the FFI context representation
     pub fn as_raw(&self) -> sys::OptixDeviceContext {
         self.raw
     }
 }
+
+type LogCallback = extern "C" fn(c_uint, *const c_char, *const c_char, *mut c_void);
+
+/// Unpack a Rust closure, extracting a `void*` pointer to the data and a
+/// trampoline function which can be used to invoke it.
+///
+/// # Safety
+///
+/// It is the user's responsibility to ensure the closure outlives the returned
+/// `void*` pointer.
+///
+/// Calling the trampoline function with anything except the `void*` pointer
+/// will result in *Undefined Behaviour*.
+unsafe fn unpack_closure<F>(closure: F) -> (*mut c_void, LogCallback)
+where
+    F: FnMut(u32, &str, &str),
+{
+    extern "C" fn trampoline<F>(
+        level: c_uint,
+        tag: *const c_char,
+        msg: *const c_char,
+        data: *mut c_void,
+    ) where
+        F: FnMut(u32, &str, &str),
+    {
+        if let Err(e) = std::panic::catch_unwind(|| {
+            let tag = unsafe { CStr::from_ptr(tag).to_string_lossy().into_owned() };
+            let msg = unsafe { CStr::from_ptr(msg).to_string_lossy().into_owned() };
+            let closure: &mut F = unsafe { &mut *(data as *mut F) };
+
+            (*closure)(level, &tag, &msg);
+        }) {
+            eprintln!("Caught a panic calling log closure: {:?}", e);
+        }
+    }
+
+    let cb = Box::new(closure);
+    let cb = Box::leak(cb);
+
+    (cb as *mut F as *mut c_void, trampoline::<F>)
+}
diff --git a/crates/optix/src/denoiser.md b/crates/optix/src/denoiser.md
new file mode 100644
index 00000000..46124e7b
--- /dev/null
+++ b/crates/optix/src/denoiser.md
@@ -0,0 +1,14 @@
+# NVIDIA AI Denoiser
+
+Image areas that have not yet fully converged during rendering will often exhibit pixel-scale noise due to the insufficient amount of information gathered by the renderer. This grainy appearance in an image may be caused by low iteration counts, especially in scenes with complex lighting environments and material calculations.
+
+The NVIDIA AI Denoiser can estimate the converged image from a partially converged image. Instead of improving image quality through a larger number of path tracing iterations, the denoiser can produce images of acceptable quality with far fewer iterations by post-processing the image.
+
+The denoiser is based on statistical data sets that guide the denoising process. These data, represented by a binary blob called a training model, are produced from a large number of rendered images in different stages of convergence. The images are used as input to an underlying deep learning system. (See the NVIDIA Developer article “Deep Learning” for more information about deep-learning systems.)
+
+Because deep-learning training needs significant computational resources—even obtaining a sufficient number of partially converged images can be difficult—a general-purpose model is included with the OptiX software. This model is suitable for many renderers. However, the model may not yield optimal results when applied to images produced by renderers with very different noise characteristics compared to those used in the original training data.
+
+Post-processing rendered images includes image filters, such as blurring or sharpening, or reconstruction filters, such as box, triangle, or Gaussian filters. Custom post-processing performed on a noisy image can lead to unsatisfactory denoising results. During post-processing, the original high-frequency, per-pixel noise may become smeared across multiple pixels, making it more difficult to detect and be handled by the model. Therefore, post-processing operations should be done after the denoising process, while reconstruction filters should be implemented by using filter importance-sampling.
+
+In general, the pixel color space of an image that is used as input for the denoiser should match the color space of the images on which the denoiser was trained. However, slight variations, such as substituting sRGB with a simple gamma curve, should not have a noticeable impact. Images used for the training model included with the NVIDIA AI Denoiser distribution were output directly as HDR data.
+
diff --git a/crates/optix/src/denoiser.rs b/crates/optix/src/denoiser.rs
index 8611f70d..4d60281c 100644
--- a/crates/optix/src/denoiser.rs
+++ b/crates/optix/src/denoiser.rs
@@ -8,13 +8,12 @@ use std::{
 
 use cust::{
     error::CudaResult,
-    memory::{
-        DeviceBox, DeviceBuffer, DeviceCopy, DevicePointer, GpuBox, GpuBuffer, UnifiedBuffer,
-    },
+    memory::{DeviceBox, DeviceBuffer, DeviceCopy, DevicePointer, GpuBuffer, UnifiedBuffer},
     prelude::Stream,
 };
 
-use crate::{context::OptixContext, error::OptixResult, optix_call, sys};
+use crate::{context::DeviceContext, error::Error, optix_call, sys};
+type Result<T, E = Error> = std::result::Result<T, E>;
 
 // can't zero initialize, OptixPixelFormat is not zero-initializable.
 fn null_optix_image() -> sys::OptixImage2D {
@@ -44,7 +43,7 @@ pub enum DenoiserModelKind {
 
 impl DenoiserModelKind {
     /// Converts this model kind to its raw counterpart.
-    pub fn to_raw(self) -> sys::OptixDenoiserModelKind {
+    pub fn to_raw(self) -> sys::OptixDenoiserModelKind::Type {
         match self {
             Self::Ldr => sys::OptixDenoiserModelKind::OPTIX_DENOISER_MODEL_KIND_LDR,
             Self::Hdr => sys::OptixDenoiserModelKind::OPTIX_DENOISER_MODEL_KIND_HDR,
@@ -131,10 +130,10 @@ impl Drop for Denoiser {
 impl Denoiser {
     /// Create a new [`Denoiser`] with a model kind and some options.
     pub fn new(
-        ctx: &OptixContext,
+        ctx: &DeviceContext,
         kind: DenoiserModelKind,
         options: DenoiserOptions,
-    ) -> OptixResult<Self> {
+    ) -> Result<Self> {
         let mut raw = MaybeUninit::uninit();
         unsafe {
             let ctx = ctx.raw;
@@ -159,7 +158,7 @@ impl Denoiser {
     ///
     /// If tiling is being used, `width` and `height` should not contain the overlap size. Tiling requires
     /// extra overlap areas which is why there is scratch memory with and without tiling requirements.
-    pub fn required_gpu_memory(&self, width: u32, height: u32) -> OptixResult<DenoiserSizes> {
+    pub fn required_gpu_memory(&self, width: u32, height: u32) -> Result<DenoiserSizes> {
         let mut sizes = MaybeUninit::uninit();
         unsafe {
             optix_call!(optixDenoiserComputeMemoryResources(
@@ -192,7 +191,7 @@ impl Denoiser {
         mut width: u32,
         mut height: u32,
         tiled: bool,
-    ) -> OptixResult<()> {
+    ) -> Result<()> {
         // first, find out how much memory we need to allocate
         let sizes = self.required_gpu_memory(width, height)?;
         let original_width = width;
@@ -264,7 +263,7 @@ impl Denoiser {
         input_image: Image,
         parameters: DenoiserParams,
         out_buffer: &mut impl GpuBuffer<T>,
-    ) -> OptixResult<()> {
+    ) -> Result<()> {
         let state_lock = self.state.lock().unwrap();
         let state = state_lock.as_ref().expect(
             "State was not initialized before invoking the denoiser, call Denoiser::setup_state first"
@@ -374,7 +373,7 @@ impl Denoiser {
         let raw_params = parameters.to_raw();
 
         let mut out = input_image.to_raw();
-        out.data = out_buffer.as_device_ptr().as_raw_mut() as u64;
+        out.data = out_buffer.as_device_ptr().as_raw() as u64;
 
         let layer = sys::OptixDenoiserLayer {
             input: input_image.to_raw(),
@@ -389,14 +388,14 @@ impl Denoiser {
                 self.raw,
                 stream.as_inner(),
                 &raw_params as *const _,
-                state.state.as_device_ptr().as_raw_mut() as u64,
+                state.state.as_device_ptr().as_raw() as u64,
                 state.state.len(),
                 &cloned as *const _,
                 &layer as *const _,
                 1, // num-layers
                 0, // offsetX
                 0, // offsetY
-                state.scratch.as_device_ptr().as_raw_mut() as u64,
+                state.scratch.as_device_ptr().as_raw() as u64,
                 state.scratch.len()
             ))?;
         }
@@ -501,7 +500,7 @@ pub enum ImageFormat {
 }
 
 impl ImageFormat {
-    pub fn to_raw(self) -> sys::OptixPixelFormat {
+    pub fn to_raw(self) -> sys::OptixPixelFormat::Type {
         use ImageFormat::*;
 
         match self {
@@ -560,11 +559,10 @@ impl<'a> Image<'a> {
         Self::validate_buf(buffer, format, width, height);
 
         Self {
-            buffer: unsafe {
+            buffer:
                 // SAFETY: this buffer is never written to for the duration of this image being alive.
                 // And we know the buffer is large enough to be reinterpreted as a buffer of bytes.
-                DevicePointer::wrap(buffer.as_device_ptr().as_raw_mut() as *mut u8)
-            },
+                DevicePointer::from_raw(buffer.as_device_ptr().as_raw()),
             buffer_size: buffer.len() * std::mem::size_of::<T>(),
             format,
             width,
diff --git a/crates/optix/src/error.rs b/crates/optix/src/error.rs
index 80165baf..96121a33 100644
--- a/crates/optix/src/error.rs
+++ b/crates/optix/src/error.rs
@@ -131,14 +131,14 @@ impl Display for OptixError {
 
 impl std::error::Error for OptixError {}
 
-pub type OptixResult<T> = Result<T, OptixError>;
+// pub type OptixResult<T> = Result<T, OptixError>;
 
 pub trait ToResult {
-    fn to_result(self) -> OptixResult<()>;
+    fn to_result(self) -> Result<(), OptixError>;
 }
 
 impl ToResult for sys::OptixResult {
-    fn to_result(self) -> OptixResult<()> {
+    fn to_result(self) -> Result<(), OptixError> {
         use OptixError::*;
 
         Err(match self {
@@ -183,6 +183,63 @@ impl ToResult for sys::OptixResult {
             sys::OptixResult::OPTIX_ERROR_CUDA_ERROR => CudaError,
             sys::OptixResult::OPTIX_ERROR_INTERNAL_ERROR => InternalError,
             sys::OptixResult::OPTIX_ERROR_UNKNOWN => Unknown,
+            value => panic!("Unhandled OptixResult value {:?}", value),
         })
     }
 }
+
+#[derive(Debug)]
+pub enum Error {
+    Optix(OptixError),
+    Cuda(CudaError),
+    ModuleCreation { source: OptixError, log: String },
+    ProgramGroupCreation { source: OptixError, log: String },
+    PipelineCreation { source: OptixError, log: String },
+    AccelUpdateMismatch,
+    NulBytesInString,
+    TooFewMotionKeys(usize),
+}
+
+impl From<OptixError> for Error {
+    fn from(o: OptixError) -> Self {
+        Self::Optix(o)
+    }
+}
+
+impl From<CudaError> for Error {
+    fn from(e: CudaError) -> Self {
+        Self::Cuda(e)
+    }
+}
+
+impl std::error::Error for Error {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            Self::Optix(e) => Some(e),
+            Self::Cuda(e) => Some(e),
+            Self::ModuleCreation { source, .. } => Some(source),
+            Self::ProgramGroupCreation { source, .. } => Some(source),
+            Self::PipelineCreation { source, .. } => Some(source),
+            Self::AccelUpdateMismatch => None,
+            Self::NulBytesInString => None,
+            Self::TooFewMotionKeys(_) => None,
+        }
+    }
+}
+
+impl Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Optix(_) => write!(f, "OptiX error"),
+            Self::Cuda(_) => write!(f, "CUDA error"),
+            Self::ModuleCreation { log, .. } => write!(f, "Module creation error: {}", log),
+            Self::ProgramGroupCreation { log, .. } => {
+                write!(f, "Program group creation error: {}", log)
+            }
+            Self::PipelineCreation { log, .. } => write!(f, "Pipeline creation error: {}", log),
+            Self::AccelUpdateMismatch => write!(f, "Build inputs passed to DynamicAccel::update do not match the structure of those used to build the accel"),
+            Self::NulBytesInString => write!(f, "The provided string contained nul bytes"),
+            Self::TooFewMotionKeys(num) => write!(f, "Provided too few motion keys ({}) for transform. Must provide at least 2", num),
+        }
+    }
+}
diff --git a/crates/optix/src/impl_glam.rs b/crates/optix/src/impl_glam.rs
new file mode 100644
index 00000000..d6e5d683
--- /dev/null
+++ b/crates/optix/src/impl_glam.rs
@@ -0,0 +1,9 @@
+use crate::acceleration::{IndexTriple, IndicesFormat, Vertex, VertexFormat};
+
+impl Vertex for glam::Vec3 {
+    const FORMAT: VertexFormat = VertexFormat::Float3;
+}
+
+impl IndexTriple for glam::IVec3 {
+    const FORMAT: IndicesFormat = IndicesFormat::Int3;
+}
diff --git a/crates/optix/src/introduction.md b/crates/optix/src/introduction.md
new file mode 100644
index 00000000..8f3bff92
--- /dev/null
+++ b/crates/optix/src/introduction.md
@@ -0,0 +1,310 @@
+# Overview of OptiX
+
+ NVIDIA OptiX 7 is intended for ray tracing applications that use NVIDIA® CUDA®
+ technology, such as:
+
+* Film and television visual effects
+* Computer-aided design for engineering and manufacturing
+* Light maps generated by path tracing
+* High-performance computing
+* LIDAR simulation
+
+NVIDIA OptiX 7 also includes support for motion blur and multi-level transforms,
+features required by ray-tracing applications designed for production-quality
+rendering.
+
+## Terms used in this documentation
+
+OptiX uses a shorthand to describe some common program components and data
+structures that are worth memorizing as they crop up a lot.
+
+### Program Types
+* **`RG`** - Ray generation - This is the entry point into the OptiX programming
+    model and is generally responsible for creating and tracing rays.
+* **`IS`** - Intersection - Run to provide intersections with custom,
+    user-defined primitives (as opposed to built-in triangles).
+* **`AH`** - Any-hit - Run during ray traversal for each potential intersection.
+    reports to OptiX whether the intersection should be considered valid and
+    whether to stop traversal.
+* **`CH`** - Closest-hit - Run only for the closest hit found during ray
+    traversal. Can inspect and interpolating properties of the intersected
+    primitive.
+* **`MS`** - Miss - Run whenever a ray exits the scene without hitting anything.
+* **`EX`** - Exception - Run whenever an exception condition is found.
+* **`DC`** - Direct callable - Can be called manually from another program.
+    May not itself continue ray traversal (i.e. may not call `optixTrace`).
+* **`CC`** - Continuation callable - Can be called manually from another
+    program and may continue ray traversal.
+
+### Acceleration structures
+* geometry-AS/GAS/BLAS - Geometry/Bottom-level acceleration structure. An
+    acceleration structure built over geometric primitives such as curves
+* instance-AS/IAS/TLAS - Instance/Top-level acceleration structure built
+    over other acceleration structures and/or transform nodes in order to
+    compose more complex scenes and implement instancing and rigid
+    transformations.
+
+In this document and in the names of API elements, the “host” is the processor
+that begins execution of an application. The “device” is the GPU with which
+the host interacts. A “build” is the creation of an acceleration structure on
+the device as initiated by the host.
+
+## Overview
+The NVIDIA OptiX 7 API is a CUDA-centric API that is invoked by a CUDA-based
+application. The API is designed to be stateless, multi-threaded and
+asynchronous, providing explicit control over performance-sensitive operations
+like memory management and shader compilation.
+
+It supports a lightweight representation for scenes that can represent
+instancing, vertex- and transform-based motion blur, with built-in triangles,
+built-in swept curves, and user-defined primitives. The API also includes
+highly-tuned kernels and neural networks for machine-learning-based denoising.
+
+An NVIDIA OptiX 7 context controls a single GPU. The context does not hold bulk
+CPU allocations, but like CUDA, may allocate resources on the device necessary
+to invoke the launch. It can hold a small number of handle objects that are used
+to manage expensive host-based state. These handle objects are automatically
+released when the context is destroyed. Handle objects, where they do exist,
+consume a small amount of host memory (typically less than 100 kilobytes) and
+are independent of the size of the GPU resources being used. For exceptions to
+this rule, see “Program pipeline creation”.
+
+The application invokes the creation of acceleration structures (called builds),
+compilation, and host-device memory transfers. All API functions employ CUDA
+streams and invoke GPU functions asynchronously, where applicable. If more than
+one stream is used, the application must ensure that required dependencies are
+satisfied by using CUDA events to avoid race conditions on the GPU.
+
+Applications can specify multi-GPU capabilities with a few different recipes.
+Multi-GPU features such as efficient load balancing or the sharing of GPU memory
+via NVLINK must be handled by the application developer.
+
+For efficiency and coherence, the NVIDIA OptiX 7 runtime—unlike CUDA kernels—
+allows the execution of one task, such as a single ray, to be moved at any point
+in time to a different lane, warp or streaming multiprocessor (SM).
+(See section “Kernel Focus” in the CUDA Toolkit Documentation.) Consequently,
+applications cannot use shared memory, synchronization, barriers, or other
+SM-thread-specific programming constructs in their programs supplied to OptiX.
+
+The NVIDIA OptiX 7 programming model provides an API that future-proofs
+applications: as new NVIDIA hardware features are released, existing programs
+can use them. For example, software-based ray tracing algorithms can be mapped
+to hardware when support is added or mapped to software when the underlying
+algorithms or hardware support such changes.
+
+## Basic concepts and definitions
+
+### Program
+In NVIDIA OptiX 7, a program is a block of executable code on the GPU that
+represents a particular shading operation. This is called a shader in DXR and
+Vulkan. For consistency with prior versions of NVIDIA OptiX 7, the term program
+is used in the current documentation. This term also serves as a reminder that
+these blocks of executable code are programmable components in the system that
+can do more than shading. See “Program input”.
+
+## Program and Data Model
+NVIDIA OptiX 7 implements a single-ray programming model with ray generation,
+any-hit, closest-hit, miss and intersection programs.
+
+The ray tracing pipeline provided by NVIDIA OptiX 7 is implemented by eight types
+of programs:
+
+### Ray generation (RG)
+The entry point into the ray tracing pipeline, invoked by the system in parallel
+for each pixel, sample, or other user-defined work assignment. See
+“Ray generation launches”.
+
+### Intersection (IS)
+Implements a ray-primitive intersection test, invoked during traversal. See
+“Traversing the scene graph” and “Ray information”.
+
+### Any-hit (AH)
+Called when a traced ray finds a new, potentially closest, intersection point,
+such as for shadow computation. See “Ray information”.
+
+### Closest-hit (CH)
+Called when a traced ray finds the closest intersection point, such as for
+material shading. See “Constructing a path tracer”.
+
+### Miss
+Called when a traced ray misses all scene geometry. See “Constructing a path
+tracer”.
+
+### Exception
+Exception handler, invoked for conditions such as stack overflow and other errors.
+See “Exceptions”.
+
+### Direct callables
+Similar to a regular CUDA function call, direct callables are called immediately.
+See “Callables”.
+
+### Continuation callables
+Unlike direct callables, continuation callables are executed by the scheduler.
+See “Callables”.
+
+The ray-tracing “pipeline” is based on the interconnected calling structure of
+the eight programs and their relationship to the search through the geometric
+data in the scene, called a traversal. Figure 2.1 is a diagram of these
+relationships:
+
+![Figure 2.1 - Optix Progams][optix_programs]
+
+### Shader Binding Table
+The shader binding table connects geometric data to programs and their
+parameters. A record is a component of the shader binding table that is selected
+during execution by using offsets specified when acceleration structures are
+created and at runtime. A record contains two data regions, header and data.
+SBT record packing is handled automatically by using the
+[`SbtRecord`](shader_binding_table::SbtRecord) generic struct:
+
+```no_run
+use cust::prelude as cu;
+use optix::prelude as ox;
+
+#[derive(Copy, Clone, Default, cu::DeviceCopy)]
+struct HitgroupSbtData {
+    object_id: u32,
+}
+
+type HitgroupRecord = ox::SbtRecord<HitgroupSbtData>;
+let rec_hitgroup: Vec<_> = (0..num_objects)
+    .map(|i| {
+        let object_type = 0;
+        let rec = HitgroupRecord::pack(
+            HitgroupSbtData { object_id: i },
+            &pg_hitgroup[object_type],
+        )
+        .expect("failed to pack hitgroup record");
+        rec
+    })
+    .collect();
+```
+
+### Ray payload
+The ray payload is used to pass data between `optixTrace` and the programs
+invoked during ray traversal. Payload values are passed to and returned from
+`optixTrace`, and follow a copy-in/copy-out semantic. There is a limited number
+of payload values, but one or more of these values can also be a pointer to
+stack-based local memory, or application-managed global memory.
+
+### Primitive attributes
+Attributes are used to pass data from intersection programs to the any-hit
+and closest-hit programs. Triangle intersection provides two predefined
+attributes for the barycentric coordinates (U,V). User-defined intersections
+can define a limited number of other attributes that are specific to those
+primitives.
+
+### Buffer
+NVIDIA OptiX 7 represents GPU information with a pointer to GPU memory.
+References to the term “buffer” in this document refer to this GPU memory pointer
+and the associated memory contents. Unlike NVIDIA OptiX 6, the allocation and
+transfer of buffers is explicitly controlled by user code.
+
+## Acceleration Stutures
+NVIDIA OptiX 7 acceleration structures are opaque data structures built on the
+device. Typically, they are based on the bounding volume hierarchy model, but
+implementations and the data layout of these structures may vary from one GPU
+architecture to another.
+
+NVIDIA OptiX 7 provides two basic types of acceleration structures:
+
+* Geometry acceleration structures - Built over primitives (triangles, curves,
+    or user-defined primitives)
+* Instance acceleration structures - Built over other objects such as
+    acceleration structures (either type) or motion transform nodes. Allow
+    for instancing with a per-instance static transform
+
+## Traversing the Scene Graph
+To determine the intersection of geometric data by a ray, NVIDIA OptiX 7 searches
+a graph of nodes composed of acceleration structures and transformations. This
+search is called a traversal; the nodes in the graph are called traversable
+objects or traversables.
+
+The following types of traversable objects exist:
+
+* An instance acceleration structure
+* A geometry acceleration structure (as a root for graph with a single geometry
+    acceleration structure (see “Traversal of a single geometry acceleration
+    structure”)
+* Static transform
+* Matrix motion transform
+* Scaling, rotation, translation (SRT) motion transform
+
+For transformation traversables, the corresponding transformation applies to
+all descendant child traversables (the sub graph spanned by the child of the
+transformation traversable). The transformation traversables should only be
+used in case of motion as applying transformations to geometry is order dependent
+and motion transformations are time dependent. Static transformations are
+available as they cannot be merged with any motion transformation due to
+time-dependency, but should be merged with instance transformations (if desired
+as the child of an instance) or any other static transformation (i.e., there
+should be at most one static transformation following a motion transformation).
+For example, Figure 2.2 combines both types:
+
+![Figure 2.2 - Traversables graph][traversables_graph]
+
+OptiX uses handles as references to traversable objects. These traversable
+handles are 64-bit opaque values that are generated from device memory pointers
+for the graph nodes. The handles identify the connectivity of these objects.
+All calls to `optixTrace` begin at a traversable handle.
+
+## Ray tracing with NVIDIA OptiX 7
+
+A functional ray tracing system is implemented by combining four components as
+described in the following steps:
+
+1. Create one or more acceleration structures over one or many geometry meshes
+    and instances of these meshes in the scene. See
+    [Acceleration structures](crate::acceleration).
+2. Create a pipeline of programs that contains all programs that will be invoked
+    during a ray tracing launch. See “Program pipeline creation”.
+3. Create a shader binding table that includes references to these programs and
+    their parameters and choose a data layout that matches the implicit shader
+    binding table record selection of the instances and geometries in the
+    acceleration structures. See “Shader binding table”.
+4. Launch a device-side kernel that will invoke a ray generation program with a
+    multitude of threads calling optixTrace to begin traversal and the execution
+    of the other programs. See “Ray generation launches”. Device-side
+    functionality is described in “Device-side functions”.
+
+Ray tracing work can be interleaved with other CUDA work to generate data, move
+data to and from the device, and move data to other graphics APIs. It is the
+application's responsibility to coordinate all work on the GPU. NVIDIA OptiX 7
+does not synchronize with any other work.
+
+# Implementation Principles
+
+## Error Handling
+All OptiX functions return a return code, which is converted to a Rust
+`Result<T, OptixError>`. You can also set a logging callback with
+[`DeviceContext::set_log_callback`](crate::context::DeviceContext::set_log_callback)
+to have OptiX report additional information.
+
+Functions that compile also return a `String` containing additional messages
+for warnings and errors.
+
+## Stateless Model
+Given the same input, the same output should be generated. GPU state is not held by NVIDIA OptiX 7 internally.
+
+In NVIDIA OptiX 7 functions, a [CUDA Stream](cust::stream::Stream) is associated
+with the [CUDA Context](cust::context::Context) used to create the
+[`DeviceContext`](context::DeviceContext). Some API functions take a
+[Stream](cust::stream::Stream) as an argument. These functions incur work on
+the device and require that the [CUDA Context](cust::context::Context) associated
+with the [`DeviceContext`](context::DeviceContext)is the current context when
+they are called. Applications can expect the
+[CUDA Context](cust::context::Context) to remain the same after invoking NVIDIA
+OptiX 7 functions.
+
+## Asynchronous Execution
+Work performed on the device is issued on an application-supplied
+[CUDA Stream](cust::stream::Stream) using asynchronous CUDA methods. The host
+function blocks execution until all work has been issued on the stream, but does
+not do any synchronization or blocking on the stream itself.
+
+## Function Table initialization
+
+You must call [`optix::init()`](crate::init) in order to load the function symbols from
+the OptiX library in the driver before calling any other functions.
+
+
diff --git a/crates/optix/src/lib.rs b/crates/optix/src/lib.rs
index 7f47e537..f497eaa1 100644
--- a/crates/optix/src/lib.rs
+++ b/crates/optix/src/lib.rs
@@ -1,14 +1,80 @@
+//! # OptiX
+//!
+//! <div style = "background-color: #fff7e1; padding: 0; margin-bottom: 1em">
+//! <span style="float:left; font-size: 4em; padding-left: 0.25em; padding-right: 0.25em;">!</span>
+//! <p style = "padding: 1em">
+//! You must call <code>optix::init()</code> before calling any of the functions
+//! in this crate in order to load the necessary symbols from the driver.
+//! </p>
+//! </div>
+//!
+//! Rust bindings for NVIDIA's OptiX GPU raytracing library.
+//!
+//!  NVIDIA OptiX 7 is intended for ray tracing applications that use NVIDIA® CUDA®
+//!  technology, such as:
+//!
+//! * Film and television visual effects
+//! * Computer-aided design for engineering and manufacturing
+//! * Light maps generated by path tracing
+//! * High-performance computing
+//! * LIDAR simulation
+//!
+//! NVIDIA OptiX 7 also includes support for motion blur and multi-level transforms,
+//! features required by ray-tracing applications designed for production-quality
+//! rendering.
+//!
+//! # Programming Guide
+//!
+//! For high-level documentation please see the
+//! [introduction](crate::introduction) module documentation and subsequent documentation in the
+//! modules listed below. Each module has an expandable "Programming Guide" section that will
+//! display the docs when clicked.
+//!
+//! * [1. Introduction](introduction)
+//! * [2. Context](context)
+//! * [3. Acceleration Structures](acceleration)
+//! * [4. Program Pipeline Creation](pipeline)
+//! * [5. Shader Binding Table](shader_binding_table)
+//! * [6. Ray Generation Launches](launch)
+//!
+
+#[doc = ::embed_doc_image::embed_image!("optix_programs", "images/optix_programs.jpg")]
+#[doc = ::embed_doc_image::embed_image!("traversables_graph", "images/traversables_graph.jpg")]
+#[doc = include_str!("introduction.md")]
+pub mod introduction {}
+
+#[doc = include_str!("acceleration.md")]
+pub mod acceleration;
+
+#[doc = include_str!("context.md")]
 pub mod context;
+
+#[doc = include_str!("denoiser.md")]
 pub mod denoiser;
+
+/// Error handling
 pub mod error;
 
+#[doc = include_str!("pipeline.md")]
+pub mod pipeline;
+pub mod prelude;
+
+#[doc = ::embed_doc_image::embed_image!("example_sbt", "images/example_sbt.png")]
+#[doc = ::embed_doc_image::embed_image!("scene_graph", "images/scene_graph.png")]
+#[doc = include_str!("shader_binding_table.md")]
+pub mod shader_binding_table;
+use shader_binding_table::ShaderBindingTable;
+
+pub mod sys;
+
 pub use cust;
-use error::{OptixResult, ToResult};
-pub use optix_sys as sys;
+use cust::memory::DeviceMemory;
+use error::{Error, ToResult};
+type Result<T, E = Error> = std::result::Result<T, E>;
 
 /// Initializes the OptiX library. This must be called before using any OptiX function. It may
 /// be called before or after initializing CUDA.
-pub fn init() -> OptixResult<()> {
+pub fn init() -> Result<()> {
     // avoid initializing multiple times because that will try to load the dll every time.
     if !optix_is_initialized() {
         init_cold()
@@ -19,13 +85,14 @@ pub fn init() -> OptixResult<()> {
 
 #[cold]
 #[inline(never)]
-fn init_cold() -> OptixResult<()> {
-    unsafe { sys::optixInit().to_result() }
+fn init_cold() -> Result<()> {
+    unsafe { Ok(sys::optixInit().to_result()?) }
 }
 
 /// Whether OptiX is initialized. If you are calling raw [`sys`] functions you must make sure
 /// this is true, otherwise OptiX will segfault. In the safe wrapper it is done automatically and optix not
 /// being initialized will return an error result.
+#[doc(hidden)]
 pub fn optix_is_initialized() -> bool {
     // SAFETY: C globals are explicitly defined to be zero-initialized, and the sys version uses
     // Option for each field, and None is explicitly defined to be represented as a nullptr for Option<fn()>,
@@ -40,6 +107,7 @@ extern "C" {
 
 /// Call a raw OptiX sys function, making sure that OptiX is initialized. Returning
 /// an OptixNotInitialized error if it is not initialized. See [`optix_is_initialized`].
+#[doc(hidden)]
 #[macro_export]
 macro_rules! optix_call {
     ($name:ident($($param:expr),* $(,)?)) => {{
@@ -50,3 +118,91 @@ macro_rules! optix_call {
           }
     }};
 }
+
+/// Launch the given [`Pipeline`](pipeline::Pipeline) on the given [`Stream`](cust::stream::Stream).
+///
+/// A ray generation launch is the primary workhorse of the NVIDIA OptiX API. A
+/// launch invokes a 1D, 2D or 3D array of threads on the device and invokes ray
+/// generation programs for each thread. When the ray generation program invokes
+/// `optixTrace`, other programs are invoked to execute traversal, intersection,
+/// any-hit, closest-hit, miss and exception programs until the invocations are
+/// complete.
+///
+/// A pipeline requires device-side memory for each launch. This space is allocated
+/// and managed by the API. Because launch resources may be shared between pipelines,
+/// they are only guaranteed to be freed when the [`DeviceContext`] is destroyed.
+///
+/// All launches are asynchronous, using [`CUDA stream`]s. When it is necessary
+/// to implement synchronization, use the mechanisms provided by CUDA streams and
+/// events.
+///
+/// In addition to the pipeline object, the CUDA stream, and the launch state, it
+/// is necessary to provide information about the SBT layout using the
+/// [`ShaderBindingTable`](crate::shader_binding_table::ShaderBindingTable) struct
+/// (see [Shader Binding Table](crate::shader_binding_table)).
+///
+/// The value of the pipeline launch parameter is specified by the
+/// `pipeline_launch_params_variable_name` field of the
+/// [`PipelineCompileOptions`](crate::pipeline::PipelineCompileOptions) struct.
+/// It is determined at launch with a [`DevicePointer`](cust::memory::DevicePointer)
+/// parameter, named `pipeline_params`]. This must be the same size as that passed
+/// to the module compilation or an error will occur.
+///
+/// The kernel creates a copy of `pipeline_params` before the launch, so the kernel
+/// is allowed to modify `pipeline_params` values during the launch. This means
+/// that subsequent launches can run with modified pipeline parameter values. Users
+/// cannot synchronize with this copy between the invocation of `launch()` and
+/// the start of the kernel.
+///
+/// # Safety
+/// You must ensure that:
+/// - Any device memory referenced in `buf_launch_params` point to valid,
+///   correctly aligned memory
+/// - Any [`SbtRecord`](shader_binding_table::SbtRecord)s and associated data
+///     referenced by the
+///     [`ShaderBindingTable`](shader_binding_table::ShaderBindingTable) are alive
+///     and valid
+///
+/// [`CUDA stream`]: cust::stream::Stream
+/// [`DeviceContext`]: crate::context::DeviceContext
+pub unsafe fn launch<M: DeviceMemory>(
+    pipeline: &crate::pipeline::Pipeline,
+    stream: &cust::stream::Stream,
+    pipeline_params: &M,
+    sbt: &ShaderBindingTable,
+    width: u32,
+    height: u32,
+    depth: u32,
+) -> Result<()> {
+    Ok(optix_call!(optixLaunch(
+        pipeline.raw,
+        stream.as_inner(),
+        pipeline_params.as_raw_ptr() as u64,
+        pipeline_params.size_in_bytes(),
+        &sbt.0,
+        width,
+        height,
+        depth,
+    ))?)
+}
+
+#[cfg(feature = "glam")]
+mod impl_glam;
+
+macro_rules! const_assert {
+    ($x:expr $(,)?) => {
+        #[allow(unknown_lints, clippy::eq_op)]
+        const _: [(); 0 - !{
+            const ASSERT: bool = $x;
+            ASSERT
+        } as usize] = [];
+    };
+}
+pub(crate) use const_assert;
+
+macro_rules! const_assert_eq {
+    ($x:expr, $y:expr $(,)?) => {
+        const_assert!($x == $y);
+    };
+}
+pub(crate) use const_assert_eq;
diff --git a/crates/optix/src/optix_wrapper.h b/crates/optix/src/optix_wrapper.h
new file mode 100644
index 00000000..bd9a4d7a
--- /dev/null
+++ b/crates/optix/src/optix_wrapper.h
@@ -0,0 +1,28 @@
+#include <optix.h>
+#include <optix_host.h>
+
+static const size_t OptixSbtRecordHeaderSize = OPTIX_SBT_RECORD_HEADER_SIZE;
+static const size_t OptixSbtRecordAlignment = OPTIX_SBT_RECORD_ALIGNMENT;
+static const size_t OptixAccelBufferByteAlignment =
+    OPTIX_ACCEL_BUFFER_BYTE_ALIGNMENT;
+static const size_t OptixInstanceByteAlignment = OPTIX_INSTANCE_BYTE_ALIGNMENT;
+static const size_t OptixAabbBufferByteAlignment =
+    OPTIX_AABB_BUFFER_BYTE_ALIGNMENT;
+static const size_t OptixGeometryTransformByteAlignment =
+    OPTIX_GEOMETRY_TRANSFORM_BYTE_ALIGNMENT;
+static const size_t OptixTransformByteAlignment =
+    OPTIX_TRANSFORM_BYTE_ALIGNMENT;
+
+static const size_t OptixVersion = OPTIX_VERSION;
+
+static const size_t OptixBuildInputSize = sizeof(OptixBuildInput);
+static const size_t OptixShaderBindingTableSize = sizeof(OptixShaderBindingTable);
+
+/**
+ * <div rustbindgen replaces="OptixGeometryFlags"></div>
+ */
+enum GeometryFlags {
+    None = OPTIX_GEOMETRY_FLAG_NONE,
+    DisableAnyHit = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT,
+    RequireSingleAnyHitCall = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL
+};
diff --git a/crates/optix/src/pipeline.md b/crates/optix/src/pipeline.md
new file mode 100644
index 00000000..8175bc65
--- /dev/null
+++ b/crates/optix/src/pipeline.md
@@ -0,0 +1,293 @@
+
+# Program Pipeline Creation
+
+# Programming Guide...
+<details>
+<summary>Click here to expand programming guide</summary>
+
+# Contents
+- [Program Input](#program-input)
+- [Programming Model](#programming-model)
+- [Module Creation](#module-creation)
+- [Pipeline Launch Parameter](#pipeline-launch-parameter)
+    - [Parameter Specialization](#parameter-specialization)
+- [Program Group Creation](#program-group-creation)
+- [Pipeline Linking](#pipeline-linking)
+- [Pipeline Stack Size](#pipeline-stack-size)
+    - [Constructing a Path Tracer](#constructing-a-path-tracer)
+- [Compilation Cache](#compilation-cache)
+
+Programs are first compiled into modules of type [`Module`]. One or more modules are combined to create a program group of type [`ProgramGroup`]. Those program groups are then linked into an [`Pipeline`] on the GPU. This is similar to the compile and link process commonly found in software development. The program groups are also used to initialize the header of the SBT record associated with those programs.
+
+The constructors for [`Module`], [`ProgramGroup`], and [`Pipeline`] return a log string. This string is used to report information about any compilation that may have occurred, such as compile errors or verbose information about the compilation result. If an error occurred, the information that would be reported in the log string is also reported by the device context log callback (when provided) (see [`DeviceContext::set_log_callback()`](crate::context::DeviceContext::set_log_callback).
+
+Both mechanisms are provided for these create functions to allow a convenient mechanism for pulling out compilation errors from parallel creation operations without having to determine which output from the logger corresponds to which API invocation.
+
+Symbols in [`Module`] objects may be unresolved and contain extern references to variables and `__device__` functions.
+
+These symbols can be resolved during pipeline creation using the symbols defined in the pipeline modules. Duplicate symbols will trigger an error.
+
+A pipeline contains all programs that are required for a particular ray-tracing launch. An application may use a different pipeline for each launch, or may combine multiple ray-generation programs into a single pipeline.
+
+Most NVIDIA OptiX 7 API functions do not own any significant GPU state; Streaming Assembly (SASS) instructions, which define the executable binary programs in a pipeline, are an exception. The [`Pipeline`] owns the CUDA resource associated with the compiled SASS and it is held until the pipeline is destroyed. This allocation is proportional to the amount of compiled code in the pipeline, typically tens of kilobytes to a few megabytes. However, it is possible to create complex pipelines that require substantially more memory, especially if large static initializers are used. Wherever possible, exercise caution in the number and size of the pipelines.
+
+## Program Input
+NVIDIA OptiX 7 programs are encoded in the parallel thread execution instruction set (PTX) language. To create PTX programs, compile CUDA source files using the NVIDIA `nvcc` offline compiler or `nvrtc` JIT compiler. The CUDA code includes PTX device headers used during compilation.
+
+See the `build.rs` files in the examples in this crate for code to compile PTX
+as part of the cargo build.
+
+```bash
+nvcc -ptx -Ipath-to-optix-sdk/include --use_fast_math myprogram.cu -o myprogram.ptx
+```
+
+The nvcc command-line options are explained in more detail as part of the usage description of the compiler options displayed with nvcc --help.
+Note the following requirements for nvcc and nvrtc compilation:
+
+* The streaming multiprocessor (SM) target of the input PTX program must be less than or equal to the SM version of the GPU for which the module is compiled.
+* To generate code for the minimum supported GPU (Maxwell), use architecture targets for SM 5.0, for example, --gpu-architecture=compute_50. Because OptiX rewrites the code internally, those targets will work on any newer GPU as well.
+* CUDA Toolkits 10.2 and newer throw deprecation warnings for SM 5.0 targets. These can be suppressed with the compiler option -Wno-deprecated-gpu-targets.
+    If support for Maxwell GPUs is not required, you can use the next higher GPU architecture target SM 6.0 (Pascal) to suppress these warnings.
+
+* Use --machine=64 (-m64). Only 64-bit code is supported in OptiX.
+* Define the output type with --ptx. Do not compile to obj or cubin.
+* Do not use debug flags -g and -G. OptiX might not handle all debugging instrumentation. This is important when using the Microsoft Visual Studio CUDA integration, which sets these flags as default in the Debug target.
+* Enable --relocatable-device-code=true (-rdc). Command nvcc can also use the option --keep-device-functions, which is not supported by nvrtc. These flags prevent the CUDA compiler from eliminating direct or continuation callables as dead code.
+* To get smaller and faster code, enable --use_fast_math. This flag enables .approx instructions for trigonometric functions and reciprocals, avoiding inadvertent use of slow double-precision floats. For performance reasons, it is recommended that you set this flag; the only exception is use cases that require more precision.
+* To profile your code with Nsight Compute, enable --generate-line-info and set `debug_level = CompileDebugLevel::LineInfo` in both the [`ModuleCompileOptions`] and [`PipelineLinkOptions`] in your application host code.
+
+## Programming Model
+The NVIDIA OptiX 7 programming model supports the multiple instruction, multiple data (MIMD) subset of CUDA. Execution must be independent of other threads. For this reason, shared memory usage and warp-wide or block-wide synchronization—such as barriers—are not allowed in the input PTX code. All other GPU instructions are allowed, including math, texture, atomic operations, control flow, and loading data to memory. Special warp-wide instructions like vote and ballot are allowed, but can yield unexpected results as the locality of threads is not guaranteed and neighboring threads can change during execution, unlike in the full CUDA programming model. Still, warp-wide instructions can be used safely when the algorithm in question is independent of locality by, for example, implementing warp-aggregated atomic adds.
+
+The memory model is consistent only within the execution of a single launch index, which starts at the ray-generation invocation and only with subsequent programs reached from any `optixTrace` or callable program. This includes writes to stack allocated variables. Writes from other launch indices may not be available until after the launch is complete. If needed, atomic operations may be used to share data between launch indices, as long as an ordering between launch indices is not required. Memory fences are not supported.
+
+The input PTX should include one or more NVIDIA OptiX 7 programs. The type of program affects how the program can be used during the execution of the pipeline. These program types are specified by prefixing the program's name with the following:
+
+ <table>
+ <tr><th>Program Type</th><th>Function Name Prefix</th>
+ <tr><td>Ray Generation</td><td><code>__raygen__</code></td></tr>
+ <tr><td>Intersection</td><td><code>__intersection__</code></td></tr>
+ <tr><td>Any-Hit</td><td><code>__anyhit__</code></td></tr>
+ <tr><td>Closest-Hit</td><td><code>__closesthit__</code></td></tr>
+ <tr><td>Miss</td><td><code>__miss__</code></td></tr>
+ <tr><td>Direct Callable</td><td><code>__direct_callable__</code></td></tr>
+ <tr><td>Continuation Callable</td><td><code>__continuation_callable__</code></td></tr>
+ <tr><td>Exception</td><td><code>__exception__</code></td></tr>
+ </table>
+
+ If a particular function needs to be used with more than one type, then multiple copies with corresponding program prefixes should be generated.
+
+In addition, each program may call a specific set of device-side intrinsics that implement the actual ray-tracing-specific features. (See “Device-side functions”.)
+
+## Module Creation
+
+A module may include multiple programs of any program type. Two option structs control the parameters of the compilation process:
+
+* [`PipelineCompileOptions`] - Must be identical for all modules used to create program groups linked in a single pipeline.
+* [`ModuleCompileOptions`] - May vary across the modules within the same pipeline.
+
+These options control general compilation settings, for example, the level of optimization. OptixPipelineCompileOptions controls features of the API such as the usage of custom any-hit programs, curve primitives, motion blur, exceptions, and the number of 32-bit values usable in ray payload and primitive attributes. For example:
+
+```
+let module_compile_options = ModuleCompileOptions {
+    opt_level: CompileOptimizationLevel::Default,
+    debug_level: CompileDebugLevel::LineInfo,
+    ..Default::default()
+};
+
+let pipeline_compile_options = PipelineCompileOptions::new()
+    .uses_motion_blur(false)
+    .num_attribute_values(2)
+    .num_payload_values(2)
+    .pipeline_launch_params_variable_name("PARAMS")
+    .exception_flags(ExceptionFlags::NONE)
+}
+.build();
+
+let (module, log) = Module::new(&ctx, 
+    &module_compile_options, 
+    &pipeline_compile_options,
+    &ptx_string
+    )?;
+```
+The `num_attribute_values` field of [`PipelineCompileOptions`] defines the number of 32-bit words that are reserved to store the attributes. This corresponds to the attribute definition in `optixReportIntersection`. See “Reporting intersections and attribute access”.
+
+<div style = "background-color: #fff7e1; padding: 0">
+<span style="float:left; font-size: 4em; padding-left: 0.25em; padding-right: 0.25em;">!</span>
+<p style = "padding: 1em">
+For best performance when your scene contains nothing but triangles, set uses_primitive_type_flags to PrimitiveTypeFlags::TRIANGLE.
+</p>
+</div>
+
+## Pipeline Launch Parameter
+
+You specify launch-varying parameters or values that must be accessible from any module through a user-defined variable named in [`PipelineCompileOptions`]. In each module that needs access, declare this variable with `extern` or `extern "C"` linkage and the `__constant__` memory specifier. The size of the variable must match across all modules in a pipeline. Variables of equal size but differing types may trigger undefined behavior.
+
+For example, the following header file defines the variable to share, named PARAMS, as an instance of the Params struct:
+```text
+struct Params {
+    float* image;
+    unsigned int image_width;
+};
+
+extern "C" __constant__ Params PARAMS;
+```
+
+You must match the layout of this struct with an equivalent Rust struct. Take care that CUDA vector types have specific alignment requirements which you must match in the Rust struct or you will trigger invalid memory accesses or undefined behaviour.
+
+```
+struct Params {
+    image: f32,
+    image_width: u32,
+}
+```
+
+You may also wish to use bindgen to automatically create the equivalent Rust struct from a C/C++ header to ensure they stay in sync.
+
+### Parameter Specialization
+
+Not current implemented
+
+## Program Group Creation
+[`ProgramGroup`] objects are created from one to three [`Module`] objects and are used to fill the header of the SBT records. (See [Shader Binding Table](crate::shader_binding_table)) There are five types of program groups: Raygen, Miss, Exception, Hitgroup and Callable.
+
+Modules can contain more than one program. The program in the module is designated by its entry function name as part of the [`ProgramGroupDesc`] struct passed to [`ProgramGroup::new()`](crate::pipeline::ProgramGroup::new). Four program groups can contain only a single program; only the hitgroup program can designate up to three programs for the closest-hit, any-hit, and intersection programs.
+
+Programs from modules can be used in any number of [`ProgramGroup`] objects. The resulting program groups can be used to fill in any number of SBT records. Program groups can also be used across pipelines as long as the compilation options match.
+
+A hit group specifies the intersection program used to test whether a ray intersects a primitive, together with the hit shaders to be executed when a ray does intersect the primitive. For built-in primitive types, a built-in intersection program should be obtained from [`Module::builtin_is_module_get()`](crate::pipeline::Module::builtin_is_module_get) and used in the hit group. As a special case, the intersection program is not required – and is ignored – for triangle primitives.
+
+```
+let (module, _log) = Module::new(
+    &mut ctx,
+    &module_compile_options,
+    &pipeline_compile_options,
+    ptx,
+)?;
+
+let pgdesc_hitgroup = ProgramGroupDesc::hitgroup(
+    Some((&module, "__closesthit__radiance")),
+    Some((&module, "__anyhit__radiance")),
+    None,
+);
+
+let (pg_hitgroup, _log) = ProgramGroup::new(&mut ctx, &[pgdesc_hitgroup])?;
+```
+
+## Pipeline Linking
+
+After all program groups of a pipeline are defined, they must be linked into an [`Pipeline`]. The resulting [`Pipeline`] object is then used to invoke a ray-generation launch.
+
+When the [`Pipeline`] is linked, some fixed function components may be selected based on [`PipelineLinkOptions`] and [`PipelineCompileOptions`]. These options were previously used to compile the modules in the pipeline. The link options consist of the maximum recursion depth setting for recursive ray tracing, along with pipeline level settings for debugging. However, the value for the maximum recursion depth has an upper limit that overrides an limit set by the link options. (See “Limits”.)
+
+For example, the following code creates and links a [`Pipeline`]:
+```
+let program_groups = [pg_raygen, pg_miss, pg_hitgroup];
+
+let pipeline_link_options = PipelineLinkOptions {
+    max_trace_depth: 2,
+    debug_level: CompileDebugLevel::LineInfo,
+};
+
+let (pipeline, _log) = Pipeline::new(
+    &mut ctx,
+    &pipeline_compile_options,
+    pipeline_link_options,
+    &program_groups,
+)?;
+```
+
+After [`Pipeline::new()`](crate::pipeline::Pipeline::new) completes, the fully linked module is loaded into the driver.
+
+NVIDIA OptiX 7 uses a small amount of GPU memory per pipeline. This memory is released when the pipeline or device context is destroyed.
+
+## Pipeline Stack Size
+
+The programs in a module may consume two types of stack structure : a direct stack and a continuation stack. The resulting stack needed for launching a pipeline depends on the resulting call graph, so the pipeline must be configured with the appropriate stack size. These sizes can be determined by the compiler for each program group. A pipeline may be reused for different call graphs as long as the set of programs is the same. For this reason, the pipeline stack size is configured separately from the pipeline compilation options.
+
+The direct stack requirements resulting from ray-generation, miss, exception, closest-hit, any-hit and intersection programs and the continuation stack requirements resulting from exception programs are calculated internally and do not need to be configured. The direct stack requirements resulting from direct-callable programs, as well as the continuation stack requirements resulting from ray-generation, miss, closest-hit, any-hit, intersection, and continuation-callable programs need to be configured. If these are not configured explicitly, an internal default implementation is used. When the maximum depth of call trees of continuation-callable and direct-callable programs is two or less, the default implementation is correct (but not necessarily optimal) Even in cases where the default implementation is correct, Users can always provide more precise stack requirements based on their knowledge of a particular call graph structure.
+
+To query individual program groups for their stack requirements, use [`ProgramGroup::get_stack_size`](crate::pipeline::ProgramGroup::get_stack_size). Use this information to calculate the total required stack sizes for a particular call graph of NVIDIA OptiX 7 programs. To set the stack sizes for a particular pipeline, use [`Pipeline::set_stack_size`](crate::pipeline::set_stack_size). For other parameters, helper functions are available to implement these calculations. The following is an explanation about how to compute the stack size for [`Pipeline::set_stack_size()`](crate::pipeline::Pipeline::set_stack_size), starting from a very conservative approach, and refining the estimates step by step.
+
+Let `css_rg` denote the maximum continuation stack size of all ray-generation programs; similarly for miss, closest-hit, any-hit, intersection, and continuation-callable programs. Let `dss_dc` denote the maximum direct stack size of all direct callable programs. Let `max_trace_depth` denote the maximum trace depth (as in [`PipelineLinkOptions::max_trace_depth`](crate::pipeline::PipelineLinkOptions)), and let `max_cc_depth` and `max_dc_depth` denote the maximum depth of call trees of continuation-callable and direct-callable programs, respectively. Then a simple, conservative approach to compute the three parameters of [`Pipeline::set_stack_size`](crate::pipeline::Pipeline::set_stack_size) is:
+
+```
+let direct_callable_stack_size_from_traversable = max_dc_depth * dss_dc;
+let direct_callable_stack_size_from_state = max_dc_depth * dss_dc;
+
+// Upper bound on continuation stack used by call trees of continuation callables
+let css_cc_tree = max_cc_depth * css_cc;
+
+// Upper bound on continuation stack used by closest-hit or miss programs, including 
+// the call tree of continuation-callable programs
+let css_ch_or_ms_plus_cc_tree = css_ch.max(css_ms) + css_cc_tree;
+
+let continuation_stack_size =
+      css_rg
+    + css_cc_tree
+    + max_trace_depth * css_ch_or_ms_plus_cc_tree
+    + css_is
+    + css_ah;
+```
+
+This computation can be improved in several ways. For the computation of `continuation_stack_size`, the stack sizes `css_is` and `css_ah` are not used on top of the other summands, but can be offset against one level of `css_ch_or_ms_plus_cc_tree`. This gives a more complex but better estimate:
+
+```
+let continuation_stack_size =
+      css_rg
+    + css_cc_tree
+    + (max_trace_depth - 1).max(1) * css_ch_or_ms_plus_cc_tree
+    + max_trace_depth.min(1) * css_ch_or_ms_plus_cc_tree.max(css_is + css_ah);
+```
+
+The computation of the first two terms can be improved if the call trees of direct callable programs are analyzed separately based on the semantic type of their call site. In this context, call sites in any-hit and intersection programs count as traversal, whereas call sites in ray-generation, miss, and closest-hit programs count as state.
+
+```
+let direct_callable_stack_size_from_traversable = 
+    max_dc_depth_from_traversal * dss_dc_from_traversal;
+let direct_callable_stack_size_from_state 
+    = max_dc_depth_from_state * dss_dc_from_state;
+```
+
+Depending on the scenario, these estimates can be improved further, sometimes substantially. For example, imagine there are two call trees of continuation-callable programs. One call tree is deep, but the involved continuation-callable programs need only a small continuation stack. The other call tree is shallow, but the involved continuation-callable programs needs a quite large continuation stack. The estimate of `css_cc_tree` can be improved as follows:
+
+```
+let css_cc_tree = max_cc_depth1 * css_cc1.max(max_cc_depth2 * css_cc2);
+```
+Similar improvements might be possible for all expressions involving `max_trace_depth` if the ray types are considered separately, for example, camera rays and shadow rays.
+
+### Constructing a Path Tracer
+
+A simple path tracer can be constructed from two ray types: camera rays and shadow rays. The path tracer will consist only of ray-generation, miss, and closest-hit programs, and will not use any-hit, intersection, continuation-callable, or direct-callable programs. The camera rays will invoke only the miss and closest-hit programs `ms1` and `ch1`, respectively. `ch1` might trace shadow rays, which invoke only the miss and closest-hit programs `ms2` and `ch2`, respectively. That is, the maximum trace depth is two and the initial formulas simplify to:
+
+```
+let direct_callable_stack_size_from_traversable = max_dc_depth * dss_dc;
+let direct_callable_stack_size_from_state = max_dc_depth * dss_dc;
+let continuation_stack_size = css_rg + 2 * css_ch1.max(css_ch2).max(css_ms1).max(css_ms2);
+```
+
+However, from the call graph structure it is clear that ms2 or ch2 can only be invoked from ch1. This restriction allows for the following estimate:
+
+```
+let continuation_stack_size = css_rg + css_ms1.max(css_ch1 + css_ms2.max(css_ch2));
+```
+
+This estimate is never worse than the previous one, but often better, for example, in the case where the closest-hit programs have different stack sizes (and the miss programs do not dominate the expression).
+
+## Compilation Cache
+
+Compilation work is triggered automatically when calling [`Module::new()`](crate::pipeline::Module::new) or [`ProgramGroup::new()`](crate::pipeline::ProgramGroup::new), and also potentially during [`Pipeline::new()`](crate::pipeline::Pipeline::new). This work is automatically cached on disk if enabled on the [`DeviceContext`]. Caching reduces compilation effort for recurring programs and program groups. While it is enabled by default, users can disable it through the use of [`DeviceContext::set_cache_enabled()`](crate::context::DeviceContext::set_cache_enabled). See [Context](crate::context) for other options regarding the compilation cache.
+
+Generally, cache entries are compatible with the same driver version and GPU type only.
+
+</details>
+
+ [`DeviceContext`]: crate::context::DeviceContext;
+ [`Module`]: crate::pipeline::Module
+ [`ProgramGroup`]: crate::pipeline::ProgramGroup
+ [`ProgramGroupDesc`]: crate::pipeline::ProgramGroupDesc
+ [`Pipeline`]: crate::pipeline::Pipeline
+ [`ModuleCompileOptions`]: crate::pipeline::ModuleCompileOptions
+ [`PipelineCompileOptions`]: crate::pipeline::PipelineCompileOptions
+ [`PipelineLinkOptions`]: crate::pipeline::PipelineLinkOptions
+
diff --git a/crates/optix/src/pipeline.rs b/crates/optix/src/pipeline.rs
new file mode 100644
index 00000000..7b612527
--- /dev/null
+++ b/crates/optix/src/pipeline.rs
@@ -0,0 +1,822 @@
+use crate::{context::DeviceContext, error::Error, optix_call, sys};
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+use std::ffi::{CStr, CString};
+
+// Kinda nasty hack to work around the fact taht bindgen generates an i32 for enums on windows,
+// but a u32 on linux
+#[cfg(windows)]
+type OptixEnumBaseType = i32;
+#[cfg(unix)]
+type OptixEnumBaseType = u32;
+
+#[repr(transparent)]
+pub struct Pipeline {
+    pub(crate) raw: sys::OptixPipeline,
+}
+
+#[repr(C)]
+#[derive(Debug, Hash, PartialEq, Copy, Clone, Default)]
+pub struct PipelineLinkOptions {
+    pub max_trace_depth: u32,
+    pub debug_level: CompileDebugLevel,
+}
+
+impl From<PipelineLinkOptions> for sys::OptixPipelineLinkOptions {
+    fn from(o: PipelineLinkOptions) -> Self {
+        sys::OptixPipelineLinkOptions {
+            maxTraceDepth: o.max_trace_depth,
+            debugLevel: o.debug_level as _,
+        }
+    }
+}
+
+/// # Creating and destroying `Pipeline`s
+impl Pipeline {
+    pub fn new(
+        ctx: &mut DeviceContext,
+        pipeline_compile_options: &PipelineCompileOptions,
+        link_options: PipelineLinkOptions,
+        program_groups: &[ProgramGroup],
+    ) -> Result<(Pipeline, String)> {
+        let popt = pipeline_compile_options.build();
+
+        let link_options: sys::OptixPipelineLinkOptions = link_options.into();
+
+        let mut log = [0u8; 4096];
+        let mut log_len = log.len();
+
+        let mut raw: sys::OptixPipeline = std::ptr::null_mut();
+
+        let res = unsafe {
+            optix_call!(optixPipelineCreate(
+                ctx.raw,
+                &popt,
+                &link_options,
+                program_groups.as_ptr() as *const _,
+                program_groups.len() as u32,
+                log.as_mut_ptr() as *mut i8,
+                &mut log_len,
+                &mut raw,
+            ))
+        };
+
+        let log = CStr::from_bytes_with_nul(&log[0..log_len])
+            .unwrap()
+            .to_string_lossy()
+            .into_owned();
+
+        match res {
+            Ok(()) => Ok((Pipeline { raw }, log)),
+            Err(source) => Err(Error::PipelineCreation { source, log }),
+        }
+    }
+}
+
+impl Drop for Pipeline {
+    fn drop(&mut self) {
+        unsafe {
+            sys::optixPipelineDestroy(self.raw);
+        }
+    }
+}
+
+impl Pipeline {
+    /// Sets the stack sizes for a pipeline.
+    ///
+    /// Users are encouraged to see the programming guide and the
+    /// implementations of the helper functions to understand how to
+    /// construct the stack sizes based on their particular needs.
+    /// If this method is not used, an internal default implementation is used.
+    /// The default implementation is correct (but not necessarily optimal) as
+    /// long as the maximum depth of call trees of CC and DC programs is at most
+    /// 2 and no motion transforms are used.
+    /// The maxTraversableGraphDepth responds to the maximal number of
+    /// traversables visited when calling trace. Every acceleration structure
+    /// and motion transform count as one level of traversal. E.g., for a simple
+    /// IAS (instance acceleration structure) -> GAS (geometry acceleration
+    /// structure) traversal graph, the maxTraversableGraphDepth is two. For
+    /// IAS -> MT (motion transform) -> GAS, the maxTraversableGraphDepth is
+    /// three. Note that it does not matter whether a IAS or GAS has motion
+    /// or not, it always counts as one. Launching optix with exceptions
+    /// turned on (see OPTIX_EXCEPTION_FLAG_TRACE_DEPTH) will throw an
+    /// exception if the specified maxTraversableGraphDepth is too small.
+    ///
+    /// # Arguments
+    /// * `direct_callable_stack_size_from_traversable` - The direct stack size
+    /// requirement for direct callables invoked from IS or AH
+    /// * `direct_callable_stack_size_from_state` - The direct stack size
+    /// requirement for direct callables invoked from RG, MS, or CH.
+    /// * `continuation_stack_size` - The continuation stack requirement.
+    /// * `max_traversable_graph_depth` - The maximum depth of a traversable
+    ///   graph
+    /// passed to trace
+    pub fn set_stack_size(
+        &self,
+        direct_callable_stack_size_from_traversable: u32,
+        direct_callable_stack_size_from_state: u32,
+        continuation_stack_size: u32,
+        max_traversable_graph_depth: u32,
+    ) -> Result<()> {
+        unsafe {
+            Ok(optix_call!(optixPipelineSetStackSize(
+                self.raw,
+                direct_callable_stack_size_from_traversable,
+                direct_callable_stack_size_from_state,
+                continuation_stack_size,
+                max_traversable_graph_depth,
+            ))?)
+        }
+    }
+}
+
+#[repr(transparent)]
+pub struct Module {
+    pub(crate) raw: sys::OptixModule,
+}
+
+/// Module compilation optimization level
+#[cfg_attr(windows, repr(i32))]
+#[cfg_attr(unix, repr(u32))]
+#[derive(Debug, Hash, PartialEq, Copy, Clone)]
+pub enum CompileOptimizationLevel {
+    Default = sys::OptixCompileOptimizationLevel::OPTIX_COMPILE_OPTIMIZATION_DEFAULT,
+    Level0 = sys::OptixCompileOptimizationLevel::OPTIX_COMPILE_OPTIMIZATION_LEVEL_0,
+    Level1 = sys::OptixCompileOptimizationLevel::OPTIX_COMPILE_OPTIMIZATION_LEVEL_1,
+    Level2 = sys::OptixCompileOptimizationLevel::OPTIX_COMPILE_OPTIMIZATION_LEVEL_2,
+    Level3 = sys::OptixCompileOptimizationLevel::OPTIX_COMPILE_OPTIMIZATION_LEVEL_3,
+}
+
+impl Default for CompileOptimizationLevel {
+    fn default() -> Self {
+        CompileOptimizationLevel::Default
+    }
+}
+
+/// Module compilation debug level
+#[cfg_attr(windows, repr(i32))]
+#[cfg_attr(unix, repr(u32))]
+#[derive(Debug, Hash, PartialEq, Copy, Clone)]
+pub enum CompileDebugLevel {
+    None = sys::OptixCompileDebugLevel::OPTIX_COMPILE_DEBUG_LEVEL_NONE,
+    LineInfo = sys::OptixCompileDebugLevel::OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO,
+    Full = sys::OptixCompileDebugLevel::OPTIX_COMPILE_DEBUG_LEVEL_FULL,
+}
+
+impl Default for CompileDebugLevel {
+    fn default() -> Self {
+        CompileDebugLevel::None
+    }
+}
+
+cfg_if::cfg_if! {
+    if #[cfg(any(feature="optix72", feature="optix73"))] {
+        #[repr(C)]
+        #[derive(Debug, Hash, PartialEq, Copy, Clone)]
+        pub struct ModuleCompileOptions {
+            pub max_register_count: i32,
+            pub opt_level: CompileOptimizationLevel,
+            pub debug_level: CompileDebugLevel,
+        }
+
+        impl From<&ModuleCompileOptions> for sys::OptixModuleCompileOptions {
+            fn from(o: &ModuleCompileOptions) -> sys::OptixModuleCompileOptions {
+                sys::OptixModuleCompileOptions {
+                    maxRegisterCount: o.max_register_count,
+                    optLevel: o.opt_level as _,
+                    debugLevel: o.debug_level as _,
+                    boundValues: std::ptr::null(),
+                    numBoundValues: 0,
+                }
+            }
+        }
+    } else {
+        #[repr(C)]
+        #[derive(Debug, Hash, PartialEq, Copy, Clone)]
+        pub struct ModuleCompileOptions {
+            pub max_register_count: i32,
+            pub opt_level: CompileOptimizationLevel,
+            pub debug_level: CompileDebugLevel,
+        }
+
+        impl From<&ModuleCompileOptions> for sys::OptixModuleCompileOptions {
+            fn from(o: &ModuleCompileOptions) -> sys::OptixModuleCompileOptions {
+                sys::OptixModuleCompileOptions {
+                    maxRegisterCount: o.max_register_count,
+                    optLevel: o.opt_level as u32,
+                    debugLevel: o.debug_level as u32,
+                }
+            }
+        }
+    }
+}
+
+bitflags::bitflags! {
+    #[derive(Default)]
+    pub struct TraversableGraphFlags: OptixEnumBaseType {
+        const ALLOW_ANY = sys::OptixTraversableGraphFlags::OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+        const ALLOW_SINGLE_GAS = sys::OptixTraversableGraphFlags::OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS;
+        const ALLOW_SINGLE_LEVEL_INSTANCING = sys::OptixTraversableGraphFlags::OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+    }
+}
+
+bitflags::bitflags! {
+    #[derive(Default)]
+    pub struct ExceptionFlags: OptixEnumBaseType {
+        const NONE = sys::OptixExceptionFlags::OPTIX_EXCEPTION_FLAG_NONE;
+        const STACK_OVERFLOW = sys::OptixExceptionFlags::OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW;
+        const TRACE_DEPTH = sys::OptixExceptionFlags::OPTIX_EXCEPTION_FLAG_TRACE_DEPTH;
+        const USER = sys::OptixExceptionFlags::OPTIX_EXCEPTION_FLAG_USER;
+        const DEBUG = sys::OptixExceptionFlags::OPTIX_EXCEPTION_FLAG_DEBUG;
+    }
+}
+
+bitflags::bitflags! {
+    #[derive(Default)]
+    pub struct PrimitiveTypeFlags: i32 {
+        const DEFAULT = 0;
+        const CUSTOM =  sys::OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+        const ROUND_QUADRATIC_BSPLINE = sys::OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE;
+        const ROUND_CUBIC_BSPLINE =  sys::OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+        const ROUND_LINEAR =  sys::OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR;
+        const TRIANGLE = sys::OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+    }
+}
+
+#[repr(u32)]
+pub enum PrimitiveType {
+    RoundQuadraticBspline =
+        sys::OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE as u32,
+    RoundCubicBspline =
+        sys::OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE as u32,
+    RoundLinear = sys::OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR as u32,
+    Triangle = sys::OptixPrimitiveTypeFlags_OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE as u32,
+}
+
+#[derive(Debug, Hash, PartialEq, Clone, Default)]
+pub struct PipelineCompileOptions {
+    uses_motion_blur: bool,
+    traversable_graph_flags: TraversableGraphFlags,
+    num_payload_values: i32,
+    num_attribute_values: i32,
+    exception_flags: ExceptionFlags,
+    pipeline_launch_params_variable_name: Option<CString>,
+    primitive_type_flags: PrimitiveTypeFlags,
+}
+
+impl PipelineCompileOptions {
+    pub fn new() -> PipelineCompileOptions {
+        PipelineCompileOptions {
+            uses_motion_blur: false,
+            traversable_graph_flags: TraversableGraphFlags::ALLOW_ANY,
+            num_payload_values: 0,
+            num_attribute_values: 0,
+            exception_flags: ExceptionFlags::NONE,
+            pipeline_launch_params_variable_name: None,
+            primitive_type_flags: PrimitiveTypeFlags::DEFAULT,
+        }
+    }
+
+    pub fn build(&self) -> sys::OptixPipelineCompileOptions {
+        cfg_if::cfg_if! {
+        if #[cfg(feature="optix73")] {
+                sys::OptixPipelineCompileOptions {
+                    usesMotionBlur: if self.uses_motion_blur { 1 } else { 0 },
+                    traversableGraphFlags: self.traversable_graph_flags.bits() as _,
+                    numPayloadValues: self.num_payload_values,
+                    numAttributeValues: self.num_attribute_values,
+                    exceptionFlags: self.exception_flags.bits() as _,
+                    pipelineLaunchParamsVariableName: if let Some(ref name) = self
+                        .pipeline_launch_params_variable_name {
+                            name.as_ptr()
+                        } else {
+                            std::ptr::null()
+                        },
+                    usesPrimitiveTypeFlags: self.primitive_type_flags.bits() as u32,
+                    reserved: 0,
+                    reserved2: 0,
+                }
+            } else {
+                sys::OptixPipelineCompileOptions {
+                    usesMotionBlur: if self.uses_motion_blur { 1 } else { 0 },
+                    traversableGraphFlags: self.traversable_graph_flags.bits(),
+                    numPayloadValues: self.num_payload_values,
+                    numAttributeValues: self.num_attribute_values,
+                    exceptionFlags: self.exception_flags.bits(),
+                    pipelineLaunchParamsVariableName: if let Some(ref name) = self
+                        .pipeline_launch_params_variable_name {
+                            name.as_ptr()
+                        } else {
+                            std::ptr::null()
+                        },
+                    usesPrimitiveTypeFlags: self.primitive_type_flags.bits() as u32,
+                }
+            }
+        }
+    }
+
+    pub fn uses_motion_blur(mut self, umb: bool) -> Self {
+        self.uses_motion_blur = umb;
+        self
+    }
+
+    pub fn traversable_graph_flags(mut self, tgf: TraversableGraphFlags) -> Self {
+        self.traversable_graph_flags = tgf;
+        self
+    }
+
+    pub fn num_payload_values(mut self, npv: i32) -> Self {
+        self.num_payload_values = npv;
+        self
+    }
+
+    pub fn num_attribute_values(mut self, nav: i32) -> Self {
+        self.num_attribute_values = nav;
+        self
+    }
+
+    pub fn exception_flags(mut self, ef: ExceptionFlags) -> Self {
+        self.exception_flags = ef;
+        self
+    }
+
+    pub fn pipeline_launch_params_variable_name(mut self, name: &str) -> Self {
+        self.pipeline_launch_params_variable_name = Some(
+            CString::new(name).expect("pipeline launch params variable name contains nul bytes"),
+        );
+        self
+    }
+}
+
+/// # Creating and destroying `Module`s
+impl Module {
+    pub fn new(
+        ctx: &mut DeviceContext,
+        module_compile_options: &ModuleCompileOptions,
+        pipeline_compile_options: &PipelineCompileOptions,
+        ptx: &str,
+    ) -> Result<(Module, String)> {
+        let cptx = CString::new(ptx).unwrap();
+        let mut log = [0u8; 4096];
+        let mut log_len = log.len();
+
+        let mopt = module_compile_options.into();
+        let popt = pipeline_compile_options.build();
+
+        let mut raw = std::ptr::null_mut();
+        let res = unsafe {
+            optix_call!(optixModuleCreateFromPTX(
+                ctx.raw,
+                &mopt as *const _,
+                &popt,
+                cptx.as_ptr(),
+                cptx.as_bytes().len(),
+                log.as_mut_ptr() as *mut i8,
+                &mut log_len,
+                &mut raw,
+            ))
+        };
+
+        let log = CStr::from_bytes_with_nul(&log[0..log_len])
+            .unwrap()
+            .to_string_lossy()
+            .into_owned();
+
+        match res {
+            Ok(()) => Ok((Module { raw }, log)),
+            Err(source) => Err(Error::ModuleCreation { source, log }),
+        }
+    }
+
+    /// Returns a module containing the intersection program for the built-in
+    /// primitive type specified by the builtinISOptions. This module must be used
+    /// as the moduleIS for the OptixProgramGroupHitgroup in any SBT record for
+    /// that primitive type.
+    pub fn builtin_is_module_get(
+        ctx: &mut DeviceContext,
+        module_compile_options: &ModuleCompileOptions,
+        pipeline_compile_options: &PipelineCompileOptions,
+        builtin_is_module_type: PrimitiveType,
+        uses_motion_blur: bool,
+    ) -> Result<Module> {
+        let is_options = sys::OptixBuiltinISOptions {
+            builtinISModuleType: builtin_is_module_type as _,
+            usesMotionBlur: if uses_motion_blur { 1 } else { 0 },
+        };
+
+        let mut raw = std::ptr::null_mut();
+
+        unsafe {
+            optix_call!(optixBuiltinISModuleGet(
+                ctx.raw,
+                module_compile_options as *const _ as *const _,
+                pipeline_compile_options as *const _ as *const _,
+                &is_options as *const _,
+                &mut raw,
+            ))
+            .map(|_| Module { raw })
+            .map_err(Error::from)
+        }
+    }
+}
+
+impl Drop for Module {
+    fn drop(&mut self) {
+        unsafe {
+            sys::optixModuleDestroy(self.raw);
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct ProgramGroupModule<'m> {
+    pub module: &'m Module,
+    pub entry_function_name: CString,
+}
+
+pub enum ProgramGroupDesc<'m> {
+    Raygen(ProgramGroupModule<'m>),
+    Miss(ProgramGroupModule<'m>),
+    Exception(ProgramGroupModule<'m>),
+    Hitgroup {
+        ch: Option<ProgramGroupModule<'m>>,
+        ah: Option<ProgramGroupModule<'m>>,
+        is: Option<ProgramGroupModule<'m>>,
+    },
+    Callables {
+        dc: Option<ProgramGroupModule<'m>>,
+        cc: Option<ProgramGroupModule<'m>>,
+    },
+}
+
+impl<'m> ProgramGroupDesc<'m> {
+    pub fn raygen(module: &'m Module, entry_function_name: &str) -> ProgramGroupDesc<'m> {
+        ProgramGroupDesc::Raygen(ProgramGroupModule {
+            module,
+            entry_function_name: CString::new(entry_function_name).expect("Invalid string"),
+        })
+    }
+
+    pub fn miss(module: &'m Module, entry_function_name: &str) -> ProgramGroupDesc<'m> {
+        ProgramGroupDesc::Miss(ProgramGroupModule {
+            module,
+            entry_function_name: CString::new(entry_function_name).expect("Invalid string"),
+        })
+    }
+
+    pub fn exception(module: &'m Module, entry_function_name: &str) -> ProgramGroupDesc<'m> {
+        ProgramGroupDesc::Exception(ProgramGroupModule {
+            module,
+            entry_function_name: CString::new(entry_function_name).expect("Invalid string"),
+        })
+    }
+
+    pub fn hitgroup(
+        ch: Option<(&'m Module, &str)>,
+        ah: Option<(&'m Module, &str)>,
+        is: Option<(&'m Module, &str)>,
+    ) -> ProgramGroupDesc<'m> {
+        ProgramGroupDesc::Hitgroup {
+            ch: ch.map(|(module, entry_function_name)| ProgramGroupModule {
+                module,
+                entry_function_name: CString::new(entry_function_name).expect("Invalid string"),
+            }),
+            ah: ah.map(|(module, entry_function_name)| ProgramGroupModule {
+                module,
+                entry_function_name: CString::new(entry_function_name).expect("Invalid string"),
+            }),
+            is: is.map(|(module, entry_function_name)| ProgramGroupModule {
+                module,
+                entry_function_name: CString::new(entry_function_name).expect("Invalid string"),
+            }),
+        }
+    }
+}
+
+/// A group of programs to be associated with a SBT record.
+///
+/// Modules can contain more than one program. The program in the module is
+/// designated by its entry function name as part of the [ProgramGroupDesc]
+/// struct passed to [`ProgramGroup::new()`] and
+/// [`ProgramGroup::new_single()`], or specified directly in the
+/// case of [`ProgramGroup::raygen()`],
+/// [`ProgramGroup::miss()`] and
+/// [ProgramGroup::hitgroup()`]
+///
+/// Four program groups can contain only a single program; only hitgroups can
+/// designate up to three programs for the closest-hit, any-hit, and
+/// intersection programs.
+///
+/// Programs from modules can be used in any number of [ProgramGroup] objects.
+/// The resulting program groups can be used to fill in any number of
+/// SBT records. Program groups can also be used across pipelines as long as the
+/// compilation options match.
+///
+/// A hit group specifies the intersection program used to test whether a ray
+/// intersects a primitive, together with the hit shaders to be executed when a
+/// ray does intersect the primitive. For built-in primitive types, a built-in
+/// intersection program should be obtained from
+/// [DeviceContext::builtin_is_module_get()] and used in the hit group. As a
+/// special case, the intersection program is not required – and is ignored –
+/// for triangle primitives.
+///
+/// # Safety
+/// The lifetime of a module must extend to the lifetime of any
+/// ProgramGroup that references that module.
+///  FIXME (AL): make this sound by storing module lifetimes here
+#[repr(transparent)]
+pub struct ProgramGroup {
+    pub(crate) raw: sys::OptixProgramGroup,
+}
+
+impl ProgramGroup {
+    /// Use this information to calculate the total required stack sizes for a
+    /// particular call graph of NVIDIA OptiX programs.
+    ///
+    /// To set the stack sizes for a particular pipeline, use
+    /// [Pipeline::set_stack_size()](crate::Pipeline::set_stack_size()).
+    pub fn get_stack_size(&self) -> Result<StackSizes> {
+        let mut stack_sizes = StackSizes::default();
+        unsafe {
+            Ok(optix_call!(optixProgramGroupGetStackSize(
+                self.raw,
+                &mut stack_sizes as *mut _ as *mut _
+            ))
+            .map(|_| stack_sizes)?)
+        }
+    }
+}
+
+impl PartialEq for ProgramGroup {
+    fn eq(&self, rhs: &ProgramGroup) -> bool {
+        self.raw == rhs.raw
+    }
+}
+
+/// # Creating and destroying `ProgramGroup`s
+impl ProgramGroup {
+    /// Create a [ProgramGroup] for each of the [ProgramGroupDesc] objects in
+    /// `desc`.
+    pub fn new(
+        ctx: &mut DeviceContext,
+        desc: &[ProgramGroupDesc],
+    ) -> Result<(Vec<ProgramGroup>, String)> {
+        cfg_if::cfg_if! {
+        if #[cfg(any(feature="optix73"))] {
+            let pg_options = sys::OptixProgramGroupOptions { reserved: 0 };
+        } else {
+            let pg_options = sys::OptixProgramGroupOptions { placeholder: 0 };
+        }
+        }
+
+        let mut log = [0u8; 4096];
+        let mut log_len = log.len();
+
+        let pg_desc: Vec<sys::OptixProgramGroupDesc> = desc.iter().map(|d| d.into()).collect();
+
+        let mut raws = vec![std::ptr::null_mut(); pg_desc.len()];
+
+        let res = unsafe {
+            optix_call!(optixProgramGroupCreate(
+                ctx.raw,
+                pg_desc.as_ptr(),
+                pg_desc.len() as u32,
+                &pg_options,
+                log.as_mut_ptr() as *mut i8,
+                &mut log_len,
+                raws.as_mut_ptr(),
+            ))
+        };
+
+        let log = CStr::from_bytes_with_nul(&log[0..log_len])
+            .unwrap()
+            .to_string_lossy()
+            .into_owned();
+
+        match res {
+            Ok(()) => Ok((
+                raws.iter().map(|raw| ProgramGroup { raw: *raw }).collect(),
+                log,
+            )),
+            Err(source) => Err(Error::ProgramGroupCreation { source, log }),
+        }
+    }
+
+    /// Create a single [ProgramGroup] specified by `desc`.
+    pub fn new_single(
+        ctx: &mut DeviceContext,
+        desc: &ProgramGroupDesc,
+    ) -> Result<(ProgramGroup, String)> {
+        cfg_if::cfg_if! {
+        if #[cfg(any(feature="optix73"))] {
+            let pg_options = sys::OptixProgramGroupOptions { reserved: 0 };
+        } else {
+            let pg_options = sys::OptixProgramGroupOptions { placeholder: 0 };
+        }
+        }
+
+        let mut log = [0u8; 4096];
+        let mut log_len = log.len();
+
+        let pg_desc: sys::OptixProgramGroupDesc = desc.into();
+
+        let mut raw = std::ptr::null_mut();
+
+        let res = unsafe {
+            optix_call!(optixProgramGroupCreate(
+                ctx.raw,
+                &pg_desc,
+                1,
+                &pg_options,
+                log.as_mut_ptr() as *mut i8,
+                &mut log_len,
+                &mut raw,
+            ))
+        };
+
+        let log = CStr::from_bytes_with_nul(&log[0..log_len])
+            .unwrap()
+            .to_string_lossy()
+            .into_owned();
+
+        match res {
+            Ok(()) => Ok((ProgramGroup { raw }, log)),
+            Err(source) => Err(Error::ProgramGroupCreation { source, log }),
+        }
+    }
+
+    /// Create a raygen [ProgramGroup] from `entry_function_name` in `module`.
+    pub fn raygen(
+        ctx: &mut DeviceContext,
+        module: &Module,
+        entry_function_name: &str,
+    ) -> Result<ProgramGroup> {
+        let desc = ProgramGroupDesc::raygen(module, entry_function_name);
+        Ok(ProgramGroup::new_single(ctx, &desc)?.0)
+    }
+
+    /// Create a miss [ProgramGroup] from `entry_function_name` in `module`.
+    pub fn miss(
+        ctx: &mut DeviceContext,
+        module: &Module,
+        entry_function_name: &str,
+    ) -> Result<ProgramGroup> {
+        let desc = ProgramGroupDesc::miss(module, entry_function_name);
+        Ok(ProgramGroup::new_single(ctx, &desc)?.0)
+    }
+
+    /// Create an exception [ProgramGroup] from `entry_function_name` in `module`.
+    pub fn exception(
+        ctx: &mut DeviceContext,
+        module: &Module,
+        entry_function_name: &str,
+    ) -> Result<ProgramGroup> {
+        let desc = ProgramGroupDesc::exception(module, entry_function_name);
+        Ok(ProgramGroup::new_single(ctx, &desc)?.0)
+    }
+
+    /// Create a hitgroup [ProgramGroup] from any combination of
+    /// `(module, entry_function_name)` pairs.
+    pub fn hitgroup(
+        ctx: &mut DeviceContext,
+        closest_hit: Option<(&Module, &str)>,
+        any_hit: Option<(&Module, &str)>,
+        intersection: Option<(&Module, &str)>,
+    ) -> Result<ProgramGroup> {
+        let desc = ProgramGroupDesc::hitgroup(closest_hit, any_hit, intersection);
+        Ok(ProgramGroup::new_single(ctx, &desc)?.0)
+    }
+}
+
+impl Drop for ProgramGroup {
+    fn drop(&mut self) {
+        unsafe {
+            sys::optixProgramGroupDestroy(self.raw);
+        }
+    }
+}
+
+impl<'m> From<&ProgramGroupDesc<'m>> for sys::OptixProgramGroupDesc {
+    fn from(desc: &ProgramGroupDesc<'m>) -> sys::OptixProgramGroupDesc {
+        match &desc {
+            ProgramGroupDesc::Raygen(ProgramGroupModule {
+                module,
+                entry_function_name,
+            }) => sys::OptixProgramGroupDesc {
+                kind: sys::OptixProgramGroupKind::OPTIX_PROGRAM_GROUP_KIND_RAYGEN,
+                __bindgen_anon_1: sys::OptixProgramGroupDesc__bindgen_ty_1 {
+                    raygen: sys::OptixProgramGroupSingleModule {
+                        module: module.raw,
+                        entryFunctionName: entry_function_name.as_ptr(),
+                    },
+                },
+                flags: 0,
+            },
+            ProgramGroupDesc::Miss(ProgramGroupModule {
+                module,
+                entry_function_name,
+            }) => sys::OptixProgramGroupDesc {
+                kind: sys::OptixProgramGroupKind::OPTIX_PROGRAM_GROUP_KIND_MISS,
+                __bindgen_anon_1: sys::OptixProgramGroupDesc__bindgen_ty_1 {
+                    miss: sys::OptixProgramGroupSingleModule {
+                        module: module.raw,
+                        entryFunctionName: entry_function_name.as_ptr(),
+                    },
+                },
+                flags: 0,
+            },
+            ProgramGroupDesc::Exception(ProgramGroupModule {
+                module,
+                entry_function_name,
+            }) => sys::OptixProgramGroupDesc {
+                kind: sys::OptixProgramGroupKind::OPTIX_PROGRAM_GROUP_KIND_EXCEPTION,
+                __bindgen_anon_1: sys::OptixProgramGroupDesc__bindgen_ty_1 {
+                    miss: sys::OptixProgramGroupSingleModule {
+                        module: module.raw,
+                        entryFunctionName: entry_function_name.as_ptr(),
+                    },
+                },
+                flags: 0,
+            },
+            ProgramGroupDesc::Hitgroup { ch, ah, is } => {
+                let mut efn_ch_ptr = std::ptr::null();
+                let mut efn_ah_ptr = std::ptr::null();
+                let mut efn_is_ptr = std::ptr::null();
+
+                let module_ch = if let Some(pg_ch) = &ch {
+                    efn_ch_ptr = pg_ch.entry_function_name.as_ptr();
+                    pg_ch.module.raw
+                } else {
+                    std::ptr::null_mut()
+                };
+
+                let module_ah = if let Some(pg_ah) = &ah {
+                    efn_ah_ptr = pg_ah.entry_function_name.as_ptr();
+                    pg_ah.module.raw
+                } else {
+                    std::ptr::null_mut()
+                };
+
+                let module_is = if let Some(pg_is) = &is {
+                    efn_is_ptr = pg_is.entry_function_name.as_ptr();
+                    pg_is.module.raw
+                } else {
+                    std::ptr::null_mut()
+                };
+
+                sys::OptixProgramGroupDesc {
+                    kind: sys::OptixProgramGroupKind::OPTIX_PROGRAM_GROUP_KIND_HITGROUP,
+                    __bindgen_anon_1: sys::OptixProgramGroupDesc__bindgen_ty_1 {
+                        hitgroup: sys::OptixProgramGroupHitgroup {
+                            moduleCH: module_ch,
+                            entryFunctionNameCH: efn_ch_ptr,
+                            moduleAH: module_ah,
+                            entryFunctionNameAH: efn_ah_ptr,
+                            moduleIS: module_is,
+                            entryFunctionNameIS: efn_is_ptr,
+                        },
+                    },
+                    flags: 0,
+                }
+            }
+            ProgramGroupDesc::Callables { dc, cc } => {
+                let (module_dc, efn_dc) = if let Some(pg_dc) = &dc {
+                    (pg_dc.module.raw, pg_dc.entry_function_name.as_ptr())
+                } else {
+                    (std::ptr::null_mut(), std::ptr::null())
+                };
+
+                let (module_cc, efn_cc) = if let Some(pg_cc) = &cc {
+                    (pg_cc.module.raw, pg_cc.entry_function_name.as_ptr())
+                } else {
+                    (std::ptr::null_mut(), std::ptr::null())
+                };
+
+                sys::OptixProgramGroupDesc {
+                    kind: sys::OptixProgramGroupKind::OPTIX_PROGRAM_GROUP_KIND_CALLABLES,
+                    __bindgen_anon_1: sys::OptixProgramGroupDesc__bindgen_ty_1 {
+                        callables: sys::OptixProgramGroupCallables {
+                            moduleDC: module_dc,
+                            entryFunctionNameDC: efn_dc,
+                            moduleCC: module_cc,
+                            entryFunctionNameCC: efn_cc,
+                        },
+                    },
+                    flags: 0,
+                }
+            }
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct StackSizes {
+    pub css_rg: u32,
+    pub css_mg: u32,
+    pub css_ch: u32,
+    pub css_ah: u32,
+    pub css_is: u32,
+    pub css_cc: u32,
+    pub css_dc: u32,
+}
diff --git a/crates/optix/src/prelude.rs b/crates/optix/src/prelude.rs
new file mode 100644
index 00000000..d51f977c
--- /dev/null
+++ b/crates/optix/src/prelude.rs
@@ -0,0 +1,16 @@
+pub use crate::{
+    acceleration::{
+        Aabb, Accel, AccelBufferSizes, AccelBuildOptions, AccelEmitDesc, AccelRelocationInfo,
+        BuildFlags, BuildOperation, CurveArray, CurveType, CustomPrimitiveArray, DynamicAccel,
+        GeometryFlags, IndexTriple, IndexedTriangleArray, Instance, InstanceArray, InstanceFlags,
+        InstancePointerArray, TraversableHandle, TriangleArray, Vertex,
+    },
+    context::{DeviceContext, DeviceProperty},
+    init, launch,
+    pipeline::{
+        CompileDebugLevel, CompileOptimizationLevel, ExceptionFlags, Module, ModuleCompileOptions,
+        Pipeline, PipelineCompileOptions, PipelineLinkOptions, PrimitiveType, PrimitiveTypeFlags,
+        ProgramGroup, ProgramGroupDesc, ProgramGroupModule, StackSizes, TraversableGraphFlags,
+    },
+    shader_binding_table::{SbtRecord, ShaderBindingTable},
+};
diff --git a/crates/optix/src/shader_binding_table.md b/crates/optix/src/shader_binding_table.md
new file mode 100644
index 00000000..7818e5cb
--- /dev/null
+++ b/crates/optix/src/shader_binding_table.md
@@ -0,0 +1,248 @@
+# Shader Binding Table
+
+# Programming Guide...
+<details>
+<summary>Click here to expand programming guide</summary>
+
+# Contents
+
+- [Records](#records)
+- [Layout](#layout)
+- [Acceleration Structures](#acceleration-structures)
+    - [SBT Instance Offset](#sbt-instance-offset)
+    - [SBT Geometry-AS Offset](#sbt-geometry-as-offset)
+    - [SBT Trace Offset](#sbt-trace-offset)
+    - [SBT Trace Stride](#sbt-trace-stride)
+    - [Example SBT For a Scene](#example-sbt-for-a-scene)
+- [SBT Record Access on Device](#sbt-record-access-on-device)
+
+The shader binding table (SBT) is an array that contains information about the location of programs and their parameters. The SBT resides in device memory and is managed by the application.
+
+The shader binding table can be complex to get your head around. In addition to this documentation, you might also enjoy reading Will Usher's [*The RTX Shader Binding Table Three Ways*](https://www.willusher.io/graphics/2019/11/20/the-sbt-three-ways)
+
+## Records
+A record is an array element of the SBT that consists of a header and a data block. The header content is opaque to the application, containing information accessed by traversal execution to identify and invoke programs. 
+
+Rather than pack the records manually as in the C API, the Rust API instead gives you a generic [`SbtRecord<T>`] type that you can specialize to supply your data to the SBT:
+
+```
+#[derive(Copy, Clone, Default, DeviceCopy)]
+struct HitgroupSbtData {
+    object_id: u32,
+}
+type HitgroupRecord = SbtRecord<HitgroupSbtData>;
+
+// Pack the object ids into the record for each object. In a real application 
+// you would supply pointers to device memory containing vertex attributes 
+// such as smoothed normals, texture coordinates etc.
+let rec_hitgroup: Vec<_> = (0..num_objects)
+    .map(|i| {
+        let object_type = 0;
+        let rec = HitgroupRecord::pack(
+            HitgroupSbtData { object_id: i },
+            &pg_hitgroup[object_type],
+        )
+        .expect("failed to pack hitgroup record");
+        rec
+    })
+    .collect();
+
+
+```
+
+The data section of an [`SbtRecord`] can be accessed on the device using the `optixGetSbtDataPointer()` device function. 
+
+## Layout
+
+A shader binding table is split into five sections, where each section represents a unique program group type:
+
+ <table>
+ <tr><th>Group</th><th>Program Types in Group</th>
+ <tr><td>Ray Generation</td><td><code>ray-generation</code></td></tr>
+ <tr><td>Exception</td><td><code>exception</code></td></tr>
+ <tr><td>Miss</td><td><code>miss</code></td></tr>
+ <tr><td>Hit</td><td><code>closest-hit, any-hit, intersection</code></td></tr>
+ <tr><td>Callable</td><td><code>direct-callable, continuation-callable</code></td></tr>
+ </table>
+
+ See also [Program Group Creation](crate::pipeline)
+
+ The [`ShaderBindingTable`] is created by passing [`DeviceBuffer`]s of [`SbtRecord`]s to the constructor:
+
+ ```
+let mut buf_raygen = DeviceBuffer::from_slice(&rec_raygen)?;
+let mut buf_miss = DeviceBuffer::from_slice(&rec_miss)?;
+let mut buf_hitgroup = DeviceBuffer::from_slice(&rec_hitgroup)?;
+
+let sbt = ShaderBindingTable::new(&mut buf_raygen)
+    .miss(&mut buf_miss)
+    .hitgroup(&mut buf_hitgroup)
+    .build();
+ ```
+The [`SbtRecord`]s buffers are assumed to be densely, packed and the [`SbtRecord`] itself is correctly aligned to 16 bytes.
+
+The index to records in the shader binding table is used in different ways for the miss, hit, and callables groups:
+
+* *Miss* - Miss programs are selected for every optixTrace call using the missSBTIndex parameter.
+* *Callables* - Callables take the index as a parameter and call the direct-callable when invoking optixDirectCall and continuation-callable when invoking optixContinuationCall.
+* *Any-hit, closest-hit, intersection* - The computation of the index for the hit group (intersection, any-hit, closest-hit) is done during traversal. See [Acceleration structures](#acceleration-structures) for more detail.
+
+
+## Acceleration Structures
+
+The selection of the SBT hit group record for the instance is slightly more involved to allow for a number of use cases such as the implementation of different ray types. The SBT record index `sbt_index` is determined by the following index calculation during traversal:
+
+```text
+sbt-index =
+    sbt-instance-offset
+    + (sbt-geometry-acceleration-structure-index * sbt-stride-from-trace-call)
+    + sbt-offset-from-trace-call
+```
+
+The index calculation depends upon the following SBT indices and offsets:
+
+* Instance offset
+* Geometry acceleration structure index
+* Trace offset
+* Trace stride
+
+### SBT Instance Offset
+
+Instance acceleration structure instances (type [`Instance`]) store an SBT offset that is applied during traversal. This is zero for single geometry-AS traversable because there is no corresponding instance-AS to hold the value. (See “Traversal of a single geometry acceleration structure”.) This value is limited to 24 bits.
+
+### SBT Geometry-AS Index
+
+Each geometry acceleration structure build input references at least one SBT record. The first SBT geometry acceleration structure index for each geometry acceleration structure build input is the prefix sum of the number of SBT records. Therefore, the computed SBT geometry acceleration structure index is dependent on the order of the build inputs.
+
+The following example demonstrates a geometry acceleration structure with three build inputs. Each build input references one SBT record by specifying `num_sbt_records=1`. When intersecting geometry at trace time, the SBT geometry acceleration structure index used to compute the `sbt_index` to select the hit group record will be organized as follows:
+
+<table>
+<tr><th>SBT Geometry-AS Index</th><th>0</th><th>1</th><th>2</th></tr>
+<tr><td rowspan=3>Geometry-AS build input</td><td><code>build_input[0]</code></td><td></td><td></td></r>
+<tr><td></td><td><code>built_input[1]</code></td><td></td></r>
+<tr><td></td><td></td><td><code>built_input[2]</code></td></r>
+</table>
+
+In this simple example, the index for the build input equals the SBT geometry acceleration structure index. Hence, whenever a primitive from “Build input [1]” is intersected, the SBT geometry acceleration structure index is one.
+
+When a single build input references multiple SBT records (for example, to support multiple materials per geometry), the mapping corresponds to the prefix sum over the number of referenced SBT records.
+
+For example, consider three build inputs where the first build input references four SBT records, the second references one SBT record, and the last references two SBT records:
+
+<table>
+<tr><th>SBT Geometry-AS Index</th><th>0</th><th>1</th><th>2</th><th>3</th><th>4</th><th>5</th><th>6</th></tr>
+<tr><td rowspan=3>Geometry-AS build input</td><td><code>build_input[0] num=4</code></td><td></td><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><td></td><td></td><td></td><td><code>build_input[1] num=1</code></td><td></td><td></td><td></td></tr>
+<tr><td></td><td></td><td></td><td></td><td></td><td><code>build_input[2] offset=2</code></td><td></td></tr>
+</table>
+
+These three build inputs result in the following possible SBT geometry acceleration structure indices when intersecting the corresponding geometry acceleration structure build input:
+
+* One index in the range of [0,3] if a primitive from `build_input[0]` is intersected
+* Four if a primitive from `build_input[1]` is intersected
+* One index in the range of [5,6] if a primitive from `build_input[2]` is intersected
+
+The per-primitive SBT index offsets, as specified by using `sbt_index_offset_buffer`, are local to the build input. Hence, per-primitive offsets in the range [0,3] for the build input 0 and in the range [0,1] for the last build input, map to the SBT geometry acceleration structure index as follows:
+
+<table>
+<tr>
+<th>SBT Geometry-AS Index</th><th>0</th><th>1</th><th>2</th><th>3</th><th>4</th><th>5</th><th>6</th>
+</tr>
+<tr>
+<td rowspan=4><code>build_input[0].sbt_index_offset:</code></td><td>[0]</td><td></td><td></td><td></td><td></td><td></td><td></td>
+</tr>
+<tr>
+<td></td><td>[1]</td><td></td><td></td><td></td><td></td><td></td>
+</tr>
+<tr>
+<td></td><td></td><td>[2]</td><td></td><td></td><td></td><td></td>
+</tr>
+<tr>
+<td></td><td></td><td></td><td>[3]</td><td></td><td></td><td></td>
+</tr>
+<tr>
+<td><code>build_input[1].sbt_index_offset=None</code></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td>
+</tr>
+<tr>
+<td rowspan=2><code>build_input[1].sbt_index_offset:</code></td><td></td><td></td><td></td><td></td><td></td><td>[0]</td><td></td>
+</tr>
+<tr>
+<td></td><td></td><td></td><td></td><td></td><td></td><td>[1]</td>
+</tr>
+</table>
+
+Because `build_input[1]` references a single SBT record, a `sbt_index_offset_buffer` does not need to be specified for the geometry acceleration structure build. See “Acceleration structures”.
+
+### SBT Trace Offset
+
+The `optixTrace` function takes the parameter `SBToffset`, allowing for an SBT access shift for this specific ray. It is required to implement different ray types, i.e. the offset is the index of the ray type.
+
+### SBT Trace Stride
+
+The parameter `SBTstride`, defined as an index offset, is multiplied by `optixTrace` with the SBT geometry acceleration structure index. It is required to implement different ray types, i.e. the stride is the number of ray types.
+
+### Example SBT For a Scene
+
+In this example, a shader binding table implements the program selection for a simple scene containing one instance acceleration structure and two instances of the same geometry acceleration structure, where the geometry acceleration structure has two build inputs:
+
+![Structure of a simple scene](scene_graph)
+
+The first build input references a single SBT record, while the second one references two SBT records. There are two ray types: one for forward path tracing and one for shadow rays (next event estimation). The two instances of the geometry acceleration structure have different transforms and SBT offsets to allow for material variation in each instance of the same geometry acceleration structure. Therefore, the SBT needs to hold two miss records and 12 hit group records (three for the geometry acceleration structure, ×2 for the ray types, ×2 for the two instances in the instance acceleration structure).
+
+![Example SBT](example_sbt)
+
+To trace a ray of type 0 (for example, for path tracing):
+
+```
+optixTrace(IAS_handle,
+    ray_org, ray_dir,
+    tmin, tmax, time, 
+    visMask, rayFlags,
+    0, // sbtOffset
+    2, // sbtStride
+    0, // missSBTIndex 
+    rayPayload0, ...);
+```
+Shadow rays need to pass in an adjusted `sbtOffset` as well as `missSBTIndex`:
+
+```
+optixTrace(IAS_handle,
+    ray_org, ray_dir,
+    tmin, tmax, time, 
+    visMask, rayFlags,
+    1, // sbtOffset
+    2, // sbtStride
+    1, // missSBTIndex 
+    rayPayload0, ...);
+```
+
+Program groups of different types (ray generation, miss, intersection, and so on) do not need to be adjacent to each other as shown in the example. The pointer to the first SBT record of each program group type is passed to [`launch()`](crate::launch), as described previously, which allows for arbitrary spacing in the SBT between the records of different program group types.
+
+### SBT Record Access on Device
+
+To access the SBT data section of the currently running program, request its pointer by using an API function:
+```text
+CUdeviceptr optixGetSbtDataPointer();
+```
+Typically, this pointer is cast to a pointer that represents the layout of the data section. For example, for a closest hit program, the application gets access to the data associated with the SBT record that was used to invoke that closest hit program:
+
+```text
+struct CHData {
+    int meshIdx; // Triangle mesh build input index
+    float3 base_color;
+};
+
+CHData* material_info = (CHData*)optixGetSbtDataPointer();
+```
+The program is encouraged to rely on the alignment constraints of the SBT data section to read this data efficiently.
+
+</details>
+
+
+ [`Instance`]: crate::acceleration::Instance
+ [`SbtRecord`]: crate::shader_binding_table::SbtRecord
+ [`SbtRecord<T>`]: crate::shader_binding_table::SbtRecord
+ [`ShaderBindingTable`]: crate::shader_binding_table::ShaderBindingTable
+ [`DeviceBuffer`]: cust::memory::DeviceBuffer
+
+
diff --git a/crates/optix/src/shader_binding_table.rs b/crates/optix/src/shader_binding_table.rs
new file mode 100644
index 00000000..f0edd95a
--- /dev/null
+++ b/crates/optix/src/shader_binding_table.rs
@@ -0,0 +1,118 @@
+use crate::{const_assert, const_assert_eq};
+use crate::{error::Error, optix_call, pipeline::ProgramGroup, sys};
+use cust::memory::{DeviceCopy, DeviceSlice};
+
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+#[repr(C)]
+#[repr(align(16))]
+#[derive(Copy, Clone)]
+pub struct SbtRecord<T>
+where
+    T: Copy,
+{
+    header: sys::SbtRecordHeader,
+    data: T,
+}
+
+// impl<T> Copy for SbtRecord<T> where T: Copy {}
+
+impl<T> SbtRecord<T>
+where
+    T: Copy,
+{
+    pub fn pack(data: T, program_group: &ProgramGroup) -> Result<SbtRecord<T>> {
+        let mut rec = SbtRecord {
+            header: sys::SbtRecordHeader::default(),
+            data,
+        };
+
+        unsafe {
+            Ok(optix_call!(optixSbtRecordPackHeader(
+                program_group.raw,
+                &mut rec as *mut _ as *mut std::os::raw::c_void,
+            ))
+            .map(|_| rec)?)
+        }
+    }
+}
+
+unsafe impl<T: DeviceCopy> DeviceCopy for SbtRecord<T> {}
+
+#[repr(transparent)]
+pub struct ShaderBindingTable(pub(crate) sys::OptixShaderBindingTable);
+
+impl ShaderBindingTable {
+    pub fn new<RG: DeviceCopy>(buf_raygen_record: &DeviceSlice<SbtRecord<RG>>) -> Self {
+        let raygen_record = buf_raygen_record.as_device_ptr().as_raw();
+        ShaderBindingTable(sys::OptixShaderBindingTable {
+            raygenRecord: raygen_record,
+            exceptionRecord: 0,
+            missRecordBase: 0,
+            missRecordStrideInBytes: 0,
+            missRecordCount: 0,
+            hitgroupRecordBase: 0,
+            hitgroupRecordStrideInBytes: 0,
+            hitgroupRecordCount: 0,
+            callablesRecordBase: 0,
+            callablesRecordStrideInBytes: 0,
+            callablesRecordCount: 0,
+        })
+    }
+
+    pub fn exception<EX: DeviceCopy>(
+        mut self,
+        buf_exception_record: &DeviceSlice<SbtRecord<EX>>,
+    ) -> Self {
+        if buf_exception_record.len() != 1 {
+            panic!("SBT not passed single exception record",);
+        }
+        self.0.exceptionRecord = buf_exception_record.as_device_ptr().as_raw();
+        self
+    }
+
+    pub fn miss<MS: DeviceCopy>(mut self, buf_miss_records: &DeviceSlice<SbtRecord<MS>>) -> Self {
+        if buf_miss_records.is_empty() {
+            panic!("SBT passed empty miss records");
+        }
+        self.0.missRecordBase = buf_miss_records.as_device_ptr().as_raw();
+        self.0.missRecordStrideInBytes = std::mem::size_of::<SbtRecord<MS>>() as u32;
+        self.0.missRecordCount = buf_miss_records.len() as u32;
+        self
+    }
+
+    pub fn hitgroup<HG: DeviceCopy>(
+        mut self,
+        buf_hitgroup_records: &DeviceSlice<SbtRecord<HG>>,
+    ) -> Self {
+        if buf_hitgroup_records.is_empty() {
+            panic!("SBT passed empty hitgroup records");
+        }
+        self.0.hitgroupRecordBase = buf_hitgroup_records.as_device_ptr().as_raw();
+        self.0.hitgroupRecordStrideInBytes = std::mem::size_of::<SbtRecord<HG>>() as u32;
+        self.0.hitgroupRecordCount = buf_hitgroup_records.len() as u32;
+        self
+    }
+
+    pub fn callables<CL: DeviceCopy>(
+        mut self,
+        buf_callables_records: &DeviceSlice<SbtRecord<CL>>,
+    ) -> Self {
+        if buf_callables_records.is_empty() {
+            panic!("SBT passed empty callables records");
+        }
+        self.0.callablesRecordBase = buf_callables_records.as_device_ptr().as_raw();
+        self.0.callablesRecordStrideInBytes = std::mem::size_of::<SbtRecord<CL>>() as u32;
+        self.0.callablesRecordCount = buf_callables_records.len() as u32;
+        self
+    }
+}
+
+const_assert_eq!(
+    std::mem::align_of::<ShaderBindingTable>(),
+    std::mem::align_of::<sys::OptixShaderBindingTable>(),
+);
+const_assert_eq!(
+    std::mem::size_of::<ShaderBindingTable>(),
+    std::mem::size_of::<sys::OptixShaderBindingTable>()
+);
diff --git a/crates/optix/src/sys.rs b/crates/optix/src/sys.rs
new file mode 100644
index 00000000..7243625a
--- /dev/null
+++ b/crates/optix/src/sys.rs
@@ -0,0 +1,59 @@
+#![allow(warnings)]
+
+use cust_raw::*;
+
+use std::mem::ManuallyDrop;
+
+type size_t = usize;
+
+include!(concat!(env!("OUT_DIR"), "/optix_wrapper.rs"));
+
+extern "C" {
+    pub fn optixInit() -> OptixResult;
+}
+
+// The SBT record header is an opaque blob used by optix
+#[repr(C)]
+#[derive(Default, Clone, Copy)]
+pub struct SbtRecordHeader {
+    header: [u8; OptixSbtRecordHeaderSize as usize],
+}
+
+impl SbtRecordHeader {
+    pub fn as_mut_ptr(&mut self) -> *mut std::os::raw::c_void {
+        self.header.as_mut_ptr() as *mut std::os::raw::c_void
+    }
+}
+
+// Manually define the build input union as the bindgen is pretty nasty
+#[repr(C)]
+pub union OptixBuildInputUnion {
+    pub triangle_array: ManuallyDrop<OptixBuildInputTriangleArray>,
+    pub curve_array: ManuallyDrop<OptixBuildInputCurveArray>,
+    pub custom_primitive_array: ManuallyDrop<OptixBuildInputCustomPrimitiveArray>,
+    pub instance_array: ManuallyDrop<OptixBuildInputInstanceArray>,
+    pad: [std::os::raw::c_char; 1024],
+}
+
+impl Default for OptixBuildInputUnion {
+    fn default() -> OptixBuildInputUnion {
+        OptixBuildInputUnion { pad: [0i8; 1024] }
+    }
+}
+
+#[repr(C)]
+pub struct OptixBuildInput {
+    pub type_: OptixBuildInputType,
+    pub input: OptixBuildInputUnion,
+}
+
+// Sanity check that the size of this union we're defining matches the one in
+// optix header so we don't get any nasty surprises
+fn _size_check() {
+    unsafe {
+        std::mem::transmute::<OptixBuildInput, [u8; OptixBuildInputSize]>(OptixBuildInput {
+            type_: OptixBuildInputType_OPTIX_BUILD_INPUT_TYPE_TRIANGLES,
+            input: { OptixBuildInputUnion { pad: [0; 1024] } },
+        });
+    }
+}
diff --git a/crates/optix_device/Cargo.toml b/crates/optix_device/Cargo.toml
new file mode 100644
index 00000000..c731edfd
--- /dev/null
+++ b/crates/optix_device/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "optix_device"
+version = "0.1.0"
+edition = "2021"
+authors = ["Anders Langlands <anderslanglands@gmail.com>", "Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
+
+[dependencies]
+cuda_std = { version = "0.2", path = "../cuda_std" }
+glam = { version = "0.20", features=["cuda", "libm"], default-features=false }
diff --git a/crates/optix_device/src/lib.rs b/crates/optix_device/src/lib.rs
new file mode 100644
index 00000000..f8ce4778
--- /dev/null
+++ b/crates/optix_device/src/lib.rs
@@ -0,0 +1,47 @@
+#![cfg_attr(
+    target_arch = "nvptx64",
+    no_std,
+    feature(register_attr, asm),
+    register_attr(nvvm_internal)
+)]
+
+extern crate alloc;
+
+use cuda_std::*;
+use glam::UVec3;
+
+extern "C" {
+    pub fn vprintf(format: *const u8, valist: *const core::ffi::c_void) -> i32;
+}
+
+#[gpu_only]
+#[inline(always)]
+pub fn get_launch_index() -> UVec3 {
+    let x: u32;
+    let y: u32;
+    let z: u32;
+
+    unsafe {
+        asm!("call ({0}), _optix_get_launch_index_x, ();", out(reg32) x);
+        asm!("call ({0}), _optix_get_launch_index_y, ();", out(reg32) y);
+        asm!("call ({0}), _optix_get_launch_index_z, ();", out(reg32) z);
+    }
+
+    UVec3::new(x, y, z)
+}
+
+#[gpu_only]
+#[inline(always)]
+pub fn get_launch_dimensions() -> UVec3 {
+    let x: u32;
+    let y: u32;
+    let z: u32;
+
+    unsafe {
+        asm!("call ({0}), _optix_get_launch_dimension_x, ();", out(reg32) x);
+        asm!("call ({0}), _optix_get_launch_dimension_y, ();", out(reg32) y);
+        asm!("call ({0}), _optix_get_launch_dimension_z, ();", out(reg32) z);
+    }
+
+    UVec3::new(x, y, z)
+}
diff --git a/crates/optix_sys/Cargo.toml b/crates/optix_sys/Cargo.toml
deleted file mode 100644
index c79da214..00000000
--- a/crates/optix_sys/Cargo.toml
+++ /dev/null
@@ -1,14 +0,0 @@
-[package]
-name = "optix_sys"
-version = "0.1.0"
-edition = "2021"
-license = "MIT OR Apache-2.0"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
-readme = "../../README.md"
-
-[dependencies]
-cust_raw = { version = "0.11.2", path = "../cust_raw" }
-
-[build-dependencies]
-cc = "1.0.71"
-find_cuda_helper = { version = "0.2", path = "../find_cuda_helper" }
diff --git a/crates/optix_sys/build.rs b/crates/optix_sys/build.rs
deleted file mode 100644
index 39068666..00000000
--- a/crates/optix_sys/build.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-use find_cuda_helper::{find_cuda_root, find_optix_root};
-use std::env;
-
-// OptiX is a bit exotic in how it provides its functions. It uses a function table
-// approach, a function table struct holds function pointers to every optix function. Then
-// the Optix driver dll is loaded at runtime and the function table is loaded from that.
-// OptiX provides this logic inside optix_stubs.h in the include dir, so we need to compile that
-// to a lib and link it in so that we have the initialization and C function logic.
-fn main() {
-    let out_dir = env::var("OUT_DIR").unwrap();
-    let mut optix_include = find_optix_root().expect(
-        "Unable to find the OptiX SDK, make sure you installed it and
-    that OPTIX_ROOT or OPTIX_ROOT_DIR are set",
-    );
-
-    optix_include = optix_include.join("include");
-
-    let mut cuda_include = find_cuda_root().expect(
-        "Unable to find the CUDA Toolkit, make sure you installed it and
-    that CUDA_ROOT, CUDA_PATH or CUDA_TOOLKIT_ROOT_DIR are set",
-    );
-    cuda_include = cuda_include.join("include");
-
-    cc::Build::new()
-        .file("./optix_stubs.c")
-        .include(optix_include)
-        .include(cuda_include)
-        .cpp(false)
-        .compile("optix_stubs");
-
-    println!("cargo:rustc-link-search=native={}", out_dir);
-    println!("cargo:rustc-link-lib=static=optix_stubs");
-}
diff --git a/crates/optix_sys/src/lib.rs b/crates/optix_sys/src/lib.rs
deleted file mode 100644
index 088cb334..00000000
--- a/crates/optix_sys/src/lib.rs
+++ /dev/null
@@ -1,10 +0,0 @@
-//! Raw bindings to the OptiX 7.3 SDK.
-
-#![allow(warnings)]
-
-use cust_raw::*;
-include!("../optix.rs");
-
-extern "C" {
-    pub fn optixInit() -> OptixResult;
-}
diff --git a/examples/cuda/cpu/add/src/main.rs b/examples/cuda/cpu/add/src/main.rs
index 5d5deb96..8ced6476 100644
--- a/examples/cuda/cpu/add/src/main.rs
+++ b/examples/cuda/cpu/add/src/main.rs
@@ -29,13 +29,13 @@ fn main() -> Result<(), Box<dyn Error>> {
     let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
 
     // allocate the GPU memory needed to house our numbers and copy them over.
-    let mut lhs_gpu = lhs.as_slice().as_dbuf()?;
-    let mut rhs_gpu = rhs.as_slice().as_dbuf()?;
+    let lhs_gpu = lhs.as_slice().as_dbuf()?;
+    let rhs_gpu = rhs.as_slice().as_dbuf()?;
 
     // allocate our output buffer. You could also use DeviceBuffer::uninitialized() to avoid the
     // cost of the copy, but you need to be careful not to read from the buffer.
     let mut out = vec![0.0f32; NUMBERS_LEN];
-    let mut out_buf = out.as_slice().as_dbuf()?;
+    let out_buf = out.as_slice().as_dbuf()?;
 
     // retrieve the add kernel from the module so we can calculate the right launch config.
     let func = module.get_function("add")?;
diff --git a/examples/cuda/cpu/path_tracer/Cargo.toml b/examples/cuda/cpu/path_tracer/Cargo.toml
index 5851fee3..d4aac270 100644
--- a/examples/cuda/cpu/path_tracer/Cargo.toml
+++ b/examples/cuda/cpu/path_tracer/Cargo.toml
@@ -4,8 +4,9 @@ version = "0.1.0"
 edition = "2018"
 
 [dependencies]
+vek = "0.15"
 bytemuck = { version = "1.7.2", features = ["derive"] }
-cust = { version = "0.2", path = "../../../../crates/cust", features = ["vek"] }
+cust = { version = "0.2", path = "../../../../crates/cust", features = ["impl_vek"] }
 image = "0.23.14"
 path_tracer_gpu = { path = "../../gpu/path_tracer_gpu" }
 gpu_rand = { version = "0.1", path = "../../../../crates/gpu_rand" }
diff --git a/examples/cuda/cpu/path_tracer/src/common.rs b/examples/cuda/cpu/path_tracer/src/common.rs
index d875b9b7..7f342492 100644
--- a/examples/cuda/cpu/path_tracer/src/common.rs
+++ b/examples/cuda/cpu/path_tracer/src/common.rs
@@ -1,8 +1,8 @@
-use cust::vek::{Vec2, Vec3};
 use glutin::event::{
     ElementState, Event, MouseButton, MouseScrollDelta, VirtualKeyCode, WindowEvent,
 };
 use path_tracer_gpu::Viewport;
+use vek::{Vec2, Vec3};
 
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub struct Camera {
diff --git a/examples/cuda/cpu/path_tracer/src/cpu/mod.rs b/examples/cuda/cpu/path_tracer/src/cpu/mod.rs
index ba7d982b..3b67efeb 100644
--- a/examples/cuda/cpu/path_tracer/src/cpu/mod.rs
+++ b/examples/cuda/cpu/path_tracer/src/cpu/mod.rs
@@ -1,6 +1,5 @@
 use std::time::Duration;
 
-use cust::vek::{Clamp, Vec2, Vec3};
 use gpu_rand::{DefaultRand, GpuRand};
 use imgui::Ui;
 use path_tracer_gpu::{
@@ -8,6 +7,7 @@ use path_tracer_gpu::{
 };
 use rayon::prelude::*;
 use sysinfo::{ProcessorExt, System, SystemExt};
+use vek::{Clamp, Vec2, Vec3};
 
 use crate::{common::Camera, cuda::SEED};
 
diff --git a/examples/cuda/cpu/path_tracer/src/cuda/data.rs b/examples/cuda/cpu/path_tracer/src/cuda/data.rs
index 79353091..d7698498 100644
--- a/examples/cuda/cpu/path_tracer/src/cuda/data.rs
+++ b/examples/cuda/cpu/path_tracer/src/cuda/data.rs
@@ -3,10 +3,10 @@ use cust::{
     error::CudaResult,
     memory::{DeviceBuffer, DeviceCopy, UnifiedBuffer},
     util::SliceExt,
-    vek::{num_traits::Zero, Vec2, Vec3},
 };
 use gpu_rand::DefaultRand;
 use path_tracer_gpu::{material::MaterialKind, scene::Scene, Object, Viewport};
+use vek::{num_traits::Zero, Vec2, Vec3};
 
 use super::SEED;
 
diff --git a/examples/cuda/cpu/path_tracer/src/cuda/mod.rs b/examples/cuda/cpu/path_tracer/src/cuda/mod.rs
index e83dc610..a80ee2a2 100644
--- a/examples/cuda/cpu/path_tracer/src/cuda/mod.rs
+++ b/examples/cuda/cpu/path_tracer/src/cuda/mod.rs
@@ -11,13 +11,13 @@ use cust::{
     event::{Event, EventFlags},
     function::{BlockSize, GridSize},
     prelude::*,
-    vek::{Vec2, Vec3},
 };
 use optix::{
-    context::OptixContext,
+    context::DeviceContext,
     denoiser::{Denoiser, DenoiserModelKind, Image, ImageFormat},
 };
 use path_tracer_gpu::scene::Scene;
+use vek::{Vec2, Vec3};
 
 /// Seed for the random states
 pub const SEED: u64 = 932174513921034;
@@ -33,7 +33,7 @@ pub struct CudaRenderer {
     stream: Stream,
     module: Module,
     denoiser: Denoiser,
-    _optix_context: OptixContext,
+    _optix_context: DeviceContext,
     _context: Context,
 
     buffers: CudaRendererBuffers,
@@ -45,7 +45,7 @@ impl CudaRenderer {
         let context = cust::quick_init()?;
         optix::init().unwrap();
 
-        let optix_context = OptixContext::new(&context).unwrap();
+        let optix_context = DeviceContext::new(&context, false).unwrap();
 
         let module = Module::from_ptx(PTX, &[])?;
         let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
@@ -91,9 +91,10 @@ impl CudaRenderer {
         self.buffers.resize(new_size)?;
         self.cpu_image.resize(new_size.product(), Vec3::zero());
 
-        Ok(self
-            .denoiser
-            .setup_state(&self.stream, new_size.x as u32, new_size.y as u32, false)?)
+        self.denoiser
+            .setup_state(&self.stream, new_size.x as u32, new_size.y as u32, false)
+            .unwrap();
+        Ok(())
     }
 
     /// calculate an optimal launch configuration for an image kernel
@@ -144,13 +145,15 @@ impl CudaRenderer {
                 height,
             );
 
-            self.denoiser.invoke(
-                stream,
-                Default::default(),
-                input_image,
-                Default::default(),
-                &mut self.buffers.denoised_buffer,
-            )?;
+            self.denoiser
+                .invoke(
+                    stream,
+                    Default::default(),
+                    input_image,
+                    Default::default(),
+                    &mut self.buffers.denoised_buffer,
+                )
+                .unwrap();
 
             self.buffers.denoised_buffer.as_device_ptr()
         } else {
@@ -187,7 +190,7 @@ impl CudaRenderer {
 
         let (blocks, threads) = self.launch_dimensions();
 
-        let mut scene = Scene {
+        let scene = Scene {
             objects: &self.buffers.objects,
             materials: &self.buffers.materials,
         }
diff --git a/examples/cuda/cpu/path_tracer/src/main.rs b/examples/cuda/cpu/path_tracer/src/main.rs
index e7a6d3fd..58b2f7b9 100644
--- a/examples/cuda/cpu/path_tracer/src/main.rs
+++ b/examples/cuda/cpu/path_tracer/src/main.rs
@@ -5,7 +5,6 @@ pub mod renderer;
 pub mod viewer;
 
 use common::Camera;
-use cust::vek::Vec3;
 use path_tracer_gpu::{
     material::{DiffuseMaterial, MaterialKind, MetallicMaterial},
     scene::Scene,
@@ -13,6 +12,7 @@ use path_tracer_gpu::{
     Object,
 };
 use std::error::Error;
+use vek::Vec3;
 
 pub const WIDTH: u32 = 1920;
 pub const HEIGHT: u32 = 1080;
diff --git a/examples/cuda/cpu/path_tracer/src/renderer.rs b/examples/cuda/cpu/path_tracer/src/renderer.rs
index 0b86c83e..5ec9c512 100644
--- a/examples/cuda/cpu/path_tracer/src/renderer.rs
+++ b/examples/cuda/cpu/path_tracer/src/renderer.rs
@@ -1,8 +1,8 @@
-use cust::vek::Vec2;
 use glutin::{event::Event, event_loop::ControlFlow};
 use imgui::Ui;
 use path_tracer_gpu::scene::Scene;
 use sysinfo::{System, SystemExt};
+use vek::Vec2;
 
 use crate::{
     common::{Camera, CameraController},
diff --git a/examples/cuda/cpu/path_tracer/src/viewer.rs b/examples/cuda/cpu/path_tracer/src/viewer.rs
index 2c74f37c..4943fc55 100644
--- a/examples/cuda/cpu/path_tracer/src/viewer.rs
+++ b/examples/cuda/cpu/path_tracer/src/viewer.rs
@@ -1,4 +1,3 @@
-use cust::vek::Vec2;
 use glium::{
     implement_vertex,
     index::{NoIndices, PrimitiveType},
@@ -16,6 +15,7 @@ use imgui::Condition;
 use imgui_winit_support::{HiDpiMode, WinitPlatform};
 use path_tracer_gpu::scene::Scene;
 use std::time::Instant;
+use vek::Vec2;
 
 use crate::{common::Camera, renderer::Renderer, HEIGHT, WIDTH};
 
diff --git a/examples/optix/denoiser/Cargo.toml b/examples/optix/denoiser/Cargo.toml
index 105b8dde..d0f26164 100644
--- a/examples/optix/denoiser/Cargo.toml
+++ b/examples/optix/denoiser/Cargo.toml
@@ -6,5 +6,6 @@ edition = "2021"
 [dependencies]
 optix = { version = "0.1", path = "../../../crates/optix" }
 structopt = "0.3"
-cust = { version = "0.2", path = "../../../crates/cust", features = ["vek"] }
+cust = { version = "0.2", path = "../../../crates/cust", features = ["impl_vek"] }
 image = "0.23.14"
+vek = { version = "0.15.1" }
diff --git a/examples/optix/denoiser/src/main.rs b/examples/optix/denoiser/src/main.rs
index 59c860f8..b6ca9f5e 100644
--- a/examples/optix/denoiser/src/main.rs
+++ b/examples/optix/denoiser/src/main.rs
@@ -1,13 +1,13 @@
 use cust::memory::DeviceBuffer;
 use cust::prelude::{Stream, StreamFlags};
 use cust::util::SliceExt;
-use cust::vek::{Clamp, Vec3};
 use image::io::Reader;
-use optix::context::OptixContext;
+use optix::context::DeviceContext;
 use optix::denoiser::{Denoiser, DenoiserModelKind, DenoiserParams, Image, ImageFormat};
 use std::error::Error;
 use std::path::PathBuf;
 use structopt::StructOpt;
+use vek::{Clamp, Vec3};
 
 #[derive(StructOpt)]
 #[structopt(
@@ -46,7 +46,7 @@ fn main() -> Result<(), Box<dyn Error>> {
     // set up CUDA and OptiX then make the needed structs/contexts.
     let cuda_ctx = cust::quick_init()?;
     optix::init()?;
-    let optix_ctx = OptixContext::new(&cuda_ctx)?;
+    let optix_ctx = DeviceContext::new(&cuda_ctx, false)?;
 
     let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;