Skip to content

Commit 9ea4bf1

Browse files
committed
examples: Add dot product of two vectors example
- An example of a scalar product of two vectors on a GPU has been added. - In the example, a benchmark of three implementation options is performed: through a cuBLAS call, using native CUDA code, and using the Rust-CUDA variant.
1 parent 6dd6736 commit 9ea4bf1

File tree

9 files changed

+423
-0
lines changed

9 files changed

+423
-0
lines changed

Cargo.lock

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ members = [
4747
"examples/gemm/kernels",
4848
"examples/i128_demo",
4949
"examples/i128_demo/kernels",
50+
"examples/sdot",
51+
"examples/sdot/kernels",
5052
"examples/sha2_crates_io",
5153
"examples/sha2_crates_io/kernels",
5254
"examples/vecadd",

examples/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,6 @@ which runs on CPU or GPU, with the additional option of running OptiX denoising.
2121

2222
The Path Tracer uses cuda_builder to compile the core path tracer for the GPU and GPU (hardware raytracing), and uses the core path tracer as a normal crate
2323
for CPU rendering and sharing structures.
24+
25+
### [sdot](sdot)
26+
Example of computes the dot product of two single-precision (f32) vectors

examples/sdot/Cargo.toml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[package]
2+
name = "sdot"
3+
version = "0.1.0"
4+
edition = "2024"
5+
description = "Example of computes the dot product of two f32 vectors"
6+
7+
[dependencies]
8+
cust = { path = "../../crates/cust" }
9+
blastoff = { path = "../../crates/blastoff" }
10+
rand = "0.9.*"
11+
12+
[build-dependencies]
13+
cuda_builder = { workspace = true, default-features = false }

examples/sdot/build.rs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
use std::env;
2+
use std::path;
3+
use std::process::{Command, Stdio};
4+
5+
use cuda_builder::CudaBuilder;
6+
7+
fn main() {
8+
println!("cargo::rerun-if-changed=build.rs");
9+
println!("cargo::rerun-if-changed=kernels");
10+
11+
let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
12+
let kernels_path = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()).join("kernels");
13+
14+
CudaBuilder::new(kernels_path.as_path())
15+
.copy_to(out_path.join("kernels.ptx"))
16+
.build()
17+
.unwrap();
18+
19+
// Generate PTX from native CUDA kernels
20+
let cuda_kernel_path = kernels_path.join("cuda/sdot.cu");
21+
22+
println!("cargo::rerun-if-changed={}", cuda_kernel_path.display());
23+
24+
let cuda_ptx = out_path.join("kernels_cuda_mangles.ptx");
25+
let mut nvcc = Command::new("nvcc");
26+
nvcc.arg("--ptx")
27+
.args(["--Werror", "all-warnings"])
28+
.args(["--output-directory", out_path.as_os_str().to_str().unwrap()])
29+
.args(["-o", cuda_ptx.as_os_str().to_str().unwrap()])
30+
.arg(cuda_kernel_path.as_path());
31+
32+
let build = nvcc
33+
.stderr(Stdio::inherit())
34+
.output()
35+
.expect("failed to execute nvcc kernel build");
36+
37+
assert!(build.status.success());
38+
39+
// Decodes (demangles) low-level identifiers
40+
let cat_out = Command::new("cat")
41+
.arg(cuda_ptx)
42+
.stdout(Stdio::piped())
43+
.stderr(Stdio::inherit())
44+
.spawn()
45+
.expect("Failed to start cat process")
46+
.stdout
47+
.expect("Failed to open cat stdout");
48+
49+
let outputs = std::fs::File::create(out_path.join("kernels_cuda.ptx"))
50+
.expect("Can not open output ptc kernel file");
51+
52+
let filt_out = Command::new("cu++filt")
53+
.arg("-p")
54+
.stdin(Stdio::from(cat_out))
55+
.stdout(Stdio::from(outputs))
56+
.stderr(Stdio::inherit())
57+
.spawn()
58+
.expect("Failed to start cu++filt process")
59+
.wait_with_output()
60+
.expect("Failed to wait on cu++filt");
61+
62+
assert!(filt_out.status.success());
63+
}

examples/sdot/kernels/Cargo.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "sdot_kernels"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cuda_std = { path = "../../../crates/cuda_std" }
8+
9+
[lib]
10+
crate-type = ["cdylib", "rlib"]

examples/sdot/kernels/cuda/sdot.cu

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Here we use `unsigned long` to match the Rust version `usize`.
2+
__global__ void sdot(const float *x, unsigned long x_n, const float *y, unsigned long y_n, float *out)
3+
{
4+
5+
extern __shared__ float shared_sum[];
6+
unsigned int i;
7+
8+
unsigned int num_threads = gridDim.x * blockDim.x;
9+
unsigned int start_ind = blockDim.x * blockIdx.x;
10+
unsigned int tid = threadIdx.x;
11+
12+
float sum = 0.0f;
13+
for (i = start_ind + tid; i < x_n; i += num_threads)
14+
{
15+
// Rust checks emulation
16+
if (i >= y_n)
17+
__trap();
18+
19+
sum += x[i] * y[i];
20+
}
21+
shared_sum[tid] = sum;
22+
23+
for (i = blockDim.x >> 1; i > 0; i >>= 1)
24+
{
25+
__syncthreads();
26+
if (tid < i)
27+
{
28+
shared_sum[tid] += shared_sum[tid + i];
29+
}
30+
}
31+
32+
if (tid == 0)
33+
{
34+
out[blockIdx.x] = shared_sum[tid];
35+
}
36+
}

examples/sdot/kernels/src/lib.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
use cuda_std::{kernel, shared, thread};
2+
3+
#[kernel]
4+
#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
5+
pub unsafe fn sdot(x: &[f32], y: &[f32], out: *mut f32) {
6+
let shared_sum = shared::dynamic_shared_mem::<f32>();
7+
8+
let num_threads = (thread::grid_dim_x() as usize) * (thread::block_dim_x() as usize);
9+
let start_ind = (thread::block_dim_x() as usize) * (thread::block_idx_x() as usize);
10+
let tid = thread::thread_idx_x() as usize;
11+
12+
let mut sum = 0f32;
13+
for i in ((start_ind + tid)..x.len()).step_by(num_threads) {
14+
sum += x[i] * y[i];
15+
}
16+
unsafe {
17+
*shared_sum.add(tid) = sum;
18+
}
19+
20+
let mut i = (thread::block_dim_x() >> 1) as usize;
21+
while i > 0 {
22+
thread::sync_threads();
23+
if tid < i {
24+
unsafe {
25+
*shared_sum.add(tid) += *shared_sum.add(tid + i);
26+
}
27+
}
28+
29+
i >>= 1;
30+
}
31+
32+
if tid == 0 {
33+
unsafe {
34+
*out.add(thread::block_idx_x() as usize) = *shared_sum.add(tid);
35+
}
36+
}
37+
}

0 commit comments

Comments
 (0)