Skip to content

Commit 9848354

Browse files
committed
Merge branch 'feat/cuda-support' of https://github.com/rust-dd/stochastic-rs into feat/cuda-support
2 parents cce5e68 + e1234c7 commit 9848354

File tree

10 files changed

+222
-40
lines changed

10 files changed

+222
-40
lines changed

Cargo.toml

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,16 @@ candle-datasets = "0.8.1"
2020
candle-nn = "0.8.1"
2121
candle-transformers = "0.8.1"
2222
chrono = "0.4.38"
23-
cudarc = { version = "0.13.9", optional = true, features = ["cuda-12080"] }
23+
cudarc = { version = "0.13.9", optional = true, features = ["cuda-12080", "cuda-version-from-build-system"] }
2424
flate2 = "1.0.34"
2525
gauss-quad = "0.2.1"
2626
impl-new-derive = "0.1.2"
2727
implied-vol = "1.0.0"
2828
indicatif = "0.17.8"
29-
itransformer = "1.0.1"
29+
# itransformer = "1.0.1"
3030
kendalls = "0.2.2"
3131
levenberg-marquardt = "0.14.0"
32+
libloading = { version = "0.8.6", optional = true }
3233
linreg = "0.2.0"
3334
mimalloc = { version = "0.1.43", optional = true }
3435
nalgebra = "0.33.2"
@@ -37,7 +38,7 @@ ndarray = { version = "0.16.1", features = [
3738
"matrixmultiply-threading",
3839
"blas",
3940
] }
40-
ndarray-linalg = { version = "0.17.0", features = ["openblas-static"] }
41+
ndarray-linalg = { version = "0.17.0"}
4142
ndarray-npy = "0.9.1"
4243
ndarray-rand = "0.15.0"
4344
ndarray-stats = "0.6.0"
@@ -68,8 +69,8 @@ yahoo_finance_api = { version = "2.3.0", optional = true }
6869
[dev-dependencies]
6970

7071
[features]
71-
cuda = ["dep:cudarc"]
72-
default = ["jemalloc"]
72+
cuda = ["dep:cudarc", "dep:libloading"]
73+
default = ["cuda"]
7374
jemalloc = ["dep:tikv-jemallocator"]
7475
malliavin = []
7576
mimalloc = ["dep:mimalloc"]
@@ -86,5 +87,17 @@ debug = false
8687
codegen-units = 1
8788
lto = true
8889

90+
[target.'cfg(target_os = "macos")'.dependencies]
91+
ndarray-linalg = { version = "0.17.0", features = ["openblas-static"] }
92+
93+
[target.'cfg(target_os = "macos")'.features]
94+
default = ["jemalloc"]
95+
96+
[target.'cfg(target_os = "linux")'.features]
97+
default = ["jemalloc"]
98+
99+
[target.'cfg(target_os = "windows")'.features]
100+
default = ["mimalloc"]
101+
89102
# [package.metadata.docs.rs]
90103
# all-features = true

src/ai.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use candle_core::Tensor;
2-
pub use itransformer::ITransformer;
2+
// pub use itransformer::ITransformer;
33

44
pub mod fou;
55
pub mod utils;

src/stochastic/cuda/fgn.cu

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#include <stdio.h>
2+
#include <cuda_runtime.h>
3+
#include <curand_kernel.h>
4+
#include <cufft.h>
5+
#include <cuComplex.h>
6+
#include <math.h>
7+
8+
#ifdef _WIN32
9+
#define EXPORT __declspec(dllexport)
10+
#else
11+
#define EXPORT
12+
#endif
13+
14+
__global__ void fill_random_with_eigs(
15+
cuComplex *d_data,
16+
const cuComplex *d_sqrt_eigs,
17+
int traj_size,
18+
int m,
19+
unsigned long seed)
20+
{
21+
int traj_id = blockIdx.x;
22+
if (traj_id >= m)
23+
return;
24+
25+
int idx = threadIdx.x;
26+
if (idx >= traj_size)
27+
return;
28+
29+
int data_idx = traj_id * traj_size + idx;
30+
31+
curandState state;
32+
curand_init(seed + traj_id, idx, 0, &state);
33+
34+
float re = curand_normal(&state);
35+
float im = curand_normal(&state);
36+
cuComplex noise = make_cuComplex(re, im);
37+
38+
d_data[data_idx] = cuCmulf(noise, d_sqrt_eigs[idx]);
39+
}
40+
41+
__global__ void scale_and_copy_to_output(
42+
const cuComplex *d_data,
43+
float *d_output,
44+
int n,
45+
int m,
46+
int offset,
47+
float hurst,
48+
float t)
49+
{
50+
int traj_id = blockIdx.x;
51+
if (traj_id >= m)
52+
return;
53+
54+
int idx = threadIdx.x;
55+
int out_size = n - offset;
56+
if (idx >= out_size)
57+
return;
58+
59+
int data_idx = traj_id * (2 * n) + (idx + 1);
60+
float scale = powf((float)n, -hurst) * powf(t, hurst);
61+
62+
int out_idx = traj_id * out_size + idx;
63+
d_output[out_idx] = d_data[data_idx].x * scale;
64+
}
65+
66+
extern "C" EXPORT void fgn_kernel(
67+
const cuComplex *d_sqrt_eigs,
68+
float *d_output,
69+
int n,
70+
int m,
71+
int offset,
72+
float hurst,
73+
float t,
74+
unsigned long seed)
75+
{
76+
int traj_size = 2 * n;
77+
78+
cuComplex *d_data = nullptr;
79+
cudaMalloc(&d_data, (size_t)m * traj_size * sizeof(cuComplex));
80+
81+
{
82+
dim3 gridDim(m);
83+
dim3 blockDim(traj_size);
84+
fill_random_with_eigs<<<gridDim, blockDim>>>(
85+
d_data, d_sqrt_eigs, traj_size, m, seed);
86+
cudaDeviceSynchronize();
87+
}
88+
89+
{
90+
cufftHandle plan;
91+
cufftPlan1d(&plan, traj_size, CUFFT_C2C, m);
92+
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
93+
cudaDeviceSynchronize();
94+
cufftDestroy(plan);
95+
}
96+
97+
{
98+
dim3 gridDim(m);
99+
dim3 blockDim(n);
100+
scale_and_copy_to_output<<<gridDim, blockDim>>>(
101+
d_data, d_output, n, m, offset, hurst, t);
102+
cudaDeviceSynchronize();
103+
}
104+
105+
cudaFree(d_data);
106+
}
827 Bytes
Binary file not shown.
1.84 KB
Binary file not shown.
837 KB
Binary file not shown.
837 KB
Binary file not shown.
822 Bytes
Binary file not shown.
1.8 KB
Binary file not shown.

src/stochastic/noise/fgn.rs

Lines changed: 97 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -92,58 +92,91 @@ impl Sampling<f64> for FGN {
9292

9393
#[cfg(feature = "cuda")]
9494
fn sample_cuda(&self) -> Result<Array2<f64>, Box<dyn Error>> {
95-
use cudarc::driver::{CudaDevice, DeviceRepr, LaunchAsync, LaunchConfig};
96-
use cudarc::nvrtc::Ptx;
95+
// nvcc -shared -Xcompiler -fPIC fgn.cu -o libfgn.so -lcufft
96+
// nvcc -shared fgn.cu -o fgn.dll -lcufft
97+
use std::ffi::c_void;
9798

98-
let m = self.m.unwrap_or(1);
99-
let device = CudaDevice::new(0)?;
100-
device.load_ptx(Ptx::from_file("fgn.ptx"), "fgn", &["fgn_kernel"])?;
101-
let f = device.get_func("fgn", "fgn_kernel").unwrap();
99+
use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut, DeviceRepr};
100+
101+
use libloading::{Library, Symbol};
102102

103103
#[repr(C)]
104-
#[derive(Debug, Default, Copy, Clone, PartialOrd, PartialEq)]
104+
#[derive(Debug, Default, Copy, Clone)]
105105
pub struct cuComplex {
106-
pub x: f64,
107-
pub y: f64,
106+
pub x: f32,
107+
pub y: f32,
108108
}
109109

110110
unsafe impl DeviceRepr for cuComplex {
111-
fn as_kernel_param(&self) -> *mut std::ffi::c_void {
111+
fn as_kernel_param(&self) -> *mut c_void {
112112
self as *const Self as *mut _
113113
}
114114
}
115115

116-
let host_sqrt_eigenvalues = self
117-
.sqrt_eigenvalues
118-
.map(|v| cuComplex { x: v.re, y: v.im })
119-
.to_vec();
120-
let d_sqrt_eigenvalues = device.htod_copy(host_sqrt_eigenvalues)?;
121-
let mut d_output = device.alloc_zeros::<f64>(m * self.n)?;
116+
type FgnKernelFn = unsafe extern "C" fn(
117+
/* d_sqrt_eigs: */ *const cuComplex,
118+
/* d_output: */ *mut f32,
119+
/* n: */ i32,
120+
/* m: */ i32,
121+
/* offset: */ i32,
122+
/* hurst: */ f32,
123+
/* t: */ f32,
124+
/* seed: */ u64,
125+
);
126+
127+
let lib = unsafe {
128+
#[cfg(target_os = "windows")]
129+
{
130+
Library::new("src/stochastic/cuda/fgn_windows/fgn.dll")
131+
}
132+
#[cfg(target_os = "linux")]
133+
{
134+
Library::new("src/stochastic/cuda/fgn_linux/libfgn.so")
135+
}
136+
}?;
137+
138+
let fgn_kernel: Symbol<FgnKernelFn> = unsafe { lib.get(b"fgn_kernel") }?;
139+
let device = CudaDevice::new(0)?;
140+
141+
let m = self.m.unwrap_or(1);
142+
let n = self.n;
143+
let offset = self.offset;
144+
let hurst = self.hurst;
145+
let t = self.t.unwrap_or(1.0);
122146
let seed = 42u64;
123147

148+
let host_sqrt_eigs: Vec<cuComplex> = self
149+
.sqrt_eigenvalues
150+
.iter()
151+
.map(|z| cuComplex {
152+
x: z.re as f32,
153+
y: z.im as f32,
154+
})
155+
.collect();
156+
let d_sqrt_eigs = device.htod_copy(host_sqrt_eigs)?;
157+
let mut d_output = device.alloc_zeros::<f32>(m * (n - offset))?;
158+
124159
unsafe {
125-
f.launch(
126-
LaunchConfig::for_num_elems(self.n as u32),
127-
(
128-
&d_sqrt_eigenvalues,
129-
&mut d_output,
130-
self.n as u32,
131-
m as u32,
132-
self.offset,
133-
self.hurst,
134-
self.t.unwrap_or(1.0),
135-
seed,
136-
),
137-
)
138-
}?;
160+
fgn_kernel(
161+
(*d_sqrt_eigs.device_ptr()) as *const cuComplex,
162+
(*d_output.device_ptr_mut()) as *mut f32,
163+
n as i32,
164+
m as i32,
165+
offset as i32,
166+
hurst as f32,
167+
t as f32,
168+
seed,
169+
);
170+
}
139171

140-
let host_output = device.sync_reclaim(d_output)?;
141-
let mut fgn = Array2::zeros((m, self.n - self.offset));
172+
let host_output: Vec<f32> = device.sync_reclaim(d_output)?;
173+
let mut fgn = Array2::<f64>::zeros((m, n - offset));
142174
for i in 0..m {
143-
for j in 0..self.n - self.offset {
144-
fgn[[i, j]] = host_output[i * self.n + j];
175+
for j in 0..(n - offset) {
176+
fgn[[i, j]] = host_output[i * (n - offset) + j] as f64;
145177
}
146178
}
179+
147180
Ok(fgn)
148181
}
149182

@@ -218,6 +251,36 @@ mod tests {
218251
plot_1d!(fbm.sample(), "Fractional Brownian Motion (H = 0.7)");
219252
}
220253

254+
#[test]
255+
#[tracing_test::traced_test]
256+
#[cfg(feature = "cuda")]
257+
fn fgn_cuda() {
258+
let fbm = FGN::new(0.7, 500, Some(1.0), None);
259+
let fgn = fbm.sample_cuda().unwrap();
260+
let fgn = fgn.row(0);
261+
tracing::info!("{:?}", fgn);
262+
// plot_1d!(fgn, "Fractional Brownian Motion (H = 0.7)");
263+
let mut path = Array1::<f64>::zeros(500);
264+
for i in 1..500 {
265+
path[i] += fgn[i - 1];
266+
}
267+
plot_1d!(path, "Fractional Brownian Motion (H = 0.7)");
268+
269+
let start = std::time::Instant::now();
270+
for i in 0..1_000_000 {
271+
let _ = fbm.sample_cuda();
272+
}
273+
let end = start.elapsed().as_secs();
274+
tracing::info!("1000 fgn generated on cuda in: {end}");
275+
276+
let start = std::time::Instant::now();
277+
for i in 0..1_000_000 {
278+
let _ = fbm.sample();
279+
}
280+
let end = start.elapsed().as_secs();
281+
tracing::info!("1000 fgn generated on cuda in: {end}");
282+
}
283+
221284
#[test]
222285
#[ignore = "Not implemented"]
223286
#[cfg(feature = "malliavin")]

0 commit comments

Comments
 (0)