Skip to content

Commit 74a8814

Browse files
committed
fix: fgn cuda compiler errors
1 parent 0e99f3e commit 74a8814

File tree

5 files changed

+66
-61
lines changed

5 files changed

+66
-61
lines changed

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ cudarc = { version = "0.13.9", optional = true, features = [
2727
either = "1.15.0"
2828
flate2 = "1.0.34"
2929
gauss-quad = "0.2.1"
30-
impl-new-derive = "0.1.2"
30+
impl-new-derive = "0.1.3"
3131
implied-vol = "1.0.0"
3232
indicatif = "0.17.8"
3333
# itransformer = "1.0.1"
@@ -74,7 +74,7 @@ yahoo_finance_api = { version = "2.3.0", optional = true }
7474

7575
[features]
7676
cuda = ["dep:cudarc", "dep:libloading"]
77-
default = []
77+
default = ["cuda"]
7878
jemalloc = ["dep:tikv-jemallocator"]
7979
malliavin = []
8080
mimalloc = ["dep:mimalloc"]

src/stochastic.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,12 @@ pub mod noise;
2626
pub mod process;
2727
pub mod volatility;
2828

29-
use std::error::Error;
3029
use std::sync::{Arc, Mutex};
3130

3231
#[cfg(feature = "cuda")]
3332
use either::Either;
33+
#[cfg(feature = "cuda")]
34+
use anyhow::Result;
3435

3536
use ndarray::parallel::prelude::*;
3637
use ndarray::{Array1, Array2, Axis};
@@ -48,7 +49,7 @@ pub trait Sampling<T: Clone + Send + Sync + Zero>: Send + Sync {
4849

4950
/// Sample the process with CUDA support
5051
#[cfg(feature = "cuda")]
51-
fn sample_cuda(&self) -> Result<Either<Array1<T>, Array2<T>>, Box<dyn Error>> {
52+
fn sample_cuda(&self) -> Result<Either<Array1<T>, Array2<T>>> {
5253
unimplemented!()
5354
}
5455

src/stochastic/cuda/fgn.cu

Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,90 @@
1-
#include <cuComplex.h>
1+
#include <stdio.h>
22
#include <cuda_runtime.h>
3-
#include <cufft.h>
43
#include <curand_kernel.h>
4+
#include <cufft.h>
5+
#include <cuComplex.h>
56
#include <math.h>
6-
#include <stdio.h>
77

88
#ifdef _WIN32
99
#define EXPORT __declspec(dllexport)
1010
#else
1111
#define EXPORT
1212
#endif
1313

14-
__global__ void fill_random_with_eigs(cuComplex *d_data,
15-
const cuComplex *d_sqrt_eigs,
16-
int traj_size, int m,
17-
unsigned long seed) {
14+
__global__ void fill_random_with_eigs(
15+
cuComplex *d_data,
16+
const cuComplex *d_sqrt_eigs,
17+
int traj_size,
18+
int m,
19+
unsigned long seed)
20+
{
1821
int tid = blockIdx.x * blockDim.x + threadIdx.x;
1922
if (tid >= m * traj_size)
2023
return;
21-
2224
int traj_id = tid / traj_size;
2325
int idx = tid % traj_size;
24-
25-
__shared__ curandState state[32];
26-
int lane_id = threadIdx.x % 32;
27-
28-
if (lane_id == 0) {
29-
curand_init(seed + traj_id, blockIdx.x, 0, &state[lane_id]);
30-
}
31-
__syncthreads();
32-
33-
float re = curand_normal(&state[lane_id]);
26+
curandState state;
27+
curand_init(seed + traj_id, idx, 0, &state);
28+
float re = curand_normal(&state);
3429
float im = curand_normal(&state);
3530
cuComplex noise = make_cuComplex(re, im);
3631
d_data[tid] = cuCmulf(noise, d_sqrt_eigs[idx]);
3732
}
3833

39-
__global__ void scale_and_copy_to_output(const cuComplex *d_data,
40-
float *d_output, int n, int m,
41-
int offset, float scale) {
34+
__global__ void scale_and_copy_to_output(
35+
const cuComplex *d_data,
36+
float *d_output,
37+
int n,
38+
int m,
39+
int offset,
40+
float hurst,
41+
float t)
42+
{
4243
int out_size = n - offset;
4344
int tid = blockIdx.x * blockDim.x + threadIdx.x;
4445
if (tid >= m * out_size)
4546
return;
46-
4747
int traj_id = tid / out_size;
4848
int idx = tid % out_size;
4949
int data_idx = traj_id * (2 * n) + (idx + 1);
50-
50+
float scale = powf((float)n, -hurst) * powf(t, hurst);
5151
d_output[tid] = d_data[data_idx].x * scale;
5252
}
5353

54-
extern "C" EXPORT void fgn_kernel(const cuComplex *d_sqrt_eigs, float *d_output,
55-
int n, int m, int offset, float hurst,
56-
float t, unsigned long seed) {
54+
extern "C" EXPORT void fgn_kernel(
55+
const cuComplex *d_sqrt_eigs,
56+
float *d_output,
57+
int n,
58+
int m,
59+
int offset,
60+
float hurst,
61+
float t,
62+
unsigned long seed)
63+
{
5764
int traj_size = 2 * n;
5865
cuComplex *d_data = nullptr;
5966
cudaMalloc(&d_data, (size_t)m * traj_size * sizeof(cuComplex));
60-
61-
int block_size = 512;
62-
int grid_size = (m * traj_size + block_size - 1) / block_size;
63-
64-
cudaStream_t stream;
65-
cudaStreamCreate(&stream);
66-
67-
fill_random_with_eigs<<<gridSize, blockSize, 0, stream>>>(d_data, d_sqrt_eigs,
68-
traj_size, m, seed);
69-
70-
cufftHandle plan;
71-
cufftPlan1d(&plan, traj_size, CUFFT_C2C, m);
72-
cufftSetStream(plan, stream);
73-
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
74-
cufftDestroy(plan);
75-
76-
int out_size = n - offset;
77-
grid_size = (m * out_size + block_size - 1) / block_size;
78-
float scale = powf((float)n, -hurst) * powf(t, hurst);
79-
scale_and_copy_to_output<<<gridSize, blockSize, 0, stream>>>(
80-
d_data, d_output, n, m, offset, scale);
81-
82-
cudaStreamSynchronize(stream);
83-
cudaStreamDestroy(stream);
67+
{
68+
int totalThreads = m * traj_size;
69+
int blockSize = 512;
70+
int gridSize = (totalThreads + blockSize - 1) / blockSize;
71+
fill_random_with_eigs<<<gridSize, blockSize>>>(d_data, d_sqrt_eigs, traj_size, m, seed);
72+
cudaDeviceSynchronize();
73+
}
74+
{
75+
cufftHandle plan;
76+
cufftPlan1d(&plan, traj_size, CUFFT_C2C, m);
77+
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
78+
cudaDeviceSynchronize();
79+
cufftDestroy(plan);
80+
}
81+
{
82+
int out_size = n - offset;
83+
int totalThreads = m * out_size;
84+
int blockSize = 512;
85+
int gridSize = (totalThreads + blockSize - 1) / blockSize;
86+
scale_and_copy_to_output<<<gridSize, blockSize>>>(d_data, d_output, n, m, offset, hurst, t);
87+
cudaDeviceSynchronize();
88+
}
8489
cudaFree(d_data);
85-
}
90+
}
0 Bytes
Binary file not shown.

src/stochastic/noise/fgn.rs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use std::sync::{Arc, RwLock};
33
#[cfg(feature = "cuda")]
44
use either::Either;
55
#[cfg(feature = "cuda")]
6-
use std::error::Error;
6+
use anyhow::Result;
77

88
use ndarray::parallel::prelude::*;
99
use ndarray::{concatenate, prelude::*};
@@ -99,13 +99,12 @@ impl Sampling<f64> for FGN {
9999
}
100100

101101
#[cfg(feature = "cuda")]
102-
fn sample_cuda(&self) -> Result<Either<Array1<f64>, Array2<f64>>, Box<dyn Error>> {
102+
fn sample_cuda(&self) -> Result<Either<Array1<f64>, Array2<f64>>> {
103103
// nvcc -shared -Xcompiler -fPIC fgn.cu -o libfgn.so -lcufft // ELF header error
104104
// nvcc -shared -o libfgn.so fgn.cu -Xcompiler -fPIC
105105
// nvcc -shared fgn.cu -o fgn.dll -lcufft
106106
use std::ffi::c_void;
107107

108-
use anyhow::Ok;
109108
use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut, DeviceRepr};
110109

111110
use libloading::{Library, Symbol};
@@ -183,7 +182,7 @@ impl Sampling<f64> for FGN {
183182
}
184183

185184
if m == 1 {
186-
let fgn = fgn.row(0);
185+
let fgn = fgn.row(0).to_owned();
187186
return Ok(Either::Left(fgn));
188187
}
189188

@@ -267,9 +266,9 @@ mod tests {
267266
#[tracing_test::traced_test]
268267
#[cfg(feature = "cuda")]
269268
fn fgn_cuda() {
270-
let fbm = FGN::new(0.7, 10_000, Some(1.0), Some(20000));
269+
let fbm = FGN::new(0.7, 500, Some(1.0), Some(1));
271270
let fgn = fbm.sample_cuda().unwrap();
272-
let fgn = fgn.row(0);
271+
let fgn = fgn.left().unwrap();
273272
plot_1d!(fgn, "Fractional Brownian Motion (H = 0.7)");
274273
let mut path = Array1::<f64>::zeros(500);
275274
for i in 1..500 {

0 commit comments

Comments
 (0)