fix: fgn cuda compiler errors

dancixx · dancixx · commit 74a881466da6 · 2025-03-14T10:06:18.000+01:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -27,7 +27,7 @@ cudarc = { version = "0.13.9", optional = true, features = [
 either = "1.15.0"
 flate2 = "1.0.34"
 gauss-quad = "0.2.1"
-impl-new-derive = "0.1.2"
+impl-new-derive = "0.1.3"
 implied-vol = "1.0.0"
 indicatif = "0.17.8"
 # itransformer = "1.0.1"
@@ -74,7 +74,7 @@ yahoo_finance_api = { version = "2.3.0", optional = true }
 
 [features]
 cuda = ["dep:cudarc", "dep:libloading"]
-default = []
+default = ["cuda"]
 jemalloc = ["dep:tikv-jemallocator"]
 malliavin = []
 mimalloc = ["dep:mimalloc"]
diff --git a/src/stochastic.rs b/src/stochastic.rs
@@ -26,11 +26,12 @@ pub mod noise;
 pub mod process;
 pub mod volatility;
 
-use std::error::Error;
 use std::sync::{Arc, Mutex};
 
 #[cfg(feature = "cuda")]
 use either::Either;
+#[cfg(feature = "cuda")]
+use anyhow::Result;
 
 use ndarray::parallel::prelude::*;
 use ndarray::{Array1, Array2, Axis};
@@ -48,7 +49,7 @@ pub trait Sampling<T: Clone + Send + Sync + Zero>: Send + Sync {
 
   /// Sample the process with CUDA support
   #[cfg(feature = "cuda")]
-  fn sample_cuda(&self) -> Result<Either<Array1<T>, Array2<T>>, Box<dyn Error>> {
+  fn sample_cuda(&self) -> Result<Either<Array1<T>, Array2<T>>> {
     unimplemented!()
   }
 
diff --git a/src/stochastic/cuda/fgn.cu b/src/stochastic/cuda/fgn.cu
@@ -1,85 +1,90 @@
-#include <cuComplex.h>
+#include <stdio.h>
 #include <cuda_runtime.h>
-#include <cufft.h>
 #include <curand_kernel.h>
+#include <cufft.h>
+#include <cuComplex.h>
 #include <math.h>
-#include <stdio.h>
 
 #ifdef _WIN32
 #define EXPORT __declspec(dllexport)
 #else
 #define EXPORT
 #endif
 
-__global__ void fill_random_with_eigs(cuComplex *d_data,
-                                      const cuComplex *d_sqrt_eigs,
-                                      int traj_size, int m,
-                                      unsigned long seed) {
+__global__ void fill_random_with_eigs(
+    cuComplex *d_data,
+    const cuComplex *d_sqrt_eigs,
+    int traj_size,
+    int m,
+    unsigned long seed)
+{
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid >= m * traj_size)
     return;
-
   int traj_id = tid / traj_size;
   int idx = tid % traj_size;
-
-  __shared__ curandState state[32];
-  int lane_id = threadIdx.x % 32;
-
-  if (lane_id == 0) {
-    curand_init(seed + traj_id, blockIdx.x, 0, &state[lane_id]);
-  }
-  __syncthreads();
-
-  float re = curand_normal(&state[lane_id]);
+  curandState state;
+  curand_init(seed + traj_id, idx, 0, &state);
+  float re = curand_normal(&state);
   float im = curand_normal(&state);
   cuComplex noise = make_cuComplex(re, im);
   d_data[tid] = cuCmulf(noise, d_sqrt_eigs[idx]);
 }
 
-__global__ void scale_and_copy_to_output(const cuComplex *d_data,
-                                         float *d_output, int n, int m,
-                                         int offset, float scale) {
+__global__ void scale_and_copy_to_output(
+    const cuComplex *d_data,
+    float *d_output,
+    int n,
+    int m,
+    int offset,
+    float hurst,
+    float t)
+{
   int out_size = n - offset;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid >= m * out_size)
     return;
-
   int traj_id = tid / out_size;
   int idx = tid % out_size;
   int data_idx = traj_id * (2 * n) + (idx + 1);
-
+  float scale = powf((float)n, -hurst) * powf(t, hurst);
   d_output[tid] = d_data[data_idx].x * scale;
 }
 
-extern "C" EXPORT void fgn_kernel(const cuComplex *d_sqrt_eigs, float *d_output,
-                                  int n, int m, int offset, float hurst,
-                                  float t, unsigned long seed) {
+extern "C" EXPORT void fgn_kernel(
+    const cuComplex *d_sqrt_eigs,
+    float *d_output,
+    int n,
+    int m,
+    int offset,
+    float hurst,
+    float t,
+    unsigned long seed)
+{
   int traj_size = 2 * n;
   cuComplex *d_data = nullptr;
   cudaMalloc(&d_data, (size_t)m * traj_size * sizeof(cuComplex));
-
-  int block_size = 512;
-  int grid_size = (m * traj_size + block_size - 1) / block_size;
-
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  fill_random_with_eigs<<<gridSize, blockSize, 0, stream>>>(d_data, d_sqrt_eigs,
-                                                            traj_size, m, seed);
-
-  cufftHandle plan;
-  cufftPlan1d(&plan, traj_size, CUFFT_C2C, m);
-  cufftSetStream(plan, stream);
-  cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
-  cufftDestroy(plan);
-
-  int out_size = n - offset;
-  grid_size = (m * out_size + block_size - 1) / block_size;
-  float scale = powf((float)n, -hurst) * powf(t, hurst);
-  scale_and_copy_to_output<<<gridSize, blockSize, 0, stream>>>(
-      d_data, d_output, n, m, offset, scale);
-
-  cudaStreamSynchronize(stream);
-  cudaStreamDestroy(stream);
+  {
+    int totalThreads = m * traj_size;
+    int blockSize = 512;
+    int gridSize = (totalThreads + blockSize - 1) / blockSize;
+    fill_random_with_eigs<<<gridSize, blockSize>>>(d_data, d_sqrt_eigs, traj_size, m, seed);
+    cudaDeviceSynchronize();
+  }
+  {
+    cufftHandle plan;
+    cufftPlan1d(&plan, traj_size, CUFFT_C2C, m);
+    cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
+    cudaDeviceSynchronize();
+    cufftDestroy(plan);
+  }
+  {
+    int out_size = n - offset;
+    int totalThreads = m * out_size;
+    int blockSize = 512;
+    int gridSize = (totalThreads + blockSize - 1) / blockSize;
+    scale_and_copy_to_output<<<gridSize, blockSize>>>(d_data, d_output, n, m, offset, hurst, t);
+    cudaDeviceSynchronize();
+  }
   cudaFree(d_data);
-}
+}
diff --git a/src/stochastic/cuda/fgn_windows/fgn.dll b/src/stochastic/cuda/fgn_windows/fgn.dll
diff --git a/src/stochastic/noise/fgn.rs b/src/stochastic/noise/fgn.rs
@@ -3,7 +3,7 @@ use std::sync::{Arc, RwLock};
 #[cfg(feature = "cuda")]
 use either::Either;
 #[cfg(feature = "cuda")]
-use std::error::Error;
+use anyhow::Result;
 
 use ndarray::parallel::prelude::*;
 use ndarray::{concatenate, prelude::*};
@@ -99,13 +99,12 @@ impl Sampling<f64> for FGN {
   }
 
   #[cfg(feature = "cuda")]
-  fn sample_cuda(&self) -> Result<Either<Array1<f64>, Array2<f64>>, Box<dyn Error>> {
+  fn sample_cuda(&self) -> Result<Either<Array1<f64>, Array2<f64>>> {
     // nvcc -shared -Xcompiler -fPIC fgn.cu -o libfgn.so -lcufft // ELF header error
     // nvcc -shared -o libfgn.so fgn.cu -Xcompiler -fPIC
     // nvcc -shared fgn.cu -o fgn.dll -lcufft
     use std::ffi::c_void;
 
-    use anyhow::Ok;
     use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut, DeviceRepr};
 
     use libloading::{Library, Symbol};
@@ -183,7 +182,7 @@ impl Sampling<f64> for FGN {
     }
 
     if m == 1 {
-      let fgn = fgn.row(0);
+      let fgn = fgn.row(0).to_owned();
       return Ok(Either::Left(fgn));
     }
 
@@ -267,9 +266,9 @@ mod tests {
   #[tracing_test::traced_test]
   #[cfg(feature = "cuda")]
   fn fgn_cuda() {
-    let fbm = FGN::new(0.7, 10_000, Some(1.0), Some(20000));
+    let fbm = FGN::new(0.7, 500, Some(1.0), Some(1));
     let fgn = fbm.sample_cuda().unwrap();
-    let fgn = fgn.row(0);
+    let fgn = fgn.left().unwrap();
     plot_1d!(fgn, "Fractional Brownian Motion (H = 0.7)");
     let mut path = Array1::<f64>::zeros(500);
     for i in 1..500 {