Skip to content

Commit 2e778e9

Browse files
committed
fix: fgn generation and improve cuda run
1 parent 7eb8063 commit 2e778e9

File tree

3 files changed

+93
-96
lines changed

3 files changed

+93
-96
lines changed

src/main.rs

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
use ndarray::Array1;
22
use prettytable::{format, row, Cell, Row, Table};
3-
use stochastic_rs::plot_1d;
4-
use stochastic_rs::stochastic::noise::fgn::FGN;
5-
use stochastic_rs::stochastic::Sampling;
63
use std::error::Error;
74
use std::fs::File;
85
use std::io::{BufRead, BufReader};
96
use std::time::Instant;
7+
use stochastic_rs::plot_1d;
108
use stochastic_rs::stochastic::noise::fgn::FGN;
119
use stochastic_rs::stochastic::Sampling;
10+
// use stochastic_rs::stochastic::noise::fgn::FGN;
11+
// use stochastic_rs::stochastic::Sampling;
1212

1313
use stochastic_rs::stats::fd::FractalDim;
1414
use stochastic_rs::stats::fou_estimator::{
@@ -19,25 +19,25 @@ use stochastic_rs::stats::fou_estimator::{
1919
// use your_crate::{FOUParameterEstimationV1, FOUParameterEstimationV2, FilterType};
2020
const N: usize = 10000;
2121
fn main() -> Result<(), Box<dyn Error>> {
22-
let fbm = FGN::new(0.7, 10_000, Some(1.0), Some(10000));
23-
let fgn = fbm.sample_cuda().unwrap();
24-
let fgn = fgn.row(0);
25-
plot_1d!(fgn, "Fractional Brownian Motion (H = 0.7)");
26-
let mut path = Array1::<f64>::zeros(500);
27-
for i in 1..500 {
28-
path[i] += path[i-1] + fgn[i];
29-
}
30-
plot_1d!(path, "Fractional Brownian Motion (H = 0.7)");
22+
// let fbm = FGN::new(0.7, 10_000, Some(1.0), Some(10000));
23+
// let fgn = fbm.sample_cuda().unwrap();
24+
// let fgn = fgn.row(0);
25+
// plot_1d!(fgn, "Fractional Brownian Motion (H = 0.7)");
26+
// let mut path = Array1::<f64>::zeros(500);
27+
// for i in 1..500 {
28+
// path[i] += path[i-1] + fgn[i];
29+
// }
30+
// plot_1d!(path, "Fractional Brownian Motion (H = 0.7)");
3131

32-
let start = std::time::Instant::now();
33-
let _ = fbm.sample_cuda();
34-
let end = start.elapsed().as_millis();
35-
println!("20000 fgn generated on cuda in: {end}");
32+
// let start = std::time::Instant::now();
33+
// let _ = fbm.sample_cuda();
34+
// let end = start.elapsed().as_millis();
35+
// println!("20000 fgn generated on cuda in: {end}");
3636

37-
let start = std::time::Instant::now();
38-
let _ = fbm.sample_par();
39-
let end = start.elapsed().as_millis();
40-
println!("20000 fgn generated on cuda in: {end}");
37+
// let start = std::time::Instant::now();
38+
// let _ = fbm.sample_par();
39+
// let end = start.elapsed().as_millis();
40+
// println!("20000 fgn generated on cuda in: {end}");
4141
// File paths
4242
// let paths = vec![
4343
// "./test/kecskekut_original.txt",

src/stochastic/cuda/fgn.cu

Lines changed: 70 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,88 +1,85 @@
1-
#include <stdio.h>
1+
#include <cuComplex.h>
22
#include <cuda_runtime.h>
3-
#include <curand_kernel.h>
43
#include <cufft.h>
5-
#include <cuComplex.h>
4+
#include <curand_kernel.h>
65
#include <math.h>
6+
#include <stdio.h>
77

88
#ifdef _WIN32
99
#define EXPORT __declspec(dllexport)
1010
#else
1111
#define EXPORT
1212
#endif
1313

14-
__global__ void fill_random_with_eigs(
15-
cuComplex* d_data,
16-
const cuComplex* d_sqrt_eigs,
17-
int traj_size,
18-
int m,
19-
unsigned long seed)
20-
{
21-
int tid = blockIdx.x * blockDim.x + threadIdx.x;
22-
if (tid >= m * traj_size) return;
23-
int traj_id = tid / traj_size;
24-
int idx = tid % traj_size;
25-
curandState state;
26-
curand_init(seed + traj_id, idx, 0, &state);
27-
float re = curand_normal(&state);
28-
float im = curand_normal(&state);
29-
cuComplex noise = make_cuComplex(re, im);
30-
d_data[tid] = cuCmulf(noise, d_sqrt_eigs[idx]);
14+
__global__ void fill_random_with_eigs(cuComplex *d_data,
15+
const cuComplex *d_sqrt_eigs,
16+
int traj_size, int m,
17+
unsigned long seed) {
18+
int tid = blockIdx.x * blockDim.x + threadIdx.x;
19+
if (tid >= m * traj_size)
20+
return;
21+
22+
int traj_id = tid / traj_size;
23+
int idx = tid % traj_size;
24+
25+
__shared__ curandState state[32];
26+
int lane_id = threadIdx.x % 32;
27+
28+
if (lane_id == 0) {
29+
curand_init(seed + traj_id, blockIdx.x, 0, &state[lane_id]);
30+
}
31+
__syncthreads();
32+
33+
float re = curand_normal(&state[lane_id]);
34+
float im = curand_normal(&state);
35+
cuComplex noise = make_cuComplex(re, im);
36+
d_data[tid] = cuCmulf(noise, d_sqrt_eigs[idx]);
3137
}
3238

33-
__global__ void scale_and_copy_to_output(
34-
const cuComplex* d_data,
35-
float* d_output,
36-
int n,
37-
int m,
38-
int offset,
39-
float hurst,
40-
float t)
41-
{
42-
int out_size = n - offset;
43-
int tid = blockIdx.x * blockDim.x + threadIdx.x;
44-
if (tid >= m * out_size) return;
45-
int traj_id = tid / out_size;
46-
int idx = tid % out_size;
47-
int data_idx = traj_id * (2 * n) + (idx + 1);
48-
float scale = powf((float)n, -hurst) * powf(t, hurst);
49-
d_output[tid] = d_data[data_idx].x * scale;
39+
__global__ void scale_and_copy_to_output(const cuComplex *d_data,
40+
float *d_output, int n, int m,
41+
int offset, float scale) {
42+
int out_size = n - offset;
43+
int tid = blockIdx.x * blockDim.x + threadIdx.x;
44+
if (tid >= m * out_size)
45+
return;
46+
47+
int traj_id = tid / out_size;
48+
int idx = tid % out_size;
49+
int data_idx = traj_id * (2 * n) + (idx + 1);
50+
51+
d_output[tid] = d_data[data_idx].x * scale;
5052
}
5153

52-
extern "C" EXPORT void fgn_kernel(
53-
const cuComplex* d_sqrt_eigs,
54-
float* d_output,
55-
int n,
56-
int m,
57-
int offset,
58-
float hurst,
59-
float t,
60-
unsigned long seed)
61-
{
62-
int traj_size = 2 * n;
63-
cuComplex* d_data = nullptr;
64-
cudaMalloc(&d_data, (size_t)m * traj_size * sizeof(cuComplex));
65-
{
66-
int totalThreads = m * traj_size;
67-
int blockSize = 512;
68-
int gridSize = (totalThreads + blockSize - 1) / blockSize;
69-
fill_random_with_eigs<<<gridSize, blockSize>>>(d_data, d_sqrt_eigs, traj_size, m, seed);
70-
cudaDeviceSynchronize();
71-
}
72-
{
73-
cufftHandle plan;
74-
cufftPlan1d(&plan, traj_size, CUFFT_C2C, m);
75-
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
76-
cudaDeviceSynchronize();
77-
cufftDestroy(plan);
78-
}
79-
{
80-
int out_size = n - offset;
81-
int totalThreads = m * out_size;
82-
int blockSize = 512;
83-
int gridSize = (totalThreads + blockSize - 1) / blockSize;
84-
scale_and_copy_to_output<<<gridSize, blockSize>>>(d_data, d_output, n, m, offset, hurst, t);
85-
cudaDeviceSynchronize();
86-
}
87-
cudaFree(d_data);
54+
extern "C" EXPORT void fgn_kernel(const cuComplex *d_sqrt_eigs, float *d_output,
55+
int n, int m, int offset, float hurst,
56+
float t, unsigned long seed) {
57+
int traj_size = 2 * n;
58+
cuComplex *d_data = nullptr;
59+
cudaMalloc(&d_data, (size_t)m * traj_size * sizeof(cuComplex));
60+
61+
int block_size = 512;
62+
int grid_size = (m * traj_size + block_size - 1) / block_size;
63+
64+
cudaStream_t stream;
65+
cudaStreamCreate(&stream);
66+
67+
fill_random_with_eigs<<<gridSize, blockSize, 0, stream>>>(d_data, d_sqrt_eigs,
68+
traj_size, m, seed);
69+
70+
cufftHandle plan;
71+
cufftPlan1d(&plan, traj_size, CUFFT_C2C, m);
72+
cufftSetStream(plan, stream);
73+
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
74+
cufftDestroy(plan);
75+
76+
int out_size = n - offset;
77+
grid_size = (m * out_size + block_size - 1) / block_size;
78+
float scale = powf((float)n, -hurst) * powf(t, hurst);
79+
scale_and_copy_to_output<<<gridSize, blockSize, 0, stream>>>(
80+
d_data, d_output, n, m, offset, scale);
81+
82+
cudaStreamSynchronize(stream);
83+
cudaStreamDestroy(stream);
84+
cudaFree(d_data);
8885
}

src/stochastic/noise/fgn.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ impl Sampling<f64> for FGN {
8484
.assign(&chunk);
8585
});
8686

87-
let fgn = &*self.sqrt_eigenvalues * &rnd;
87+
let fgn = &*self.sqrt_eigenvalues * &*rnd.read().unwrap();
8888
let mut fgn_fft = Array1::<Complex<f64>>::zeros(2 * self.n);
8989
ndfft(&fgn, &mut fgn_fft, &*self.fft_handler, 0);
9090
let scale = (self.n as f64).powf(-self.hurst) * self.t.unwrap_or(1.0).powf(self.hurst);
@@ -265,7 +265,7 @@ mod tests {
265265
plot_1d!(fgn, "Fractional Brownian Motion (H = 0.7)");
266266
let mut path = Array1::<f64>::zeros(500);
267267
for i in 1..500 {
268-
path[i] += path[i-1] + fgn[i];
268+
path[i] += path[i - 1] + fgn[i];
269269
}
270270
plot_1d!(path, "Fractional Brownian Motion (H = 0.7)");
271271

@@ -275,7 +275,7 @@ mod tests {
275275
tracing::info!("10000 fgn generated on cuda in: {end}");
276276

277277
let start = std::time::Instant::now();
278-
let _ = fbm.sample_par();
278+
let _ = fbm.sample_par();
279279
let end = start.elapsed().as_millis();
280280
tracing::info!("10000 fgn generated on cuda in: {end}");
281281
}

0 commit comments

Comments
 (0)