1- #include < cuComplex .h>
1+ #include < stdio .h>
22#include < cuda_runtime.h>
3- #include < cufft.h>
43#include < curand_kernel.h>
4+ #include < cufft.h>
5+ #include < cuComplex.h>
56#include < math.h>
6- #include < stdio.h>
77
88#ifdef _WIN32
99#define EXPORT __declspec (dllexport)
1010#else
1111#define EXPORT
1212#endif
1313
14- __global__ void fill_random_with_eigs (cuComplex *d_data,
15- const cuComplex *d_sqrt_eigs,
16- int traj_size, int m,
17- unsigned long seed) {
14+ __global__ void fill_random_with_eigs (
15+ cuComplex *d_data,
16+ const cuComplex *d_sqrt_eigs,
17+ int traj_size,
18+ int m,
19+ unsigned long seed)
20+ {
1821 int tid = blockIdx .x * blockDim .x + threadIdx .x ;
1922 if (tid >= m * traj_size)
2023 return ;
21-
2224 int traj_id = tid / traj_size;
2325 int idx = tid % traj_size;
24-
25- __shared__ curandState state[32 ];
26- int lane_id = threadIdx .x % 32 ;
27-
28- if (lane_id == 0 ) {
29- curand_init (seed + traj_id, blockIdx .x , 0 , &state[lane_id]);
30- }
31- __syncthreads ();
32-
33- float re = curand_normal (&state[lane_id]);
26+ curandState state;
27+ curand_init (seed + traj_id, idx, 0 , &state);
28+ float re = curand_normal (&state);
3429 float im = curand_normal (&state);
3530 cuComplex noise = make_cuComplex (re, im);
3631 d_data[tid] = cuCmulf (noise, d_sqrt_eigs[idx]);
3732}
3833
39- __global__ void scale_and_copy_to_output (const cuComplex *d_data,
40- float *d_output, int n, int m,
41- int offset, float scale) {
34+ __global__ void scale_and_copy_to_output (
35+ const cuComplex *d_data,
36+ float *d_output,
37+ int n,
38+ int m,
39+ int offset,
40+ float hurst,
41+ float t)
42+ {
4243 int out_size = n - offset;
4344 int tid = blockIdx .x * blockDim .x + threadIdx .x ;
4445 if (tid >= m * out_size)
4546 return ;
46-
4747 int traj_id = tid / out_size;
4848 int idx = tid % out_size;
4949 int data_idx = traj_id * (2 * n) + (idx + 1 );
50-
50+ float scale = powf (( float )n, -hurst) * powf (t, hurst);
5151 d_output[tid] = d_data[data_idx].x * scale;
5252}
5353
54- extern " C" EXPORT void fgn_kernel (const cuComplex *d_sqrt_eigs, float *d_output,
55- int n, int m, int offset, float hurst,
56- float t, unsigned long seed) {
54+ extern " C" EXPORT void fgn_kernel (
55+ const cuComplex *d_sqrt_eigs,
56+ float *d_output,
57+ int n,
58+ int m,
59+ int offset,
60+ float hurst,
61+ float t,
62+ unsigned long seed)
63+ {
5764 int traj_size = 2 * n;
5865 cuComplex *d_data = nullptr ;
5966 cudaMalloc (&d_data, (size_t )m * traj_size * sizeof (cuComplex));
60-
61- int block_size = 512 ;
62- int grid_size = (m * traj_size + block_size - 1 ) / block_size;
63-
64- cudaStream_t stream;
65- cudaStreamCreate (&stream);
66-
67- fill_random_with_eigs<<<gridSize, blockSize, 0 , stream>>> (d_data, d_sqrt_eigs,
68- traj_size, m, seed);
69-
70- cufftHandle plan;
71- cufftPlan1d (&plan, traj_size, CUFFT_C2C, m);
72- cufftSetStream (plan, stream);
73- cufftExecC2C (plan, d_data, d_data, CUFFT_FORWARD);
74- cufftDestroy (plan);
75-
76- int out_size = n - offset;
77- grid_size = (m * out_size + block_size - 1 ) / block_size;
78- float scale = powf ((float )n, -hurst) * powf (t, hurst);
79- scale_and_copy_to_output<<<gridSize, blockSize, 0 , stream>>> (
80- d_data, d_output, n, m, offset, scale);
81-
82- cudaStreamSynchronize (stream);
83- cudaStreamDestroy (stream);
67+ {
68+ int totalThreads = m * traj_size;
69+ int blockSize = 512 ;
70+ int gridSize = (totalThreads + blockSize - 1 ) / blockSize;
71+ fill_random_with_eigs<<<gridSize, blockSize>>> (d_data, d_sqrt_eigs, traj_size, m, seed);
72+ cudaDeviceSynchronize ();
73+ }
74+ {
75+ cufftHandle plan;
76+ cufftPlan1d (&plan, traj_size, CUFFT_C2C, m);
77+ cufftExecC2C (plan, d_data, d_data, CUFFT_FORWARD);
78+ cudaDeviceSynchronize ();
79+ cufftDestroy (plan);
80+ }
81+ {
82+ int out_size = n - offset;
83+ int totalThreads = m * out_size;
84+ int blockSize = 512 ;
85+ int gridSize = (totalThreads + blockSize - 1 ) / blockSize;
86+ scale_and_copy_to_output<<<gridSize, blockSize>>> (d_data, d_output, n, m, offset, hurst, t);
87+ cudaDeviceSynchronize ();
88+ }
8489 cudaFree (d_data);
85- }
90+ }
0 commit comments