|
1 | | -#include <stdio.h> |
| 1 | +#include <cuComplex.h> |
2 | 2 | #include <cuda_runtime.h> |
3 | | -#include <curand_kernel.h> |
4 | 3 | #include <cufft.h> |
5 | | -#include <cuComplex.h> |
| 4 | +#include <curand_kernel.h> |
6 | 5 | #include <math.h> |
| 6 | +#include <stdio.h> |
7 | 7 |
|
8 | 8 | #ifdef _WIN32 |
9 | 9 | #define EXPORT __declspec(dllexport) |
10 | 10 | #else |
11 | 11 | #define EXPORT |
12 | 12 | #endif |
13 | 13 |
|
14 | | -__global__ void fill_random_with_eigs( |
15 | | - cuComplex* d_data, |
16 | | - const cuComplex* d_sqrt_eigs, |
17 | | - int traj_size, |
18 | | - int m, |
19 | | - unsigned long seed) |
20 | | -{ |
21 | | - int tid = blockIdx.x * blockDim.x + threadIdx.x; |
22 | | - if (tid >= m * traj_size) return; |
23 | | - int traj_id = tid / traj_size; |
24 | | - int idx = tid % traj_size; |
25 | | - curandState state; |
26 | | - curand_init(seed + traj_id, idx, 0, &state); |
27 | | - float re = curand_normal(&state); |
28 | | - float im = curand_normal(&state); |
29 | | - cuComplex noise = make_cuComplex(re, im); |
30 | | - d_data[tid] = cuCmulf(noise, d_sqrt_eigs[idx]); |
| 14 | +__global__ void fill_random_with_eigs(cuComplex *d_data, |
| 15 | + const cuComplex *d_sqrt_eigs, |
| 16 | + int traj_size, int m, |
| 17 | + unsigned long seed) { |
| 18 | + int tid = blockIdx.x * blockDim.x + threadIdx.x; |
| 19 | + if (tid >= m * traj_size) |
| 20 | + return; |
| 21 | + |
| 22 | + int traj_id = tid / traj_size; |
| 23 | + int idx = tid % traj_size; |
| 24 | + |
| 25 | + __shared__ curandState state[32]; |
| 26 | + int lane_id = threadIdx.x % 32; |
| 27 | + |
| 28 | + if (lane_id == 0) { |
| 29 | + curand_init(seed + traj_id, blockIdx.x, 0, &state[lane_id]); |
| 30 | + } |
| 31 | + __syncthreads(); |
| 32 | + |
| 33 | + float re = curand_normal(&state[lane_id]); |
| 34 | + float im = curand_normal(&state); |
| 35 | + cuComplex noise = make_cuComplex(re, im); |
| 36 | + d_data[tid] = cuCmulf(noise, d_sqrt_eigs[idx]); |
31 | 37 | } |
32 | 38 |
|
33 | | -__global__ void scale_and_copy_to_output( |
34 | | - const cuComplex* d_data, |
35 | | - float* d_output, |
36 | | - int n, |
37 | | - int m, |
38 | | - int offset, |
39 | | - float hurst, |
40 | | - float t) |
41 | | -{ |
42 | | - int out_size = n - offset; |
43 | | - int tid = blockIdx.x * blockDim.x + threadIdx.x; |
44 | | - if (tid >= m * out_size) return; |
45 | | - int traj_id = tid / out_size; |
46 | | - int idx = tid % out_size; |
47 | | - int data_idx = traj_id * (2 * n) + (idx + 1); |
48 | | - float scale = powf((float)n, -hurst) * powf(t, hurst); |
49 | | - d_output[tid] = d_data[data_idx].x * scale; |
| 39 | +__global__ void scale_and_copy_to_output(const cuComplex *d_data, |
| 40 | + float *d_output, int n, int m, |
| 41 | + int offset, float scale) { |
| 42 | + int out_size = n - offset; |
| 43 | + int tid = blockIdx.x * blockDim.x + threadIdx.x; |
| 44 | + if (tid >= m * out_size) |
| 45 | + return; |
| 46 | + |
| 47 | + int traj_id = tid / out_size; |
| 48 | + int idx = tid % out_size; |
| 49 | + int data_idx = traj_id * (2 * n) + (idx + 1); |
| 50 | + |
| 51 | + d_output[tid] = d_data[data_idx].x * scale; |
50 | 52 | } |
51 | 53 |
|
52 | | -extern "C" EXPORT void fgn_kernel( |
53 | | - const cuComplex* d_sqrt_eigs, |
54 | | - float* d_output, |
55 | | - int n, |
56 | | - int m, |
57 | | - int offset, |
58 | | - float hurst, |
59 | | - float t, |
60 | | - unsigned long seed) |
61 | | -{ |
62 | | - int traj_size = 2 * n; |
63 | | - cuComplex* d_data = nullptr; |
64 | | - cudaMalloc(&d_data, (size_t)m * traj_size * sizeof(cuComplex)); |
65 | | - { |
66 | | - int totalThreads = m * traj_size; |
67 | | - int blockSize = 512; |
68 | | - int gridSize = (totalThreads + blockSize - 1) / blockSize; |
69 | | - fill_random_with_eigs<<<gridSize, blockSize>>>(d_data, d_sqrt_eigs, traj_size, m, seed); |
70 | | - cudaDeviceSynchronize(); |
71 | | - } |
72 | | - { |
73 | | - cufftHandle plan; |
74 | | - cufftPlan1d(&plan, traj_size, CUFFT_C2C, m); |
75 | | - cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD); |
76 | | - cudaDeviceSynchronize(); |
77 | | - cufftDestroy(plan); |
78 | | - } |
79 | | - { |
80 | | - int out_size = n - offset; |
81 | | - int totalThreads = m * out_size; |
82 | | - int blockSize = 512; |
83 | | - int gridSize = (totalThreads + blockSize - 1) / blockSize; |
84 | | - scale_and_copy_to_output<<<gridSize, blockSize>>>(d_data, d_output, n, m, offset, hurst, t); |
85 | | - cudaDeviceSynchronize(); |
86 | | - } |
87 | | - cudaFree(d_data); |
| 54 | +extern "C" EXPORT void fgn_kernel(const cuComplex *d_sqrt_eigs, float *d_output, |
| 55 | + int n, int m, int offset, float hurst, |
| 56 | + float t, unsigned long seed) { |
| 57 | + int traj_size = 2 * n; |
| 58 | + cuComplex *d_data = nullptr; |
| 59 | + cudaMalloc(&d_data, (size_t)m * traj_size * sizeof(cuComplex)); |
| 60 | + |
| 61 | + int block_size = 512; |
| 62 | + int grid_size = (m * traj_size + block_size - 1) / block_size; |
| 63 | + |
| 64 | + cudaStream_t stream; |
| 65 | + cudaStreamCreate(&stream); |
| 66 | + |
| 67 | + fill_random_with_eigs<<<gridSize, blockSize, 0, stream>>>(d_data, d_sqrt_eigs, |
| 68 | + traj_size, m, seed); |
| 69 | + |
| 70 | + cufftHandle plan; |
| 71 | + cufftPlan1d(&plan, traj_size, CUFFT_C2C, m); |
| 72 | + cufftSetStream(plan, stream); |
| 73 | + cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD); |
| 74 | + cufftDestroy(plan); |
| 75 | + |
| 76 | + int out_size = n - offset; |
| 77 | + grid_size = (m * out_size + block_size - 1) / block_size; |
| 78 | + float scale = powf((float)n, -hurst) * powf(t, hurst); |
| 79 | + scale_and_copy_to_output<<<gridSize, blockSize, 0, stream>>>( |
| 80 | + d_data, d_output, n, m, offset, scale); |
| 81 | + |
| 82 | + cudaStreamSynchronize(stream); |
| 83 | + cudaStreamDestroy(stream); |
| 84 | + cudaFree(d_data); |
88 | 85 | } |
0 commit comments