|
7 | 7 |
|
8 | 8 | #ifdef USE_CUDA |
9 | 9 |
|
10 | | -#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__) |
11 | | -// ROCm doesn't have __shfl_down_sync, only __shfl_down without mask. |
| 10 | +#if defined(__HIP_PLATFORM_AMD__) |
| 11 | + |
| 12 | +// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd |
| 13 | +#define atomicAdd_block atomicAdd |
| 14 | + |
| 15 | +// hipify |
| 16 | +#include <hip/hip_runtime.h> |
| 17 | +#define cudaDeviceProp hipDeviceProp_t |
| 18 | +#define cudaDeviceSynchronize hipDeviceSynchronize |
| 19 | +#define cudaError_t hipError_t |
| 20 | +#define cudaFree hipFree |
| 21 | +#define cudaFreeHost hipFreeHost |
| 22 | +#define cudaGetDevice hipGetDevice |
| 23 | +#define cudaGetDeviceProperties hipGetDeviceProperties |
| 24 | +#define cudaGetErrorName hipGetErrorName |
| 25 | +#define cudaGetErrorString hipGetErrorString |
| 26 | +#define cudaGetLastError hipGetLastError |
| 27 | +#define cudaHostAlloc hipHostAlloc |
| 28 | +#define cudaHostAllocPortable hipHostAllocPortable |
| 29 | +#define cudaMalloc hipMalloc |
| 30 | +#define cudaMemcpy hipMemcpy |
| 31 | +#define cudaMemcpyAsync hipMemcpyAsync |
| 32 | +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice |
| 33 | +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost |
| 34 | +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice |
| 35 | +#define cudaMemoryTypeHost hipMemoryTypeHost |
| 36 | +#define cudaMemset hipMemset |
| 37 | +#define cudaPointerAttributes hipPointerAttribute_t |
| 38 | +#define cudaPointerGetAttributes hipPointerGetAttributes |
| 39 | +#define cudaSetDevice hipSetDevice |
| 40 | +#define cudaStreamCreate hipStreamCreate |
| 41 | +#define cudaStreamDestroy hipStreamDestroy |
| 42 | +#define cudaStream_t hipStream_t |
| 43 | +#define cudaSuccess hipSuccess |
| 44 | + |
| 45 | +// ROCm 7.0 did add __shfl_down_sync et al, but the following hack still works. |
12 | 46 | // Since mask is full 0xffffffff, we can use __shfl_down instead. |
13 | 47 | #define __shfl_down_sync(mask, val, offset) __shfl_down(val, offset) |
14 | 48 | #define __shfl_up_sync(mask, val, offset) __shfl_up(val, offset) |
15 | | -// ROCm warpSize is constexpr and is either 32 or 64 depending on gfx arch. |
16 | | -#define WARPSIZE warpSize |
17 | | -// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd |
18 | | -#define atomicAdd_block atomicAdd |
19 | | -#else |
| 49 | + |
| 50 | +// warpSize is only allowed for device code. |
| 51 | +// HIP header used to define warpSize as a constexpr that was either 32 or 64 |
| 52 | +// depending on the target device, and then always set it to 64 for host code. |
| 53 | +static inline constexpr int WARP_SIZE_INTERNAL() { |
| 54 | +#if defined(__GFX9__) |
| 55 | + return 64; |
| 56 | +#else // __GFX9__ |
| 57 | + return 32; |
| 58 | +#endif // __GFX9__ |
| 59 | +} |
| 60 | +#define WARPSIZE (WARP_SIZE_INTERNAL()) |
| 61 | + |
| 62 | +#else // __HIP_PLATFORM_AMD__ |
20 | 63 | // CUDA warpSize is not a constexpr, but always 32 |
21 | 64 | #define WARPSIZE 32 |
22 | 65 | #endif // defined(__HIP_PLATFORM_AMD__) || defined(__HIP__) |
|
0 commit comments