|
71 | 71 | #define GGML_CUDA_CC_QY1 210
|
72 | 72 | #define GGML_CUDA_CC_QY2 220
|
73 | 73 |
|
| 74 | +#ifdef __CUDA_ARCH_LIST__ |
| 75 | +constexpr bool ggml_cuda_has_arch_impl(int) { |
| 76 | + return false; |
| 77 | +} |
| 78 | + |
| 79 | +template<class ... Archs> |
| 80 | +constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) { |
| 81 | + return arch == first || ggml_cuda_has_arch_impl(arch, rest...); |
| 82 | +} |
| 83 | + |
| 84 | +constexpr bool ggml_cuda_has_arch(const int arch) { |
| 85 | + return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__); |
| 86 | +} |
| 87 | + |
| 88 | +constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur) { |
| 89 | + if (cur == 0) { |
| 90 | + GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch); |
| 91 | + } |
| 92 | + return cur; |
| 93 | +} |
| 94 | + |
| 95 | +template<class ... Archs> |
| 96 | +constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) { |
| 97 | + if (first <= arch && first > cur) { |
| 98 | + return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...); |
| 99 | + } else { |
| 100 | + return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...); |
| 101 | + } |
| 102 | +} |
| 103 | + |
| 104 | +constexpr int ggml_cuda_highest_compiled_arch(const int arch) { |
| 105 | + return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__); |
| 106 | +} |
| 107 | +#else |
| 108 | +static int ggml_cuda_highest_compiled_arch(const int arch) { |
| 109 | + return arch; |
| 110 | +} |
| 111 | +#endif // __CUDA_ARCH_LIST__ |
| 112 | + |
| 113 | +// --------------------------------------------------------------------------------------------------------- |
| 114 | + |
74 | 115 | #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
75 | 116 |
|
76 | 117 | #if defined(_MSC_VER)
|
@@ -162,18 +203,32 @@ typedef float2 dfloat2;
|
162 | 203 | #define FLASH_ATTN_AVAILABLE
|
163 | 204 | #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
164 | 205 |
|
165 |
| -static constexpr bool fast_fp16_available(const int cc) { |
| 206 | +static bool fp16_available(const int cc) { |
| 207 | + return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL; |
| 208 | +} |
| 209 | + |
| 210 | +static bool fast_fp16_available(const int cc) { |
| 211 | + return fp16_available(cc) && cc != 610; |
| 212 | +} |
| 213 | + |
| 214 | +// To be used for feature selection of external libraries, e.g. cuBLAS. |
| 215 | +static bool fast_fp16_hardware_available(const int cc) { |
166 | 216 | return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
|
167 | 217 | }
|
168 | 218 |
|
169 |
| -// Any FP16 tensor cores are available. |
170 |
| -static constexpr bool fp16_mma_available(const int cc) { |
| 219 | +// Any FP16 tensor core instructions are available for ggml code. |
| 220 | +static bool fp16_mma_available(const int cc) { |
| 221 | + return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; |
| 222 | +} |
| 223 | + |
| 224 | +// To be used for feature selection of external libraries, e.g. cuBLAS. |
| 225 | +static bool fp16_mma_hardware_available(const int cc) { |
171 | 226 | return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
|
172 | 227 | }
|
173 | 228 |
|
174 | 229 | // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
175 |
| -static constexpr bool new_mma_available(const int cc) { |
176 |
| - return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING; |
| 230 | +static bool new_mma_available(const int cc) { |
| 231 | + return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; |
177 | 232 | }
|
178 | 233 |
|
179 | 234 | static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
|
0 commit comments