Skip to content

Commit b9ab0a4

Browse files
CUDA: use arch list for compatibility check (#11775)
* CUDA: use arch list for feature availability check --------- Co-authored-by: Diego Devesa <[email protected]>
1 parent 7b891bd commit b9ab0a4

File tree

6 files changed

+80
-24
lines changed

6 files changed

+80
-24
lines changed

ggml/src/ggml-common.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,6 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
473473
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
474474
GGML_TABLE_END()
475475

476-
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
477476
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
478477
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
479478
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
@@ -508,7 +507,6 @@ GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
508507
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
509508
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
510509
GGML_TABLE_END()
511-
//#endif
512510

513511

514512
GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)

ggml/src/ggml-cuda/common.cuh

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,47 @@
7171
#define GGML_CUDA_CC_QY1 210
7272
#define GGML_CUDA_CC_QY2 220
7373

74+
#ifdef __CUDA_ARCH_LIST__
75+
constexpr bool ggml_cuda_has_arch_impl(int) {
76+
return false;
77+
}
78+
79+
template<class ... Archs>
80+
constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
81+
return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
82+
}
83+
84+
constexpr bool ggml_cuda_has_arch(const int arch) {
85+
return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
86+
}
87+
88+
constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur) {
89+
if (cur == 0) {
90+
GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch);
91+
}
92+
return cur;
93+
}
94+
95+
template<class ... Archs>
96+
constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) {
97+
if (first <= arch && first > cur) {
98+
return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...);
99+
} else {
100+
return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...);
101+
}
102+
}
103+
104+
constexpr int ggml_cuda_highest_compiled_arch(const int arch) {
105+
return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__);
106+
}
107+
#else
108+
static int ggml_cuda_highest_compiled_arch(const int arch) {
109+
return arch;
110+
}
111+
#endif // __CUDA_ARCH_LIST__
112+
113+
// ---------------------------------------------------------------------------------------------------------
114+
74115
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
75116

76117
#if defined(_MSC_VER)
@@ -162,18 +203,32 @@ typedef float2 dfloat2;
162203
#define FLASH_ATTN_AVAILABLE
163204
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
164205

165-
static constexpr bool fast_fp16_available(const int cc) {
206+
static bool fp16_available(const int cc) {
207+
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
208+
}
209+
210+
static bool fast_fp16_available(const int cc) {
211+
return fp16_available(cc) && cc != 610;
212+
}
213+
214+
// To be used for feature selection of external libraries, e.g. cuBLAS.
215+
static bool fast_fp16_hardware_available(const int cc) {
166216
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
167217
}
168218

169-
// Any FP16 tensor cores are available.
170-
static constexpr bool fp16_mma_available(const int cc) {
219+
// Any FP16 tensor core instructions are available for ggml code.
220+
static bool fp16_mma_available(const int cc) {
221+
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
222+
}
223+
224+
// To be used for feature selection of external libraries, e.g. cuBLAS.
225+
static bool fp16_mma_hardware_available(const int cc) {
171226
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
172227
}
173228

174229
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
175-
static constexpr bool new_mma_available(const int cc) {
176-
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
230+
static bool new_mma_available(const int cc) {
231+
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
177232
}
178233

179234
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {

ggml/src/ggml-cuda/convert.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
599599
case GGML_TYPE_Q5_1:
600600
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
601601
case GGML_TYPE_Q8_0:
602-
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) {
602+
if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
603603
return dequantize_block_q8_0_f16_cuda;
604604
}
605605
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1867,14 +1867,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
18671867

18681868
const int cc = ggml_cuda_info().devices[id].cc;
18691869
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1870-
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
1871-
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
1870+
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1871+
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
18721872
}
18731873
} else {
18741874
const int cc = ggml_cuda_info().devices[ctx.device].cc;
18751875
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1876-
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
1877-
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
1876+
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1877+
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
18781878
}
18791879

18801880
// debug helpers
@@ -3205,8 +3205,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32053205
if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
32063206
return true;
32073207
}
3208-
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3209-
return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
3208+
return fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc) &&
3209+
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
32103210
}
32113211
case GGML_OP_CROSS_ENTROPY_LOSS:
32123212
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:

ggml/src/ggml-cuda/mmq.cu

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ void ggml_cuda_op_mul_mat_q(
1818
const int64_t stride00 = ne00 / ggml_blck_size(src0->type);
1919

2020
int id = ggml_cuda_get_device();
21-
const int compute_capability = ggml_cuda_info().devices[id].cc;
21+
const int cc = ggml_cuda_info().devices[id].cc;
2222

2323
// the main device has a larger memory buffer to hold the results from all GPUs
2424
// nrows_dst == nrows of the matrix that the kernel writes into
@@ -27,7 +27,8 @@ void ggml_cuda_op_mul_mat_q(
2727
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
2828
// Also its fixup needs to allocate a temporary buffer in the memory pool.
2929
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
30-
const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
30+
const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA &&
31+
cc < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
3132
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
3233

3334
switch (src0->type) {
@@ -136,7 +137,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
136137
return true;
137138
}
138139

139-
if (cc < GGML_CUDA_CC_DP4A) {
140+
if (ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_DP4A) {
140141
return false;
141142
}
142143

@@ -145,7 +146,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
145146
#endif //GGML_CUDA_FORCE_MMQ
146147

147148
if (cc < GGML_CUDA_CC_OFFSET_AMD) {
148-
return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
149+
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
149150
}
150151

151152
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc) && !GGML_CUDA_CC_IS_GCN(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,13 @@ struct tile_x_sizes {
8686
int sc;
8787
};
8888

89-
static constexpr int get_mmq_x_max_host(const int cc) {
89+
static int get_mmq_x_max_host(const int cc) {
9090
return new_mma_available(cc) ? 128 :
91+
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ?
9192
#ifdef GGML_CUDA_FORCE_MMQ
92-
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;
93+
128 : 64;
9394
#else
94-
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
95+
MMQ_DP4A_MAX_BATCH_SIZE : 64;
9596
#endif // GGML_CUDA_FORCE_MMQ
9697
}
9798

@@ -119,8 +120,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
119120
#endif // NEW_MMA_AVAILABLE
120121
}
121122

122-
static constexpr int get_mmq_y_host(const int cc) {
123-
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
123+
static int get_mmq_y_host(const int cc) {
124+
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
125+
(ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ? 128 : 64);
124126
}
125127

126128
static constexpr __device__ int get_mmq_y_device() {
@@ -2828,7 +2830,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
28282830
const int mmq_x_max = get_mmq_x_max_host(cc);
28292831
const int mmq_y = get_mmq_y_host(cc);
28302832
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
2831-
const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
2833+
const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
28322834

28332835
int mmq_x_best = 0;
28342836
int nparts_best = INT_MAX;

0 commit comments

Comments
 (0)