22
22
#include " ggml-cuda/fattn.cuh"
23
23
#include " ggml-cuda/getrows.cuh"
24
24
#include " ggml-cuda/im2col.cuh"
25
- #include " ggml-cuda/mmf.cuh"
26
25
#include " ggml-cuda/mmq.cuh"
27
- #include " ggml-cuda/mmvf .cuh"
26
+ #include " ggml-cuda/mmv .cuh"
28
27
#include " ggml-cuda/mmvq.cuh"
29
28
#include " ggml-cuda/norm.cuh"
30
29
#include " ggml-cuda/opt-step-adamw.cuh"
@@ -2009,9 +2008,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2009
2008
const bool bad_padding_clear = ggml_backend_buffer_get_usage (src0->buffer ) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
2010
2009
&& ggml_nbytes (src0) != ggml_backend_buffer_get_alloc_size (src0->buffer , src0) && src0->view_src ;
2011
2010
2012
- bool use_mul_mat_vec_f = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
2013
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2014
- bool use_mul_mat_f = !ggml_is_quantized (src0->type )
2011
+ bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
2015
2012
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2016
2013
bool use_mul_mat_vec_q = ggml_is_quantized (src0->type ) && !bad_padding_clear
2017
2014
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
@@ -2031,18 +2028,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2031
2028
}
2032
2029
2033
2030
const int cc = ggml_cuda_info ().devices [id].cc ;
2034
- const int warp_size = ggml_cuda_info ().devices [id].warp_size ;
2035
2031
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
2036
- use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf (src0->type , cc, warp_size, src0->ne , src1->ne [1 ]);
2037
- use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src1->ne [1 ]);
2032
+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv (src0->type , cc, src0->ne , src1->ne [1 ]);
2038
2033
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
2039
2034
}
2040
2035
} else {
2041
2036
const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
2042
- const int warp_size = ggml_cuda_info ().devices [ctx.device ].warp_size ;
2043
2037
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
2044
- use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf (src0->type , cc, warp_size, src0->ne , src1->ne [1 ]);
2045
- use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src1->ne [1 ]);
2038
+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv (src0->type , cc, src0->ne , src1->ne [1 ]);
2046
2039
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
2047
2040
}
2048
2041
@@ -2055,17 +2048,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2055
2048
// printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
2056
2049
2057
2050
// TODO update for generic tensor parallelism
2058
- const int cc = ggml_cuda_info ().devices [ggml_cuda_get_device ()].cc ;
2051
+ const int cc = ggml_cuda_info ().devices [ggml_cuda_get_device ()].cc ;
2059
2052
bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
2060
2053
bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available (cc);
2061
2054
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
2062
2055
2063
- if (!split && use_mul_mat_vec_f ) {
2056
+ if (!split && use_mul_mat_vec ) {
2064
2057
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
2065
2058
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
2066
- ggml_cuda_mul_mat_vec_f (ctx, src0, src1, nullptr , dst);
2067
- } else if (!split && use_mul_mat_f) {
2068
- ggml_cuda_mul_mat_f (ctx, src0, src1, nullptr , dst);
2059
+ ggml_cuda_mul_mat_vec (ctx, src0, src1, nullptr , dst);
2069
2060
} else if (!split && use_mul_mat_vec_q) {
2070
2061
ggml_cuda_mul_mat_vec_q (ctx, src0, src1, nullptr , dst);
2071
2062
} else if (!split && use_mul_mat_q) {
@@ -2074,8 +2065,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2074
2065
&& !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
2075
2066
// general KQ + KQV multi-batch without FlashAttention
2076
2067
ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
2077
- } else if (use_mul_mat_vec_f ) {
2078
- ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_f , nullptr );
2068
+ } else if (use_mul_mat_vec ) {
2069
+ ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec , nullptr );
2079
2070
} else if (use_mul_mat_vec_q) {
2080
2071
ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
2081
2072
} else if (use_mul_mat_q) {
@@ -2103,7 +2094,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
2103
2094
if (ggml_is_quantized (src0->type )) {
2104
2095
ggml_cuda_mul_mat_vec_q (ctx, src0, src1, ids, dst);
2105
2096
} else {
2106
- ggml_cuda_mul_mat_vec_f (ctx, src0, src1, ids, dst);
2097
+ ggml_cuda_mul_mat_vec (ctx, src0, src1, ids, dst);
2107
2098
}
2108
2099
return ;
2109
2100
}
@@ -3525,7 +3516,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3525
3516
#endif // FLASH_ATTN_AVAILABLE
3526
3517
if (op->src [1 ]->ne [0 ] != op->src [2 ]->ne [0 ]) {
3527
3518
const int cc = ggml_cuda_info ().devices [dev_ctx->device ].cc ;
3528
- if (!turing_mma_available (cc)) {
3519
+ if (!new_mma_available (cc)) {
3529
3520
return false ;
3530
3521
}
3531
3522
const int gqa_ratio = op->src [0 ]->ne [2 ] / op->src [1 ]->ne [2 ];
0 commit comments