Skip to content

Commit e660942

Browse files
committed
vulkan: support q4_0/q8_0 KV in scalar FA
1 parent 989bfb1 commit e660942

File tree

3 files changed

+66
-8
lines changed

3 files changed

+66
-8
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1939,6 +1939,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
19391939
CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 256)
19401940

19411941
CREATE_FA(GGML_TYPE_F16, f16, true, )
1942+
CREATE_FA(GGML_TYPE_Q4_0, q4_0, true, )
1943+
CREATE_FA(GGML_TYPE_Q8_0, q8_0, true, )
19421944
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
19431945
if (device->coopmat2) {
19441946
CREATE_FA(GGML_TYPE_F16, f16, false, _cm2)
@@ -9603,10 +9605,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
96039605
switch (op->src[1]->type) {
96049606
case GGML_TYPE_F16:
96059607
case GGML_TYPE_Q4_0:
9608+
case GGML_TYPE_Q8_0:
9609+
// supported in scalar and coopmat2 paths
9610+
break;
96069611
case GGML_TYPE_Q4_1:
96079612
case GGML_TYPE_Q5_0:
96089613
case GGML_TYPE_Q5_1:
9609-
case GGML_TYPE_Q8_0:
96109614
// K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently
96119615
//case GGML_TYPE_Q2_K:
96129616
//case GGML_TYPE_Q3_K:
@@ -9622,13 +9626,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
96229626
//case GGML_TYPE_IQ3_S:
96239627
//case GGML_TYPE_IQ4_XS:
96249628
case GGML_TYPE_IQ4_NL:
9629+
// currently supported only in coopmat2 path
9630+
if (!coopmat2) {
9631+
return false;
9632+
}
96259633
break;
96269634
default:
96279635
return false;
96289636
}
9629-
if (!coopmat2 && op->src[1]->type != GGML_TYPE_F16) {
9630-
return false;
9631-
}
96329637
if (!coopmat2 && !device->subgroup_shuffle) {
96339638
// scalar FA uses subgroupShuffle
96349639
return false;

ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,36 @@ layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
7272
layout (binding = 3) readonly buffer M {float16_t data_m[];};
7373
layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
7474

75+
#if defined(A_TYPE_PACKED16)
76+
#define BINDING_IDX_K 0
77+
#define BINDING_IDX_V 1
78+
layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
79+
#endif
80+
81+
#if defined(DATA_A_Q4_0)
82+
#define BLOCK_BYTE_SIZE 18
83+
84+
vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
85+
uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
86+
uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
87+
uint shift = (iqs & 0x10) >> 2;
88+
vui_lo >>= shift;
89+
vui_hi >>= shift;
90+
91+
return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
92+
}
93+
#endif
94+
95+
#if defined(DATA_A_Q8_0)
96+
#define BLOCK_BYTE_SIZE 34
97+
vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
98+
const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
99+
const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
100+
101+
return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
102+
}
103+
#endif
104+
75105
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
76106

77107
// Store the output when doing grouped query attention.
@@ -208,6 +238,14 @@ void main() {
208238
}
209239
}
210240

241+
#if BLOCK_SIZE > 1
242+
uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / BLOCK_BYTE_SIZE;
243+
uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / BLOCK_BYTE_SIZE;
244+
#else
245+
uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2;
246+
uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2;
247+
#endif
248+
211249
[[dont_unroll]]
212250
for (uint32_t j = start_j; j < end_j; ++j) {
213251

@@ -218,11 +256,17 @@ void main() {
218256
}
219257
}
220258

221-
uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2;
222259

223260
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
224261
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
262+
#if BLOCK_SIZE > 1
263+
uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
264+
uint ib = coord / BLOCK_SIZE;
265+
uint iqs = (coord % BLOCK_SIZE);
266+
vec4 K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
267+
#else
225268
vec4 K_Tf = vec4(data_kv4[k_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * k_stride / 4 + d * D_split + d_tid]);
269+
#endif
226270
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
227271
Sf[r][c] += dot(Qf[r][d * D_split + d_tid], K_Tf);
228272
}
@@ -297,11 +341,16 @@ void main() {
297341
}
298342
}
299343

300-
uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2;
301-
302344
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
303345
[[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
346+
#if BLOCK_SIZE > 1
347+
uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
348+
uint ib = coord / BLOCK_SIZE;
349+
uint iqs = (coord % BLOCK_SIZE);
350+
vec4 Vf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
351+
#else
304352
vec4 Vf = vec4(data_vv4[v_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * v_stride / 4 + d * D_split + d_tid]);
353+
#endif
305354
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
306355
Of[r][d] += Pf[r][c] * Vf;
307356
}

ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,11 @@ void process_shaders() {
444444
if (tname == "f16") {
445445
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
446446
merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, false, false, f16acc);
447-
} // quants not supported yet
447+
} else if (tname == "q4_0" || tname == "q8_0") {
448+
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
449+
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
450+
merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, false, f16acc);
451+
}
448452
}
449453
}
450454

0 commit comments

Comments
 (0)