From 221946777c845c4665b1cb4d5055a39978f8c06f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Sun, 16 Apr 2023 00:37:16 +0200 Subject: [PATCH 1/8] test-quantize: fix for q8_0 intermediates --- tests/test-quantize-perf.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index d5514455db11d..e0be681cf0b0d 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -228,7 +228,7 @@ int main(int argc, char * argv[]) { if (qfns.quantize_row_q && qfns.dequantize_row_q) { printf("%s\n", ggml_type_name(type)); - if (params.op_quantize_row_q_reference) { + if (params.op_quantize_row_q_reference && qfns.quantize_row_q_reference) { printf(" quantize_row_q_reference\n"); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); @@ -242,7 +242,7 @@ int main(int argc, char * argv[]) { printf("\n"); } - if (params.op_quantize_row_q) { + if (params.op_quantize_row_q && qfns.quantize_row_q) { printf(" quantize_row_q\n"); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); @@ -256,7 +256,7 @@ int main(int argc, char * argv[]) { printf("\n"); } - if (params.op_dequantize_row_q) { + if (params.op_dequantize_row_q && qfns.dequantize_row_q) { printf(" dequantize_row_q\n"); qfns.quantize_row_q(test_data1, test_q1, largest); for (size_t size : params.test_sizes) { @@ -271,7 +271,7 @@ int main(int argc, char * argv[]) { printf("\n"); } - if (params.op_quantize_row_q_dot) { + if (params.op_quantize_row_q_dot && qfns.quantize_row_q_dot) { printf(" quantize_row_q_dot\n"); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); @@ -285,7 +285,7 @@ int main(int argc, char * argv[]) { printf("\n"); } - if (params.op_vec_dot_q) { + if (params.op_vec_dot_q && qfns.vec_dot_q) { printf(" vec_dot_q\n"); qfns.quantize_row_q(test_data1, test_q1, largest); qfns.quantize_row_q(test_data2, test_q2, largest); From a1e6fb92814d01cdabee9a61dcae8dbff281ad1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Mon, 17 Apr 2023 23:36:29 +0200 Subject: [PATCH 2/8] q4_0c continous row layout Introduce alternative quantized formats q4_0c and q8_0c, corresponding exactly to q4_0 and q8_0, except that quantized values and scales are laid out continuously in memory, and the nibbles in q4_0 are rearranged. This should simplify SIMD implementations, at the expense of slighly more complex scalar implementations. --- ggml.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- ggml.h | 2 + 2 files changed, 207 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index bce7a7a57e939..84dd55100d92d 100644 --- a/ggml.c +++ b/ggml.c @@ -772,6 +772,14 @@ typedef struct { } block_q8_1; static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); +#define QK4_0C (4*32) +#define QK4_0C_MUL (QK4_0C / QK4_0) +// TODO: nicer description - pseudostruct? +// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n] + +#define QK8_0C 32 +// q8_0c : uint8_t qs[n] || float d[n] + // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { assert(k % QK4_0 == 0); @@ -1117,6 +1125,57 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int #endif } +static void quantize_row_q4_0c_reference(const float * restrict x, uint8_t * restrict y, int k) { + assert(k % QK4_0C == 0); + const int nb = k / QK4_0; + const int nsb = k / QK4_0C; + + // Split y into nibbles section and scales section + uint8_t * restrict qs = y; + float * restrict ds = (float *) (y + QK4_0C/2 * nsb); + + for (int i = 0; i < nb/2; i++) { + // Interleave two output blocks in low and high nibbles + const int src0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ... + const int src1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ... + const float * xb[2] = { + x + QK4_0 * src0, // block in low nibbles + x + QK4_0 * src1, // block in high nibbles + }; + + // Find multiplier for each block + float d[2]; + float id[2]; + for (int j = 0; j < 2; j++) { + float amax = 0.0f; // absolute max + + for (int l = 0; l < QK4_0; l++) { + const float v = xb[j][l]; + amax = MAX(amax, fabsf(v)); + } + + d[j] = amax / ((1 << 3) - 1); + id[j] = d[j] ? 1.0f/d[j] : 0.0f; + } + + ds[src0] = d[0]; + ds[src1] = d[1]; + + for (int l = 0; l < QK4_0; l++) { + const float v0 = xb[0][l]*id[0]; + const uint8_t vi0 = (int8_t)roundf(v0) + 8; + + const float v1 = xb[1][l]*id[1]; + const uint8_t vi1 = (int8_t)roundf(v1) + 8; + + assert(vi0 < 16); + assert(vi1 < 16); + + qs[i*QK4_0 + l] = vi0 | (vi1 << 4); + } + } +} + static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) { assert(k % QK4_1 == 0); const int nb = k / QK4_1; @@ -1658,6 +1717,40 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int #endif } +// reference implementation for deterministic creation of model files +static void quantize_row_q8_0c_reference(const float * restrict x, void * restrict y, int k) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + uint8_t * restrict qs = y; + float * restrict ds = (float *) ((uint8_t *) y + QK8_0C * nb); + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int l = 0; l < QK8_0; l++) { + const float v = x[i*QK8_0 + l]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + ds[i] = d; + + for (int l = 0; l < QK8_0; ++l) { + const float v = x[i*QK8_0 + l]*id; + qs[i*QK8_0 + l] = roundf(v); + } + } +} + +static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int k) { + assert(k % QK8_0 == 0); + + quantize_row_q8_0c_reference(x, vy, k); +} + static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) { assert(k % QK4_0 == 0); const int nb = k / QK4_0; @@ -1776,6 +1869,41 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in #endif } +static void dequantize_row_q4_0c(const void * restrict vx, float * restrict y, int k) { + assert(k % QK4_0C == 0); + const int nb = k / QK4_0; + const int nsb = k / QK4_0C; + + // Split vx into nibbles section and scales section + const uint8_t * restrict qs = vx; + const float * restrict ds = (const float *) ((const uint8_t *) vx + QK4_0C/2 * nsb); + + // scalar + for (int i = 0; i < nb/2; i++) { + const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ... + const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ... + + const float d0 = ds[dst0]; + const float d1 = ds[dst1]; + + for (int l = 0; l < QK4_0; l++) { + const uint8_t vi = qs[i * QK4_0 + l]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = (vi0 - 8)*d0; + const float v1 = (vi1 - 8)*d1; + + y[dst0*QK4_0 + l] = v0; + y[dst1*QK4_0 + l] = v1; + + assert(!isnan(y[dst0*QK4_0 + l])); + assert(!isnan(y[dst1*QK4_0 + l])); + } + } +} + static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) { assert(k % QK4_1 == 0); const int nb = k / QK4_1; @@ -2002,6 +2130,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in } static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); @@ -2017,6 +2146,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_q = ggml_vec_dot_q4_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, }, + [GGML_TYPE_Q4_0C] = { + .dequantize_row_q = dequantize_row_q4_0c, + //.quantize_row_q = quantize_row_q4_0c, + .quantize_row_q = (quantize_row_q_t) quantize_row_q4_0c_reference, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0c_reference, + .quantize_row_q_dot = quantize_row_q8_0c, + .vec_dot_q = ggml_vec_dot_q4_0c_q8_0c, + }, [GGML_TYPE_Q4_1] = { .dequantize_row_q = dequantize_row_q4_1, .quantize_row_q = quantize_row_q4_1, @@ -2065,6 +2202,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_q = NULL, // TODO .vec_dot_type = GGML_TYPE_Q8_1, }, + [GGML_TYPE_Q8_0C] = { + .dequantize_row_q = NULL, + .quantize_row_q = quantize_row_q8_0c, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0c_reference, + .quantize_row_q_dot = quantize_row_q8_0c, + .vec_dot_q = NULL, + }, }; // For internal test use @@ -2835,6 +2979,51 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * #endif } +static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / QK4_0; + const int nsb = n / QK4_0C; + + assert(n % QK4_0C == 0); + + // Split into nibbles and scales sections + const uint8_t * restrict xqs = vx; + const float * restrict xds = (const float *) ((const uint8_t *) vx + nsb*QK4_0C/2); + const int8_t * restrict yqs = vy; + const float * restrict yds = (const float *) ((const uint8_t *) vy + nb*QK8_0C); + + float sumf = 0.0; + + // scalar + for (int i = 0; i < nb/2; i++) { + const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ... + const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ... + + const float dx0 = xds[dst0]; + const float dx1 = xds[dst1]; + const float dy0 = yds[dst0]; + const float dy1 = yds[dst1]; + + int sumi0 = 0; + int sumi1 = 0; + + for (int l = 0; l < QK4_0; l++) { + const uint8_t v0 = xqs[i*QK4_0 + l]; + + const int i0 = (int8_t) (v0 & 0xf) - 8; + const int i1 = (int8_t) (v0 >> 4) - 8; + + const int i2 = yqs[dst0*QK4_0 + l]; + const int i3 = yqs[dst1*QK4_0 + l]; + + sumi0 += i0*i2; + sumi1 += i1*i3; + } + sumf += dx0*dy0*sumi0 + dx1*dy1*sumi1; + } + + *s = sumf; +} + static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_1; @@ -3885,66 +4074,74 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = 1, [GGML_TYPE_F16] = 1, [GGML_TYPE_Q4_0] = QK4_0, + [GGML_TYPE_Q4_0C] = QK4_0C, [GGML_TYPE_Q4_1] = QK4_1, [GGML_TYPE_Q4_2] = QK4_2, [GGML_TYPE_Q5_0] = QK5_0, [GGML_TYPE_Q5_1] = QK5_1, [GGML_TYPE_Q8_0] = QK8_0, + [GGML_TYPE_Q8_0C] = QK8_0C, [GGML_TYPE_Q8_1] = QK8_1, [GGML_TYPE_I8] = 1, [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, }; -static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 15, "GGML_BLCK_SIZE is outdated"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = sizeof(float), [GGML_TYPE_F16] = sizeof(ggml_fp16_t), [GGML_TYPE_Q4_0] = sizeof(block_q4_0), + [GGML_TYPE_Q4_0C] = 4*sizeof(block_q4_0), [GGML_TYPE_Q4_1] = sizeof(block_q4_1), [GGML_TYPE_Q4_2] = sizeof(block_q4_2), [GGML_TYPE_Q5_0] = sizeof(block_q5_0), [GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), + [GGML_TYPE_Q8_0C] = sizeof(block_q8_0), [GGML_TYPE_Q8_1] = sizeof(block_q8_1), [GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), }; -static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_SIZE is outdated"); static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = "f32", [GGML_TYPE_F16] = "f16", [GGML_TYPE_Q4_0] = "q4_0", + [GGML_TYPE_Q4_0C] = "q4_0c", [GGML_TYPE_Q4_1] = "q4_1", [GGML_TYPE_Q4_2] = "q4_2", [GGML_TYPE_Q5_0] = "q5_0", [GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q8_0] = "q8_0", + [GGML_TYPE_Q8_0C] = "q8_0c", [GGML_TYPE_Q8_1] = "q8_1", [GGML_TYPE_I8] = "i8", [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", }; -static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated"); +static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_NAME is outdated"); static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = false, [GGML_TYPE_F16] = false, [GGML_TYPE_Q4_0] = true, + [GGML_TYPE_Q4_0C] = true, [GGML_TYPE_Q4_1] = true, [GGML_TYPE_Q4_2] = true, [GGML_TYPE_Q5_0] = true, [GGML_TYPE_Q5_1] = true, [GGML_TYPE_Q8_0] = true, + [GGML_TYPE_Q8_0C] = true, [GGML_TYPE_Q8_1] = true, [GGML_TYPE_I8] = false, [GGML_TYPE_I16] = false, [GGML_TYPE_I32] = false, }; -static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated"); +static_assert(GGML_TYPE_COUNT == 15, "GGML_IS_QUANTIZED is outdated"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", @@ -8763,11 +8960,13 @@ static void ggml_compute_forward_mul_mat( struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_0C: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_0C: case GGML_TYPE_Q8_1: { ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); @@ -8994,11 +9193,13 @@ static void ggml_compute_forward_get_rows( struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_0C: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_0C: case GGML_TYPE_Q8_1: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); diff --git a/ggml.h b/ggml.h index ef5a048c3b7e4..3c1807736e8e3 100644 --- a/ggml.h +++ b/ggml.h @@ -237,6 +237,8 @@ extern "C" { GGML_TYPE_Q5_1 = 7, GGML_TYPE_Q8_0 = 8, GGML_TYPE_Q8_1 = 9, + GGML_TYPE_Q4_0C = 10, + GGML_TYPE_Q8_0C = 11, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, From 4bd781cd2572de9ec022178e9973f79cd1c7b278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Tue, 18 Apr 2023 00:57:30 +0200 Subject: [PATCH 3/8] q4_0c: quantize support --- examples/quantize/quantize.cpp | 1 + ggml.c | 41 ++++++++++++++++++++++++++++++---- ggml.h | 3 ++- llama.cpp | 13 +++++++---- llama.h | 1 + 5 files changed, 50 insertions(+), 9 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 198bd5fcb4cf6..bc903d209bd73 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -8,6 +8,7 @@ static const std::map LLAMA_FTYPE_MAP = { {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, + {"q4_0c", LLAMA_FTYPE_MOSTLY_Q4_0C}, {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2}, {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, diff --git a/ggml.c b/ggml.c index 84dd55100d92d..f481774a4128f 100644 --- a/ggml.c +++ b/ggml.c @@ -774,11 +774,17 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s #define QK4_0C (4*32) #define QK4_0C_MUL (QK4_0C / QK4_0) -// TODO: nicer description - pseudostruct? -// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n] +#define Q4_0C_QSIZE (QK4_0C/2 + 4*sizeof(float)) +// typedef struct { +// uint8_t qs[QK4_0C/2][nb]; +// float d[nb]; +// } block_q4_0c #define QK8_0C 32 -// q8_0c : uint8_t qs[n] || float d[n] +// typedef struct { +// uint8_t qs[QK8_0C][nb]; +// float d[nb]; +// } block_q8_0c // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { @@ -13102,6 +13108,27 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_0*sizeof(block_q4_0)); } +size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK4_0C == 0); + const int nb = k / QK4_0; + + for (int j = 0; j < n; j += k) { + uint8_t * restrict y = (uint8_t *)dst + sizeof(block_q4_0)*j/QK4_0; + + quantize_row_q4_0c_reference(src + j, y, k); + + for (int i = 0; i < nb*QK4_0/2; i++) { + const uint8_t vi0 = y[i] & 0xF; + const uint8_t vi1 = y[i] >> 4; + + hist[vi0]++; + hist[vi1]++; + } + } + + return (n/QK4_0*sizeof(block_q4_0)); +} + size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_1 == 0); const int nb = k / QK4_1; @@ -13229,7 +13256,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK8_0*sizeof(block_q8_0)); } -size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist) { size_t result = 0; switch (type) { case GGML_TYPE_Q4_0: @@ -13238,6 +13265,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; result = ggml_quantize_q4_0(src + start, block, n, n, hist); } break; + case GGML_TYPE_Q4_0C: + { + GGML_ASSERT(start % QK4_0C == 0); + uint8_t * dst_off = (uint8_t *) dst + Q4_0C_QSIZE * start / QK4_0C; + result = ggml_quantize_q4_0c(src + start, dst_off, n, k, hist); + } break; case GGML_TYPE_Q4_1: { GGML_ASSERT(start % QK4_1 == 0); diff --git a/ggml.h b/ggml.h index 3c1807736e8e3..2b502b2fb0753 100644 --- a/ggml.h +++ b/ggml.h @@ -871,13 +871,14 @@ extern "C" { // GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist); // // system info diff --git a/llama.cpp b/llama.cpp index 868a58a8b0b93..59747a16cf502 100644 --- a/llama.cpp +++ b/llama.cpp @@ -481,6 +481,7 @@ struct llama_file_loader { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_0C: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: @@ -557,6 +558,7 @@ struct llama_file_saver { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_0C: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: @@ -846,6 +848,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; + case LLAMA_FTYPE_MOSTLY_Q4_0C: return "mostly Q4_0C"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: return "mostly Q4_1, some F16"; @@ -1880,6 +1883,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ggml_type quantized_type; switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; + case LLAMA_FTYPE_MOSTLY_Q4_0C: quantized_type = GGML_TYPE_Q4_0C; break; case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; @@ -1961,15 +1965,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_data = work.addr; std::vector hist_cur(1 << 4, 0); - int chunk_size = 32 * 512; + int row_size = tensor.ne.at(0); + int chunk_size = ceil(32 * 512 * 1.0 / row_size) * row_size; const int nchunk = (nelements + chunk_size - 1)/chunk_size; const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; if (nthread_use < 2) { - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, row_size, hist_cur.data()); } else { size_t counter = 0; new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size, row_size] () { std::vector local_hist; size_t local_size = 0; while (true) { @@ -1985,7 +1990,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s lock.unlock(); size_t last = std::min(nelements, first + chunk_size); if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0); - local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); + local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, row_size, local_hist.data()); } }; if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1); diff --git a/llama.h b/llama.h index 2f6ce8d831e6c..94c3e56b100cf 100644 --- a/llama.h +++ b/llama.h @@ -83,6 +83,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0C = 20, // except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params(); From ab543dc1a46be9ab561af68a4b0ea4443d0b69b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Tue, 18 Apr 2023 23:07:03 +0200 Subject: [PATCH 4/8] q4_0c: AVX512 vec_dot and quantize impl --- ggml.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 124 insertions(+), 17 deletions(-) diff --git a/ggml.c b/ggml.c index f481774a4128f..03c9cd462ee05 100644 --- a/ggml.c +++ b/ggml.c @@ -1725,8 +1725,8 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int // reference implementation for deterministic creation of model files static void quantize_row_q8_0c_reference(const float * restrict x, void * restrict y, int k) { - assert(k % QK8_0 == 0); - const int nb = k / QK8_0; + assert(k % QK8_0C == 0); + const int nb = k / QK8_0C; uint8_t * restrict qs = y; float * restrict ds = (float *) ((uint8_t *) y + QK8_0C * nb); @@ -1734,8 +1734,8 @@ static void quantize_row_q8_0c_reference(const float * restrict x, void * restri for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max - for (int l = 0; l < QK8_0; l++) { - const float v = x[i*QK8_0 + l]; + for (int l = 0; l < QK8_0C; l++) { + const float v = x[i*QK8_0C + l]; amax = MAX(amax, fabsf(v)); } @@ -1744,17 +1744,46 @@ static void quantize_row_q8_0c_reference(const float * restrict x, void * restri ds[i] = d; - for (int l = 0; l < QK8_0; ++l) { - const float v = x[i*QK8_0 + l]*id; - qs[i*QK8_0 + l] = roundf(v); + for (int l = 0; l < QK8_0C; ++l) { + const float v = x[i*QK8_0C + l]*id; + qs[i*QK8_0C + l] = roundf(v); } } } static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int k) { - assert(k % QK8_0 == 0); + assert(k % QK8_0C == 0); + const int nb = k / QK8_0C; + + int8_t * restrict qs = vy; + float * restrict ds = (float *) ((uint8_t *) vy + nb*QK8_0C); + +#if __AVX512F__ + for (int i = 0; i < nb; i++) { + const __m512 x0 = _mm512_loadu_ps( x + i*QK8_0C ); + const __m512 x1 = _mm512_loadu_ps( x + i*QK8_0C + QK8_0C/2); + + // Find absolute max + const __m512 x0abs = _mm512_abs_ps(x0); + const __m512 x1abs = _mm512_abs_ps(x1); + const float amax = _mm512_reduce_max_ps(_mm512_max_ps(x0abs, x1abs)); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + ds[i] = d; + + const __m512 mul = _mm512_set1_ps( id ); + const __m512i x0q = _mm512_cvt_roundps_epi32(_mm512_mul_ps(x0, mul), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + const __m512i x1q = _mm512_cvt_roundps_epi32(_mm512_mul_ps(x1, mul), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + _mm512_mask_cvtepi32_storeu_epi8(qs + i*QK8_0C, 0xffff, x0q); + _mm512_mask_cvtepi32_storeu_epi8(qs + i*QK8_0C + QK8_0C/2, 0xffff, x1q); + } +#else + // scalar quantize_row_q8_0c_reference(x, vy, k); +#endif } static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) { @@ -2780,6 +2809,73 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float *s = sumf; } +#if __AVX512F__ && QK4_0 == 32 + +// Dot product of four blocks of q4_0c with four blocks of q8_0c +static inline __m512 dot_q4_0c_fourblocks_avx512( + __m512 acc, + const uint8_t * restrict xqs, + const float * restrict xds, + const int8_t * restrict yqs, + const float * restrict yds +) { + // load quantized bytes + // TODO: change back to aligned loads + const __m512i xqs0123 = _mm512_loadu_epi64( xqs ); + const __m512i low_nibble_mask = _mm512_set1_epi8( 0xf ); + const __m512i xqs01 = _mm512_and_si512( low_nibble_mask, xqs0123 ); + // TODO: try srlv/i? + const __m512i xqs23 = _mm512_and_si512( low_nibble_mask, _mm512_srli_epi32( xqs0123, 4 ) ); + const __m512i yqs01 = _mm512_loadu_epi64( yqs ); + const __m512i yqs23 = _mm512_loadu_epi64( yqs + 2*QK8_0C ); + + // load scales + const __m512i scale_mask0 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0); + const __m512i scale_mask1 = _mm512_set_epi32(3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2); + const __m128 xyds = _mm_mul_ps(_mm_load_ps(xds), _mm_load_ps(yds)); + const __m512 xyds0123 = _mm512_broadcast_f32x4(xyds); + const __m512 xyds01 = _mm512_permutevar_ps(xyds0123, scale_mask0); + const __m512 xyds23 = _mm512_permutevar_ps(xyds0123, scale_mask1); + + // take dot product of x and y bytes + const __m512i plus_8 = _mm512_set1_epi8( 8 ); +#ifdef __AVX512VNNI__ + // We have VPDPBUSDS in AVX512-VNNI, which does exactly what we want, but with a catch: + // the *left* operand is supposed to be unsigned, while Q4_0 quantization subtracts 8 + // from each nibble, so they can be negative. So, instead of `(xqs01 - 8) * yqs01`, + // we compute `xqs01 * yqs01 - 8 * yqks`. + const __m512i zero = _mm512_setzero_epi32(); + const __m512i yqs01_mul8 = _mm512_dpbusds_epi32( zero, plus_8, yqs01 ); + const __m512i yqs23_mul8 = _mm512_dpbusds_epi32( zero, plus_8, yqs23 ); + const __m512i xy01 = _mm512_dpbusds_epi32( zero, xqs01, yqs01 ); + const __m512i xy23 = _mm512_dpbusds_epi32( zero, xqs23, yqs23 ); + const __m512i res0_int = _mm512_sub_epi32( xy01, yqs01_mul8 ); + const __m512i res1_int = _mm512_sub_epi32( xy23, yqs23_mul8 ); +#else + // As a fallback, we have VPMADDUBSW in AVX512-BW, which uses 16-bit products instead of 32-bit ones. + // It has the same catch as VPDPBUSDS: the left operand should be unsigned. + // This is essentially the AVX-512 version of the AVX-2 trick used by GH user Const-me + // ref: https://gist.github.com/Const-me/4d30e1fc767ab314596e16e90f53b6f4#file-matmultest-cpp-L119 + const __m512i one = _mm512_set1_epi16( 1 ); + const __m512i prod_0 = _mm512_maddubs_epi16( xqs01, yqs01 ); + const __m512i prod_1 = _mm512_maddubs_epi16( plus_8, yqs01 ); + const __m512i prod_2 = _mm512_maddubs_epi16( xqs23, yqs23 ); + const __m512i prod_3 = _mm512_maddubs_epi16( plus_8, yqs23 ); + const __m512i diff0 = _mm512_sub_epi16( prod_0, prod_1 ); + const __m512i diff1 = _mm512_sub_epi16( prod_2, prod_3 ); + const __m512i res0_int = _mm512_madd_epi16( diff0, one ); + const __m512i res1_int = _mm512_madd_epi16( diff1, one ); +#endif + + // Finally, we multiply the permuted scales and the 32-bit dot products, then accumulate. + const __m512 res0_float = _mm512_cvtepi32_ps( res0_int ); + const __m512 res1_float = _mm512_cvtepi32_ps( res1_int ); + + return _mm512_fmadd_ps( xyds23, res1_float, + _mm512_fmadd_ps( xyds01, res0_float, acc )); +} +#endif + inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { ggml_float sumf = 0.0; @@ -2999,6 +3095,15 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void float sumf = 0.0; +#if __AVX512F__ + // Initialize accumulator with zeros + __m512 acc = _mm512_setzero_ps(); + for (int i = 0; i < nb; i += 4) { + acc = dot_q4_0c_fourblocks_avx512(acc, xqs + i*QK4_0/2, xds + i, yqs + i*QK8_0, yds + i); + } + // Horizontal sum of all lanes of the accumulator + sumf = _mm512_reduce_add_ps( acc ); +#else // scalar for (int i = 0; i < nb/2; i++) { const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ... @@ -3009,23 +3114,25 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void const float dy0 = yds[dst0]; const float dy1 = yds[dst1]; - int sumi0 = 0; - int sumi1 = 0; + // NOTE: having these as plain int triggers a bug with AVX512 on GCC 12.2 + int64_t sumi0 = 0; + int64_t sumi1 = 0; for (int l = 0; l < QK4_0; l++) { - const uint8_t v0 = xqs[i*QK4_0 + l]; + const uint8_t v0 = xqs[i*QK4_0 + l]; - const int i0 = (int8_t) (v0 & 0xf) - 8; - const int i1 = (int8_t) (v0 >> 4) - 8; + const int i0 = (int) (v0 & 0xf) - 8; + const int i1 = (int) (v0 >> 4) - 8; - const int i2 = yqs[dst0*QK4_0 + l]; - const int i3 = yqs[dst1*QK4_0 + l]; + const int i2 = yqs[dst0*QK4_0 + l]; + const int i3 = yqs[dst1*QK4_0 + l]; - sumi0 += i0*i2; - sumi1 += i1*i3; + sumi0 += i0*i2; + sumi1 += i1*i3; } sumf += dx0*dy0*sumi0 + dx1*dy1*sumi1; } +#endif *s = sumf; } From 1b49d26f8a1a862e0b628c4828e53f7f5315ebf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Fri, 21 Apr 2023 00:11:49 +0200 Subject: [PATCH 5/8] q4_0c: Arm Neon acceleration Mostly copied from the q4_0 implementation --- ggml.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 94 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 03c9cd462ee05..78abf324c13f4 100644 --- a/ggml.c +++ b/ggml.c @@ -1758,7 +1758,37 @@ static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int int8_t * restrict qs = vy; float * restrict ds = (float *) ((uint8_t *) vy + nb*QK8_0C); -#if __AVX512F__ +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); + for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]); + + for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]); + for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); + for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + ds[i] = d; + + for (int l = 0; l < 8; l++) { + const float32x4_t v = vmulq_n_f32(srcv[l], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + qs[i*QK8_0C + 4*l + 0] = vgetq_lane_s32(vi, 0); + qs[i*QK8_0C + 4*l + 1] = vgetq_lane_s32(vi, 1); + qs[i*QK8_0C + 4*l + 2] = vgetq_lane_s32(vi, 2); + qs[i*QK8_0C + 4*l + 3] = vgetq_lane_s32(vi, 3); + } + } +#elif defined(__AVX512F__) for (int i = 0; i < nb; i++) { const __m512 x0 = _mm512_loadu_ps( x + i*QK8_0C ); const __m512 x1 = _mm512_loadu_ps( x + i*QK8_0C + QK8_0C/2); @@ -3095,7 +3125,69 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void float sumf = 0.0; -#if __AVX512F__ +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb/2; i++) { + const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ... + const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ... + + const uint8x16_t m4b = vdupq_n_u8(0xf); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_01l = vld1q_u8(&xqs[i*QK4_0]); + const uint8x16_t v0_01h = vld1q_u8(&xqs[i*QK4_0 + QK4_0/2]); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_01l, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vandq_u8 (v0_01h, m4b)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vshrq_n_u8(v0_01l, 4)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_01h, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t v1_0l = vld1q_s8(&yqs[dst0*QK8_0C]); + const int8x16_t v1_0h = vld1q_s8(&yqs[dst0*QK8_0C + 16]); + const int8x16_t v1_1l = vld1q_s8(&yqs[dst1*QK8_0C]); + const int8x16_t v1_1h = vld1q_s8(&yqs[dst1*QK8_0C + 16]); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), xds[dst0]*yds[dst0]); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), xds[dst1]*yds[dst1]); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), xds[dst0]*yds[dst0]); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), xds[dst1]*yds[dst1]); +#endif + } + + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); + +#elif defined(__AVX512F__) // Initialize accumulator with zeros __m512 acc = _mm512_setzero_ps(); for (int i = 0; i < nb; i += 4) { From 2949725fea6b43005a5f2374ccde7fb524359026 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Mon, 24 Apr 2023 18:17:31 +0200 Subject: [PATCH 6/8] q4_0c: prefetch on AVX-512 and ARM Seems significant especially for evaluation time --- ggml.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ggml.c b/ggml.c index 78abf324c13f4..a70aa4773cfdd 100644 --- a/ggml.c +++ b/ggml.c @@ -3126,10 +3126,17 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void float sumf = 0.0; #if defined(__ARM_NEON) + const int ahead=80; float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); for (int i = 0; i < nb/2; i++) { + __builtin_prefetch(&xqs[i*QK4_0 + 64*ahead]); + __builtin_prefetch(&yqs[2*i*QK8_0C + 64*ahead]); + __builtin_prefetch(&yqs[2*i*QK8_0C + 64*ahead + 64]); + __builtin_prefetch(&xds[2*i + 64/4*ahead]); + __builtin_prefetch(&yds[2*i + 64/4*ahead]); + const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ... const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ... @@ -3188,9 +3195,15 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); #elif defined(__AVX512F__) + const int ahead = 64; // Initialize accumulator with zeros __m512 acc = _mm512_setzero_ps(); for (int i = 0; i < nb; i += 4) { + _mm_prefetch(xqs + i*QK4_0/2 + 64*ahead, _MM_HINT_T0); + _mm_prefetch(yqs + i*QK8_0 + 64*ahead, _MM_HINT_T0); + _mm_prefetch(yqs + i*QK8_0 + 64*ahead + 64, _MM_HINT_T0); + _mm_prefetch(xds + i + 64/4*ahead, _MM_HINT_T0); + _mm_prefetch(yds + i + 64/4*ahead, _MM_HINT_T0); acc = dot_q4_0c_fourblocks_avx512(acc, xqs + i*QK4_0/2, xds + i, yqs + i*QK8_0, yds + i); } // Horizontal sum of all lanes of the accumulator From d53f76760d7b067fd0cef67a994a90e662bdfb50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Thu, 27 Apr 2023 22:48:46 +0200 Subject: [PATCH 7/8] q4_0c: disable prefetching on M1 --- ggml.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index a70aa4773cfdd..2c5c796fd2cc4 100644 --- a/ggml.c +++ b/ggml.c @@ -1154,13 +1154,17 @@ static void quantize_row_q4_0c_reference(const float * restrict x, uint8_t * res float id[2]; for (int j = 0; j < 2; j++) { float amax = 0.0f; // absolute max + float max = 0.0f; for (int l = 0; l < QK4_0; l++) { const float v = xb[j][l]; - amax = MAX(amax, fabsf(v)); + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } } - d[j] = amax / ((1 << 3) - 1); + d[j] = max / -8; id[j] = d[j] ? 1.0f/d[j] : 0.0f; } @@ -1169,10 +1173,10 @@ static void quantize_row_q4_0c_reference(const float * restrict x, uint8_t * res for (int l = 0; l < QK4_0; l++) { const float v0 = xb[0][l]*id[0]; - const uint8_t vi0 = (int8_t)roundf(v0) + 8; + const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8); const float v1 = xb[1][l]*id[1]; - const uint8_t vi1 = (int8_t)roundf(v1) + 8; + const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8); assert(vi0 < 16); assert(vi1 < 16); @@ -3126,16 +3130,19 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void float sumf = 0.0; #if defined(__ARM_NEON) - const int ahead=80; float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); for (int i = 0; i < nb/2; i++) { + // Disable prefetching on M1 for now. +#ifndef __APPLE__ + const int ahead=80; __builtin_prefetch(&xqs[i*QK4_0 + 64*ahead]); __builtin_prefetch(&yqs[2*i*QK8_0C + 64*ahead]); __builtin_prefetch(&yqs[2*i*QK8_0C + 64*ahead + 64]); __builtin_prefetch(&xds[2*i + 64/4*ahead]); __builtin_prefetch(&yds[2*i + 64/4*ahead]); +#endif const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ... const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ... @@ -9738,11 +9745,13 @@ static void ggml_compute_forward_alibi( ggml_compute_forward_alibi_f32(params, src0, src1, dst); } break; case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_0C: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_0C: case GGML_TYPE_Q8_1: case GGML_TYPE_I8: case GGML_TYPE_I16: From 76692c90cdb909065be522910a5b7c60fa3a062b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Thu, 4 May 2023 09:53:55 +0200 Subject: [PATCH 8/8] q4_0c: avoid _mm512_loadu_epi64 instruction Not supported on some GCC versions --- ggml.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index 2c5c796fd2cc4..30c790f52389e 100644 --- a/ggml.c +++ b/ggml.c @@ -2855,13 +2855,13 @@ static inline __m512 dot_q4_0c_fourblocks_avx512( ) { // load quantized bytes // TODO: change back to aligned loads - const __m512i xqs0123 = _mm512_loadu_epi64( xqs ); + const __m512i xqs0123 = _mm512_loadu_si512( xqs ); const __m512i low_nibble_mask = _mm512_set1_epi8( 0xf ); const __m512i xqs01 = _mm512_and_si512( low_nibble_mask, xqs0123 ); // TODO: try srlv/i? const __m512i xqs23 = _mm512_and_si512( low_nibble_mask, _mm512_srli_epi32( xqs0123, 4 ) ); - const __m512i yqs01 = _mm512_loadu_epi64( yqs ); - const __m512i yqs23 = _mm512_loadu_epi64( yqs + 2*QK8_0C ); + const __m512i yqs01 = _mm512_loadu_si512( yqs ); + const __m512i yqs23 = _mm512_loadu_si512( yqs + 2*QK8_0C ); // load scales const __m512i scale_mask0 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);