From 221946777c845c4665b1cb4d5055a39978f8c06f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= <haakon@likedan.net>
Date: Sun, 16 Apr 2023 00:37:16 +0200
Subject: [PATCH 1/8] test-quantize: fix for q8_0 intermediates

---
 tests/test-quantize-perf.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index d5514455db11d..e0be681cf0b0d 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -228,7 +228,7 @@ int main(int argc, char * argv[]) {
         if (qfns.quantize_row_q && qfns.dequantize_row_q) {
             printf("%s\n", ggml_type_name(type));
 
-            if (params.op_quantize_row_q_reference) {
+            if (params.op_quantize_row_q_reference && qfns.quantize_row_q_reference) {
                 printf("  quantize_row_q_reference\n");
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
@@ -242,7 +242,7 @@ int main(int argc, char * argv[]) {
                 printf("\n");
             }
 
-            if (params.op_quantize_row_q) {
+            if (params.op_quantize_row_q && qfns.quantize_row_q) {
                 printf("  quantize_row_q\n");
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
@@ -256,7 +256,7 @@ int main(int argc, char * argv[]) {
                 printf("\n");
             }
 
-            if (params.op_dequantize_row_q) {
+            if (params.op_dequantize_row_q && qfns.dequantize_row_q) {
                 printf("  dequantize_row_q\n");
                 qfns.quantize_row_q(test_data1, test_q1, largest);
                 for (size_t size : params.test_sizes) {
@@ -271,7 +271,7 @@ int main(int argc, char * argv[]) {
                 printf("\n");
             }
 
-            if (params.op_quantize_row_q_dot) {
+            if (params.op_quantize_row_q_dot && qfns.quantize_row_q_dot) {
                 printf("  quantize_row_q_dot\n");
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
@@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
                 printf("\n");
             }
 
-            if (params.op_vec_dot_q) {
+            if (params.op_vec_dot_q && qfns.vec_dot_q) {
                 printf("  vec_dot_q\n");
                 qfns.quantize_row_q(test_data1, test_q1, largest);
                 qfns.quantize_row_q(test_data2, test_q2, largest);

From a1e6fb92814d01cdabee9a61dcae8dbff281ad1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= <haakon@likedan.net>
Date: Mon, 17 Apr 2023 23:36:29 +0200
Subject: [PATCH 2/8] q4_0c continous row layout

Introduce alternative quantized formats q4_0c and q8_0c, corresponding
exactly to q4_0 and q8_0, except that quantized values and scales are
laid out continuously in memory, and the nibbles in q4_0 are rearranged.

This should simplify SIMD implementations, at the expense of slighly
more complex scalar implementations.
---
 ggml.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 ggml.h |   2 +
 2 files changed, 207 insertions(+), 4 deletions(-)

diff --git a/ggml.c b/ggml.c
index bce7a7a57e939..84dd55100d92d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -772,6 +772,14 @@ typedef struct {
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
 
+#define QK4_0C (4*32)
+#define QK4_0C_MUL (QK4_0C / QK4_0)
+// TODO: nicer description - pseudostruct?
+// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n]
+
+#define QK8_0C 32
+// q8_0c : uint8_t qs[n] || float d[n]
+
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
     assert(k % QK4_0 == 0);
@@ -1117,6 +1125,57 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
 #endif
 }
 
+static void quantize_row_q4_0c_reference(const float * restrict x, uint8_t * restrict y, int k) {
+    assert(k % QK4_0C == 0);
+    const int nb = k / QK4_0;
+    const int nsb = k / QK4_0C;
+
+    // Split y into nibbles section and scales section
+    uint8_t * restrict qs = y;
+    float * restrict ds = (float *) (y + QK4_0C/2 * nsb);
+
+    for (int i = 0; i < nb/2; i++) {
+        // Interleave two output blocks in low and high nibbles
+        const int src0 = i + i/2*2;      // 0, 1, 4, 5, 8, 9, ...
+        const int src1 = i + i/2*2 + 2;  // 2, 3, 6, 7, 10, 11 ...
+        const float * xb[2] = {
+            x + QK4_0 * src0, // block in low nibbles
+            x + QK4_0 * src1, // block in high nibbles
+        };
+
+        // Find multiplier for each block
+        float d[2];
+        float id[2];
+        for (int j = 0; j < 2; j++) {
+            float amax = 0.0f; // absolute max
+
+            for (int l = 0; l < QK4_0; l++) {
+                const float v = xb[j][l];
+                amax = MAX(amax, fabsf(v));
+            }
+
+            d[j] = amax / ((1 << 3) - 1);
+            id[j] = d[j] ? 1.0f/d[j] : 0.0f;
+        }
+
+        ds[src0] = d[0];
+        ds[src1] = d[1];
+
+        for (int l = 0; l < QK4_0; l++) {
+            const float v0 = xb[0][l]*id[0];
+            const uint8_t vi0 = (int8_t)roundf(v0) + 8;
+
+            const float v1 = xb[1][l]*id[1];
+            const uint8_t vi1 = (int8_t)roundf(v1) + 8;
+
+            assert(vi0 < 16);
+            assert(vi1 < 16);
+
+            qs[i*QK4_0 + l] = vi0 | (vi1 << 4);
+        }
+    }
+}
+
 static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
     assert(k % QK4_1 == 0);
     const int nb = k / QK4_1;
@@ -1658,6 +1717,40 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
 #endif
 }
 
+// reference implementation for deterministic creation of model files
+static void quantize_row_q8_0c_reference(const float * restrict x, void * restrict y, int k) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    uint8_t * restrict qs = y;
+    float * restrict ds = (float *) ((uint8_t *) y + QK8_0C * nb);
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int l = 0; l < QK8_0; l++) {
+            const float v = x[i*QK8_0 + l];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        ds[i] = d;
+
+        for (int l = 0; l < QK8_0; ++l) {
+            const float v = x[i*QK8_0 + l]*id;
+            qs[i*QK8_0 + l] = roundf(v);
+        }
+    }
+}
+
+static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK8_0 == 0);
+
+    quantize_row_q8_0c_reference(x, vy, k);
+}
+
 static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
     assert(k % QK4_0 == 0);
     const int nb = k / QK4_0;
@@ -1776,6 +1869,41 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
 #endif
 }
 
+static void dequantize_row_q4_0c(const void * restrict vx, float * restrict y, int k) {
+    assert(k % QK4_0C == 0);
+    const int nb = k / QK4_0;
+    const int nsb = k / QK4_0C;
+
+    // Split vx into nibbles section and scales section
+    const uint8_t * restrict qs = vx;
+    const float * restrict ds = (const float *) ((const uint8_t *) vx + QK4_0C/2 * nsb);
+
+    // scalar
+    for (int i = 0; i < nb/2; i++) {
+        const int dst0 = i + i/2*2;      // 0, 1, 4, 5, 8, 9, ...
+        const int dst1 = i + i/2*2 + 2;  // 2, 3, 6, 7, 10, 11 ...
+
+        const float d0 = ds[dst0];
+        const float d1 = ds[dst1];
+
+        for (int l = 0; l < QK4_0; l++) {
+            const uint8_t vi = qs[i * QK4_0 + l];
+
+            const int8_t vi0 = vi & 0xf;
+            const int8_t vi1 = vi >> 4;
+
+            const float v0 = (vi0 - 8)*d0;
+            const float v1 = (vi1 - 8)*d1;
+
+            y[dst0*QK4_0 + l] = v0;
+            y[dst1*QK4_0 + l] = v1;
+
+            assert(!isnan(y[dst0*QK4_0 + l]));
+            assert(!isnan(y[dst1*QK4_0 + l]));
+        }
+    }
+}
+
 static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) {
     assert(k % QK4_1 == 0);
     const int nb = k / QK4_1;
@@ -2002,6 +2130,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
 }
 
 static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -2017,6 +2146,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_q                = ggml_vec_dot_q4_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
+    [GGML_TYPE_Q4_0C] = {
+        .dequantize_row_q         = dequantize_row_q4_0c,
+        //.quantize_row_q           = quantize_row_q4_0c,
+        .quantize_row_q           = (quantize_row_q_t) quantize_row_q4_0c_reference,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0c_reference,
+        .quantize_row_q_dot       = quantize_row_q8_0c,
+        .vec_dot_q                = ggml_vec_dot_q4_0c_q8_0c,
+    },
     [GGML_TYPE_Q4_1] = {
         .dequantize_row_q         = dequantize_row_q4_1,
         .quantize_row_q           = quantize_row_q4_1,
@@ -2065,6 +2202,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_q                = NULL,   // TODO
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
+    [GGML_TYPE_Q8_0C] = {
+        .dequantize_row_q         = NULL,
+        .quantize_row_q           = quantize_row_q8_0c,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0c_reference,
+        .quantize_row_q_dot       = quantize_row_q8_0c,
+        .vec_dot_q                = NULL,
+    },
 };
 
 // For internal test use
@@ -2835,6 +2979,51 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 #endif
 }
 
+static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int nb = n / QK4_0;
+    const int nsb = n / QK4_0C;
+
+    assert(n % QK4_0C == 0);
+
+    // Split into nibbles and scales sections
+    const uint8_t * restrict xqs = vx;
+    const float * restrict xds = (const float *) ((const uint8_t *) vx + nsb*QK4_0C/2);
+    const int8_t * restrict yqs = vy;
+    const float * restrict yds = (const float *) ((const uint8_t *) vy + nb*QK8_0C);
+
+    float sumf = 0.0;
+
+    // scalar
+    for (int i = 0; i < nb/2; i++) {
+        const int dst0 = i + i/2*2;      // 0, 1, 4, 5, 8, 9, ...
+        const int dst1 = i + i/2*2 + 2;  // 2, 3, 6, 7, 10, 11 ...
+
+        const float dx0 = xds[dst0];
+        const float dx1 = xds[dst1];
+        const float dy0 = yds[dst0];
+        const float dy1 = yds[dst1];
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int l = 0; l < QK4_0; l++) {
+                const uint8_t v0 = xqs[i*QK4_0 + l];
+
+                const int i0 = (int8_t) (v0 & 0xf) - 8;
+                const int i1 = (int8_t) (v0 >> 4)  - 8;
+
+                const int i2 = yqs[dst0*QK4_0 + l];
+                const int i3 = yqs[dst1*QK4_0 + l];
+
+                sumi0 += i0*i2;
+                sumi1 += i1*i3;
+        }
+        sumf += dx0*dy0*sumi0 + dx1*dy1*sumi1;
+    }
+
+    *s = sumf;
+}
+
 static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     const int nb = n / QK8_1;
 
@@ -3885,66 +4074,74 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = 1,
     [GGML_TYPE_F16]  = 1,
     [GGML_TYPE_Q4_0] = QK4_0,
+    [GGML_TYPE_Q4_0C] = QK4_0C,
     [GGML_TYPE_Q4_1] = QK4_1,
     [GGML_TYPE_Q4_2] = QK4_2,
     [GGML_TYPE_Q5_0] = QK5_0,
     [GGML_TYPE_Q5_1] = QK5_1,
     [GGML_TYPE_Q8_0] = QK8_0,
+    [GGML_TYPE_Q8_0C] = QK8_0C,
     [GGML_TYPE_Q8_1] = QK8_1,
     [GGML_TYPE_I8]   = 1,
     [GGML_TYPE_I16]  = 1,
     [GGML_TYPE_I32]  = 1,
 };
-static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
+static_assert(GGML_TYPE_COUNT == 15, "GGML_BLCK_SIZE is outdated");
 
 static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = sizeof(float),
     [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
     [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
+    [GGML_TYPE_Q4_0C] = 4*sizeof(block_q4_0),
     [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
     [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
     [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
     [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
     [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
+    [GGML_TYPE_Q8_0C] = sizeof(block_q8_0),
     [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
     [GGML_TYPE_I8]   = sizeof(int8_t),
     [GGML_TYPE_I16]  = sizeof(int16_t),
     [GGML_TYPE_I32]  = sizeof(int32_t),
 };
-static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
+static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_SIZE is outdated");
 
 
 static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = "f32",
     [GGML_TYPE_F16]  = "f16",
     [GGML_TYPE_Q4_0] = "q4_0",
+    [GGML_TYPE_Q4_0C] = "q4_0c",
     [GGML_TYPE_Q4_1] = "q4_1",
     [GGML_TYPE_Q4_2] = "q4_2",
     [GGML_TYPE_Q5_0] = "q5_0",
     [GGML_TYPE_Q5_1] = "q5_1",
     [GGML_TYPE_Q8_0] = "q8_0",
+    [GGML_TYPE_Q8_0C] = "q8_0c",
     [GGML_TYPE_Q8_1] = "q8_1",
     [GGML_TYPE_I8]   = "i8",
     [GGML_TYPE_I16]  = "i16",
     [GGML_TYPE_I32]  = "i32",
 };
-static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
+static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_NAME is outdated");
 
 static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = false,
     [GGML_TYPE_F16]  = false,
     [GGML_TYPE_Q4_0] = true,
+    [GGML_TYPE_Q4_0C] = true,
     [GGML_TYPE_Q4_1] = true,
     [GGML_TYPE_Q4_2] = true,
     [GGML_TYPE_Q5_0] = true,
     [GGML_TYPE_Q5_1] = true,
     [GGML_TYPE_Q8_0] = true,
+    [GGML_TYPE_Q8_0C] = true,
     [GGML_TYPE_Q8_1] = true,
     [GGML_TYPE_I8]   = false,
     [GGML_TYPE_I16]  = false,
     [GGML_TYPE_I32]  = false,
 };
-static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
+static_assert(GGML_TYPE_COUNT == 15, "GGML_IS_QUANTIZED is outdated");
 
 static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "NONE",
@@ -8763,11 +8960,13 @@ static void ggml_compute_forward_mul_mat(
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_0C:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_0C:
         case GGML_TYPE_Q8_1:
             {
                 ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
@@ -8994,11 +9193,13 @@ static void ggml_compute_forward_get_rows(
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_0C:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_0C:
         case GGML_TYPE_Q8_1:
             {
                 ggml_compute_forward_get_rows_q(params, src0, src1, dst);
diff --git a/ggml.h b/ggml.h
index ef5a048c3b7e4..3c1807736e8e3 100644
--- a/ggml.h
+++ b/ggml.h
@@ -237,6 +237,8 @@ extern "C" {
         GGML_TYPE_Q5_1 = 7,
         GGML_TYPE_Q8_0 = 8,
         GGML_TYPE_Q8_1 = 9,
+        GGML_TYPE_Q4_0C = 10,
+        GGML_TYPE_Q8_0C = 11,
         GGML_TYPE_I8,
         GGML_TYPE_I16,
         GGML_TYPE_I32,

From 4bd781cd2572de9ec022178e9973f79cd1c7b278 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= <haakon@likedan.net>
Date: Tue, 18 Apr 2023 00:57:30 +0200
Subject: [PATCH 3/8] q4_0c: quantize support

---
 examples/quantize/quantize.cpp |  1 +
 ggml.c                         | 41 ++++++++++++++++++++++++++++++----
 ggml.h                         |  3 ++-
 llama.cpp                      | 13 +++++++----
 llama.h                        |  1 +
 5 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 198bd5fcb4cf6..bc903d209bd73 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -8,6 +8,7 @@
 
 static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
   {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
+  {"q4_0c", LLAMA_FTYPE_MOSTLY_Q4_0C},
   {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
   {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
   {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
diff --git a/ggml.c b/ggml.c
index 84dd55100d92d..f481774a4128f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -774,11 +774,17 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s
 
 #define QK4_0C (4*32)
 #define QK4_0C_MUL (QK4_0C / QK4_0)
-// TODO: nicer description - pseudostruct?
-// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n]
+#define Q4_0C_QSIZE (QK4_0C/2 + 4*sizeof(float))
+// typedef struct {
+//    uint8_t qs[QK4_0C/2][nb];
+//    float d[nb];
+// } block_q4_0c
 
 #define QK8_0C 32
-// q8_0c : uint8_t qs[n] || float d[n]
+// typedef struct {
+//    uint8_t qs[QK8_0C][nb];
+//    float d[nb];
+// } block_q8_0c
 
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
@@ -13102,6 +13108,27 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK4_0*sizeof(block_q4_0));
 }
 
+size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK4_0C == 0);
+    const int nb = k / QK4_0;
+
+    for (int j = 0; j < n; j += k) {
+        uint8_t * restrict y = (uint8_t *)dst + sizeof(block_q4_0)*j/QK4_0;
+
+        quantize_row_q4_0c_reference(src + j, y, k);
+
+        for (int i = 0; i < nb*QK4_0/2; i++) {
+            const uint8_t vi0 = y[i] & 0xF;
+            const uint8_t vi1 = y[i] >> 4;
+
+            hist[vi0]++;
+            hist[vi1]++;
+        }
+    }
+
+    return (n/QK4_0*sizeof(block_q4_0));
+}
+
 size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK4_1 == 0);
     const int nb = k / QK4_1;
@@ -13229,7 +13256,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK8_0*sizeof(block_q8_0));
 }
 
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
+size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist) {
     size_t result = 0;
     switch (type) {
         case GGML_TYPE_Q4_0:
@@ -13238,6 +13265,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                 block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
                 result = ggml_quantize_q4_0(src + start, block, n, n, hist);
             } break;
+        case GGML_TYPE_Q4_0C:
+            {
+                GGML_ASSERT(start % QK4_0C == 0);
+                uint8_t * dst_off = (uint8_t *) dst + Q4_0C_QSIZE * start / QK4_0C;
+                result = ggml_quantize_q4_0c(src + start, dst_off, n, k, hist);
+            } break;
         case GGML_TYPE_Q4_1:
             {
                 GGML_ASSERT(start % QK4_1 == 0);
diff --git a/ggml.h b/ggml.h
index 3c1807736e8e3..2b502b2fb0753 100644
--- a/ggml.h
+++ b/ggml.h
@@ -871,13 +871,14 @@ extern "C" {
     //
 
     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
 
-    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist);
 
     //
     // system info
diff --git a/llama.cpp b/llama.cpp
index 868a58a8b0b93..59747a16cf502 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -481,6 +481,7 @@ struct llama_file_loader {
                 case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
                 case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_0C:
                 case GGML_TYPE_Q4_1:
                 case GGML_TYPE_Q4_2:
                 case GGML_TYPE_Q5_0:
@@ -557,6 +558,7 @@ struct llama_file_saver {
             case GGML_TYPE_F32:
             case GGML_TYPE_F16:
             case GGML_TYPE_Q4_0:
+            case GGML_TYPE_Q4_0C:
             case GGML_TYPE_Q4_1:
             case GGML_TYPE_Q4_2:
             case GGML_TYPE_Q5_0:
@@ -846,6 +848,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
         case LLAMA_FTYPE_ALL_F32:     return "all F32";
         case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
         case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
+        case LLAMA_FTYPE_MOSTLY_Q4_0C: return "mostly Q4_0C";
         case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
                                       return "mostly Q4_1, some F16";
@@ -1880,6 +1883,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     ggml_type quantized_type;
     switch (ftype) {
         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0C: quantized_type = GGML_TYPE_Q4_0C; break;
         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
         case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
         case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
@@ -1961,15 +1965,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             new_data = work.addr;
             std::vector<int64_t> hist_cur(1 << 4, 0);
 
-            int chunk_size = 32 * 512;
+            int row_size = tensor.ne.at(0);
+            int chunk_size = ceil(32 * 512 * 1.0 / row_size) * row_size;
             const int nchunk = (nelements + chunk_size - 1)/chunk_size;
             const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
             if (nthread_use < 2) {
-                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
+                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, row_size, hist_cur.data());
             } else {
                 size_t counter = 0;
                 new_size = 0;
-                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
+                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size, row_size] () {
                     std::vector<int64_t> local_hist;
                     size_t local_size = 0;
                     while (true) {
@@ -1985,7 +1990,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                         lock.unlock();
                         size_t last = std::min(nelements, first + chunk_size);
                         if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
-                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
+                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, row_size, local_hist.data());
                     }
                 };
                 if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
diff --git a/llama.h b/llama.h
index 2f6ce8d831e6c..94c3e56b100cf 100644
--- a/llama.h
+++ b/llama.h
@@ -83,6 +83,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0C = 20,  // except 1d tensors
     };
 
     LLAMA_API struct llama_context_params llama_context_default_params();

From ab543dc1a46be9ab561af68a4b0ea4443d0b69b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= <haakon@likedan.net>
Date: Tue, 18 Apr 2023 23:07:03 +0200
Subject: [PATCH 4/8] q4_0c: AVX512 vec_dot and quantize impl

---
 ggml.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 124 insertions(+), 17 deletions(-)

diff --git a/ggml.c b/ggml.c
index f481774a4128f..03c9cd462ee05 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1725,8 +1725,8 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
 
 // reference implementation for deterministic creation of model files
 static void quantize_row_q8_0c_reference(const float * restrict x, void * restrict y, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
+    assert(k % QK8_0C == 0);
+    const int nb = k / QK8_0C;
 
     uint8_t * restrict qs = y;
     float * restrict ds = (float *) ((uint8_t *) y + QK8_0C * nb);
@@ -1734,8 +1734,8 @@ static void quantize_row_q8_0c_reference(const float * restrict x, void * restri
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
 
-        for (int l = 0; l < QK8_0; l++) {
-            const float v = x[i*QK8_0 + l];
+        for (int l = 0; l < QK8_0C; l++) {
+            const float v = x[i*QK8_0C + l];
             amax = MAX(amax, fabsf(v));
         }
 
@@ -1744,17 +1744,46 @@ static void quantize_row_q8_0c_reference(const float * restrict x, void * restri
 
         ds[i] = d;
 
-        for (int l = 0; l < QK8_0; ++l) {
-            const float v = x[i*QK8_0 + l]*id;
-            qs[i*QK8_0 + l] = roundf(v);
+        for (int l = 0; l < QK8_0C; ++l) {
+            const float v = x[i*QK8_0C + l]*id;
+            qs[i*QK8_0C + l] = roundf(v);
         }
     }
 }
 
 static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK8_0 == 0);
+    assert(k % QK8_0C == 0);
+    const int nb = k / QK8_0C;
+
+    int8_t * restrict qs = vy;
+    float * restrict ds = (float *) ((uint8_t *) vy + nb*QK8_0C);
+
+#if __AVX512F__
+    for (int i = 0; i < nb; i++) {
+        const __m512 x0 = _mm512_loadu_ps( x + i*QK8_0C );
+        const __m512 x1 = _mm512_loadu_ps( x + i*QK8_0C + QK8_0C/2);
+
+        // Find absolute max
+        const __m512 x0abs = _mm512_abs_ps(x0);
+        const __m512 x1abs = _mm512_abs_ps(x1);
+        const float amax = _mm512_reduce_max_ps(_mm512_max_ps(x0abs, x1abs));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        ds[i] = d;
+
+        const __m512 mul = _mm512_set1_ps( id );
+        const __m512i x0q = _mm512_cvt_roundps_epi32(_mm512_mul_ps(x0, mul), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+        const __m512i x1q = _mm512_cvt_roundps_epi32(_mm512_mul_ps(x1, mul), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
 
+        _mm512_mask_cvtepi32_storeu_epi8(qs + i*QK8_0C, 0xffff, x0q);
+        _mm512_mask_cvtepi32_storeu_epi8(qs + i*QK8_0C + QK8_0C/2, 0xffff, x1q);
+    }
+#else
+    // scalar
     quantize_row_q8_0c_reference(x, vy, k);
+#endif
 }
 
 static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
@@ -2780,6 +2809,73 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
     *s = sumf;
 }
 
+#if __AVX512F__ && QK4_0 == 32
+
+// Dot product of four blocks of q4_0c with four blocks of q8_0c
+static inline __m512 dot_q4_0c_fourblocks_avx512(
+    __m512 acc,
+    const uint8_t * restrict xqs,
+    const float * restrict xds,
+    const int8_t * restrict yqs,
+    const float * restrict yds
+) {
+    // load quantized bytes
+    // TODO: change back to aligned loads
+    const __m512i xqs0123 = _mm512_loadu_epi64( xqs );
+    const __m512i low_nibble_mask = _mm512_set1_epi8( 0xf );
+    const __m512i xqs01 = _mm512_and_si512( low_nibble_mask, xqs0123 );
+    // TODO: try srlv/i?
+    const __m512i xqs23 = _mm512_and_si512( low_nibble_mask, _mm512_srli_epi32( xqs0123, 4 ) );
+    const __m512i yqs01 = _mm512_loadu_epi64( yqs );
+    const __m512i yqs23 = _mm512_loadu_epi64( yqs + 2*QK8_0C );
+
+    // load scales
+    const __m512i scale_mask0 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
+    const __m512i scale_mask1 = _mm512_set_epi32(3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2);
+    const __m128 xyds = _mm_mul_ps(_mm_load_ps(xds), _mm_load_ps(yds));
+    const __m512 xyds0123 = _mm512_broadcast_f32x4(xyds);
+    const __m512 xyds01 = _mm512_permutevar_ps(xyds0123, scale_mask0);
+    const __m512 xyds23 = _mm512_permutevar_ps(xyds0123, scale_mask1);
+
+    // take dot product of x and y bytes
+    const __m512i plus_8 = _mm512_set1_epi8( 8 );
+#ifdef __AVX512VNNI__
+    // We have VPDPBUSDS in AVX512-VNNI, which does exactly what we want, but with a catch:
+    // the *left* operand is supposed to be unsigned, while Q4_0 quantization subtracts 8
+    // from each nibble, so they can be negative. So, instead of `(xqs01 - 8) * yqs01`,
+    // we compute `xqs01 * yqs01 - 8 * yqks`.
+    const __m512i zero = _mm512_setzero_epi32();
+    const __m512i yqs01_mul8 = _mm512_dpbusds_epi32( zero, plus_8, yqs01 );
+    const __m512i yqs23_mul8 = _mm512_dpbusds_epi32( zero, plus_8, yqs23 );
+    const __m512i xy01 = _mm512_dpbusds_epi32( zero, xqs01, yqs01 );
+    const __m512i xy23 = _mm512_dpbusds_epi32( zero, xqs23, yqs23 );
+    const __m512i res0_int = _mm512_sub_epi32( xy01, yqs01_mul8 );
+    const __m512i res1_int = _mm512_sub_epi32( xy23, yqs23_mul8 );
+#else
+    // As a fallback, we have VPMADDUBSW in AVX512-BW, which uses 16-bit products instead of 32-bit ones.
+    // It has the same catch as VPDPBUSDS: the left operand should be unsigned.
+    // This is essentially the AVX-512 version of the AVX-2 trick used by GH user Const-me
+    //   ref: https://gist.github.com/Const-me/4d30e1fc767ab314596e16e90f53b6f4#file-matmultest-cpp-L119
+    const __m512i one = _mm512_set1_epi16( 1 );
+    const __m512i prod_0 = _mm512_maddubs_epi16( xqs01, yqs01 );
+    const __m512i prod_1 = _mm512_maddubs_epi16( plus_8, yqs01 );
+    const __m512i prod_2 = _mm512_maddubs_epi16( xqs23, yqs23 );
+    const __m512i prod_3 = _mm512_maddubs_epi16( plus_8, yqs23 );
+    const __m512i diff0 = _mm512_sub_epi16( prod_0, prod_1 );
+    const __m512i diff1 = _mm512_sub_epi16( prod_2, prod_3 );
+    const __m512i res0_int = _mm512_madd_epi16( diff0, one );
+    const __m512i res1_int = _mm512_madd_epi16( diff1, one );
+#endif
+
+    // Finally, we multiply the permuted scales and the 32-bit dot products, then accumulate.
+    const __m512 res0_float = _mm512_cvtepi32_ps( res0_int );
+    const __m512 res1_float = _mm512_cvtepi32_ps( res1_int );
+
+    return _mm512_fmadd_ps( xyds23, res1_float,
+            _mm512_fmadd_ps( xyds01, res0_float, acc ));
+}
+#endif
+
 inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
     ggml_float sumf = 0.0;
 
@@ -2999,6 +3095,15 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
 
     float sumf = 0.0;
 
+#if __AVX512F__
+    // Initialize accumulator with zeros
+    __m512 acc = _mm512_setzero_ps();
+    for (int i = 0; i < nb; i += 4) {
+        acc = dot_q4_0c_fourblocks_avx512(acc, xqs + i*QK4_0/2, xds + i, yqs + i*QK8_0, yds + i);
+    }
+    // Horizontal sum of all lanes of the accumulator
+    sumf = _mm512_reduce_add_ps( acc );
+#else
     // scalar
     for (int i = 0; i < nb/2; i++) {
         const int dst0 = i + i/2*2;      // 0, 1, 4, 5, 8, 9, ...
@@ -3009,23 +3114,25 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
         const float dy0 = yds[dst0];
         const float dy1 = yds[dst1];
 
-        int sumi0 = 0;
-        int sumi1 = 0;
+        // NOTE: having these as plain int triggers a bug with AVX512 on GCC 12.2
+        int64_t sumi0 = 0;
+        int64_t sumi1 = 0;
 
         for (int l = 0; l < QK4_0; l++) {
-                const uint8_t v0 = xqs[i*QK4_0 + l];
+            const uint8_t v0 = xqs[i*QK4_0 + l];
 
-                const int i0 = (int8_t) (v0 & 0xf) - 8;
-                const int i1 = (int8_t) (v0 >> 4)  - 8;
+            const int i0 = (int) (v0 & 0xf) - 8;
+            const int i1 = (int) (v0 >> 4)  - 8;
 
-                const int i2 = yqs[dst0*QK4_0 + l];
-                const int i3 = yqs[dst1*QK4_0 + l];
+            const int i2 = yqs[dst0*QK4_0 + l];
+            const int i3 = yqs[dst1*QK4_0 + l];
 
-                sumi0 += i0*i2;
-                sumi1 += i1*i3;
+            sumi0 += i0*i2;
+            sumi1 += i1*i3;
         }
         sumf += dx0*dy0*sumi0 + dx1*dy1*sumi1;
     }
+#endif
 
     *s = sumf;
 }

From 1b49d26f8a1a862e0b628c4828e53f7f5315ebf2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= <haakon@likedan.net>
Date: Fri, 21 Apr 2023 00:11:49 +0200
Subject: [PATCH 5/8] q4_0c: Arm Neon acceleration

Mostly copied from the q4_0 implementation
---
 ggml.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 94 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 03c9cd462ee05..78abf324c13f4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1758,7 +1758,37 @@ static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int
     int8_t * restrict qs = vy;
     float * restrict ds = (float *) ((uint8_t *) vy + nb*QK8_0C);
 
-#if __AVX512F__
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
+        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
+
+        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
+        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
+        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        ds[i] = d;
+
+        for (int l = 0; l < 8; l++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            qs[i*QK8_0C + 4*l + 0] = vgetq_lane_s32(vi, 0);
+            qs[i*QK8_0C + 4*l + 1] = vgetq_lane_s32(vi, 1);
+            qs[i*QK8_0C + 4*l + 2] = vgetq_lane_s32(vi, 2);
+            qs[i*QK8_0C + 4*l + 3] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__AVX512F__)
     for (int i = 0; i < nb; i++) {
         const __m512 x0 = _mm512_loadu_ps( x + i*QK8_0C );
         const __m512 x1 = _mm512_loadu_ps( x + i*QK8_0C + QK8_0C/2);
@@ -3095,7 +3125,69 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
 
     float sumf = 0.0;
 
-#if __AVX512F__
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    for (int i = 0; i < nb/2; i++) {
+        const int dst0 = i + i/2*2;      // 0, 1, 4, 5, 8, 9, ...
+        const int dst1 = i + i/2*2 + 2;  // 2, 3, 6, 7, 10, 11 ...
+
+        const uint8x16_t m4b = vdupq_n_u8(0xf);
+        const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_01l = vld1q_u8(&xqs[i*QK4_0]);
+        const uint8x16_t v0_01h = vld1q_u8(&xqs[i*QK4_0 + QK4_0/2]);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_01l, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vandq_u8  (v0_01h, m4b));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vshrq_n_u8(v0_01l, 4));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_01h, 4));
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(&yqs[dst0*QK8_0C]);
+        const int8x16_t v1_0h = vld1q_s8(&yqs[dst0*QK8_0C + 16]);
+        const int8x16_t v1_1l = vld1q_s8(&yqs[dst1*QK8_0C]);
+        const int8x16_t v1_1h = vld1q_s8(&yqs[dst1*QK8_0C + 16]);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        // dot product into int32x4_t
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), xds[dst0]*yds[dst0]);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), xds[dst1]*yds[dst1]);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), xds[dst0]*yds[dst0]);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), xds[dst1]*yds[dst1]);
+#endif
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+
+#elif defined(__AVX512F__)
     // Initialize accumulator with zeros
     __m512 acc = _mm512_setzero_ps();
     for (int i = 0; i < nb; i += 4) {

From 2949725fea6b43005a5f2374ccde7fb524359026 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= <haakon@likedan.net>
Date: Mon, 24 Apr 2023 18:17:31 +0200
Subject: [PATCH 6/8] q4_0c: prefetch on AVX-512 and ARM

Seems significant especially for evaluation time
---
 ggml.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/ggml.c b/ggml.c
index 78abf324c13f4..a70aa4773cfdd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3126,10 +3126,17 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
     float sumf = 0.0;
 
 #if defined(__ARM_NEON)
+    const int ahead=80;
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
     for (int i = 0; i < nb/2; i++) {
+        __builtin_prefetch(&xqs[i*QK4_0 + 64*ahead]);
+        __builtin_prefetch(&yqs[2*i*QK8_0C + 64*ahead]);
+        __builtin_prefetch(&yqs[2*i*QK8_0C + 64*ahead + 64]);
+        __builtin_prefetch(&xds[2*i + 64/4*ahead]);
+        __builtin_prefetch(&yds[2*i + 64/4*ahead]);
+
         const int dst0 = i + i/2*2;      // 0, 1, 4, 5, 8, 9, ...
         const int dst1 = i + i/2*2 + 2;  // 2, 3, 6, 7, 10, 11 ...
 
@@ -3188,9 +3195,15 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
     sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
 
 #elif defined(__AVX512F__)
+    const int ahead = 64;
     // Initialize accumulator with zeros
     __m512 acc = _mm512_setzero_ps();
     for (int i = 0; i < nb; i += 4) {
+        _mm_prefetch(xqs + i*QK4_0/2 + 64*ahead, _MM_HINT_T0);
+        _mm_prefetch(yqs + i*QK8_0 + 64*ahead, _MM_HINT_T0);
+        _mm_prefetch(yqs + i*QK8_0 + 64*ahead + 64, _MM_HINT_T0);
+        _mm_prefetch(xds + i + 64/4*ahead, _MM_HINT_T0);
+        _mm_prefetch(yds + i + 64/4*ahead, _MM_HINT_T0);
         acc = dot_q4_0c_fourblocks_avx512(acc, xqs + i*QK4_0/2, xds + i, yqs + i*QK8_0, yds + i);
     }
     // Horizontal sum of all lanes of the accumulator

From d53f76760d7b067fd0cef67a994a90e662bdfb50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= <haakon@likedan.net>
Date: Thu, 27 Apr 2023 22:48:46 +0200
Subject: [PATCH 7/8] q4_0c: disable prefetching on M1

---
 ggml.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index a70aa4773cfdd..2c5c796fd2cc4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1154,13 +1154,17 @@ static void quantize_row_q4_0c_reference(const float * restrict x, uint8_t * res
         float id[2];
         for (int j = 0; j < 2; j++) {
             float amax = 0.0f; // absolute max
+            float max = 0.0f;
 
             for (int l = 0; l < QK4_0; l++) {
                 const float v = xb[j][l];
-                amax = MAX(amax, fabsf(v));
+                if (amax < fabsf(v)) {
+                    amax = fabsf(v);
+                    max = v;
+                }
             }
 
-            d[j] = amax / ((1 << 3) - 1);
+            d[j] = max / -8;
             id[j] = d[j] ? 1.0f/d[j] : 0.0f;
         }
 
@@ -1169,10 +1173,10 @@ static void quantize_row_q4_0c_reference(const float * restrict x, uint8_t * res
 
         for (int l = 0; l < QK4_0; l++) {
             const float v0 = xb[0][l]*id[0];
-            const uint8_t vi0 = (int8_t)roundf(v0) + 8;
+            const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8);
 
             const float v1 = xb[1][l]*id[1];
-            const uint8_t vi1 = (int8_t)roundf(v1) + 8;
+            const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8);
 
             assert(vi0 < 16);
             assert(vi1 < 16);
@@ -3126,16 +3130,19 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
     float sumf = 0.0;
 
 #if defined(__ARM_NEON)
-    const int ahead=80;
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
     for (int i = 0; i < nb/2; i++) {
+        // Disable prefetching on M1 for now.
+#ifndef __APPLE__
+        const int ahead=80;
         __builtin_prefetch(&xqs[i*QK4_0 + 64*ahead]);
         __builtin_prefetch(&yqs[2*i*QK8_0C + 64*ahead]);
         __builtin_prefetch(&yqs[2*i*QK8_0C + 64*ahead + 64]);
         __builtin_prefetch(&xds[2*i + 64/4*ahead]);
         __builtin_prefetch(&yds[2*i + 64/4*ahead]);
+#endif
 
         const int dst0 = i + i/2*2;      // 0, 1, 4, 5, 8, 9, ...
         const int dst1 = i + i/2*2 + 2;  // 2, 3, 6, 7, 10, 11 ...
@@ -9738,11 +9745,13 @@ static void ggml_compute_forward_alibi(
                 ggml_compute_forward_alibi_f32(params, src0, src1, dst);
             } break;
         case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_0C:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_0C:
         case GGML_TYPE_Q8_1:
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:

From 76692c90cdb909065be522910a5b7c60fa3a062b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= <haakon@likedan.net>
Date: Thu, 4 May 2023 09:53:55 +0200
Subject: [PATCH 8/8] q4_0c: avoid _mm512_loadu_epi64 instruction

Not supported on some GCC versions
---
 ggml.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2c5c796fd2cc4..30c790f52389e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2855,13 +2855,13 @@ static inline __m512 dot_q4_0c_fourblocks_avx512(
 ) {
     // load quantized bytes
     // TODO: change back to aligned loads
-    const __m512i xqs0123 = _mm512_loadu_epi64( xqs );
+    const __m512i xqs0123 = _mm512_loadu_si512( xqs );
     const __m512i low_nibble_mask = _mm512_set1_epi8( 0xf );
     const __m512i xqs01 = _mm512_and_si512( low_nibble_mask, xqs0123 );
     // TODO: try srlv/i?
     const __m512i xqs23 = _mm512_and_si512( low_nibble_mask, _mm512_srli_epi32( xqs0123, 4 ) );
-    const __m512i yqs01 = _mm512_loadu_epi64( yqs );
-    const __m512i yqs23 = _mm512_loadu_epi64( yqs + 2*QK8_0C );
+    const __m512i yqs01 = _mm512_loadu_si512( yqs );
+    const __m512i yqs23 = _mm512_loadu_si512( yqs + 2*QK8_0C );
 
     // load scales
     const __m512i scale_mask0 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);