From 7b6f3f39700aaa868ab867c7af693c76f2c04dca Mon Sep 17 00:00:00 2001
From: katsu560 <katsu560oo-@docomo.ne.jp>
Date: Sat, 13 May 2023 22:26:58 +0900
Subject: [PATCH 1/4] ggml : add AVX support based on AVX2 code

---
 ggml.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 170 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 096ccacfb7e08..24c79a9af85ee 100644
--- a/ggml.c
+++ b/ggml.c
@@ -579,7 +579,63 @@ static inline __m128i packNibbles( __m256i bytes )
     return _mm_packus_epi16( r0, r1 );
 #endif
 }
-#else
+#elif defined(__AVX__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytesl = _mm_or_si128(bytesl, bit_mask);
+    bytesh = _mm_or_si128(bytesh, bit_mask);
+    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+    return _mm256_set_m128i(bytesh, bytesl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    // Load 16 bytes from memory
+    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
+    __m128i tmph = _mm_srli_epi16(tmpl, 4);
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    tmpl = _mm_and_si128(lowMask, tmpl);
+    tmph = _mm_and_si128(lowMask, tmph);
+    return _mm256_set_m128i(tmph, tmpl);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+    const __m128i ones = _mm_set1_epi16(1);
+    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+    const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+    const __m128i xl = _mm256_castsi256_si128(x);
+    const __m128i xh = _mm256_extractf128_si256(x, 1);
+    const __m128i yl = _mm256_castsi256_si128(y);
+    const __m128i yh = _mm256_extractf128_si256(y, 1);
+    // Get absolute values of x vectors
+    const __m128i axl = _mm_sign_epi8(xl, xl);
+    const __m128i axh = _mm_sign_epi8(xh, xh);
+    // Sign the values of the y vectors
+    const __m128i syl = _mm_sign_epi8(yl, xl);
+    const __m128i syh = _mm_sign_epi8(yh, xh);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
 static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
 {
     // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
@@ -2250,6 +2306,36 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
         acc = _mm256_fmadd_ps( d0d1, xy, acc );
     }
 
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        const float * d0 = &x[i].d;
+        const float * d1 = &y[i].d;
+
+        summs += x[i].m * y[i].s;
+
+        const __m256 d0v = _mm256_broadcast_ss( d0 );
+        const __m256 d1v = _mm256_broadcast_ss( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
+
+        const __m256 xy = mul_sum_i8_pairs_float(bx, by);
+
+        // Accumulate d0*d1*x*y
+        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+    }
+
     *s = hsum_float_8(acc) + summs;
 #else
     // scalar
@@ -2458,6 +2544,37 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         acc = _mm256_fmadd_ps(d, q, acc);
     }
 
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8((char)0xF0);
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_andnot_si128(bxhil, mask);
+        bxhih = _mm_andnot_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = _mm256_set_m128i(bxh, bxl);
+
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+    }
+
     *s = hsum_float_8(acc);
 #else
     // scalar
@@ -2686,6 +2803,40 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
     }
 
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8(0x10);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_and_si128(bxhil, mask);
+        bxhih = _mm_and_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = _mm256_set_m128i(bxh, bxl);
+
+        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+    }
+
     *s = hsum_float_8(acc) + summs;
 #else
     // scalar
@@ -2793,6 +2944,24 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
         acc = _mm256_fmadd_ps( d, q, acc );
     }
 
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
+        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        // Multiply q with scale and accumulate
+        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
+    }
+
     *s = hsum_float_8(acc);
 #else
     // scalar

From 61a30466300be5ab5b4cc7ef0987de8acf7e4385 Mon Sep 17 00:00:00 2001
From: katsu560 <katsu560oo-@docomo.ne.jp>
Date: Sun, 14 May 2023 04:59:01 +0900
Subject: [PATCH 2/4] ggml : add AVX support to quantize_row_q5_0,
 quantize_row_q5_1

---
 ggml.c | 239 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 239 insertions(+)

diff --git a/ggml.c b/ggml.c
index 24c79a9af85ee..57bd3755968e9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -908,7 +908,135 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
 }
 
 static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
+    static const int qk = QK5_0;
+
+    assert(k % qk == 0);
+
+#if defined(__AVX__)
+    const int nb = k / qk;
+
+    block_q5_0 * restrict yy = y;
+
+    const __m256 signBit8 = _mm256_set1_ps( -0.0f );
+    const __m128 signBit4 = _mm_set1_ps( -0.0f );
+    const __m256 base = _mm256_set1_ps( 16.5f );
+    const __m128i n31 = _mm_set1_epi8( 31 );
+    const __m128i lowmask = _mm_set1_epi8( 0xF );
+    const __m128i bit5mask = _mm_set1_epi8( 0x10 );
+
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(e) by max(abs(e)) for the block
+        __m256 abs0 = _mm256_andnot_ps( signBit8, v0 );
+        __m256 abs1 = _mm256_andnot_ps( signBit8, v1 );
+        __m256 mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ);
+        __m256 max01 = _mm256_blendv_ps( v0, v1, mask8 );
+
+        abs0 = _mm256_andnot_ps( signBit8, v2 );
+        abs1 = _mm256_andnot_ps( signBit8, v3 );
+        mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ);
+        __m256 max23 = _mm256_blendv_ps( v2, v3, mask8 );
+
+        abs0 = _mm256_andnot_ps( signBit8, max01 );
+        abs1 = _mm256_andnot_ps( signBit8, max23 );
+        mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ);
+        max01 = _mm256_blendv_ps( max01, max23, mask8 );
+
+        __m128 lo = _mm256_castps256_ps128( max01 );
+        __m128 hi = _mm256_extractf128_ps( max01, 1 );
+        __m128 abslo = _mm_andnot_ps( signBit4, lo );
+        __m128 abshi = _mm_andnot_ps( signBit4, hi );
+        __m128 mask4 = _mm_cmp_ps( abslo, abshi, _CMP_LE_OQ);
+        __m128 maxhl = _mm_blendv_ps( lo, hi, mask4 );
+
+        hi = _mm_movehl_ps( maxhl, maxhl );
+        abslo = _mm_andnot_ps( signBit4, maxhl );
+        abshi = _mm_andnot_ps( signBit4, hi );
+        mask4 = _mm_cmp_ps( abslo, abshi, _CMP_LE_OQ);
+        maxhl = _mm_blendv_ps( lo, hi, mask4 );
+
+        hi = _mm_movehdup_ps( maxhl );
+        abslo = _mm_andnot_ps( signBit4, maxhl );
+        abshi = _mm_andnot_ps( signBit4, hi );
+        mask4 = _mm_cmp_ps( abshi, abslo, _CMP_LE_OQ);
+        maxhl = _mm_blendv_ps( abshi, abslo, mask4 );
+        const float max = _mm_cvtss_f32( maxhl );
+
+
+        const float d  = max / -16;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        yy[i].d = GGML_FP32_TO_FP16(d);
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Add 16.5f
+        v0 = _mm256_add_ps( v0, base );
+        v1 = _mm256_add_ps( v1, base );
+        v2 = _mm256_add_ps( v2, base );
+        v3 = _mm256_add_ps( v3, base );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1 );
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1 );
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1 );
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1 );
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        ni0 = _mm_min_epi8( n31, ni0 );
+        ni4 = _mm_min_epi8( n31, ni4 );
+
+        // y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+        ni1 = _mm_and_si128( lowmask, ni0 );
+        ni5 = _mm_and_si128( lowmask, ni4 );
+        ni5 = _mm_slli_epi16( ni5, 4 );
+        ni1 = _mm_or_si128( ni1, ni5 );
+        _mm_storeu_si128((__m128i *)(yy[i].qs +  0), ni1);
+
+        // get the 5-th bit and store it in qh at the right position
+        // qh |= ((xi0 & 0x10) >> 4) << (j + 0);
+        // qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
+        ni0 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni0 ), 3 );
+        ni4 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni4 ), 3 );
+        uint16_t qhl = _mm_movemask_epi8( ni0 );
+        uint16_t qhh = _mm_movemask_epi8( ni4 );
+        memcpy(&yy[i].qh[0], &qhl, sizeof(qhl));
+        memcpy(&yy[i].qh[2], &qhh, sizeof(qhh));
+    }
+#else
     quantize_row_q5_0_reference(x, y, k);
+#endif
 }
 
 static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
@@ -956,7 +1084,118 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
 }
 
 static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
+    const int qk = QK5_1;
+
+    assert(k % qk == 0);
+
+#if defined(__AVX__)
+    const int nb = k / qk;
+
+    block_q5_1 * restrict yy = y;
+
+    const __m256 base = _mm256_set1_ps( 0.5f );
+    const __m128i lowmask = _mm_set1_epi8( 0xF );
+    const __m128i bit5mask = _mm_set1_epi8( 0x10 );
+
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max,min
+        __m256 max8 = _mm256_max_ps( v0, v1 );
+        max8 = _mm256_max_ps( max8, v2 );
+        max8 = _mm256_max_ps( max8, v3 );
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max8, 1 ), _mm256_castps256_ps128( max8 ) );
+        max4 = _mm_max_ps( _mm_movehl_ps( max4, max4 ), max4 );
+        max4 = _mm_max_ss( _mm_movehdup_ps( max4 ), max4 );
+        const float max = _mm_cvtss_f32( max4 );
+
+        __m256 min8 = _mm256_min_ps( v0, v1 );
+        min8 = _mm256_min_ps( min8, v2 );
+        min8 = _mm256_min_ps( min8, v3 );
+        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min8, 1 ), _mm256_castps256_ps128( min8 ) );
+        min4 = _mm_min_ps( _mm_movehl_ps( min4, min4 ), min4 );
+        min4 = _mm_min_ss( _mm_movehdup_ps( min4 ), min4 );
+        const float min = _mm_cvtss_f32( min4 );
+
+        const float d  = (max - min) / ((1 << 5) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        yy[i].d = GGML_FP32_TO_FP16(d);
+        yy[i].m = GGML_FP32_TO_FP16(min);
+
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Subtract min
+        min8 = _mm256_set1_ps( min );
+        v0 = _mm256_sub_ps( v0, min8 );
+        v1 = _mm256_sub_ps( v1, min8 );
+        v2 = _mm256_sub_ps( v2, min8 );
+        v3 = _mm256_sub_ps( v3, min8 );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Add 0.5f
+        v0 = _mm256_add_ps( v0, base );
+        v1 = _mm256_add_ps( v1, base );
+        v2 = _mm256_add_ps( v2, base );
+        v3 = _mm256_add_ps( v3, base );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1 );
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1 );
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1 );
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1 );
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        // y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+        ni1 = _mm_and_si128( lowmask, ni0 );
+        ni5 = _mm_and_si128( lowmask, ni4 );
+        ni5 = _mm_slli_epi16( ni5, 4 );
+        ni1 = _mm_or_si128( ni1, ni5 );
+        _mm_storeu_si128((__m128i *)(yy[i].qs +  0), ni1);
+
+        // get the 5-th bit and store it in qh at the right position
+        // qh |= ((xi0 & 0x10) >> 4) << (j + 0);
+        // qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
+        ni0 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni0 ), 3 );
+        ni4 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni4 ), 3 );
+        uint16_t qhl = _mm_movemask_epi8( ni0 );
+        uint16_t qhh = _mm_movemask_epi8( ni4 );
+        memcpy(&yy[i].qh[0], &qhl, sizeof(qhl));
+        memcpy(&yy[i].qh[2], &qhh, sizeof(qhh));
+    }
+#else
     quantize_row_q5_1_reference(x, y, k);
+#endif
 }
 
 // reference implementation for deterministic creation of model files

From 81b65da7aa54f63bcf93e48f35240a9a0ffb32eb Mon Sep 17 00:00:00 2001
From: katsu560 <katsu560oo-@docomo.ne.jp>
Date: Sun, 14 May 2023 06:23:56 +0900
Subject: [PATCH 3/4] ggml : merge AVX2/AVX code in ggml_vec_dot_q4_1_q8_1,
 ggml_vec_dot_q8_0_q8_0

---
 ggml.c | 56 ++++++++------------------------------------------------
 1 file changed, 8 insertions(+), 48 deletions(-)

diff --git a/ggml.c b/ggml.c
index 57bd3755968e9..d1b4cbdca5440 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2516,7 +2516,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
     }
 
     *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) || defined(__AVX__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
 
@@ -2542,37 +2542,11 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
         const __m256 xy = mul_sum_i8_pairs_float(bx, by);
 
         // Accumulate d0*d1*x*y
+#if defined(__AVX2__)
         acc = _mm256_fmadd_ps( d0d1, xy, acc );
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        const float * d0 = &x[i].d;
-        const float * d1 = &y[i].d;
-
-        summs += x[i].m * y[i].s;
-
-        const __m256 d0v = _mm256_broadcast_ss( d0 );
-        const __m256 d1v = _mm256_broadcast_ss( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
-
-        const __m256 xy = mul_sum_i8_pairs_float(bx, by);
-
-        // Accumulate d0*d1*x*y
+#else
         acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+#endif
     }
 
     *s = hsum_float_8(acc) + summs;
@@ -3166,7 +3140,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
     }
 
     *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) || defined(__AVX__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
 
@@ -3180,25 +3154,11 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
         const __m256 q = mul_sum_i8_pairs_float(bx, by);
 
         // Multiply q with scale and accumulate
+#if defined(__AVX2__)
         acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        // Multiply q with scale and accumulate
+#else
         acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
+#endif
     }
 
     *s = hsum_float_8(acc);

From 262a757989517e7794ee2ee67090da1a42ce3355 Mon Sep 17 00:00:00 2001
From: katsu560 <katsu560oo-@docomo.ne.jp>
Date: Sun, 14 May 2023 07:02:59 +0900
Subject: [PATCH 4/4] ggml : delete SIMD optimizations for the quantization of
 the Q5 format

---
 ggml.c | 239 ---------------------------------------------------------
 1 file changed, 239 deletions(-)

diff --git a/ggml.c b/ggml.c
index d1b4cbdca5440..b4dac6223163b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -908,135 +908,7 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
 }
 
 static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
-    static const int qk = QK5_0;
-
-    assert(k % qk == 0);
-
-#if defined(__AVX__)
-    const int nb = k / qk;
-
-    block_q5_0 * restrict yy = y;
-
-    const __m256 signBit8 = _mm256_set1_ps( -0.0f );
-    const __m128 signBit4 = _mm_set1_ps( -0.0f );
-    const __m256 base = _mm256_set1_ps( 16.5f );
-    const __m128i n31 = _mm_set1_epi8( 31 );
-    const __m128i lowmask = _mm_set1_epi8( 0xF );
-    const __m128i bit5mask = _mm_set1_epi8( 0x10 );
-
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(e) by max(abs(e)) for the block
-        __m256 abs0 = _mm256_andnot_ps( signBit8, v0 );
-        __m256 abs1 = _mm256_andnot_ps( signBit8, v1 );
-        __m256 mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ);
-        __m256 max01 = _mm256_blendv_ps( v0, v1, mask8 );
-
-        abs0 = _mm256_andnot_ps( signBit8, v2 );
-        abs1 = _mm256_andnot_ps( signBit8, v3 );
-        mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ);
-        __m256 max23 = _mm256_blendv_ps( v2, v3, mask8 );
-
-        abs0 = _mm256_andnot_ps( signBit8, max01 );
-        abs1 = _mm256_andnot_ps( signBit8, max23 );
-        mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ);
-        max01 = _mm256_blendv_ps( max01, max23, mask8 );
-
-        __m128 lo = _mm256_castps256_ps128( max01 );
-        __m128 hi = _mm256_extractf128_ps( max01, 1 );
-        __m128 abslo = _mm_andnot_ps( signBit4, lo );
-        __m128 abshi = _mm_andnot_ps( signBit4, hi );
-        __m128 mask4 = _mm_cmp_ps( abslo, abshi, _CMP_LE_OQ);
-        __m128 maxhl = _mm_blendv_ps( lo, hi, mask4 );
-
-        hi = _mm_movehl_ps( maxhl, maxhl );
-        abslo = _mm_andnot_ps( signBit4, maxhl );
-        abshi = _mm_andnot_ps( signBit4, hi );
-        mask4 = _mm_cmp_ps( abslo, abshi, _CMP_LE_OQ);
-        maxhl = _mm_blendv_ps( lo, hi, mask4 );
-
-        hi = _mm_movehdup_ps( maxhl );
-        abslo = _mm_andnot_ps( signBit4, maxhl );
-        abshi = _mm_andnot_ps( signBit4, hi );
-        mask4 = _mm_cmp_ps( abshi, abslo, _CMP_LE_OQ);
-        maxhl = _mm_blendv_ps( abshi, abslo, mask4 );
-        const float max = _mm_cvtss_f32( maxhl );
-
-
-        const float d  = max / -16;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        yy[i].d = GGML_FP32_TO_FP16(d);
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Add 16.5f
-        v0 = _mm256_add_ps( v0, base );
-        v1 = _mm256_add_ps( v1, base );
-        v2 = _mm256_add_ps( v2, base );
-        v3 = _mm256_add_ps( v3, base );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1 );
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1 );
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1 );
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1 );
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        ni0 = _mm_min_epi8( n31, ni0 );
-        ni4 = _mm_min_epi8( n31, ni4 );
-
-        // y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-        ni1 = _mm_and_si128( lowmask, ni0 );
-        ni5 = _mm_and_si128( lowmask, ni4 );
-        ni5 = _mm_slli_epi16( ni5, 4 );
-        ni1 = _mm_or_si128( ni1, ni5 );
-        _mm_storeu_si128((__m128i *)(yy[i].qs +  0), ni1);
-
-        // get the 5-th bit and store it in qh at the right position
-        // qh |= ((xi0 & 0x10) >> 4) << (j + 0);
-        // qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
-        ni0 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni0 ), 3 );
-        ni4 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni4 ), 3 );
-        uint16_t qhl = _mm_movemask_epi8( ni0 );
-        uint16_t qhh = _mm_movemask_epi8( ni4 );
-        memcpy(&yy[i].qh[0], &qhl, sizeof(qhl));
-        memcpy(&yy[i].qh[2], &qhh, sizeof(qhh));
-    }
-#else
     quantize_row_q5_0_reference(x, y, k);
-#endif
 }
 
 static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
@@ -1084,118 +956,7 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
 }
 
 static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
-    const int qk = QK5_1;
-
-    assert(k % qk == 0);
-
-#if defined(__AVX__)
-    const int nb = k / qk;
-
-    block_q5_1 * restrict yy = y;
-
-    const __m256 base = _mm256_set1_ps( 0.5f );
-    const __m128i lowmask = _mm_set1_epi8( 0xF );
-    const __m128i bit5mask = _mm_set1_epi8( 0x10 );
-
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max,min
-        __m256 max8 = _mm256_max_ps( v0, v1 );
-        max8 = _mm256_max_ps( max8, v2 );
-        max8 = _mm256_max_ps( max8, v3 );
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max8, 1 ), _mm256_castps256_ps128( max8 ) );
-        max4 = _mm_max_ps( _mm_movehl_ps( max4, max4 ), max4 );
-        max4 = _mm_max_ss( _mm_movehdup_ps( max4 ), max4 );
-        const float max = _mm_cvtss_f32( max4 );
-
-        __m256 min8 = _mm256_min_ps( v0, v1 );
-        min8 = _mm256_min_ps( min8, v2 );
-        min8 = _mm256_min_ps( min8, v3 );
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min8, 1 ), _mm256_castps256_ps128( min8 ) );
-        min4 = _mm_min_ps( _mm_movehl_ps( min4, min4 ), min4 );
-        min4 = _mm_min_ss( _mm_movehdup_ps( min4 ), min4 );
-        const float min = _mm_cvtss_f32( min4 );
-
-        const float d  = (max - min) / ((1 << 5) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        yy[i].d = GGML_FP32_TO_FP16(d);
-        yy[i].m = GGML_FP32_TO_FP16(min);
-
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Subtract min
-        min8 = _mm256_set1_ps( min );
-        v0 = _mm256_sub_ps( v0, min8 );
-        v1 = _mm256_sub_ps( v1, min8 );
-        v2 = _mm256_sub_ps( v2, min8 );
-        v3 = _mm256_sub_ps( v3, min8 );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Add 0.5f
-        v0 = _mm256_add_ps( v0, base );
-        v1 = _mm256_add_ps( v1, base );
-        v2 = _mm256_add_ps( v2, base );
-        v3 = _mm256_add_ps( v3, base );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1 );
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1 );
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1 );
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1 );
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        // y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-        ni1 = _mm_and_si128( lowmask, ni0 );
-        ni5 = _mm_and_si128( lowmask, ni4 );
-        ni5 = _mm_slli_epi16( ni5, 4 );
-        ni1 = _mm_or_si128( ni1, ni5 );
-        _mm_storeu_si128((__m128i *)(yy[i].qs +  0), ni1);
-
-        // get the 5-th bit and store it in qh at the right position
-        // qh |= ((xi0 & 0x10) >> 4) << (j + 0);
-        // qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
-        ni0 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni0 ), 3 );
-        ni4 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni4 ), 3 );
-        uint16_t qhl = _mm_movemask_epi8( ni0 );
-        uint16_t qhh = _mm_movemask_epi8( ni4 );
-        memcpy(&yy[i].qh[0], &qhl, sizeof(qhl));
-        memcpy(&yy[i].qh[2], &qhh, sizeof(qhh));
-    }
-#else
     quantize_row_q5_1_reference(x, y, k);
-#endif
 }
 
 // reference implementation for deterministic creation of model files