From 7b6f3f39700aaa868ab867c7af693c76f2c04dca Mon Sep 17 00:00:00 2001 From: katsu560 Date: Sat, 13 May 2023 22:26:58 +0900 Subject: [PATCH 1/4] ggml : add AVX support based on AVX2 code --- ggml.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 170 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 096ccacfb7e08..24c79a9af85ee 100644 --- a/ggml.c +++ b/ggml.c @@ -579,7 +579,63 @@ static inline __m128i packNibbles( __m256i bytes ) return _mm_packus_epi16( r0, r1 ); #endif } -#else +#elif defined(__AVX__) +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); + const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202); + __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl); + __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh); + const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytesl = _mm_or_si128(bytesl, bit_mask); + bytesh = _mm_or_si128(bytesh, bit_mask); + bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1)); + bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1)); + return _mm256_set_m128i(bytesh, bytesl); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) +{ + // Load 16 bytes from memory + __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi); + __m128i tmph = _mm_srli_epi16(tmpl, 4); + const __m128i lowMask = _mm_set1_epi8(0xF); + tmpl = _mm_and_si128(lowMask, tmpl); + tmph = _mm_and_si128(lowMask, tmph); + return _mm256_set_m128i(tmph, tmpl); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { + const __m128i ones = _mm_set1_epi16(1); + const __m128i summed_pairsl = _mm_madd_epi16(ones, xl); + const __m128i summed_pairsh = _mm_madd_epi16(ones, xh); + const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl); + return _mm256_cvtepi32_ps(summed_pairs); +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { + const __m128i xl = _mm256_castsi256_si128(x); + const __m128i xh = _mm256_extractf128_si256(x, 1); + const __m128i yl = _mm256_castsi256_si128(y); + const __m128i yh = _mm256_extractf128_si256(y, 1); + // Get absolute values of x vectors + const __m128i axl = _mm_sign_epi8(xl, xl); + const __m128i axh = _mm_sign_epi8(xh, xh); + // Sign the values of the y vectors + const __m128i syl = _mm_sign_epi8(yl, xl); + const __m128i syh = _mm_sign_epi8(yh, xh); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh @@ -2250,6 +2306,36 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * acc = _mm256_fmadd_ps( d0d1, xy, acc ); } + *s = hsum_float_8(acc) + summs; +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + float summs = 0; + + // Main loop + for (int i = 0; i < nb; ++i) { + const float * d0 = &x[i].d; + const float * d1 = &y[i].d; + + summs += x[i].m * y[i].s; + + const __m256 d0v = _mm256_broadcast_ss( d0 ); + const __m256 d1v = _mm256_broadcast_ss( d1 ); + + // Compute combined scales + const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + const __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); + + const __m256 xy = mul_sum_i8_pairs_float(bx, by); + + // Accumulate d0*d1*x*y + acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); + } + *s = hsum_float_8(acc) + summs; #else // scalar @@ -2458,6 +2544,37 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * acc = _mm256_fmadd_ps(d, q, acc); } + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8((char)0xF0); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bxhi = bytes_from_bits_32(x[i].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_andnot_si128(bxhil, mask); + bxhih = _mm_andnot_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx); + __m128i bxh = _mm256_extractf128_si256(bx, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx = _mm256_set_m128i(bxh, bxl); + + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); + } + *s = hsum_float_8(acc); #else // scalar @@ -2686,6 +2803,40 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); } + *s = hsum_float_8(acc) + summs; +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8(0x10); + + float summs = 0.0f; + + // Main loop + for (int i = 0; i < nb; i++) { + const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); + + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bxhi = bytes_from_bits_32(x[i].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_and_si128(bxhil, mask); + bxhih = _mm_and_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx); + __m128i bxh = _mm256_extractf128_si256(bx, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx = _mm256_set_m128i(bxh, bxl); + + const __m256 dy = _mm256_broadcast_ss(&y[i].d); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); + } + *s = hsum_float_8(acc) + summs; #else // scalar @@ -2793,6 +2944,24 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * acc = _mm256_fmadd_ps( d, q, acc ); } + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); + __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + // Multiply q with scale and accumulate + acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc ); + } + *s = hsum_float_8(acc); #else // scalar From 61a30466300be5ab5b4cc7ef0987de8acf7e4385 Mon Sep 17 00:00:00 2001 From: katsu560 Date: Sun, 14 May 2023 04:59:01 +0900 Subject: [PATCH 2/4] ggml : add AVX support to quantize_row_q5_0, quantize_row_q5_1 --- ggml.c | 239 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 239 insertions(+) diff --git a/ggml.c b/ggml.c index 24c79a9af85ee..57bd3755968e9 100644 --- a/ggml.c +++ b/ggml.c @@ -908,7 +908,135 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r } static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { + static const int qk = QK5_0; + + assert(k % qk == 0); + +#if defined(__AVX__) + const int nb = k / qk; + + block_q5_0 * restrict yy = y; + + const __m256 signBit8 = _mm256_set1_ps( -0.0f ); + const __m128 signBit4 = _mm_set1_ps( -0.0f ); + const __m256 base = _mm256_set1_ps( 16.5f ); + const __m128i n31 = _mm_set1_epi8( 31 ); + const __m128i lowmask = _mm_set1_epi8( 0xF ); + const __m128i bit5mask = _mm_set1_epi8( 0x10 ); + + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(e) by max(abs(e)) for the block + __m256 abs0 = _mm256_andnot_ps( signBit8, v0 ); + __m256 abs1 = _mm256_andnot_ps( signBit8, v1 ); + __m256 mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ); + __m256 max01 = _mm256_blendv_ps( v0, v1, mask8 ); + + abs0 = _mm256_andnot_ps( signBit8, v2 ); + abs1 = _mm256_andnot_ps( signBit8, v3 ); + mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ); + __m256 max23 = _mm256_blendv_ps( v2, v3, mask8 ); + + abs0 = _mm256_andnot_ps( signBit8, max01 ); + abs1 = _mm256_andnot_ps( signBit8, max23 ); + mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ); + max01 = _mm256_blendv_ps( max01, max23, mask8 ); + + __m128 lo = _mm256_castps256_ps128( max01 ); + __m128 hi = _mm256_extractf128_ps( max01, 1 ); + __m128 abslo = _mm_andnot_ps( signBit4, lo ); + __m128 abshi = _mm_andnot_ps( signBit4, hi ); + __m128 mask4 = _mm_cmp_ps( abslo, abshi, _CMP_LE_OQ); + __m128 maxhl = _mm_blendv_ps( lo, hi, mask4 ); + + hi = _mm_movehl_ps( maxhl, maxhl ); + abslo = _mm_andnot_ps( signBit4, maxhl ); + abshi = _mm_andnot_ps( signBit4, hi ); + mask4 = _mm_cmp_ps( abslo, abshi, _CMP_LE_OQ); + maxhl = _mm_blendv_ps( lo, hi, mask4 ); + + hi = _mm_movehdup_ps( maxhl ); + abslo = _mm_andnot_ps( signBit4, maxhl ); + abshi = _mm_andnot_ps( signBit4, hi ); + mask4 = _mm_cmp_ps( abshi, abslo, _CMP_LE_OQ); + maxhl = _mm_blendv_ps( abshi, abslo, mask4 ); + const float max = _mm_cvtss_f32( maxhl ); + + + const float d = max / -16; + const float id = d ? 1.0f/d : 0.0f; + + yy[i].d = GGML_FP32_TO_FP16(d); + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Add 16.5f + v0 = _mm256_add_ps( v0, base ); + v1 = _mm256_add_ps( v1, base ); + v2 = _mm256_add_ps( v2, base ); + v3 = _mm256_add_ps( v3, base ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1 ); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1 ); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1 ); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1 ); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + ni0 = _mm_min_epi8( n31, ni0 ); + ni4 = _mm_min_epi8( n31, ni4 ); + + // y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + ni1 = _mm_and_si128( lowmask, ni0 ); + ni5 = _mm_and_si128( lowmask, ni4 ); + ni5 = _mm_slli_epi16( ni5, 4 ); + ni1 = _mm_or_si128( ni1, ni5 ); + _mm_storeu_si128((__m128i *)(yy[i].qs + 0), ni1); + + // get the 5-th bit and store it in qh at the right position + // qh |= ((xi0 & 0x10) >> 4) << (j + 0); + // qh |= ((xi1 & 0x10) >> 4) << (j + qk/2); + ni0 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni0 ), 3 ); + ni4 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni4 ), 3 ); + uint16_t qhl = _mm_movemask_epi8( ni0 ); + uint16_t qhh = _mm_movemask_epi8( ni4 ); + memcpy(&yy[i].qh[0], &qhl, sizeof(qhl)); + memcpy(&yy[i].qh[2], &qhh, sizeof(qhh)); + } +#else quantize_row_q5_0_reference(x, y, k); +#endif } static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { @@ -956,7 +1084,118 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r } static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { + const int qk = QK5_1; + + assert(k % qk == 0); + +#if defined(__AVX__) + const int nb = k / qk; + + block_q5_1 * restrict yy = y; + + const __m256 base = _mm256_set1_ps( 0.5f ); + const __m128i lowmask = _mm_set1_epi8( 0xF ); + const __m128i bit5mask = _mm_set1_epi8( 0x10 ); + + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max,min + __m256 max8 = _mm256_max_ps( v0, v1 ); + max8 = _mm256_max_ps( max8, v2 ); + max8 = _mm256_max_ps( max8, v3 ); + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max8, 1 ), _mm256_castps256_ps128( max8 ) ); + max4 = _mm_max_ps( _mm_movehl_ps( max4, max4 ), max4 ); + max4 = _mm_max_ss( _mm_movehdup_ps( max4 ), max4 ); + const float max = _mm_cvtss_f32( max4 ); + + __m256 min8 = _mm256_min_ps( v0, v1 ); + min8 = _mm256_min_ps( min8, v2 ); + min8 = _mm256_min_ps( min8, v3 ); + __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min8, 1 ), _mm256_castps256_ps128( min8 ) ); + min4 = _mm_min_ps( _mm_movehl_ps( min4, min4 ), min4 ); + min4 = _mm_min_ss( _mm_movehdup_ps( min4 ), min4 ); + const float min = _mm_cvtss_f32( min4 ); + + const float d = (max - min) / ((1 << 5) - 1); + const float id = d ? 1.0f/d : 0.0f; + + yy[i].d = GGML_FP32_TO_FP16(d); + yy[i].m = GGML_FP32_TO_FP16(min); + + const __m256 mul = _mm256_set1_ps( id ); + + // Subtract min + min8 = _mm256_set1_ps( min ); + v0 = _mm256_sub_ps( v0, min8 ); + v1 = _mm256_sub_ps( v1, min8 ); + v2 = _mm256_sub_ps( v2, min8 ); + v3 = _mm256_sub_ps( v3, min8 ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Add 0.5f + v0 = _mm256_add_ps( v0, base ); + v1 = _mm256_add_ps( v1, base ); + v2 = _mm256_add_ps( v2, base ); + v3 = _mm256_add_ps( v3, base ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1 ); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1 ); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1 ); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1 ); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + // y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + ni1 = _mm_and_si128( lowmask, ni0 ); + ni5 = _mm_and_si128( lowmask, ni4 ); + ni5 = _mm_slli_epi16( ni5, 4 ); + ni1 = _mm_or_si128( ni1, ni5 ); + _mm_storeu_si128((__m128i *)(yy[i].qs + 0), ni1); + + // get the 5-th bit and store it in qh at the right position + // qh |= ((xi0 & 0x10) >> 4) << (j + 0); + // qh |= ((xi1 & 0x10) >> 4) << (j + qk/2); + ni0 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni0 ), 3 ); + ni4 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni4 ), 3 ); + uint16_t qhl = _mm_movemask_epi8( ni0 ); + uint16_t qhh = _mm_movemask_epi8( ni4 ); + memcpy(&yy[i].qh[0], &qhl, sizeof(qhl)); + memcpy(&yy[i].qh[2], &qhh, sizeof(qhh)); + } +#else quantize_row_q5_1_reference(x, y, k); +#endif } // reference implementation for deterministic creation of model files From 81b65da7aa54f63bcf93e48f35240a9a0ffb32eb Mon Sep 17 00:00:00 2001 From: katsu560 Date: Sun, 14 May 2023 06:23:56 +0900 Subject: [PATCH 3/4] ggml : merge AVX2/AVX code in ggml_vec_dot_q4_1_q8_1, ggml_vec_dot_q8_0_q8_0 --- ggml.c | 56 ++++++++------------------------------------------------ 1 file changed, 8 insertions(+), 48 deletions(-) diff --git a/ggml.c b/ggml.c index 57bd3755968e9..d1b4cbdca5440 100644 --- a/ggml.c +++ b/ggml.c @@ -2516,7 +2516,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; -#elif defined(__AVX2__) +#elif defined(__AVX2__) || defined(__AVX__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -2542,37 +2542,11 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const __m256 xy = mul_sum_i8_pairs_float(bx, by); // Accumulate d0*d1*x*y +#if defined(__AVX2__) acc = _mm256_fmadd_ps( d0d1, xy, acc ); - } - - *s = hsum_float_8(acc) + summs; -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - float summs = 0; - - // Main loop - for (int i = 0; i < nb; ++i) { - const float * d0 = &x[i].d; - const float * d1 = &y[i].d; - - summs += x[i].m * y[i].s; - - const __m256 d0v = _mm256_broadcast_ss( d0 ); - const __m256 d1v = _mm256_broadcast_ss( d1 ); - - // Compute combined scales - const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); - - // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - const __m256i bx = bytes_from_nibbles_32(x[i].qs); - const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); - - const __m256 xy = mul_sum_i8_pairs_float(bx, by); - - // Accumulate d0*d1*x*y +#else acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); +#endif } *s = hsum_float_8(acc) + summs; @@ -3166,7 +3140,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined(__AVX2__) +#elif defined(__AVX2__) || defined(__AVX__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -3180,25 +3154,11 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * const __m256 q = mul_sum_i8_pairs_float(bx, by); // Multiply q with scale and accumulate +#if defined(__AVX2__) acc = _mm256_fmadd_ps( d, q, acc ); - } - - *s = hsum_float_8(acc); -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; ++i) { - // Compute combined scale for the block - const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - // Multiply q with scale and accumulate +#else acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc ); +#endif } *s = hsum_float_8(acc); From 262a757989517e7794ee2ee67090da1a42ce3355 Mon Sep 17 00:00:00 2001 From: katsu560 Date: Sun, 14 May 2023 07:02:59 +0900 Subject: [PATCH 4/4] ggml : delete SIMD optimizations for the quantization of the Q5 format --- ggml.c | 239 --------------------------------------------------------- 1 file changed, 239 deletions(-) diff --git a/ggml.c b/ggml.c index d1b4cbdca5440..b4dac6223163b 100644 --- a/ggml.c +++ b/ggml.c @@ -908,135 +908,7 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r } static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { - static const int qk = QK5_0; - - assert(k % qk == 0); - -#if defined(__AVX__) - const int nb = k / qk; - - block_q5_0 * restrict yy = y; - - const __m256 signBit8 = _mm256_set1_ps( -0.0f ); - const __m128 signBit4 = _mm_set1_ps( -0.0f ); - const __m256 base = _mm256_set1_ps( 16.5f ); - const __m128i n31 = _mm_set1_epi8( 31 ); - const __m128i lowmask = _mm_set1_epi8( 0xF ); - const __m128i bit5mask = _mm_set1_epi8( 0x10 ); - - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max(e) by max(abs(e)) for the block - __m256 abs0 = _mm256_andnot_ps( signBit8, v0 ); - __m256 abs1 = _mm256_andnot_ps( signBit8, v1 ); - __m256 mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ); - __m256 max01 = _mm256_blendv_ps( v0, v1, mask8 ); - - abs0 = _mm256_andnot_ps( signBit8, v2 ); - abs1 = _mm256_andnot_ps( signBit8, v3 ); - mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ); - __m256 max23 = _mm256_blendv_ps( v2, v3, mask8 ); - - abs0 = _mm256_andnot_ps( signBit8, max01 ); - abs1 = _mm256_andnot_ps( signBit8, max23 ); - mask8 = _mm256_cmp_ps( abs0, abs1, _CMP_LE_OQ); - max01 = _mm256_blendv_ps( max01, max23, mask8 ); - - __m128 lo = _mm256_castps256_ps128( max01 ); - __m128 hi = _mm256_extractf128_ps( max01, 1 ); - __m128 abslo = _mm_andnot_ps( signBit4, lo ); - __m128 abshi = _mm_andnot_ps( signBit4, hi ); - __m128 mask4 = _mm_cmp_ps( abslo, abshi, _CMP_LE_OQ); - __m128 maxhl = _mm_blendv_ps( lo, hi, mask4 ); - - hi = _mm_movehl_ps( maxhl, maxhl ); - abslo = _mm_andnot_ps( signBit4, maxhl ); - abshi = _mm_andnot_ps( signBit4, hi ); - mask4 = _mm_cmp_ps( abslo, abshi, _CMP_LE_OQ); - maxhl = _mm_blendv_ps( lo, hi, mask4 ); - - hi = _mm_movehdup_ps( maxhl ); - abslo = _mm_andnot_ps( signBit4, maxhl ); - abshi = _mm_andnot_ps( signBit4, hi ); - mask4 = _mm_cmp_ps( abshi, abslo, _CMP_LE_OQ); - maxhl = _mm_blendv_ps( abshi, abslo, mask4 ); - const float max = _mm_cvtss_f32( maxhl ); - - - const float d = max / -16; - const float id = d ? 1.0f/d : 0.0f; - - yy[i].d = GGML_FP32_TO_FP16(d); - const __m256 mul = _mm256_set1_ps( id ); - - // Apply the multiplier - v0 = _mm256_mul_ps( v0, mul ); - v1 = _mm256_mul_ps( v1, mul ); - v2 = _mm256_mul_ps( v2, mul ); - v3 = _mm256_mul_ps( v3, mul ); - - // Add 16.5f - v0 = _mm256_add_ps( v0, base ); - v1 = _mm256_add_ps( v1, base ); - v2 = _mm256_add_ps( v2, base ); - v3 = _mm256_add_ps( v3, base ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - - // Since we don't have in AVX some necessary functions, - // we split the registers in half and call AVX2 analogs from SSE - __m128i ni0 = _mm256_castsi256_si128( i0 ); - __m128i ni1 = _mm256_extractf128_si256( i0, 1 ); - __m128i ni2 = _mm256_castsi256_si128( i1 ); - __m128i ni3 = _mm256_extractf128_si256( i1, 1 ); - __m128i ni4 = _mm256_castsi256_si128( i2 ); - __m128i ni5 = _mm256_extractf128_si256( i2, 1 ); - __m128i ni6 = _mm256_castsi256_si128( i3 ); - __m128i ni7 = _mm256_extractf128_si256( i3, 1 ); - - // Convert int32 to int16 - ni0 = _mm_packs_epi32( ni0, ni1 ); - ni2 = _mm_packs_epi32( ni2, ni3 ); - ni4 = _mm_packs_epi32( ni4, ni5 ); - ni6 = _mm_packs_epi32( ni6, ni7 ); - - // Convert int16 to int8 - ni0 = _mm_packs_epi16( ni0, ni2 ); - ni4 = _mm_packs_epi16( ni4, ni6 ); - - ni0 = _mm_min_epi8( n31, ni0 ); - ni4 = _mm_min_epi8( n31, ni4 ); - - // y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); - ni1 = _mm_and_si128( lowmask, ni0 ); - ni5 = _mm_and_si128( lowmask, ni4 ); - ni5 = _mm_slli_epi16( ni5, 4 ); - ni1 = _mm_or_si128( ni1, ni5 ); - _mm_storeu_si128((__m128i *)(yy[i].qs + 0), ni1); - - // get the 5-th bit and store it in qh at the right position - // qh |= ((xi0 & 0x10) >> 4) << (j + 0); - // qh |= ((xi1 & 0x10) >> 4) << (j + qk/2); - ni0 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni0 ), 3 ); - ni4 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni4 ), 3 ); - uint16_t qhl = _mm_movemask_epi8( ni0 ); - uint16_t qhh = _mm_movemask_epi8( ni4 ); - memcpy(&yy[i].qh[0], &qhl, sizeof(qhl)); - memcpy(&yy[i].qh[2], &qhh, sizeof(qhh)); - } -#else quantize_row_q5_0_reference(x, y, k); -#endif } static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { @@ -1084,118 +956,7 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r } static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { - const int qk = QK5_1; - - assert(k % qk == 0); - -#if defined(__AVX__) - const int nb = k / qk; - - block_q5_1 * restrict yy = y; - - const __m256 base = _mm256_set1_ps( 0.5f ); - const __m128i lowmask = _mm_set1_epi8( 0xF ); - const __m128i bit5mask = _mm_set1_epi8( 0x10 ); - - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max,min - __m256 max8 = _mm256_max_ps( v0, v1 ); - max8 = _mm256_max_ps( max8, v2 ); - max8 = _mm256_max_ps( max8, v3 ); - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max8, 1 ), _mm256_castps256_ps128( max8 ) ); - max4 = _mm_max_ps( _mm_movehl_ps( max4, max4 ), max4 ); - max4 = _mm_max_ss( _mm_movehdup_ps( max4 ), max4 ); - const float max = _mm_cvtss_f32( max4 ); - - __m256 min8 = _mm256_min_ps( v0, v1 ); - min8 = _mm256_min_ps( min8, v2 ); - min8 = _mm256_min_ps( min8, v3 ); - __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min8, 1 ), _mm256_castps256_ps128( min8 ) ); - min4 = _mm_min_ps( _mm_movehl_ps( min4, min4 ), min4 ); - min4 = _mm_min_ss( _mm_movehdup_ps( min4 ), min4 ); - const float min = _mm_cvtss_f32( min4 ); - - const float d = (max - min) / ((1 << 5) - 1); - const float id = d ? 1.0f/d : 0.0f; - - yy[i].d = GGML_FP32_TO_FP16(d); - yy[i].m = GGML_FP32_TO_FP16(min); - - const __m256 mul = _mm256_set1_ps( id ); - - // Subtract min - min8 = _mm256_set1_ps( min ); - v0 = _mm256_sub_ps( v0, min8 ); - v1 = _mm256_sub_ps( v1, min8 ); - v2 = _mm256_sub_ps( v2, min8 ); - v3 = _mm256_sub_ps( v3, min8 ); - - // Apply the multiplier - v0 = _mm256_mul_ps( v0, mul ); - v1 = _mm256_mul_ps( v1, mul ); - v2 = _mm256_mul_ps( v2, mul ); - v3 = _mm256_mul_ps( v3, mul ); - - // Add 0.5f - v0 = _mm256_add_ps( v0, base ); - v1 = _mm256_add_ps( v1, base ); - v2 = _mm256_add_ps( v2, base ); - v3 = _mm256_add_ps( v3, base ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - - // Since we don't have in AVX some necessary functions, - // we split the registers in half and call AVX2 analogs from SSE - __m128i ni0 = _mm256_castsi256_si128( i0 ); - __m128i ni1 = _mm256_extractf128_si256( i0, 1 ); - __m128i ni2 = _mm256_castsi256_si128( i1 ); - __m128i ni3 = _mm256_extractf128_si256( i1, 1 ); - __m128i ni4 = _mm256_castsi256_si128( i2 ); - __m128i ni5 = _mm256_extractf128_si256( i2, 1 ); - __m128i ni6 = _mm256_castsi256_si128( i3 ); - __m128i ni7 = _mm256_extractf128_si256( i3, 1 ); - - // Convert int32 to int16 - ni0 = _mm_packs_epi32( ni0, ni1 ); - ni2 = _mm_packs_epi32( ni2, ni3 ); - ni4 = _mm_packs_epi32( ni4, ni5 ); - ni6 = _mm_packs_epi32( ni6, ni7 ); - - // Convert int16 to int8 - ni0 = _mm_packs_epi16( ni0, ni2 ); - ni4 = _mm_packs_epi16( ni4, ni6 ); - - // y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); - ni1 = _mm_and_si128( lowmask, ni0 ); - ni5 = _mm_and_si128( lowmask, ni4 ); - ni5 = _mm_slli_epi16( ni5, 4 ); - ni1 = _mm_or_si128( ni1, ni5 ); - _mm_storeu_si128((__m128i *)(yy[i].qs + 0), ni1); - - // get the 5-th bit and store it in qh at the right position - // qh |= ((xi0 & 0x10) >> 4) << (j + 0); - // qh |= ((xi1 & 0x10) >> 4) << (j + qk/2); - ni0 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni0 ), 3 ); - ni4 = _mm_slli_epi16( _mm_and_si128( bit5mask, ni4 ), 3 ); - uint16_t qhl = _mm_movemask_epi8( ni0 ); - uint16_t qhh = _mm_movemask_epi8( ni4 ); - memcpy(&yy[i].qh[0], &qhl, sizeof(qhl)); - memcpy(&yy[i].qh[2], &qhh, sizeof(qhh)); - } -#else quantize_row_q5_1_reference(x, y, k); -#endif } // reference implementation for deterministic creation of model files