Skip to content

Commit 69c9229

Browse files
swggerganov
andauthored
Deduplicate q4 quantization functions (#383)
* Deduplicate q4 quantization functions * Use const; add basic test * Re-enable quantization test * Disable AVX2 flags in CI --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 9794052 commit 69c9229

File tree

5 files changed

+119
-113
lines changed

5 files changed

+119
-113
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ jobs:
8989
run: |
9090
mkdir build
9191
cd build
92-
cmake ..
92+
cmake -DLLAMA_AVX2=OFF ..
9393
cmake --build . --config Release
9494
ctest --output-on-failure
9595

ggml.c

Lines changed: 65 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -403,16 +403,63 @@ static inline __m128i packNibbles( __m256i bytes )
403403
// method 5
404404
// blocks of QK elements
405405
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
406+
407+
// reference implementation for deterministic creation of model files
408+
static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
409+
assert(k % QK == 0);
410+
const int nb = k / QK;
411+
412+
const size_t bs = sizeof(float) + QK/2;
413+
414+
uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
415+
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
416+
417+
uint8_t pp[QK/2];
418+
419+
for (int i = 0; i < nb; i++) {
420+
float amax = 0.0f; // absolute max
421+
422+
for (int l = 0; l < QK; l++) {
423+
const float v = x[i*QK + l];
424+
amax = MAX(amax, fabsf(v));
425+
}
426+
427+
const float d = amax / ((1 << 3) - 1);
428+
const float id = d ? 1.0f/d : 0.0f;
429+
430+
*(float *)pd = d;
431+
pd += bs;
432+
433+
for (int l = 0; l < QK; l += 2) {
434+
const float v0 = x[i*QK + l + 0]*id;
435+
const float v1 = x[i*QK + l + 1]*id;
436+
437+
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
438+
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
439+
440+
assert(vi0 >= 0 && vi0 < 16);
441+
assert(vi1 >= 0 && vi1 < 16);
442+
443+
pp[l/2] = vi0 | (vi1 << 4);
444+
}
445+
446+
memcpy(pb, pp, sizeof(pp));
447+
pb += bs;
448+
}
449+
}
450+
406451
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
407452
assert(k % QK == 0);
408453

454+
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
409455
const int nb = k / QK;
410456
const size_t bs = sizeof(float) + QK/2;
411457

412458
uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
413459
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
414460

415461
uint8_t pp[QK/2];
462+
#endif
416463

417464
#if __ARM_NEON
418465
#if QK == 32
@@ -569,36 +616,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
569616
#endif
570617
#else
571618
// scalar
572-
for (int i = 0; i < nb; i++) {
573-
float amax = 0.0f; // absolute max
574-
575-
for (int l = 0; l < QK; l++) {
576-
const float v = x[i*QK + l];
577-
amax = MAX(amax, fabsf(v));
578-
}
579-
580-
const float d = amax / ((1 << 3) - 1);
581-
const float id = d ? 1.0f/d : 0.0f;
582-
583-
*(float *)pd = d;
584-
pd += bs;
585-
586-
for (int l = 0; l < QK; l += 2) {
587-
const float v0 = x[i*QK + l + 0]*id;
588-
const float v1 = x[i*QK + l + 1]*id;
589-
590-
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
591-
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
592-
593-
assert(vi0 >= 0 && vi0 < 16);
594-
assert(vi1 >= 0 && vi1 < 16);
595-
596-
pp[l/2] = vi0 | (vi1 << 4);
597-
}
598-
599-
memcpy(pb, pp, sizeof(pp));
600-
pb += bs;
601-
}
619+
quantize_row_q4_0_reference(x, y, k);
602620
#endif
603621
}
604622

@@ -10705,119 +10723,60 @@ enum ggml_opt_result ggml_opt(
1070510723

1070610724
////////////////////////////////////////////////////////////////////////////////
1070710725

10708-
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
10726+
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
1070910727
const int nb = k / qk;
1071010728
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
1071110729
const size_t row_size = nb*bs;
1071210730

1071310731
assert(k % qk == 0);
1071410732

10715-
const size_t pp_size = qk / 2;
10716-
uint8_t * pp = (uint8_t *) alloca(pp_size);
10717-
1071810733
char * pdst = (char *) dst;
1071910734

1072010735
for (int j = 0; j < n; j += k) {
1072110736
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
1072210737
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
1072310738

10724-
for (int i = 0; i < nb; i++) {
10725-
float amax = 0.0f; // absolute max
10726-
10727-
{
10728-
for (int l = 0; l < qk; l++) {
10729-
const float v = src[j + i*qk + l];
10730-
amax = MAX(amax, fabsf(v));
10731-
}
10732-
10733-
const float d = amax / ((1 << 3) - 1);
10734-
const float id = d ? 1.0f/d : 0.0f;
10735-
10736-
*(float *) pd = d;
10737-
pd += bs;
10739+
quantize_row_q4_0_reference(src + j, pd, k);
1073810740

10739-
for (int l = 0; l < qk; l += 2) {
10740-
const float v0 = (src[j + i*qk + l + 0])*id;
10741-
const float v1 = (src[j + i*qk + l + 1])*id;
10742-
10743-
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
10744-
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
10745-
10746-
assert(vi0 >= 0 && vi0 < 16);
10747-
assert(vi1 >= 0 && vi1 < 16);
10748-
10749-
hist[vi0]++;
10750-
hist[vi1]++;
10751-
10752-
pp[l/2] = vi0 | (vi1 << 4);
10753-
}
10741+
for (int i = 0; i < nb; i++) {
10742+
for (int l = 0; l < qk; l += 2) {
10743+
const uint8_t vi0 = pb[l/2] & 0xF;
10744+
const uint8_t vi1 = pb[l/2] >> 4;
1075410745

10755-
memcpy(pb, pp, pp_size);
10756-
pb += bs;
10746+
hist[vi0]++;
10747+
hist[vi1]++;
1075710748
}
10749+
pb += bs;
1075810750
}
1075910751
}
1076010752

1076110753
return (n/k)*row_size;
1076210754
}
1076310755

10764-
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
10756+
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
1076510757
const int nb = k / qk;
1076610758
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
1076710759
const size_t row_size = nb*bs;
1076810760

1076910761
assert(k % qk == 0);
1077010762

10771-
const size_t pp_size = qk / 2;
10772-
uint8_t * pp = (uint8_t *) alloca(pp_size);
10773-
1077410763
char * pdst = (char *) dst;
1077510764

1077610765
for (int j = 0; j < n; j += k) {
1077710766
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
10778-
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
1077910767
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
1078010768

10781-
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
10769+
quantize_row_q4_1(src + j, pd, k);
1078210770

1078310771
for (int i = 0; i < nb; i++) {
10784-
float min = FLT_MAX;
10785-
float max = -FLT_MAX;
10786-
10787-
{
10788-
for (int l = 0; l < qk; l++) {
10789-
const float v = src[j + i*qk + l];
10790-
if (v < min) min = v;
10791-
if (v > max) max = v;
10792-
}
10793-
10794-
const float d = (max - min) / ((1 << 4) - 1);
10795-
const float id = d ? 1.0f/d : 0.0f;
10796-
10797-
*(float *) pd = d;
10798-
*(float *) pm = min;
10799-
pd += bs;
10800-
pm += bs;
10801-
10802-
for (int l = 0; l < qk; l += 2) {
10803-
const float v0 = (src[j + i*qk + l + 0] - min)*id;
10804-
const float v1 = (src[j + i*qk + l + 1] - min)*id;
10805-
10806-
const uint8_t vi0 = round(v0);
10807-
const uint8_t vi1 = round(v1);
10808-
10809-
assert(vi0 >= 0 && vi0 < 16);
10810-
assert(vi1 >= 0 && vi1 < 16);
10811-
10812-
hist[vi0]++;
10813-
hist[vi1]++;
10814-
10815-
pp[l/2] = vi0 | (vi1 << 4);
10816-
}
10772+
for (int l = 0; l < qk; l += 2) {
10773+
const uint8_t vi0 = pb[l/2] & 0xF;
10774+
const uint8_t vi1 = pb[l/2] >> 4;
1081710775

10818-
memcpy(pb, pp, pp_size);
10819-
pb += bs;
10776+
hist[vi0]++;
10777+
hist[vi1]++;
1082010778
}
10779+
pb += bs;
1082110780
}
1082210781
}
1082310782

ggml.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -745,8 +745,8 @@ enum ggml_opt_result ggml_opt(
745745
// quantization
746746
//
747747

748-
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
749-
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
748+
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
749+
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
750750

751751
//
752752
// system info

tests/CMakeLists.txt

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
set(TEST_TARGET test-tokenizer-0)
2-
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
3-
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
4-
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
1+
function(llama_add_test source)
2+
get_filename_component(TEST_TARGET ${source} NAME_WE)
3+
add_executable(${TEST_TARGET} ${source})
4+
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
5+
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
6+
endfunction()
7+
8+
llama_add_test(test-quantize.c)
9+
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)

tests/test-quantize.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#include "ggml.h"
2+
#undef NDEBUG
3+
#include <assert.h>
4+
#include <math.h>
5+
6+
int main(void) {
7+
#define QK 32
8+
float src[QK];
9+
uint8_t dst[24];
10+
int64_t hist[16];
11+
12+
for (int i = 0; i < QK; i++) {
13+
src[i] = (float)(i + 1);
14+
}
15+
16+
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
17+
assert(size == 20);
18+
float max_result = ((float *)dst)[0];
19+
float max_expected = src[31] / ((1 << 3) - 1);
20+
assert(max_result == max_expected);
21+
for (int i = 0; i < QK; i++) {
22+
uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
23+
uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
24+
assert(q4_result == q4_expected);
25+
}
26+
27+
size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
28+
assert(size == 24);
29+
float delta_result = ((float *)dst)[0];
30+
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
31+
assert(delta_result == delta_expected);
32+
float min_result = ((float *)dst)[1];
33+
float min_expected = src[0];
34+
assert(min_result == min_expected);
35+
for (int i = 0; i < QK; i++) {
36+
uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
37+
uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
38+
assert(q4_result == q4_expected);
39+
}
40+
41+
return 0;
42+
}

0 commit comments

Comments
 (0)