@@ -403,16 +403,63 @@ static inline __m128i packNibbles( __m256i bytes )
403
403
// method 5
404
404
// blocks of QK elements
405
405
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
406
+
407
+ // reference implementation for deterministic creation of model files
408
+ static void quantize_row_q4_0_reference (const float * restrict x , void * restrict y , int k ) {
409
+ assert (k % QK == 0 );
410
+ const int nb = k / QK ;
411
+
412
+ const size_t bs = sizeof (float ) + QK /2 ;
413
+
414
+ uint8_t * restrict pd = ((uint8_t * )y + 0 * bs );
415
+ uint8_t * restrict pb = ((uint8_t * )y + 0 * bs + sizeof (float ));
416
+
417
+ uint8_t pp [QK /2 ];
418
+
419
+ for (int i = 0 ; i < nb ; i ++ ) {
420
+ float amax = 0.0f ; // absolute max
421
+
422
+ for (int l = 0 ; l < QK ; l ++ ) {
423
+ const float v = x [i * QK + l ];
424
+ amax = MAX (amax , fabsf (v ));
425
+ }
426
+
427
+ const float d = amax / ((1 << 3 ) - 1 );
428
+ const float id = d ? 1.0f /d : 0.0f ;
429
+
430
+ * (float * )pd = d ;
431
+ pd += bs ;
432
+
433
+ for (int l = 0 ; l < QK ; l += 2 ) {
434
+ const float v0 = x [i * QK + l + 0 ]* id ;
435
+ const float v1 = x [i * QK + l + 1 ]* id ;
436
+
437
+ const uint8_t vi0 = ((int8_t ) (round (v0 ))) + 8 ;
438
+ const uint8_t vi1 = ((int8_t ) (round (v1 ))) + 8 ;
439
+
440
+ assert (vi0 >= 0 && vi0 < 16 );
441
+ assert (vi1 >= 0 && vi1 < 16 );
442
+
443
+ pp [l /2 ] = vi0 | (vi1 << 4 );
444
+ }
445
+
446
+ memcpy (pb , pp , sizeof (pp ));
447
+ pb += bs ;
448
+ }
449
+ }
450
+
406
451
void quantize_row_q4_0 (const float * restrict x , void * restrict y , int k ) {
407
452
assert (k % QK == 0 );
408
453
454
+ #if __ARM_NEON || defined(__AVX2__ ) || defined(__wasm_simd128__ )
409
455
const int nb = k / QK ;
410
456
const size_t bs = sizeof (float ) + QK /2 ;
411
457
412
458
uint8_t * restrict pd = ((uint8_t * )y + 0 * bs );
413
459
uint8_t * restrict pb = ((uint8_t * )y + 0 * bs + sizeof (float ));
414
460
415
461
uint8_t pp [QK /2 ];
462
+ #endif
416
463
417
464
#if __ARM_NEON
418
465
#if QK == 32
@@ -569,36 +616,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
569
616
#endif
570
617
#else
571
618
// scalar
572
- for (int i = 0 ; i < nb ; i ++ ) {
573
- float amax = 0.0f ; // absolute max
574
-
575
- for (int l = 0 ; l < QK ; l ++ ) {
576
- const float v = x [i * QK + l ];
577
- amax = MAX (amax , fabsf (v ));
578
- }
579
-
580
- const float d = amax / ((1 << 3 ) - 1 );
581
- const float id = d ? 1.0f /d : 0.0f ;
582
-
583
- * (float * )pd = d ;
584
- pd += bs ;
585
-
586
- for (int l = 0 ; l < QK ; l += 2 ) {
587
- const float v0 = x [i * QK + l + 0 ]* id ;
588
- const float v1 = x [i * QK + l + 1 ]* id ;
589
-
590
- const uint8_t vi0 = ((int8_t ) (round (v0 ))) + 8 ;
591
- const uint8_t vi1 = ((int8_t ) (round (v1 ))) + 8 ;
592
-
593
- assert (vi0 >= 0 && vi0 < 16 );
594
- assert (vi1 >= 0 && vi1 < 16 );
595
-
596
- pp [l /2 ] = vi0 | (vi1 << 4 );
597
- }
598
-
599
- memcpy (pb , pp , sizeof (pp ));
600
- pb += bs ;
601
- }
619
+ quantize_row_q4_0_reference (x , y , k );
602
620
#endif
603
621
}
604
622
@@ -10705,119 +10723,60 @@ enum ggml_opt_result ggml_opt(
10705
10723
10706
10724
////////////////////////////////////////////////////////////////////////////////
10707
10725
10708
- size_t ggml_quantize_q4_0 (float * src , void * dst , int n , int k , int qk , int64_t * hist ) {
10726
+ size_t ggml_quantize_q4_0 (const float * src , void * dst , int n , int k , int qk , int64_t * hist ) {
10709
10727
const int nb = k / qk ;
10710
10728
const size_t bs = (sizeof (float ) + sizeof (uint8_t )* qk /2 );
10711
10729
const size_t row_size = nb * bs ;
10712
10730
10713
10731
assert (k % qk == 0 );
10714
10732
10715
- const size_t pp_size = qk / 2 ;
10716
- uint8_t * pp = (uint8_t * ) alloca (pp_size );
10717
-
10718
10733
char * pdst = (char * ) dst ;
10719
10734
10720
10735
for (int j = 0 ; j < n ; j += k ) {
10721
10736
uint8_t * pd = (uint8_t * ) (pdst + (j /k )* row_size + 0 * bs );
10722
10737
uint8_t * pb = (uint8_t * ) (pdst + (j /k )* row_size + 0 * bs + sizeof (float ));
10723
10738
10724
- for (int i = 0 ; i < nb ; i ++ ) {
10725
- float amax = 0.0f ; // absolute max
10726
-
10727
- {
10728
- for (int l = 0 ; l < qk ; l ++ ) {
10729
- const float v = src [j + i * qk + l ];
10730
- amax = MAX (amax , fabsf (v ));
10731
- }
10732
-
10733
- const float d = amax / ((1 << 3 ) - 1 );
10734
- const float id = d ? 1.0f /d : 0.0f ;
10735
-
10736
- * (float * ) pd = d ;
10737
- pd += bs ;
10739
+ quantize_row_q4_0_reference (src + j , pd , k );
10738
10740
10739
- for (int l = 0 ; l < qk ; l += 2 ) {
10740
- const float v0 = (src [j + i * qk + l + 0 ])* id ;
10741
- const float v1 = (src [j + i * qk + l + 1 ])* id ;
10742
-
10743
- const uint8_t vi0 = ((int8_t ) (round (v0 ))) + 8 ;
10744
- const uint8_t vi1 = ((int8_t ) (round (v1 ))) + 8 ;
10745
-
10746
- assert (vi0 >= 0 && vi0 < 16 );
10747
- assert (vi1 >= 0 && vi1 < 16 );
10748
-
10749
- hist [vi0 ]++ ;
10750
- hist [vi1 ]++ ;
10751
-
10752
- pp [l /2 ] = vi0 | (vi1 << 4 );
10753
- }
10741
+ for (int i = 0 ; i < nb ; i ++ ) {
10742
+ for (int l = 0 ; l < qk ; l += 2 ) {
10743
+ const uint8_t vi0 = pb [l /2 ] & 0xF ;
10744
+ const uint8_t vi1 = pb [l /2 ] >> 4 ;
10754
10745
10755
- memcpy ( pb , pp , pp_size ) ;
10756
- pb += bs ;
10746
+ hist [ vi0 ] ++ ;
10747
+ hist [ vi1 ] ++ ;
10757
10748
}
10749
+ pb += bs ;
10758
10750
}
10759
10751
}
10760
10752
10761
10753
return (n /k )* row_size ;
10762
10754
}
10763
10755
10764
- size_t ggml_quantize_q4_1 (float * src , void * dst , int n , int k , int qk , int64_t * hist ) {
10756
+ size_t ggml_quantize_q4_1 (const float * src , void * dst , int n , int k , int qk , int64_t * hist ) {
10765
10757
const int nb = k / qk ;
10766
10758
const size_t bs = (2 * sizeof (float ) + sizeof (uint8_t )* qk /2 );
10767
10759
const size_t row_size = nb * bs ;
10768
10760
10769
10761
assert (k % qk == 0 );
10770
10762
10771
- const size_t pp_size = qk / 2 ;
10772
- uint8_t * pp = (uint8_t * ) alloca (pp_size );
10773
-
10774
10763
char * pdst = (char * ) dst ;
10775
10764
10776
10765
for (int j = 0 ; j < n ; j += k ) {
10777
10766
uint8_t * pd = (uint8_t * ) (pdst + (j /k )* row_size + 0 * bs );
10778
- uint8_t * pm = (uint8_t * ) (pdst + (j /k )* row_size + 0 * bs + sizeof (float ));
10779
10767
uint8_t * pb = (uint8_t * ) (pdst + (j /k )* row_size + 0 * bs + 2 * sizeof (float ));
10780
10768
10781
- //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb );
10769
+ quantize_row_q4_1 ( src + j , pd , k );
10782
10770
10783
10771
for (int i = 0 ; i < nb ; i ++ ) {
10784
- float min = FLT_MAX ;
10785
- float max = - FLT_MAX ;
10786
-
10787
- {
10788
- for (int l = 0 ; l < qk ; l ++ ) {
10789
- const float v = src [j + i * qk + l ];
10790
- if (v < min ) min = v ;
10791
- if (v > max ) max = v ;
10792
- }
10793
-
10794
- const float d = (max - min ) / ((1 << 4 ) - 1 );
10795
- const float id = d ? 1.0f /d : 0.0f ;
10796
-
10797
- * (float * ) pd = d ;
10798
- * (float * ) pm = min ;
10799
- pd += bs ;
10800
- pm += bs ;
10801
-
10802
- for (int l = 0 ; l < qk ; l += 2 ) {
10803
- const float v0 = (src [j + i * qk + l + 0 ] - min )* id ;
10804
- const float v1 = (src [j + i * qk + l + 1 ] - min )* id ;
10805
-
10806
- const uint8_t vi0 = round (v0 );
10807
- const uint8_t vi1 = round (v1 );
10808
-
10809
- assert (vi0 >= 0 && vi0 < 16 );
10810
- assert (vi1 >= 0 && vi1 < 16 );
10811
-
10812
- hist [vi0 ]++ ;
10813
- hist [vi1 ]++ ;
10814
-
10815
- pp [l /2 ] = vi0 | (vi1 << 4 );
10816
- }
10772
+ for (int l = 0 ; l < qk ; l += 2 ) {
10773
+ const uint8_t vi0 = pb [l /2 ] & 0xF ;
10774
+ const uint8_t vi1 = pb [l /2 ] >> 4 ;
10817
10775
10818
- memcpy ( pb , pp , pp_size ) ;
10819
- pb += bs ;
10776
+ hist [ vi0 ] ++ ;
10777
+ hist [ vi1 ] ++ ;
10820
10778
}
10779
+ pb += bs ;
10821
10780
}
10822
10781
}
10823
10782
0 commit comments