@@ -1327,6 +1327,7 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
1327
1327
1328
1328
y [i ].qs [l /2 ] = (vi0 & 0x0F ) | ((vi1 & 0x0F ) << 4 );
1329
1329
1330
+ // get the 5-th bit and store it in qh at the right position
1330
1331
y [i ].qh |= ((vi0 & 0x10 ) >> 4 ) << (l + 0 );
1331
1332
y [i ].qh |= ((vi1 & 0x10 ) >> 4 ) << (l + 1 );
1332
1333
}
@@ -1624,7 +1625,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
1624
1625
const uint8x8_t v8 = vld1_u8 (pp + l /2 );
1625
1626
1626
1627
// Expand 4-bit qs to 8-bit bytes
1627
- const uint8x8_t v0 = vand_u8 (v8 , vdup_n_u8 (0x0f ));
1628
+ const uint8x8_t v0 = vand_u8 (v8 , vdup_n_u8 (0x0F ));
1628
1629
const uint8x8_t v1 = vshr_n_u8 (v8 , 4 );
1629
1630
1630
1631
// Convert to signed 8-bit integers
@@ -1674,7 +1675,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
1674
1675
for (int l = 0 ; l < QK4_0 ; l += 2 ) {
1675
1676
const uint8_t vi = pp [l /2 ];
1676
1677
1677
- const int8_t vi0 = vi & 0xf ;
1678
+ const int8_t vi0 = vi & 0x0F ;
1678
1679
const int8_t vi1 = vi >> 4 ;
1679
1680
1680
1681
const float v0 = (vi0 - 8 )* d ;
@@ -1740,7 +1741,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
1740
1741
const uint8x8_t v8 = vld1_u8 (pp + l /2 );
1741
1742
1742
1743
// Expand 4-bit qs to 8-bit bytes
1743
- const uint8x8_t v0 = vand_u8 (v8 , vdup_n_u8 (0x0f ));
1744
+ const uint8x8_t v0 = vand_u8 (v8 , vdup_n_u8 (0x0F ));
1744
1745
const uint8x8_t v1 = vshr_n_u8 (v8 , 4 );
1745
1746
1746
1747
// Interleave and combine
@@ -1782,7 +1783,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
1782
1783
for (int l = 0 ; l < QK4_1 ; l += 2 ) {
1783
1784
const uint8_t vi = pp [l /2 ];
1784
1785
1785
- const int8_t vi0 = vi & 0xf ;
1786
+ const int8_t vi0 = vi & 0x0F ;
1786
1787
const int8_t vi1 = vi >> 4 ;
1787
1788
1788
1789
const float v0 = vi0 * d + m ;
@@ -1812,7 +1813,7 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
1812
1813
for (int l = 0 ; l < QK4_2 ; l += 2 ) {
1813
1814
const uint8_t vi = pp [l /2 ];
1814
1815
1815
- const int8_t vi0 = vi & 0xf ;
1816
+ const int8_t vi0 = vi & 0x0F ;
1816
1817
const int8_t vi1 = vi >> 4 ;
1817
1818
1818
1819
const float v0 = (vi0 - 8 )* d ;
@@ -1842,7 +1843,7 @@ static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, in
1842
1843
for (int l = 0 ; l < QK4_3 ; l += 2 ) {
1843
1844
const uint8_t vi = pp [l /2 ];
1844
1845
1845
- const int8_t vi0 = vi & 0xf ;
1846
+ const int8_t vi0 = vi & 0x0F ;
1846
1847
const int8_t vi1 = vi >> 4 ;
1847
1848
1848
1849
const float v0 = vi0 * d + m ;
@@ -1874,11 +1875,12 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
1874
1875
for (int l = 0 ; l < QK5_0 ; l += 2 ) {
1875
1876
const uint8_t vi = pp [l /2 ];
1876
1877
1877
- const int8_t vh0 = ((qh & (1 << (l + 0 ))) >> (l + 0 )) << 4 ;
1878
- const int8_t vh1 = ((qh & (1 << (l + 1 ))) >> (l + 1 )) << 4 ;
1878
+ // extract the 5-th bit from qh
1879
+ const uint8_t vh0 = ((qh & (1 << (l + 0 ))) >> (l + 0 )) << 4 ;
1880
+ const uint8_t vh1 = ((qh & (1 << (l + 1 ))) >> (l + 1 )) << 4 ;
1879
1881
1880
- const int8_t vi0 = (vi & 0xf ) | vh0 ;
1881
- const int8_t vi1 = (vi >> 4 ) | vh1 ;
1882
+ const uint8_t vi0 = (vi & 0x0F ) | vh0 ;
1883
+ const uint8_t vi1 = (vi >> 4 ) | vh1 ;
1882
1884
1883
1885
const float v0 = vi0 * d + m ;
1884
1886
const float v1 = vi1 * d + m ;
@@ -2593,7 +2595,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2593
2595
const block_q8_0 * restrict y0 = & y [i + 0 ];
2594
2596
const block_q8_0 * restrict y1 = & y [i + 1 ];
2595
2597
2596
- const uint8x16_t m4b = vdupq_n_u8 (0xf );
2598
+ const uint8x16_t m4b = vdupq_n_u8 (0x0F );
2597
2599
const int8x16_t s8b = vdupq_n_s8 (0x8 );
2598
2600
2599
2601
const uint8x16_t v0_0 = vld1q_u8 (x0 -> qs );
@@ -2729,8 +2731,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2729
2731
for (int j = 0 ; j < QK8_0 /2 ; j ++ ) {
2730
2732
const uint8_t v0 = p0 [j ];
2731
2733
2732
- const int i0 = (int8_t ) (v0 & 0xf ) - 8 ;
2733
- const int i1 = (int8_t ) (v0 >> 4 ) - 8 ;
2734
+ const int i0 = (int8_t ) (v0 & 0x0F ) - 8 ;
2735
+ const int i1 = (int8_t ) (v0 >> 4 ) - 8 ;
2734
2736
2735
2737
const int i2 = p1 [2 * j + 0 ];
2736
2738
const int i3 = p1 [2 * j + 1 ];
@@ -2767,7 +2769,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2767
2769
2768
2770
summs += x0 -> m * (y0 -> s0 + y0 -> s1 ) + x1 -> m * (y1 -> s0 + y1 -> s1 );
2769
2771
2770
- const uint8x16_t m4b = vdupq_n_u8 (0xf );
2772
+ const uint8x16_t m4b = vdupq_n_u8 (0x0F );
2771
2773
2772
2774
const uint8x16_t v0_0 = vld1q_u8 (x0 -> qs );
2773
2775
const uint8x16_t v0_1 = vld1q_u8 (x1 -> qs );
@@ -2864,8 +2866,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2864
2866
for (int j = 0 ; j < QK8_1 /2 ; j ++ ) {
2865
2867
const uint8_t v0 = p0 [j ];
2866
2868
2867
- const float f0 = d0 * (v0 & 0xf ) + m0 ;
2868
- const float f1 = d0 * (v0 >> 4 ) + m0 ;
2869
+ const float f0 = d0 * (v0 & 0x0F ) + m0 ;
2870
+ const float f1 = d0 * (v0 >> 4 ) + m0 ;
2869
2871
2870
2872
const float f2 = d1 * p1 [2 * j + 0 ];
2871
2873
const float f3 = d1 * p1 [2 * j + 1 ];
@@ -2900,7 +2902,7 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
2900
2902
const block_q8_0 * restrict y0 = & y [i + 0 ];
2901
2903
const block_q8_0 * restrict y1 = & y [i + 1 ];
2902
2904
2903
- const uint8x16_t m4b = vdupq_n_u8 (0xf );
2905
+ const uint8x16_t m4b = vdupq_n_u8 (0x0F );
2904
2906
const int8x16_t s8b = vdupq_n_s8 (0x8 );
2905
2907
2906
2908
const uint8x16_t v0_0 = vcombine_u8 (vld1_u8 (x0_0 -> qs ), vld1_u8 (x0_1 -> qs ));
@@ -3011,11 +3013,11 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
3011
3013
const uint8_t v0 = x0 [j ];
3012
3014
const uint8_t v1 = x1 [j ];
3013
3015
3014
- const int i0_0 = (int8_t ) (v0 & 0xf ) - 8 ;
3015
- const int i1_0 = (int8_t ) (v0 >> 4 ) - 8 ;
3016
+ const int i0_0 = (int8_t ) (v0 & 0x0F ) - 8 ;
3017
+ const int i1_0 = (int8_t ) (v0 >> 4 ) - 8 ;
3016
3018
3017
- const int i0_1 = (int8_t ) (v1 & 0xf ) - 8 ;
3018
- const int i1_1 = (int8_t ) (v1 >> 4 ) - 8 ;
3019
+ const int i0_1 = (int8_t ) (v1 & 0x0F ) - 8 ;
3020
+ const int i1_1 = (int8_t ) (v1 >> 4 ) - 8 ;
3019
3021
3020
3022
const int i2_0 = y0 [2 * j + 0 ];
3021
3023
const int i3_0 = y0 [2 * j + 1 ];
@@ -3063,7 +3065,7 @@ static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void *
3063
3065
const uint8x16_t v0_0 = vcombine_u8 (vld1_u8 (x0_0 -> qs ), vld1_u8 (x0_1 -> qs ));
3064
3066
3065
3067
// 4-bit -> 8-bit
3066
- const int8x16_t v0_0l = vreinterpretq_s8_u8 (vandq_u8 (v0_0 , vdupq_n_u8 (0xf )));
3068
+ const int8x16_t v0_0l = vreinterpretq_s8_u8 (vandq_u8 (v0_0 , vdupq_n_u8 (0x0F )));
3067
3069
const int8x16_t v0_0h = vreinterpretq_s8_u8 (vshrq_n_u8 (v0_0 , 4 ));
3068
3070
3069
3071
// interleave
@@ -3142,10 +3144,10 @@ static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void *
3142
3144
const uint8_t v0 = x0 [j ];
3143
3145
const uint8_t v1 = x1 [j ];
3144
3146
3145
- const int x0_0 = v0 & 0xf ;
3147
+ const int x0_0 = v0 & 0x0F ;
3146
3148
const int x1_0 = v0 >> 4 ;
3147
3149
3148
- const int x0_1 = v1 & 0xf ;
3150
+ const int x0_1 = v1 & 0x0F ;
3149
3151
const int x1_1 = v1 >> 4 ;
3150
3152
3151
3153
const int y0_0 = y0 [2 * j + 0 ];
@@ -3195,7 +3197,7 @@ static void ggml_vec_dot_q5_0_q8_1(const int n, float * restrict s, const void *
3195
3197
const uint8x16_t v0_0 = vcombine_u8 (vld1_u8 (x0_0 -> qs ), vld1_u8 (x0_1 -> qs ));
3196
3198
3197
3199
// 4-bit -> 8-bit
3198
- const int8x16_t v0_0l = vreinterpretq_s8_u8 (vandq_u8 (v0_0 , vdupq_n_u8 (0xf )));
3200
+ const int8x16_t v0_0l = vreinterpretq_s8_u8 (vandq_u8 (v0_0 , vdupq_n_u8 (0x0F )));
3199
3201
const int8x16_t v0_0h = vreinterpretq_s8_u8 (vshrq_n_u8 (v0_0 , 4 ));
3200
3202
3201
3203
// interleave
@@ -3274,10 +3276,10 @@ static void ggml_vec_dot_q5_0_q8_1(const int n, float * restrict s, const void *
3274
3276
const uint8_t v0 = x0 [j ];
3275
3277
const uint8_t v1 = x1 [j ];
3276
3278
3277
- const int x0_0 = v0 & 0xf ;
3279
+ const int x0_0 = v0 & 0x0F ;
3278
3280
const int x1_0 = v0 >> 4 ;
3279
3281
3280
- const int x0_1 = v1 & 0xf ;
3282
+ const int x0_1 = v1 & 0x0F ;
3281
3283
const int x1_1 = v1 >> 4 ;
3282
3284
3283
3285
const int y0_0 = y0 [2 * j + 0 ];
@@ -12500,7 +12502,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
12500
12502
12501
12503
for (int i = 0 ; i < nb ; i ++ ) {
12502
12504
for (int l = 0 ; l < QK4_0 ; l += 2 ) {
12503
- const uint8_t vi0 = y [i ].qs [l /2 ] & 0xF ;
12505
+ const uint8_t vi0 = y [i ].qs [l /2 ] & 0x0F ;
12504
12506
const uint8_t vi1 = y [i ].qs [l /2 ] >> 4 ;
12505
12507
12506
12508
hist [vi0 ]++ ;
@@ -12523,7 +12525,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
12523
12525
12524
12526
for (int i = 0 ; i < nb ; i ++ ) {
12525
12527
for (int l = 0 ; l < QK4_1 ; l += 2 ) {
12526
- const uint8_t vi0 = y [i ].qs [l /2 ] & 0xF ;
12528
+ const uint8_t vi0 = y [i ].qs [l /2 ] & 0x0F ;
12527
12529
const uint8_t vi1 = y [i ].qs [l /2 ] >> 4 ;
12528
12530
12529
12531
hist [vi0 ]++ ;
@@ -12546,7 +12548,7 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
12546
12548
12547
12549
for (int i = 0 ; i < nb ; i ++ ) {
12548
12550
for (int l = 0 ; l < QK4_2 ; l += 2 ) {
12549
- const uint8_t vi0 = y [i ].qs [l /2 ] & 0xF ;
12551
+ const uint8_t vi0 = y [i ].qs [l /2 ] & 0x0F ;
12550
12552
const uint8_t vi1 = y [i ].qs [l /2 ] >> 4 ;
12551
12553
12552
12554
hist [vi0 ]++ ;
@@ -12569,7 +12571,7 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t *
12569
12571
12570
12572
for (int i = 0 ; i < nb ; i ++ ) {
12571
12573
for (int l = 0 ; l < QK4_3 ; l += 2 ) {
12572
- const uint8_t vi0 = y [i ].qs [l /2 ] & 0xF ;
12574
+ const uint8_t vi0 = y [i ].qs [l /2 ] & 0x0F ;
12573
12575
const uint8_t vi1 = y [i ].qs [l /2 ] >> 4 ;
12574
12576
12575
12577
hist [vi0 ]++ ;
@@ -12590,11 +12592,14 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
12590
12592
12591
12593
quantize_row_q5_0_reference (src + j , y , k );
12592
12594
12593
- // TODO: this is wrong
12594
12595
for (int i = 0 ; i < nb ; i ++ ) {
12595
12596
for (int l = 0 ; l < QK5_0 ; l += 2 ) {
12596
- const uint8_t vi0 = y [i ].qs [l /2 ] & 0xF ;
12597
- const uint8_t vi1 = y [i ].qs [l /2 ] >> 4 ;
12597
+ const uint8_t vh0 = ((y [i ].qh & (1 << (l + 0 ))) >> (l + 0 )) << 4 ;
12598
+ const uint8_t vh1 = ((y [i ].qh & (1 << (l + 1 ))) >> (l + 1 )) << 4 ;
12599
+
12600
+ // cast to 16 bins
12601
+ const uint8_t vi0 = ((y [i ].qs [l /2 ] & 0x0F ) | vh0 ) / 2 ;
12602
+ const uint8_t vi1 = ((y [i ].qs [l /2 ] >> 4 ) | vh1 ) / 2 ;
12598
12603
12599
12604
hist [vi0 ]++ ;
12600
12605
hist [vi1 ]++ ;
0 commit comments