8
8
#include <unistd.h>
9
9
#include <string.h>
10
10
#include <assert.h>
11
+ #include <inttypes.h>
11
12
12
13
#include "gguflib.h"
13
14
#include "fp16.h"
15
+ #include "bf16.h"
14
16
15
17
/* ============================ Low level functions ========================= */
16
18
@@ -43,9 +45,21 @@ struct gguf_tensor_type_features {
43
45
{"q5_k" , 256 , 176 },
44
46
{"q6_k" , 256 , 210 },
45
47
{"q8_k" , 256 , 292 },
48
+ {"iq2_xxs" , 256 , 66 },
49
+ {"iq2_xs" , 256 , 74 },
50
+ {"iq3_xxs" , 256 , 98 },
51
+ {"iq1_s" , 256 , 110 },
52
+ {"iq4_nl" , 256 , 50 },
53
+ {"iq3_s" , 256 , 110 },
54
+ {"iq2_s" , 256 , 82 },
55
+ {"iq4_xs" , 256 , 136 },
46
56
{"i8" , 1 , 1 },
47
57
{"i16" , 1 , 2 },
48
58
{"i32" , 1 , 4 },
59
+ {"i64" , 1 , 8 },
60
+ {"f64" , 1 , 8 },
61
+ {"iq1_m" , 256 , 56 },
62
+ {"bf16" , 1 , 2 },
49
63
};
50
64
51
65
/* Return the value type name given the type ID. */
@@ -101,8 +115,8 @@ gguf_ctx *gguf_open(const char *filename) {
101
115
if (fd == -1 ) return NULL ;
102
116
103
117
/* Mapping successful. We can create our context object. */
104
- gguf_ctx * ctx = malloc ( sizeof (* ctx ));
105
- memset ( ctx , 0 , sizeof ( * ctx )) ;
118
+ gguf_ctx * ctx = calloc ( 1 , sizeof (* ctx ));
119
+ if (! ctx ) return NULL ;
106
120
ctx -> fd = fd ;
107
121
ctx -> alignment = 32 ; // Default alignment of GGUF files.
108
122
ctx -> data_off = 0 ; // Set later.
@@ -363,8 +377,8 @@ void gguf_print_value_callback(void *privdata, uint32_t type, union gguf_value *
363
377
struct gguf_print_options * po = privdata ;
364
378
if (po && po -> max_array_items && in_array > po -> max_array_items ) {
365
379
if (in_array - 1 == po -> max_array_items )
366
- printf ("... %llu more items of %llu" , array_len - in_array + 1 ,
367
- array_len );
380
+ printf ("... %" PRIu64 " more items of %" PRIu64 "" ,
381
+ array_len - in_array + 1 , array_len );
368
382
return ;
369
383
}
370
384
@@ -396,9 +410,9 @@ void gguf_print_value_callback(void *privdata, uint32_t type, union gguf_value *
396
410
case GGUF_VALUE_TYPE_STRING :
397
411
printf ("%.*s" , (int )val -> string .len , val -> string .string ); break ;
398
412
case GGUF_VALUE_TYPE_UINT64 :
399
- printf ("%llu " , val -> uint64 ); break ;
413
+ printf ("%" PRIu64 " " , val -> uint64 ); break ;
400
414
case GGUF_VALUE_TYPE_INT64 :
401
- printf ("%lld " , val -> int64 ); break ;
415
+ printf ("%" PRId64 " " , val -> int64 ); break ;
402
416
case GGUF_VALUE_TYPE_FLOAT64 :
403
417
printf ("%lf" , val -> float64 ); break ;
404
418
default :
@@ -516,6 +530,12 @@ void gguf_store_f16_callback(void *dst, uint64_t idx, float f) {
516
530
f16 [idx ] = to_half (f );
517
531
}
518
532
533
+ /* Callback used to store BF16 when dequantizing. */
534
+ void gguf_store_bf16_callback (void * dst , uint64_t idx , float f ) {
535
+ uint16_t * f16 = dst ;
536
+ f16 [idx ] = to_brain (f );
537
+ }
538
+
519
539
/* Q8_0 blocks dequantization to floats.
520
540
* 'dst' is supposed to have enough space for 'count' weights. */
521
541
void gguf_q8_0_to_float (void * weights_data , void * dst , uint64_t count , store_float_callback store_callback ) {
@@ -755,7 +775,7 @@ void gguf_q2_k_to_float(void *weights_data, void *dst, uint64_t count, store_flo
755
775
float scale_of_scales = from_half (* ((uint16_t * )(block + 16 + 64 )));
756
776
float scale_of_mins = from_half (* ((uint16_t * )(block + 16 + 64 + 2 )));
757
777
758
- float scale , min ;
778
+ float scale = 0 , min = 0 ;
759
779
int bn = 0 ; // Block number
760
780
for (uint64_t cluster = 0 ; cluster < 2 ; cluster ++ ) {
761
781
for (uint64_t j = 0 ; j < 128 ; j ++ ) {
@@ -863,7 +883,8 @@ void gguf_q4_1_to_float(void *weights_data, void *dst, uint64_t count, store_flo
863
883
864
884
/* FP16 blocks dequantization to floats.
865
885
* 'y' is supposed to have enough space for 'count' weights. */
866
- void gguf_f16_to_float (void * weights_data , float * dst , uint64_t count , store_float_callback store_callback ) {
886
+ static void gguf_f16_to_float (void * weights_data , void * dst , uint64_t count ,
887
+ store_float_callback store_callback ) {
867
888
float * f = dst ;
868
889
uint64_t i = 0 ; // i-th weight to dequantize.
869
890
uint16_t * w16 = weights_data ;
@@ -877,6 +898,23 @@ void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_flo
877
898
}
878
899
}
879
900
901
+ /* BF16 blocks dequantization to floats.
902
+ * 'y' is supposed to have enough space for 'count' weights. */
903
+ static void gguf_bf16_to_float (void * weights_data , void * dst , uint64_t count ,
904
+ store_float_callback store_callback ) {
905
+ float * f = dst ;
906
+ uint64_t i = 0 ; // i-th weight to dequantize.
907
+ uint16_t * w16 = weights_data ;
908
+ while (i < count ) {
909
+ float weight = from_brain (w16 [i ]);
910
+ if (store_callback )
911
+ store_callback (dst ,i ,weight );
912
+ else
913
+ f [i ] = weight ;
914
+ i ++ ;
915
+ }
916
+ }
917
+
880
918
/* Convert the specified tensor (quantized or not) into an array of
881
919
* floats. The array is allocated with malloc(). If the tensor is already
882
920
* in FP32 floats format, it is just memcpy()-ed to the destination array.
@@ -885,10 +923,13 @@ void gguf_f16_to_float(void *weights_data, float *dst, uint64_t count, store_flo
885
923
* NULL is returned as well, but errno is set to EINVAL. */
886
924
float * gguf_tensor_to_float (gguf_tensor * tensor ) {
887
925
float * f = malloc (tensor -> num_weights * sizeof (float ));
926
+ if (!f ) return NULL ;
888
927
if (tensor -> type == GGUF_TYPE_F32 ) {
889
928
memcpy (f , tensor -> weights_data , tensor -> num_weights * sizeof (float ));
890
929
} else if (tensor -> type == GGUF_TYPE_F16 ) {
891
930
gguf_f16_to_float (tensor -> weights_data , f , tensor -> num_weights , NULL );
931
+ } else if (tensor -> type == GGUF_TYPE_BF16 ) {
932
+ gguf_bf16_to_float (tensor -> weights_data , f , tensor -> num_weights , NULL );
892
933
} else if (tensor -> type == GGUF_TYPE_Q8_0 ) {
893
934
gguf_q8_0_to_float (tensor -> weights_data , f , tensor -> num_weights , NULL );
894
935
} else if (tensor -> type == GGUF_TYPE_Q4_K ) {
@@ -913,12 +954,15 @@ float *gguf_tensor_to_float(gguf_tensor *tensor) {
913
954
* an array of int16_t values. */
914
955
int16_t * gguf_tensor_to_f16 (gguf_tensor * tensor ) {
915
956
int16_t * f16 = malloc (tensor -> num_weights * sizeof (int16_t ));
957
+ if (!f16 ) return NULL ;
916
958
if (tensor -> type == GGUF_TYPE_F32 ) {
917
959
float * f = (float * )tensor -> weights_data ;
918
960
for (uint64_t j = 0 ; j < tensor -> num_weights ; j ++ )
919
961
f16 [j ] = to_half (f [j ]);
920
962
} else if (tensor -> type == GGUF_TYPE_F16 ) {
921
963
memcpy (f16 , tensor -> weights_data , tensor -> num_weights * sizeof (int16_t ));
964
+ } else if (tensor -> type == GGUF_TYPE_BF16 ) {
965
+ gguf_bf16_to_float (tensor -> weights_data , f16 , tensor -> num_weights , gguf_store_f16_callback );
922
966
} else if (tensor -> type == GGUF_TYPE_Q8_0 ) {
923
967
gguf_q8_0_to_float (tensor -> weights_data , f16 , tensor -> num_weights , gguf_store_f16_callback );
924
968
} else if (tensor -> type == GGUF_TYPE_Q4_K ) {
@@ -938,3 +982,36 @@ int16_t *gguf_tensor_to_f16(gguf_tensor *tensor) {
938
982
}
939
983
return f16 ;
940
984
}
985
+
986
+ /* Same as gguf_tensor_to_float() but the result will be an bf16 tensor, that is
987
+ * an array of int16_t values. */
988
+ int16_t * gguf_tensor_to_bf16 (gguf_tensor * tensor ) {
989
+ int16_t * f16 = malloc (tensor -> num_weights * sizeof (int16_t ));
990
+ if (!f16 ) return NULL ;
991
+ if (tensor -> type == GGUF_TYPE_F32 ) {
992
+ float * f = (float * )tensor -> weights_data ;
993
+ for (uint64_t j = 0 ; j < tensor -> num_weights ; j ++ )
994
+ f16 [j ] = to_half (f [j ]);
995
+ } else if (tensor -> type == GGUF_TYPE_F16 ) {
996
+ gguf_f16_to_float (tensor -> weights_data , f16 , tensor -> num_weights , gguf_store_bf16_callback );
997
+ } else if (tensor -> type == GGUF_TYPE_BF16 ) {
998
+ memcpy (f16 , tensor -> weights_data , tensor -> num_weights * sizeof (int16_t ));
999
+ } else if (tensor -> type == GGUF_TYPE_Q8_0 ) {
1000
+ gguf_q8_0_to_float (tensor -> weights_data , f16 , tensor -> num_weights , gguf_store_bf16_callback );
1001
+ } else if (tensor -> type == GGUF_TYPE_Q4_K ) {
1002
+ gguf_q4_k_to_float (tensor -> weights_data , f16 , tensor -> num_weights , gguf_store_bf16_callback );
1003
+ } else if (tensor -> type == GGUF_TYPE_Q6_K ) {
1004
+ gguf_q6_k_to_float (tensor -> weights_data , f16 , tensor -> num_weights , gguf_store_bf16_callback );
1005
+ } else if (tensor -> type == GGUF_TYPE_Q2_K ) {
1006
+ gguf_q2_k_to_float (tensor -> weights_data , f16 , tensor -> num_weights , gguf_store_bf16_callback );
1007
+ } else if (tensor -> type == GGUF_TYPE_Q4_0 ) {
1008
+ gguf_q4_0_to_float (tensor -> weights_data , f16 , tensor -> num_weights , gguf_store_bf16_callback );
1009
+ } else if (tensor -> type == GGUF_TYPE_Q4_1 ) {
1010
+ gguf_q4_1_to_float (tensor -> weights_data , f16 , tensor -> num_weights , gguf_store_bf16_callback );
1011
+ } else {
1012
+ free (f16 );
1013
+ errno = EINVAL ;
1014
+ return NULL ;
1015
+ }
1016
+ return f16 ;
1017
+ }
0 commit comments