@@ -1862,7 +1862,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
1862
1862
1863
1863
std::vector<uint8_t > work (512 );
1864
1864
std::vector<float > conv_buf (512 );
1865
- std::vector<int64_t > hist_all (1 << 4 , 0 );
1866
1865
size_t total_size_org = 0 ;
1867
1866
size_t total_size_new = 0 ;
1868
1867
@@ -1917,48 +1916,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
1917
1916
}
1918
1917
new_data = work.data ();
1919
1918
1920
- std::vector<int64_t > hist_cur (1 << 4 , 0 );
1921
-
1922
- switch (new_type) {
1923
- case GGML_TYPE_Q4_0: {
1924
- new_size = ggml_quantize_q4_0 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1925
- } break ;
1926
- case GGML_TYPE_Q4_1: {
1927
- new_size = ggml_quantize_q4_1 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1928
- } break ;
1929
- case GGML_TYPE_Q5_0: {
1930
- new_size = ggml_quantize_q5_0 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1931
- } break ;
1932
- case GGML_TYPE_Q5_1: {
1933
- new_size = ggml_quantize_q5_1 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1934
- } break ;
1935
- case GGML_TYPE_Q8_0: {
1936
- new_size = ggml_quantize_q8_0 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1937
- } break ;
1938
- case GGML_TYPE_Q2_K: {
1939
- new_size = ggml_quantize_q2_K (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1940
- } break ;
1941
- case GGML_TYPE_Q3_K: {
1942
- new_size = ggml_quantize_q3_K (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1943
- } break ;
1944
- case GGML_TYPE_Q4_K: {
1945
- new_size = ggml_quantize_q4_K (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1946
- } break ;
1947
- case GGML_TYPE_Q5_K: {
1948
- new_size = ggml_quantize_q5_K (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1949
- } break ;
1950
- case GGML_TYPE_Q6_K: {
1951
- new_size = ggml_quantize_q6_K (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1952
- } break ;
1953
- default : {
1954
- fprintf (stderr, " %s: unsupported quantization type %d\n " , __func__, new_type);
1955
- return false ;
1956
- }
1957
- }
1958
-
1959
- for (size_t j = 0 ; j < hist_cur.size (); ++j) {
1960
- hist_all[j] += hist_cur[j];
1961
- }
1919
+ new_size = ggml_quantize_chunk (new_type, f32_data, new_data, 0 , n_elms/cur->ne [0 ], cur->ne [0 ], nullptr );
1962
1920
} else {
1963
1921
new_type = cur->type ;
1964
1922
new_data = cur->data ;
@@ -1993,17 +1951,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
1993
1951
{
1994
1952
printf (" %s: original size = %8.2f MB\n " , __func__, total_size_org / 1024.0 / 1024.0 );
1995
1953
printf (" %s: quantized size = %8.2f MB\n " , __func__, total_size_new / 1024.0 / 1024.0 );
1996
-
1997
- int64_t sum_all = 0 ;
1998
- for (size_t i = 0 ; i < hist_all.size (); ++i) {
1999
- sum_all += hist_all[i];
2000
- }
2001
-
2002
- printf (" %s: hist: " , __func__);
2003
- for (size_t i = 0 ; i < hist_all.size (); ++i) {
2004
- printf (" %5.3f " , hist_all[i] / (float )sum_all);
2005
- }
2006
- printf (" \n " );
2007
1954
}
2008
1955
2009
1956
return true ;
0 commit comments