From 6c281ad88f5c5ecb4a1192f84a4df2f26cd1243d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 17 Apr 2025 09:57:48 +0100 Subject: [PATCH 1/6] Move struct declaration to header and update allowed tensor list --- examples/quantize/quantize.cpp | 47 +----------------------- src/llama-quant.cpp | 6 ---- src/llama-quant.h | 65 ++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 52 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 0355311dc5c06..fcbb8a50a3a0d 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "llama.h" +#include "llama-quant.h" #include #include @@ -248,52 +249,6 @@ static ggml_type parse_ggml_type(const char * arg) { return GGML_TYPE_COUNT; } -// Allowed tensors for arbitrary quantization with --tensor-type option -static const std::vector ALLOWED_TENSOR_TYPE = { - "attn_k", - "attn_kv_a_mqa", - "attn_kv_b", - "attn_o", - "attn_output", - "attn_q", - "attn_q_a", - "attn_q_b", - "attn_qkv", - "attn_v", - "channel_mix_key", - "channel_mix_receptance", - "channel_mix_value", - "cls", - "cls.output", - "cross_attn_k", - "cross_attn_o", - "cross_attn_q", - "cross_attn_v", - "ffn_act", - "ffn_down", - "ffn_down_exps", - "ffn_down_shexp", - "ffn_gate", - "ffn_gate_exps", - "ffn_gate_shexp", - "ffn_up", - "ffn_up_exps", - "ffn_up_shexp", - "ssm_in", - "ssm_out", - "time_mix_gate", - "time_mix_key", - "time_mix_output", - "time_mix_receptance", - "time_mix_value", -}; - -// changes to this struct must be replicated in llama-quant.cpp -struct tensor_quantization { - std::string name; - ggml_type quant = GGML_TYPE_COUNT; -}; - static bool parse_tensor_type(const char * data, std::vector & tensor_type) { const char * sep = strchr(data, '='); if (sep == nullptr) { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7dc5422763118..b2213b346de93 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -48,12 +48,6 @@ struct quantize_state_impl { {} }; -// changes to this struct must be replicated in quantize.cpp -struct tensor_quantization { - std::string name; - ggml_type quant = GGML_TYPE_COUNT; -}; - static void llama_tensor_dequantize_impl( ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread diff --git a/src/llama-quant.h b/src/llama-quant.h index 6f70f09beec22..953b23994dc8c 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1 +1,66 @@ #pragma once + +#include +#include + +#include "ggml.h" + +// Allowed tensors for arbitrary quantization with --tensor-type option +static const std::vector ALLOWED_TENSOR_TYPE = { + "attn_k", + "attn_k_b", + "attn_kv_a_mqa", + "attn_kv_b", + "attn_o", + "attn_output", + "attn_q", + "attn_q_a", + "attn_q_b", + "attn_qkv", + "attn_rel_b", + "attn_v", + "attn_v_b", + "channel_mix_key", + "channel_mix_receptance", + "channel_mix_value", + "cls", + "cls.output", + "conv1", + "conv1d", + "conv2", + "cross_attn_k", + "cross_attn_o", + "cross_attn_q", + "cross_attn_rel_b", + "cross_attn_v", + "dw", + "ffn_down", + "ffn_down_exps", + "ffn_down_shexp", + "ffn_gate", + "ffn_gate_exps", + "ffn_gate_shexp", + "ffn_up", + "ffn_up_exps", + "ffn_up_shexp", + "pw1", + "pw1", + "ssm_a", + "ssm_conv1d", + "ssm_dt", + "ssm_in", + "ssm_out", + "ssm_x", + "time_mix_gate", + "time_mix_key", + "time_mix_output", + "time_mix_receptance", + "time_mix_value", + "token_types" +}; + +// Quantization types +struct tensor_quantization { + std::string name; + ggml_type quant = GGML_TYPE_COUNT; +}; From 3e031bc7047dd0d7828ef6e96d6d3f41233eec00 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 17 Apr 2025 09:58:09 +0100 Subject: [PATCH 2/6] Improve regex matching whilst still validating allowed tensors --- examples/quantize/quantize.cpp | 35 ++++++---------------------------- src/llama-quant.cpp | 18 ++++++++++++----- 2 files changed, 19 insertions(+), 34 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index fcbb8a50a3a0d..01e6cc8d21d28 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -261,7 +261,6 @@ static bool parse_tensor_type(const char * data, std::vectortensor_types) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); + const std::string tensor_name(tensor->name); for (const auto & [tname, qtype] : tensor_types) { - if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) { - if (qtype != new_type) { - LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype)); + if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { + for (const auto & allowed : ALLOWED_TENSOR_TYPE) { + if (tensor_name.find(allowed) != std::string::npos) { + if (qtype != new_type) { + LLAMA_LOG_DEBUG("(overriding %s), ", ggml_type_name(new_type)); + new_type = qtype; + break; + } + } } - new_type = qtype; - break; + goto loop_exit; // if two or more types are specified for the tensor, first match wins } } } + loop_exit:; } + if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { new_type = params->token_embedding_type; } From 6c13fcbc30a785559f79dd25bd1039df264dc0f5 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 17 Apr 2025 10:16:36 +0100 Subject: [PATCH 3/6] Minor cosmetic change to log output --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8a0f33a60aa01..4f60dc818e4c2 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -796,7 +796,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: for (const auto & allowed : ALLOWED_TENSOR_TYPE) { if (tensor_name.find(allowed) != std::string::npos) { if (qtype != new_type) { - LLAMA_LOG_DEBUG("(overriding %s), ", ggml_type_name(new_type)); + LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type)); new_type = qtype; break; } From d69391ef71bfff06ae7cbfa36360a9428016d098 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 3 May 2025 08:37:42 +0100 Subject: [PATCH 4/6] Add header search path --- tools/quantize/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index 47e5cbe30cfe3..4cf7cbbb1615d 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -3,4 +3,5 @@ add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) +target_include_directories(${TARGET} PRIVATE ../../src) target_compile_features(${TARGET} PRIVATE cxx_std_17) From 61bb6e2640a63a7656edbef8538c4ebd2ea07543 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 11 May 2025 13:56:01 +0100 Subject: [PATCH 5/6] Remove allowed tensor checks --- src/llama-quant.cpp | 20 ++++++------ src/llama-quant.h | 65 ------------------------------------- tools/quantize/quantize.cpp | 9 +++-- 3 files changed, 17 insertions(+), 77 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4f60dc818e4c2..d9ba2aee56d86 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -14,6 +14,12 @@ #include #include +// Quantization types. Changes to this struct must be replicated in quantize.cpp +struct tensor_quantization { + std::string name; + ggml_type quant = GGML_TYPE_COUNT; +}; + static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -793,20 +799,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const std::string tensor_name(tensor->name); for (const auto & [tname, qtype] : tensor_types) { if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { - for (const auto & allowed : ALLOWED_TENSOR_TYPE) { - if (tensor_name.find(allowed) != std::string::npos) { - if (qtype != new_type) { - LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type)); - new_type = qtype; - break; - } - } + if (qtype != new_type) { + LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type)); + new_type = qtype; + break; // if two or more types are specified for the tensor, first match wins } - goto loop_exit; // if two or more types are specified for the tensor, first match wins } } } - loop_exit:; } if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { diff --git a/src/llama-quant.h b/src/llama-quant.h index 953b23994dc8c..6f70f09beec22 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1,66 +1 @@ #pragma once - -#include -#include - -#include "ggml.h" - -// Allowed tensors for arbitrary quantization with --tensor-type option -static const std::vector ALLOWED_TENSOR_TYPE = { - "attn_k", - "attn_k_b", - "attn_kv_a_mqa", - "attn_kv_b", - "attn_o", - "attn_output", - "attn_q", - "attn_q_a", - "attn_q_b", - "attn_qkv", - "attn_rel_b", - "attn_v", - "attn_v_b", - "channel_mix_key", - "channel_mix_receptance", - "channel_mix_value", - "cls", - "cls.output", - "conv1", - "conv1d", - "conv2", - "cross_attn_k", - "cross_attn_o", - "cross_attn_q", - "cross_attn_rel_b", - "cross_attn_v", - "dw", - "ffn_down", - "ffn_down_exps", - "ffn_down_shexp", - "ffn_gate", - "ffn_gate_exps", - "ffn_gate_shexp", - "ffn_up", - "ffn_up_exps", - "ffn_up_shexp", - "pw1", - "pw1", - "ssm_a", - "ssm_conv1d", - "ssm_dt", - "ssm_in", - "ssm_out", - "ssm_x", - "time_mix_gate", - "time_mix_key", - "time_mix_output", - "time_mix_receptance", - "time_mix_value", - "token_types" -}; - -// Quantization types -struct tensor_quantization { - std::string name; - ggml_type quant = GGML_TYPE_COUNT; -}; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 01e6cc8d21d28..3f54af7c58158 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -1,6 +1,5 @@ #include "common.h" #include "llama.h" -#include "llama-quant.h" #include #include @@ -58,6 +57,12 @@ static const std::vector QUANT_OPTIONS = { { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; +// Quantization types. Changes to this struct must be replicated in llama-quantize.cpp +struct tensor_quantization { + std::string name; + ggml_type quant = GGML_TYPE_COUNT; +}; + static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset"; static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; @@ -245,7 +250,7 @@ static ggml_type parse_ggml_type(const char * arg) { return type; } } - fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg); + fprintf(stderr, "\n%s: invalid ggml_type '%s'\n\n", __func__, arg); return GGML_TYPE_COUNT; } From 184ef70e3bdbe27106a8fe4717f20da39800a85c Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 12 May 2025 18:57:43 +0100 Subject: [PATCH 6/6] Remove unneeded include directory --- tools/quantize/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index 4cf7cbbb1615d..47e5cbe30cfe3 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -3,5 +3,4 @@ add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) -target_include_directories(${TARGET} PRIVATE ../../src) target_compile_features(${TARGET} PRIVATE cxx_std_17)