From 6c281ad88f5c5ecb4a1192f84a4df2f26cd1243d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 17 Apr 2025 09:57:48 +0100
Subject: [PATCH 1/6] Move struct declaration to header and update allowed
 tensor list

---
 examples/quantize/quantize.cpp | 47 +-----------------------
 src/llama-quant.cpp            |  6 ----
 src/llama-quant.h              | 65 ++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 52 deletions(-)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 0355311dc5c06..fcbb8a50a3a0d 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "llama.h"
+#include "llama-quant.h"
 
 #include <cstdio>
 #include <cstring>
@@ -248,52 +249,6 @@ static ggml_type parse_ggml_type(const char * arg) {
     return GGML_TYPE_COUNT;
 }
 
-// Allowed tensors for arbitrary quantization with --tensor-type option
-static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
-    "attn_k",
-    "attn_kv_a_mqa",
-    "attn_kv_b",
-    "attn_o",
-    "attn_output",
-    "attn_q",
-    "attn_q_a",
-    "attn_q_b",
-    "attn_qkv",
-    "attn_v",
-    "channel_mix_key",
-    "channel_mix_receptance",
-    "channel_mix_value",
-    "cls",
-    "cls.output",
-    "cross_attn_k",
-    "cross_attn_o",
-    "cross_attn_q",
-    "cross_attn_v",
-    "ffn_act",
-    "ffn_down",
-    "ffn_down_exps",
-    "ffn_down_shexp",
-    "ffn_gate",
-    "ffn_gate_exps",
-    "ffn_gate_shexp",
-    "ffn_up",
-    "ffn_up_exps",
-    "ffn_up_shexp",
-    "ssm_in",
-    "ssm_out",
-    "time_mix_gate",
-    "time_mix_key",
-    "time_mix_output",
-    "time_mix_receptance",
-    "time_mix_value",
-};
-
-// changes to this struct must be replicated in llama-quant.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
-
 static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
     const char * sep = strchr(data, '=');
     if (sep == nullptr) {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7dc5422763118..b2213b346de93 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -48,12 +48,6 @@ struct quantize_state_impl {
         {}
 };
 
-// changes to this struct must be replicated in quantize.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
-
 static void llama_tensor_dequantize_impl(
     ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
diff --git a/src/llama-quant.h b/src/llama-quant.h
index 6f70f09beec22..953b23994dc8c 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -1 +1,66 @@
 #pragma once
+
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+
+// Allowed tensors for arbitrary quantization with --tensor-type option
+static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
+  "attn_k",
+  "attn_k_b",
+  "attn_kv_a_mqa",
+  "attn_kv_b",
+  "attn_o",
+  "attn_output",
+  "attn_q",
+  "attn_q_a",
+  "attn_q_b",
+  "attn_qkv",
+  "attn_rel_b",
+  "attn_v",
+  "attn_v_b",
+  "channel_mix_key",
+  "channel_mix_receptance",
+  "channel_mix_value",
+  "cls",
+  "cls.output",
+  "conv1",
+  "conv1d",
+  "conv2",
+  "cross_attn_k",
+  "cross_attn_o",
+  "cross_attn_q",
+  "cross_attn_rel_b",
+  "cross_attn_v",
+  "dw",
+  "ffn_down",
+  "ffn_down_exps",
+  "ffn_down_shexp",
+  "ffn_gate",
+  "ffn_gate_exps",
+  "ffn_gate_shexp",
+  "ffn_up",
+  "ffn_up_exps",
+  "ffn_up_shexp",
+  "pw1",
+  "pw1",
+  "ssm_a",
+  "ssm_conv1d",
+  "ssm_dt",
+  "ssm_in",
+  "ssm_out",
+  "ssm_x",
+  "time_mix_gate",
+  "time_mix_key",
+  "time_mix_output",
+  "time_mix_receptance",
+  "time_mix_value",
+  "token_types"
+};
+
+// Quantization types
+struct tensor_quantization {
+  std::string name;
+  ggml_type quant = GGML_TYPE_COUNT;
+};

From 3e031bc7047dd0d7828ef6e96d6d3f41233eec00 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 17 Apr 2025 09:58:09 +0100
Subject: [PATCH 2/6] Improve regex matching whilst still validating allowed
 tensors

---
 examples/quantize/quantize.cpp | 35 ++++++----------------------------
 src/llama-quant.cpp            | 18 ++++++++++++-----
 2 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index fcbb8a50a3a0d..01e6cc8d21d28 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -261,7 +261,6 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
         printf("\n%s: missing tensor name\n\n", __func__);
         return false;
     }
-
     if (const size_t qt_len = strlen(sep); qt_len == 1) {
         printf("\n%s: missing quantization type\n\n", __func__);
         return false;
@@ -270,37 +269,15 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
     std::string tn(data, tn_len);
     std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
     sep++;
-    const std::string qt(sep);
-
-    bool found = false;
-    for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
-        std::string tensor;
-        tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
-        // handle special case of cls.output
-        std::string cls_output = "cls.output";
-        if (tn.find(cls_output) != std::string::npos) {
-            tensor = "cls.output";
-        }
-        // check if an allowed tensor exists and it's at the end of the kv string
-        if (tensor == allowed) {
-            found = true;
-            break;
-        }
-    }
-    if (!found) {
-        printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
-        return false;
-    }
-
-    if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
-        printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
-        return false;
-    }
-
     tensor_quantization tqz;
     tqz.name = tn;
-    tqz.quant = parse_ggml_type(qt.c_str());
+    tqz.quant = parse_ggml_type(sep);
     tensor_type.emplace_back(std::move(tqz));
+    if (tqz.quant == GGML_TYPE_COUNT) {
+        printf("\n%s: invalid quantization type '%s'\n\n", __func__, sep);
+        return false;
+    }
+
     return true;
 }
 
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b2213b346de93..8a0f33a60aa01 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -790,17 +790,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 // unless the user specifies a type
                 if (params->tensor_types) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+                    const std::string tensor_name(tensor->name);
                     for (const auto & [tname, qtype] : tensor_types) {
-                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
-                            if (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
+                            for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
+                                if (tensor_name.find(allowed) != std::string::npos) {
+                                    if  (qtype != new_type) {
+                                        LLAMA_LOG_DEBUG("(overriding %s), ", ggml_type_name(new_type));
+                                        new_type = qtype;
+                                        break;
+                                    }
+                                }
                             }
-                            new_type = qtype;
-                            break;
+                            goto loop_exit; // if two or more types are specified for the tensor, first match wins
                         }
                     }
                 }
+                loop_exit:;
             }
+
             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                 new_type = params->token_embedding_type;
             }

From 6c13fcbc30a785559f79dd25bd1039df264dc0f5 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 17 Apr 2025 10:16:36 +0100
Subject: [PATCH 3/6] Minor cosmetic change to log output

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 8a0f33a60aa01..4f60dc818e4c2 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -796,7 +796,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                             for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
                                 if (tensor_name.find(allowed) != std::string::npos) {
                                     if  (qtype != new_type) {
-                                        LLAMA_LOG_DEBUG("(overriding %s), ", ggml_type_name(new_type));
+                                        LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
                                         new_type = qtype;
                                         break;
                                     }

From d69391ef71bfff06ae7cbfa36360a9428016d098 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 3 May 2025 08:37:42 +0100
Subject: [PATCH 4/6] Add header search path

---
 tools/quantize/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt
index 47e5cbe30cfe3..4cf7cbbb1615d 100644
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -3,4 +3,5 @@ add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
+target_include_directories(${TARGET} PRIVATE ../../src)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

From 61bb6e2640a63a7656edbef8538c4ebd2ea07543 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 11 May 2025 13:56:01 +0100
Subject: [PATCH 5/6] Remove allowed tensor checks

---
 src/llama-quant.cpp         | 20 ++++++------
 src/llama-quant.h           | 65 -------------------------------------
 tools/quantize/quantize.cpp |  9 +++--
 3 files changed, 17 insertions(+), 77 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4f60dc818e4c2..d9ba2aee56d86 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -14,6 +14,12 @@
 #include <thread>
 #include <unordered_map>
 
+// Quantization types. Changes to this struct must be replicated in quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
@@ -793,20 +799,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     const std::string tensor_name(tensor->name);
                     for (const auto & [tname, qtype] : tensor_types) {
                         if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
-                            for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
-                                if (tensor_name.find(allowed) != std::string::npos) {
-                                    if  (qtype != new_type) {
-                                        LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
-                                        new_type = qtype;
-                                        break;
-                                    }
-                                }
+                            if  (qtype != new_type) {
+                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+                                new_type = qtype;
+                                break; // if two or more types are specified for the tensor, first match wins
                             }
-                            goto loop_exit; // if two or more types are specified for the tensor, first match wins
                         }
                     }
                 }
-                loop_exit:;
             }
 
             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
diff --git a/src/llama-quant.h b/src/llama-quant.h
index 953b23994dc8c..6f70f09beec22 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -1,66 +1 @@
 #pragma once
-
-#include <string>
-#include <vector>
-
-#include "ggml.h"
-
-// Allowed tensors for arbitrary quantization with --tensor-type option
-static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
-  "attn_k",
-  "attn_k_b",
-  "attn_kv_a_mqa",
-  "attn_kv_b",
-  "attn_o",
-  "attn_output",
-  "attn_q",
-  "attn_q_a",
-  "attn_q_b",
-  "attn_qkv",
-  "attn_rel_b",
-  "attn_v",
-  "attn_v_b",
-  "channel_mix_key",
-  "channel_mix_receptance",
-  "channel_mix_value",
-  "cls",
-  "cls.output",
-  "conv1",
-  "conv1d",
-  "conv2",
-  "cross_attn_k",
-  "cross_attn_o",
-  "cross_attn_q",
-  "cross_attn_rel_b",
-  "cross_attn_v",
-  "dw",
-  "ffn_down",
-  "ffn_down_exps",
-  "ffn_down_shexp",
-  "ffn_gate",
-  "ffn_gate_exps",
-  "ffn_gate_shexp",
-  "ffn_up",
-  "ffn_up_exps",
-  "ffn_up_shexp",
-  "pw1",
-  "pw1",
-  "ssm_a",
-  "ssm_conv1d",
-  "ssm_dt",
-  "ssm_in",
-  "ssm_out",
-  "ssm_x",
-  "time_mix_gate",
-  "time_mix_key",
-  "time_mix_output",
-  "time_mix_receptance",
-  "time_mix_value",
-  "token_types"
-};
-
-// Quantization types
-struct tensor_quantization {
-  std::string name;
-  ggml_type quant = GGML_TYPE_COUNT;
-};
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 01e6cc8d21d28..3f54af7c58158 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -1,6 +1,5 @@
 #include "common.h"
 #include "llama.h"
-#include "llama-quant.h"
 
 #include <cstdio>
 #include <cstring>
@@ -58,6 +57,12 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
     { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
 };
 
+// Quantization types. Changes to this struct must be replicated in llama-quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
@@ -245,7 +250,7 @@ static ggml_type parse_ggml_type(const char * arg) {
             return type;
         }
     }
-    fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
+    fprintf(stderr, "\n%s: invalid ggml_type '%s'\n\n", __func__, arg);
     return GGML_TYPE_COUNT;
 }
 

From 184ef70e3bdbe27106a8fe4717f20da39800a85c Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 12 May 2025 18:57:43 +0100
Subject: [PATCH 6/6] Remove unneeded include directory

---
 tools/quantize/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt
index 4cf7cbbb1615d..47e5cbe30cfe3 100644
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -3,5 +3,4 @@ add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
-target_include_directories(${TARGET} PRIVATE ../../src)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)