From d8e902ee5dbfd6a6888de7b48b1792882e178a60 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 1 Apr 2025 21:54:52 +0100 Subject: [PATCH 01/35] Add --show-statistics option --- common/arg.cpp | 7 +++++++ common/common.h | 5 +++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 8292adaac655d..851130762c5c0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1925,6 +1925,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"--show-statistics"}, + string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"), + [](common_params & params) { + params.show_statistics = true; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"-pps"}, string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), diff --git a/common/common.h b/common/common.h index 1c0f199774976..7db669c88b924 100644 --- a/common/common.h +++ b/common/common.h @@ -411,8 +411,9 @@ struct common_params { int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations int32_t i_chunk = 0; // start processing from this chunk - bool process_output = false; // collect data for the output tensor - bool compute_ppl = true; // whether to compute perplexity + bool process_output = false; // collect data for the output tensor + bool compute_ppl = true; // whether to compute perplexity + bool show_statistics = false; // show imatrix statistics per tensor // cvector-generator params int n_pca_batch = 100; From f46693bc69a851b9693a723a2cf2c96f7ab9304f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 1 Apr 2025 21:55:41 +0100 Subject: [PATCH 02/35] Add --show-statistics logic --- examples/imatrix/imatrix.cpp | 65 +++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 31b675e8f90b9..49d1d395039d5 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -21,10 +21,9 @@ static void print_usage(int, char ** argv) { LOG("\nexample usage:\n"); - LOG("\n %s \\\n" - " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n" - " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" - " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); + LOG("\n %s -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output]\n" + " [--chunk 123] [--output-frequency 10] [--save-frequency 0] [--show-statistics]\n" + " [--no-ppl] [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); LOG("\n"); } @@ -34,13 +33,19 @@ struct Stats { int ncall = 0; }; +struct Tally { + std::string tensor; + float value = 0; + int count = 0; +}; + class IMatrixCollector { public: IMatrixCollector() = default; void set_params(common_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; - bool load_imatrix(const char * fname); + bool load_imatrix(const char * fname, std::vector * tally = nullptr); private: std::unordered_map m_stats; common_params m_params; @@ -289,7 +294,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); } -bool IMatrixCollector::load_imatrix(const char * fname) { +bool IMatrixCollector::load_imatrix(const char * fname, std::vector * tally) { std::ifstream in(fname, std::ios::binary); if (!in) { LOG_ERR("%s: failed to open %s\n",__func__, fname); @@ -335,13 +340,22 @@ bool IMatrixCollector::load_imatrix(const char * fname) { return false; } - // Recreate the state as expected by save_imatrix(), and corerct for weighted sum. + // Recreate the state as expected by save_imatrix(), and correct for weighted sum. + float total = 0; for (int i = 0; i < nval; i++) { e.values[i] += tmp[i]; + total += tmp[i]; e.counts[i] += ncall; } e.ncall += ncall; + if (tally) { + tally->emplace_back(); + auto & [tensor, value, count] = (*tally)[i]; + tensor = name_as_vec.data(); + value = total; + count = nval; + } } return true; } @@ -352,7 +366,6 @@ static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_dat return g_collector.collect_imatrix(t, ask, user_data); } - struct results_log_softmax { double log_softmax; float logit; @@ -590,6 +603,42 @@ int main(int argc, char ** argv) { return 1; } + std::vector tallies; + + if (params.show_statistics) { + if (params.in_files.empty() || params.in_files.size() > 1) { + LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n"); + return 1; + } + if (!g_collector.load_imatrix(params.in_files[0].c_str(), & tallies)) { + LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str()); + return 1; + } + if (tallies.empty()) { + LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str()); + return 1; + } + float total = 0; + for (const auto & tallie : tallies) { + total += tallie.value / static_cast(tallie.count); + } + + struct tally_sort { + bool operator()(const Tally& x, const Tally & y) const { + return x.value / static_cast(x.count) > y.value / static_cast(y.count); + } + }; + std::sort(tallies.begin(), tallies.end(), tally_sort()); + LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(tallies.size())); + LOG_INF("\n Tensor Σ(weights) Contribution\n"); + LOG_INF("==========================================================================\n"); + for (const auto & [tensor, value, count] : tallies) { + LOG_INF("%40s\t%10.2f\t%7.4f %%\n", tensor.c_str(), value / count, 100.0f * (value / count / total)); + } + LOG_INF("\n"); + return 0; + } + common_init(); params.n_batch = std::min(params.n_batch, params.n_ctx); From dc3373e5bb66febcc4e9d0061758c041cefee4d6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 2 Apr 2025 11:43:38 +0100 Subject: [PATCH 03/35] Add tensor name parsing --- examples/imatrix/imatrix.cpp | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 49d1d395039d5..3467f8f937c2e 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -74,6 +74,35 @@ static std::string filter_tensor_name(const char * name) { return wname; } +static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) { + std::vector name; + std::istringstream stream(input); + std::string item; + + while (std::getline(stream, item, '.')) { + name.push_back(item); + } + for (size_t i = 0; i < name.size(); ++i) { + if (name[i] == "blk" && i + 1 < name.size()) { + layer = name[i + 1]; + break; + } + } + for (size_t i = 0; i < name.size(); ++i) { + if (name[i] == "weight" && i > 0) { + tensor = name[i - 1]; + break; + } + } + + if (tensor.empty()) { + tensor = input; + } + if (layer.empty()) { + layer = "-"; + } +} + bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { GGML_UNUSED(user_data); From 0589c3ee9ffe936663d4119fd647ba9a785fda9f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 2 Apr 2025 11:44:03 +0100 Subject: [PATCH 04/35] Tidy output format --- examples/imatrix/imatrix.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 3467f8f937c2e..30b9a9a295da5 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -658,11 +658,14 @@ int main(int argc, char ** argv) { } }; std::sort(tallies.begin(), tallies.end(), tally_sort()); + LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(tallies.size())); - LOG_INF("\n Tensor Σ(weights) Contribution\n"); - LOG_INF("==========================================================================\n"); + LOG_INF("\n Layer\t Tensor\t Σ(Importance Scores)\t Contribution\n"); + LOG_INF("================================================================================\n"); for (const auto & [tensor, value, count] : tallies) { - LOG_INF("%40s\t%10.2f\t%7.4f %%\n", tensor.c_str(), value / count, 100.0f * (value / count / total)); + std::string layer, name; + process_tensor_name(tensor, layer, name); + LOG_INF("%5s\t%30s\t%15.2f\t%20.4f %%\n", layer.c_str(), name.c_str(), value / count, 100.0f * (value / count / total)); } LOG_INF("\n"); return 0; From e1fd1af77e9750e4cca7accc5efc20f2a16deecb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 2 Apr 2025 14:13:42 +0100 Subject: [PATCH 05/35] Fix typo in title --- examples/imatrix/imatrix.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 30b9a9a295da5..f1d2febfda621 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -660,7 +660,7 @@ int main(int argc, char ** argv) { std::sort(tallies.begin(), tallies.end(), tally_sort()); LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(tallies.size())); - LOG_INF("\n Layer\t Tensor\t Σ(Importance Scores)\t Contribution\n"); + LOG_INF("\n Layer\t Tensor\t μ(Importance Scores)\t Contribution\n"); LOG_INF("================================================================================\n"); for (const auto & [tensor, value, count] : tallies) { std::string layer, name; From 62ac26833a4b91866d9c93dabb88c625429f2580 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 8 Apr 2025 17:07:16 +0100 Subject: [PATCH 06/35] Improve tensor influence ranking --- examples/imatrix/imatrix.cpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index f1d2febfda621..ab31fd7b2f2e6 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -35,7 +35,7 @@ struct Stats { struct Tally { std::string tensor; - float value = 0; + double bias = 0; int count = 0; }; @@ -370,19 +370,20 @@ bool IMatrixCollector::load_imatrix(const char * fname, std::vector * tal } // Recreate the state as expected by save_imatrix(), and correct for weighted sum. - float total = 0; + double total = 0; for (int i = 0; i < nval; i++) { e.values[i] += tmp[i]; - total += tmp[i]; e.counts[i] += ncall; + const double avg_sq = (1.0 * e.values[i]) / e.counts[i]; + total += avg_sq; } e.ncall += ncall; if (tally) { tally->emplace_back(); - auto & [tensor, value, count] = (*tally)[i]; + auto & [tensor, bias, count] = (*tally)[i]; tensor = name_as_vec.data(); - value = total; + bias = total; count = nval; } } @@ -647,25 +648,25 @@ int main(int argc, char ** argv) { LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str()); return 1; } - float total = 0; + double total = 0; for (const auto & tallie : tallies) { - total += tallie.value / static_cast(tallie.count); + total += tallie.bias; } struct tally_sort { bool operator()(const Tally& x, const Tally & y) const { - return x.value / static_cast(x.count) > y.value / static_cast(y.count); + return x.bias > y.bias; } }; std::sort(tallies.begin(), tallies.end(), tally_sort()); LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(tallies.size())); - LOG_INF("\n Layer\t Tensor\t μ(Importance Scores)\t Contribution\n"); - LOG_INF("================================================================================\n"); - for (const auto & [tensor, value, count] : tallies) { + LOG_INF("\n Layer\t Tensor\t Total Bias\tAvg Bias\t Contribution\n"); + LOG_INF("===============================================================================================\n"); + for (const auto & [tensor, bias, count] : tallies) { std::string layer, name; process_tensor_name(tensor, layer, name); - LOG_INF("%5s\t%30s\t%15.2f\t%20.4f %%\n", layer.c_str(), name.c_str(), value / count, 100.0f * (value / count / total)); + LOG_INF("%5s\t%30s\t%15.2f\t%15.4f\t%19.4f%%\n", layer.c_str(), name.c_str(), bias, bias / count, 100.0 * bias / total); } LOG_INF("\n"); return 0; From 73d8ecbc42bb0d032a9b63d959db9fff16e02243 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 13 Apr 2025 20:30:36 +0100 Subject: [PATCH 07/35] Add better statistics --- examples/imatrix/imatrix.cpp | 121 +++++++++++++++++++++++------------ 1 file changed, 80 insertions(+), 41 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index ab31fd7b2f2e6..21980635b9e6a 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -1,19 +1,20 @@ -#include "arg.h" -#include "common.h" -#include "log.h" -#include "llama.h" - +#include #include #include #include #include #include -#include -#include -#include #include +#include +#include +#include #include -#include +#include + +#include "arg.h" +#include "common.h" +#include "llama.h" +#include "log.h" #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -33,10 +34,18 @@ struct Stats { int ncall = 0; }; -struct Tally { +struct tensor_statistics { std::string tensor; - double bias = 0; - int count = 0; + float total = 0; + float mean = 0; + float max = 0; + float min = 0; + float stddev = 0; + float cv = 0; + float zd = 0; + float active = 0; + float entropy = 0; + int elements = 0; }; class IMatrixCollector { @@ -45,7 +54,7 @@ class IMatrixCollector { void set_params(common_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; - bool load_imatrix(const char * fname, std::vector * tally = nullptr); + bool load_imatrix(const char * fname, std::vector * tstats = nullptr); private: std::unordered_map m_stats; common_params m_params; @@ -323,7 +332,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); } -bool IMatrixCollector::load_imatrix(const char * fname, std::vector * tally) { +bool IMatrixCollector::load_imatrix(const char * fname, std::vector * ts) { std::ifstream in(fname, std::ios::binary); if (!in) { LOG_ERR("%s: failed to open %s\n",__func__, fname); @@ -370,21 +379,58 @@ bool IMatrixCollector::load_imatrix(const char * fname, std::vector * tal } // Recreate the state as expected by save_imatrix(), and correct for weighted sum. - double total = 0; + std::vector activations; + activations.reserve(nval); + for (int i = 0; i < nval; i++) { e.values[i] += tmp[i]; e.counts[i] += ncall; - const double avg_sq = (1.0 * e.values[i]) / e.counts[i]; - total += avg_sq; + activations.push_back(e.values[i] / static_cast(e.counts[i])); } e.ncall += ncall; - if (tally) { - tally->emplace_back(); - auto & [tensor, bias, count] = (*tally)[i]; + if (ts) { + float total_bias = std::accumulate(activations.begin(), activations.end(), 0.0f); + float max_bias = * std::max_element(activations.begin(), activations.end()); + float min_bias = * std::min_element(activations.begin(), activations.end()); + float mean_bias = total_bias / activations.size(); + float sq_total_bias = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f); + float dev = std::sqrt((sq_total_bias / activations.size()) - (mean_bias * mean_bias)); + float rmsd = mean_bias > 0.0f ? dev / mean_bias : 0.0f; + + float threshold = 1e-6f; + int inactive_count = std::count_if(activations.begin(), activations.end(), [threshold](const float v) { return fabs(v) < threshold; }); + float active_ratio = 1 - (static_cast(inactive_count) / activations.size()); + + float ent = 0.0f; + if (total_bias > 0) { + for (auto act : activations) { + if (float p = act / total_bias; p > 0) { + ent -= p* std::log2(p); + } + } + } + + int z_score = 0; + for (auto act : activations) { + if (float p = (act - mean_bias) / dev; p > 1) { + z_score++; + } + } + + ts->emplace_back(); + auto & [tensor, total, mean, max, min, stddev, cv, zd, active, entropy, elements] = (*ts)[i]; tensor = name_as_vec.data(); - bias = total; - count = nval; + total = total_bias; + mean = mean_bias; + max = max_bias; + min = min_bias; + stddev = dev; + cv = rmsd; + active = active_ratio; + entropy = ent; + elements = static_cast(activations.size()); + zd = static_cast(z_score) / static_cast(elements); } } return true; @@ -633,42 +679,35 @@ int main(int argc, char ** argv) { return 1; } - std::vector tallies; + std::vector ts; if (params.show_statistics) { if (params.in_files.empty() || params.in_files.size() > 1) { LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n"); return 1; } - if (!g_collector.load_imatrix(params.in_files[0].c_str(), & tallies)) { + if (!g_collector.load_imatrix(params.in_files[0].c_str(), & ts)) { LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str()); return 1; } - if (tallies.empty()) { + if (ts.empty()) { LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str()); return 1; } - double total = 0; - for (const auto & tallie : tallies) { - total += tallie.bias; - } - struct tally_sort { - bool operator()(const Tally& x, const Tally & y) const { - return x.bias > y.bias; - } - }; - std::sort(tallies.begin(), tallies.end(), tally_sort()); - - LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(tallies.size())); - LOG_INF("\n Layer\t Tensor\t Total Bias\tAvg Bias\t Contribution\n"); - LOG_INF("===============================================================================================\n"); - for (const auto & [tensor, bias, count] : tallies) { + std::sort(ts.begin(), ts.end(), [](const tensor_statistics &a, const tensor_statistics &b) { return a.total > b.total; }); + LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); + LOG_INF("\n%5s\t%-20s\t%10s\t%7s\t%12s\t%9s\t%10s\t%9s\t%6s\t%12s\t%7s\t%10s\n", + "Layer", "Tensor", "Σ(Bias)", "Min", "Max", "μ", "σ", "% Active", "N", "Entropy", "E (norm)", "ZD Score"); + LOG_INF("==========================================================================================================================================================================\n"); + for (const auto & [tensor, total, mean, max, min, stddev, cv, zd, active, entropy, elements] : ts) { std::string layer, name; process_tensor_name(tensor, layer, name); - LOG_INF("%5s\t%30s\t%15.2f\t%15.4f\t%19.4f%%\n", layer.c_str(), name.c_str(), bias, bias / count, 100.0 * bias / total); + LOG_INF("%5s\t%-20s\t%10.2f\t%7.4f\t%12.4f\t%8.4f\t%9.4f\t%8.2f%%\t%6d\t%12.4f\t%7.2f%%\t%10.4f\n", + layer.c_str(), name.c_str(), total, min, max, mean, stddev, active * 100.0f, elements, entropy, 100.0f * (entropy / std::log2(elements)), 1000.0f * zd); } LOG_INF("\n"); + return 0; } From 0b7f9c40c8cbe1bc52c8152805baf9794b439cf5 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 15 Apr 2025 08:16:19 +0100 Subject: [PATCH 08/35] Change statistics' sort order --- examples/imatrix/imatrix.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 21980635b9e6a..bc9cf0108b492 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -695,7 +695,16 @@ int main(int argc, char ** argv) { return 1; } - std::sort(ts.begin(), ts.end(), [](const tensor_statistics &a, const tensor_statistics &b) { return a.total > b.total; }); + struct tensor_comparer { + bool operator()(const tensor_statistics & a, const tensor_statistics & b) const { + std::string layer, name_a, name_b;; + process_tensor_name(a.tensor, layer, name_a); + process_tensor_name(b.tensor, layer, name_b); + return name_a < name_b || (name_a == name_b && a.total > b.total); + } + }; + std::sort(ts.begin(), ts.end(), tensor_comparer()); + LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); LOG_INF("\n%5s\t%-20s\t%10s\t%7s\t%12s\t%9s\t%10s\t%9s\t%6s\t%12s\t%7s\t%10s\n", "Layer", "Tensor", "Σ(Bias)", "Min", "Max", "μ", "σ", "% Active", "N", "Entropy", "E (norm)", "ZD Score"); @@ -703,8 +712,8 @@ int main(int argc, char ** argv) { for (const auto & [tensor, total, mean, max, min, stddev, cv, zd, active, entropy, elements] : ts) { std::string layer, name; process_tensor_name(tensor, layer, name); - LOG_INF("%5s\t%-20s\t%10.2f\t%7.4f\t%12.4f\t%8.4f\t%9.4f\t%8.2f%%\t%6d\t%12.4f\t%7.2f%%\t%10.4f\n", - layer.c_str(), name.c_str(), total, min, max, mean, stddev, active * 100.0f, elements, entropy, 100.0f * (entropy / std::log2(elements)), 1000.0f * zd); + LOG_INF("%5s\t%-20s\t%10.2f\t%7.4f\t%12.4f\t%8.4f\t%9.4f\t%8.2f%%\t%6d\t%12.4f\t%7.2f%%\t%9.2f%%\n", + layer.c_str(), name.c_str(), total, min, max, mean, stddev, active * 100.0f, elements, entropy, 100.0f * (entropy / std::log2(elements)), 100.0f * zd); } LOG_INF("\n"); From 755c1efbc15e94d5e6be47852c52ef329eb122b4 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 22 Apr 2025 18:42:31 +0100 Subject: [PATCH 09/35] Add Cosine Similarity --- examples/imatrix/imatrix.cpp | 131 +++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 53 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index bc9cf0108b492..2c6a06cdf236f 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -1,3 +1,9 @@ +#include "arg.h" +#include "common.h" +#include "llama-impl.h" +#include "llama.h" +#include "log.h" + #include #include #include @@ -10,11 +16,7 @@ #include #include #include - -#include "arg.h" -#include "common.h" -#include "llama.h" -#include "log.h" +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -36,16 +38,17 @@ struct Stats { struct tensor_statistics { std::string tensor; - float total = 0; - float mean = 0; - float max = 0; - float min = 0; + Stats stats; + float total_bias = 0; + float mean_bias = 0; + float max_bias = 0; + float min_bias = 0; + int elements = 0; float stddev = 0; - float cv = 0; - float zd = 0; float active = 0; float entropy = 0; - int elements = 0; + float zd = 0; + float cossim = 0; }; class IMatrixCollector { @@ -332,7 +335,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); } -bool IMatrixCollector::load_imatrix(const char * fname, std::vector * ts) { +bool IMatrixCollector::load_imatrix(const char * fname, std::vector * tstats) { std::ifstream in(fname, std::ios::binary); if (!in) { LOG_ERR("%s: failed to open %s\n",__func__, fname); @@ -381,31 +384,29 @@ bool IMatrixCollector::load_imatrix(const char * fname, std::vector activations; activations.reserve(nval); - for (int i = 0; i < nval; i++) { e.values[i] += tmp[i]; e.counts[i] += ncall; - activations.push_back(e.values[i] / static_cast(e.counts[i])); + activations.push_back(e.values[i] / e.counts[i]); } e.ncall += ncall; - if (ts) { - float total_bias = std::accumulate(activations.begin(), activations.end(), 0.0f); - float max_bias = * std::max_element(activations.begin(), activations.end()); - float min_bias = * std::min_element(activations.begin(), activations.end()); - float mean_bias = total_bias / activations.size(); - float sq_total_bias = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f); - float dev = std::sqrt((sq_total_bias / activations.size()) - (mean_bias * mean_bias)); - float rmsd = mean_bias > 0.0f ? dev / mean_bias : 0.0f; - - float threshold = 1e-6f; - int inactive_count = std::count_if(activations.begin(), activations.end(), [threshold](const float v) { return fabs(v) < threshold; }); - float active_ratio = 1 - (static_cast(inactive_count) / activations.size()); - - float ent = 0.0f; - if (total_bias > 0) { + if (tstats) { + float total = std::accumulate(activations.begin(), activations.end(), 0.0f); + float max = * std::max_element(activations.begin(), activations.end()); + float min = * std::min_element(activations.begin(), activations.end()); + float mean = total / activations.size(); + float sq_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f); + float dev = std::sqrt((sq_total / activations.size()) - (mean * mean)); + + float threshold = min + min * 0.5f; + int inactive_count = std::count_if(activations.begin(), activations.end(), [threshold](const float v) { return fabs(v) <= threshold; }); + float active_ratio = 1 - static_cast(inactive_count) / activations.size(); + + float ent = 0; + if (total > 0) { for (auto act : activations) { - if (float p = act / total_bias; p > 0) { + if (float p = act / total; p > 0) { ent -= p* std::log2(p); } } @@ -413,26 +414,48 @@ bool IMatrixCollector::load_imatrix(const char * fname, std::vector 1) { + if (float p = (act - mean) / dev; p > 1) { z_score++; } } - ts->emplace_back(); - auto & [tensor, total, mean, max, min, stddev, cv, zd, active, entropy, elements] = (*ts)[i]; - tensor = name_as_vec.data(); - total = total_bias; - mean = mean_bias; - max = max_bias; - min = min_bias; - stddev = dev; - cv = rmsd; - active = active_ratio; - entropy = ent; - elements = static_cast(activations.size()); - zd = static_cast(z_score) / static_cast(elements); + tstats->emplace_back(); + auto & ts = (*tstats)[i]; + ts.tensor = name_as_vec.data(); + ts.stats = e; + ts.total_bias = total; + ts.mean_bias = mean; + ts.max_bias = max; + ts.min_bias = min; + ts.elements = static_cast(activations.size()); + ts.stddev = dev; + ts.active = active_ratio; + ts.entropy = ent; + ts.zd = static_cast(z_score) / ts.elements; + } + } + + if (tstats) { + static const std::regex pattern(R"(blk\.(\d+)\.)"); + for (auto & ts : *tstats) { + if (std::smatch match; std::regex_search(ts.tensor, match, pattern)) { + const int blk = std::stoi(match[1]); + std::string tname(ts.tensor); + tname.replace(match.position(1), match.length(1), std::to_string(blk-1)); + auto prev = std::find_if(tstats->begin(), tstats->end(), [tname](const tensor_statistics & t) { return t.tensor == tname; }); + if (prev != tstats->end()) { + const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), prev->stats.values.begin(), 0.0f); + const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), ts.stats.values.begin(), 0.0f)); + const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(), prev->stats.values.begin(), 0.0f)); + const float cs = dp / (curr_mag * prev_mag); + ts.cossim = cs; + } + } else { + ts.cossim = 0; + } } } + return true; } @@ -700,20 +723,22 @@ int main(int argc, char ** argv) { std::string layer, name_a, name_b;; process_tensor_name(a.tensor, layer, name_a); process_tensor_name(b.tensor, layer, name_b); - return name_a < name_b || (name_a == name_b && a.total > b.total); + return name_a < name_b || (name_a == name_b && a.total_bias > b.total_bias); } }; std::sort(ts.begin(), ts.end(), tensor_comparer()); LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); - LOG_INF("\n%5s\t%-20s\t%10s\t%7s\t%12s\t%9s\t%10s\t%9s\t%6s\t%12s\t%7s\t%10s\n", - "Layer", "Tensor", "Σ(Bias)", "Min", "Max", "μ", "σ", "% Active", "N", "Entropy", "E (norm)", "ZD Score"); - LOG_INF("==========================================================================================================================================================================\n"); - for (const auto & [tensor, total, mean, max, min, stddev, cv, zd, active, entropy, elements] : ts) { + LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + " Layer", " Tensor", " Σ(Bias)", " Min", " Max", " μ", " σ", " % Active", "N", " Entropy", "E (norm)", "ZD", " CosSim"); + LOG_INF("=========================================================================================================================================================================\n"); + for (const auto & tstat : ts) { std::string layer, name; - process_tensor_name(tensor, layer, name); - LOG_INF("%5s\t%-20s\t%10.2f\t%7.4f\t%12.4f\t%8.4f\t%9.4f\t%8.2f%%\t%6d\t%12.4f\t%7.2f%%\t%9.2f%%\n", - layer.c_str(), name.c_str(), total, min, max, mean, stddev, active * 100.0f, elements, entropy, 100.0f * (entropy / std::log2(elements)), 100.0f * zd); + process_tensor_name(tstat.tensor, layer, name); + LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n", + layer.c_str(), name.c_str(), tstat.total_bias, tstat.min_bias, tstat.max_bias, tstat.mean_bias, tstat.stddev, + tstat.active * 100.0f, tstat.elements, tstat.entropy, 100.0f * (tstat.entropy / std::log2(tstat.elements)), + 100.0f * tstat.zd, tstat.cossim); } LOG_INF("\n"); From 5cd20e4322a7e3eb590b86e9beaac3bd40167125 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 3 May 2025 08:34:10 +0100 Subject: [PATCH 10/35] Add header search path --- tools/imatrix/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt index 412696c47c31c..bb7111144278a 100644 --- a/tools/imatrix/CMakeLists.txt +++ b/tools/imatrix/CMakeLists.txt @@ -2,4 +2,5 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PUBLIC ../../src) target_compile_features(${TARGET} PRIVATE cxx_std_17) From 1dbe6c37ebf691b7e393532df38a9257c3955039 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 3 May 2025 08:39:06 +0100 Subject: [PATCH 11/35] Change header search path to private --- tools/imatrix/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt index bb7111144278a..078e73161dd10 100644 --- a/tools/imatrix/CMakeLists.txt +++ b/tools/imatrix/CMakeLists.txt @@ -2,5 +2,5 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PUBLIC ../../src) +target_include_directories(${TARGET} PRIVATE ../../src) target_compile_features(${TARGET} PRIVATE cxx_std_17) From 3eb556e9e7bee3b0346f501accd71db0031bbbb9 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 25 May 2025 22:44:17 +0100 Subject: [PATCH 12/35] Add weighted statistics per layer --- tools/imatrix/imatrix.cpp | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index d81416809ffea..78bbd76e0f56f 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -729,6 +730,14 @@ int main(int argc, char ** argv) { }; std::sort(ts.begin(), ts.end(), tensor_comparer()); + struct weighted_stats { + float weighted_bias = 0.0f; + float weighted_zd = 0.0f; + float weighted_cossim = 0.0f; + int total_elements = 0; + }; + std::map ws; + LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", " Tensor", " Σ(Bias)", " Min", " Max", " μ", " σ", " % Active", "N", " Entropy", "E (norm)", "ZD", " CosSim"); @@ -736,11 +745,58 @@ int main(int argc, char ** argv) { for (const auto & tstat : ts) { std::string layer, name; process_tensor_name(tstat.tensor, layer, name); + + int blk; + try { + blk = std::stoi(layer); + } catch (const std::exception & e) { + blk = -1; // not a block layer + } + LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n", layer.c_str(), name.c_str(), tstat.total_bias, tstat.min_bias, tstat.max_bias, tstat.mean_bias, tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy, 100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim); + + const float weighted_bias = tstat.elements * tstat.total_bias; + const float weighted_zd = tstat.elements * tstat.zd; + const float weighted_cossim = tstat.elements * tstat.cossim; + + if (ws.find(blk) != ws.end()) { + ws[blk].weighted_bias += weighted_bias; + ws[blk].weighted_zd += weighted_zd; + ws[blk].weighted_cossim += weighted_cossim; + ws[blk].total_elements += tstat.elements; + } else { + weighted_stats temp_ws; + temp_ws.weighted_bias = weighted_bias; + temp_ws.weighted_zd = weighted_zd; + temp_ws.weighted_cossim = weighted_cossim; + temp_ws.total_elements = tstat.elements; + ws[blk] = temp_ws; + } } + + const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; }); + LOG_INF("\nComputing weighted statistics per layer (%d layers)\n", layers); + LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " Σ(Bias)", " ZD", "CosSim"); + LOG_INF("===============================================\n"); + + for (const auto & [first, second] : ws) { + const auto & layer = first; + const auto & stats = second; + + if (stats.total_elements == 0) continue; + + if (layer >= 0) { + const float bias = stats.weighted_bias / stats.total_elements; + const float zd = stats.weighted_zd / stats.total_elements; + const float cossim = stats.weighted_cossim / stats.total_elements; + + LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim); + } + } + LOG_INF("\n"); return 0; From 5cfc4436ff1d0acce1bb2e682c241532aef85ed1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 21 Jun 2025 09:56:20 +0100 Subject: [PATCH 13/35] Update report title --- tools/imatrix/imatrix.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index ec4b57e0d125c..eff9597431b57 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -41,15 +41,15 @@ struct tensor_statistics { std::string tensor; Stats stats; float total_bias = 0; - float mean_bias = 0; - float max_bias = 0; - float min_bias = 0; - int elements = 0; - float stddev = 0; - float active = 0; - float entropy = 0; - float zd = 0; - float cossim = 0; + float mean_bias = 0; + float max_bias = 0; + float min_bias = 0; + int elements = 0; + float stddev = 0; + float active = 0; + float entropy = 0; + float zd = 0; + float cossim = 0; }; class IMatrixCollector { @@ -778,8 +778,8 @@ int main(int argc, char ** argv) { } const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; }); - LOG_INF("\nComputing weighted statistics per layer (%d layers)\n", layers); - LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " Σ(Bias)", " ZD", "CosSim"); + LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers); + LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " μΣ(Bias)", " μZD", "μCosSim"); LOG_INF("===============================================\n"); for (const auto & [first, second] : ws) { From 235442a6155cb121ef00aeff3caa596948b09897 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 22 Jun 2025 10:58:46 +0100 Subject: [PATCH 14/35] Refactor compute_statistics out of main --- tools/imatrix/imatrix.cpp | 194 +++++++++++++++++++++----------------- 1 file changed, 105 insertions(+), 89 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index eff9597431b57..ee19f988e2d44 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -692,113 +692,129 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { return true; } -int main(int argc, char ** argv) { - common_params params; +static bool compute_statistics(const common_params & params) { + std::vector ts; + if (params.in_files.empty() || params.in_files.size() > 1) { + LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n"); + return false; + } + if (!g_collector.load_imatrix(params.in_files[0].c_str(), &ts)) { + LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str()); + return false; + } + if (ts.empty()) { + LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str()); + return false; + } - params.out_file = "imatrix.dat" ; + struct tensor_comparer { + bool operator()(const tensor_statistics & a, const tensor_statistics & b) const { + std::string layer, name_a, name_b; + ; + process_tensor_name(a.tensor, layer, name_a); + process_tensor_name(b.tensor, layer, name_b); + return name_a < name_b || (name_a == name_b && a.total_bias > b.total_bias); + } + }; - params.n_ctx = 512; - params.escape = false; + std::sort(ts.begin(), ts.end(), tensor_comparer()); - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { - return 1; + struct weighted_stats { + float weighted_bias = 0.0f; + float weighted_zd = 0.0f; + float weighted_cossim = 0.0f; + int total_elements = 0; + }; + + std::map ws; + + LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); + LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", " Tensor", " Σ(Bias)", + " Min", " Max", " μ", " σ", " % Active", "N", " Entropy", "E (norm)", "ZD", + " CosSim"); + LOG_INF( + "==============================================================================================================" + "===========================================================\n"); + for (const auto & tstat : ts) { + std::string layer, name; + process_tensor_name(tstat.tensor, layer, name); + + int blk; + try { + blk = std::stoi(layer); + } catch (const std::exception & e) { + blk = -1; // not a block layer + } + + LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n", + layer.c_str(), name.c_str(), tstat.total_bias, tstat.min_bias, tstat.max_bias, tstat.mean_bias, + tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy, + 100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim); + + const float weighted_bias = tstat.elements * tstat.total_bias; + const float weighted_zd = tstat.elements * tstat.zd; + const float weighted_cossim = tstat.elements * tstat.cossim; + + if (ws.find(blk) != ws.end()) { + ws[blk].weighted_bias += weighted_bias; + ws[blk].weighted_zd += weighted_zd; + ws[blk].weighted_cossim += weighted_cossim; + ws[blk].total_elements += tstat.elements; + } else { + weighted_stats temp_ws; + temp_ws.weighted_bias = weighted_bias; + temp_ws.weighted_zd = weighted_zd; + temp_ws.weighted_cossim = weighted_cossim; + temp_ws.total_elements = tstat.elements; + ws[blk] = temp_ws; + } } - std::vector ts; + const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; }); + LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers); + LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " μΣ(Bias)", " μZD", "μCosSim"); + LOG_INF("===============================================\n"); - if (params.show_statistics) { - if (params.in_files.empty() || params.in_files.size() > 1) { - LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n"); - return 1; - } - if (!g_collector.load_imatrix(params.in_files[0].c_str(), & ts)) { - LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str()); - return 1; - } - if (ts.empty()) { - LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str()); - return 1; + for (const auto & [first, second] : ws) { + const auto & layer = first; + const auto & stats = second; + + if (stats.total_elements == 0) { + continue; } - struct tensor_comparer { - bool operator()(const tensor_statistics & a, const tensor_statistics & b) const { - std::string layer, name_a, name_b;; - process_tensor_name(a.tensor, layer, name_a); - process_tensor_name(b.tensor, layer, name_b); - return name_a < name_b || (name_a == name_b && a.total_bias > b.total_bias); - } - }; - std::sort(ts.begin(), ts.end(), tensor_comparer()); - - struct weighted_stats { - float weighted_bias = 0.0f; - float weighted_zd = 0.0f; - float weighted_cossim = 0.0f; - int total_elements = 0; - }; - std::map ws; - - LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); - LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", - " Layer", " Tensor", " Σ(Bias)", " Min", " Max", " μ", " σ", " % Active", "N", " Entropy", "E (norm)", "ZD", " CosSim"); - LOG_INF("=========================================================================================================================================================================\n"); - for (const auto & tstat : ts) { - std::string layer, name; - process_tensor_name(tstat.tensor, layer, name); - - int blk; - try { - blk = std::stoi(layer); - } catch (const std::exception & e) { - blk = -1; // not a block layer - } + if (layer >= 0) { + const float bias = stats.weighted_bias / stats.total_elements; + const float zd = stats.weighted_zd / stats.total_elements; + const float cossim = stats.weighted_cossim / stats.total_elements; - LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n", - layer.c_str(), name.c_str(), tstat.total_bias, tstat.min_bias, tstat.max_bias, tstat.mean_bias, tstat.stddev, - tstat.active * 100.0f, tstat.elements, tstat.entropy, 100.0f * (tstat.entropy / std::log2(tstat.elements)), - 100.0f * tstat.zd, tstat.cossim); + LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim); + } + } - const float weighted_bias = tstat.elements * tstat.total_bias; - const float weighted_zd = tstat.elements * tstat.zd; - const float weighted_cossim = tstat.elements * tstat.cossim; + LOG_INF("\n"); - if (ws.find(blk) != ws.end()) { - ws[blk].weighted_bias += weighted_bias; - ws[blk].weighted_zd += weighted_zd; - ws[blk].weighted_cossim += weighted_cossim; - ws[blk].total_elements += tstat.elements; - } else { - weighted_stats temp_ws; - temp_ws.weighted_bias = weighted_bias; - temp_ws.weighted_zd = weighted_zd; - temp_ws.weighted_cossim = weighted_cossim; - temp_ws.total_elements = tstat.elements; - ws[blk] = temp_ws; - } - } + return true; +} - const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; }); - LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers); - LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " μΣ(Bias)", " μZD", "μCosSim"); - LOG_INF("===============================================\n"); +int main(int argc, char ** argv) { + common_params params; - for (const auto & [first, second] : ws) { - const auto & layer = first; - const auto & stats = second; + params.out_file = "imatrix.dat" ; - if (stats.total_elements == 0) continue; + params.n_ctx = 512; + params.escape = false; - if (layer >= 0) { - const float bias = stats.weighted_bias / stats.total_elements; - const float zd = stats.weighted_zd / stats.total_elements; - const float cossim = stats.weighted_cossim / stats.total_elements; + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { + return 1; + } - LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim); - } + std::vector ts; + if (params.show_statistics) { + if (!compute_statistics(params)) { + return 1; } - LOG_INF("\n"); - return 0; } From c823d167a883d821d1e060bfcf4f3d55d76d0d66 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 22 Jun 2025 12:50:58 +0100 Subject: [PATCH 15/35] Refactor compute_cossim out of load_imatrix --- tools/imatrix/imatrix.cpp | 42 ++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index ee19f988e2d44..59d997527182e 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -116,6 +116,30 @@ static void process_tensor_name(const std::string & input, std::string & layer, } } +static void compute_cossim(std::vector & tstats) { + static const std::regex pattern(R"(blk\.(\d+)\.)"); + for (auto & ts : tstats) { + if (std::smatch match; std::regex_search(ts.tensor, match, pattern)) { + const int blk = std::stoi(match[1]); + std::string tname(ts.tensor); + tname.replace(match.position(1), match.length(1), std::to_string(blk-1)); + auto prev = std::find_if(tstats.begin(), tstats.end(), + [tname](const tensor_statistics & t) { return t.tensor == tname; }); + if (prev != tstats.end()) { + const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), + prev->stats.values.begin(), 0.0f); + const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), + ts.stats.values.begin(), 0.0f)); + const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(), + prev->stats.values.begin(), 0.0f)); + const float cs = dp / (curr_mag * prev_mag); + ts.cossim = cs; + } + } else { + ts.cossim = 0; + } + } +} bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { GGML_UNUSED(user_data); @@ -438,24 +462,6 @@ bool IMatrixCollector::load_imatrix(const char * fname, std::vectorbegin(), tstats->end(), [tname](const tensor_statistics & t) { return t.tensor == tname; }); - if (prev != tstats->end()) { - const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), prev->stats.values.begin(), 0.0f); - const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), ts.stats.values.begin(), 0.0f)); - const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(), prev->stats.values.begin(), 0.0f)); - const float cs = dp / (curr_mag * prev_mag); - ts.cossim = cs; - } - } else { - ts.cossim = 0; - } } } From a5c464059ee761e4b9b0d60eed794b2b941d8261 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 22 Jun 2025 12:52:21 +0100 Subject: [PATCH 16/35] Refactor compute_statistics out of load_imatrix --- tools/imatrix/imatrix.cpp | 87 ++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 59d997527182e..0e2d47cbe132f 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -116,6 +116,49 @@ static void process_tensor_name(const std::string & input, std::string & layer, } } +static void compute_statistics(std::vector & tstats, const int & i, const std::string & name, const Stats & e, const std::vector & activations) { + const float total = std::accumulate(activations.begin(), activations.end(), 0.0f); + const float max = *std::max_element(activations.begin(), activations.end()); + const float min = *std::min_element(activations.begin(), activations.end()); + const float mean = total / activations.size(); + const float sq_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f); + const float dev = std::sqrt((sq_total / activations.size()) - (mean * mean)); + float threshold = min + min * 0.5f; + const int inactive_count = std::count_if(activations.begin(), activations.end(), + [threshold](const float v) { return fabs(v) <= threshold; }); + const float active_ratio = 1 - static_cast(inactive_count) / activations.size(); + + float entropy = 0; + if (total > 0) { + for (const auto act : activations) { + if (const float p = act / total; p > 0) { + entropy -= p * std::log2(p); + } + } + } + + int z_score = 0; + for (const auto act : activations) { + if (const float p = (act - mean) / dev; p > 1) { + z_score++; + } + } + + tstats.emplace_back(); + auto & ts = tstats[i]; + ts.tensor = name; + ts.stats = e; + ts.total_bias = total; + ts.mean_bias = mean; + ts.max_bias = max; + ts.min_bias = min; + ts.elements = static_cast(activations.size()); + ts.stddev = dev; + ts.active = active_ratio; + ts.entropy = entropy; + ts.zd = static_cast(z_score) / ts.elements; +} + static void compute_cossim(std::vector & tstats) { static const std::regex pattern(R"(blk\.(\d+)\.)"); for (auto & ts : tstats) { @@ -419,49 +462,7 @@ bool IMatrixCollector::load_imatrix(const char * fname, std::vector(inactive_count) / activations.size(); - - float ent = 0; - if (total > 0) { - for (auto act : activations) { - if (float p = act / total; p > 0) { - ent -= p* std::log2(p); - } - } - } - - int z_score = 0; - for (auto act : activations) { - if (float p = (act - mean) / dev; p > 1) { - z_score++; - } - } - - tstats->emplace_back(); - auto & ts = (*tstats)[i]; - ts.tensor = name_as_vec.data(); - ts.stats = e; - ts.total_bias = total; - ts.mean_bias = mean; - ts.max_bias = max; - ts.min_bias = min; - ts.elements = static_cast(activations.size()); - ts.stddev = dev; - ts.active = active_ratio; - ts.entropy = ent; - ts.zd = static_cast(z_score) / ts.elements; - } - } - + compute_statistics(*tstats, i, name_as_vec.data(), e, activations); } } From 655be19fd5a725a78b1cb315de5b070cf0914390 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 22 Jun 2025 12:54:17 +0100 Subject: [PATCH 17/35] Move imatrix statistics calculation into its own functions --- tools/imatrix/imatrix.cpp | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 0e2d47cbe132f..f63510928ca8d 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -183,6 +183,7 @@ static void compute_cossim(std::vector & tstats) { } } } + bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { GGML_UNUSED(user_data); @@ -417,6 +418,7 @@ bool IMatrixCollector::load_imatrix(const char * fname, std::vector name_as_vec(len+1); @@ -699,7 +701,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { return true; } -static bool compute_statistics(const common_params & params) { +static bool show_statistics(const common_params & params) { std::vector ts; if (params.in_files.empty() || params.in_files.size() > 1) { LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n"); @@ -709,7 +711,10 @@ static bool compute_statistics(const common_params & params) { LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str()); return false; } - if (ts.empty()) { + + if (!ts.empty()) { + compute_cossim(ts); + } else { LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str()); return false; } @@ -723,7 +728,6 @@ static bool compute_statistics(const common_params & params) { return name_a < name_b || (name_a == name_b && a.total_bias > b.total_bias); } }; - std::sort(ts.begin(), ts.end(), tensor_comparer()); struct weighted_stats { @@ -732,7 +736,6 @@ static bool compute_statistics(const common_params & params) { float weighted_cossim = 0.0f; int total_elements = 0; }; - std::map ws; LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); @@ -781,7 +784,6 @@ static bool compute_statistics(const common_params & params) { LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers); LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " μΣ(Bias)", " μZD", "μCosSim"); LOG_INF("===============================================\n"); - for (const auto & [first, second] : ws) { const auto & layer = first; const auto & stats = second; @@ -798,7 +800,6 @@ static bool compute_statistics(const common_params & params) { LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim); } } - LOG_INF("\n"); return true; @@ -806,9 +807,7 @@ static bool compute_statistics(const common_params & params) { int main(int argc, char ** argv) { common_params params; - params.out_file = "imatrix.dat" ; - params.n_ctx = 512; params.escape = false; @@ -816,19 +815,15 @@ int main(int argc, char ** argv) { return 1; } - std::vector ts; if (params.show_statistics) { - if (!compute_statistics(params)) { + if (!show_statistics(params)) { return 1; } - return 0; } common_init(); - params.n_batch = std::min(params.n_batch, params.n_ctx); - g_collector.set_params(params); for (const auto & in_file : params.in_files) { @@ -855,7 +850,6 @@ int main(int argc, char ** argv) { // init common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); @@ -888,12 +882,10 @@ int main(int argc, char ** argv) { } } - g_collector.save_imatrix(); LOG("\n"); llama_perf_context_print(ctx); - llama_backend_free(); return 0; From 23ecca8b4a752f595273850b69c0ffcda817ffea Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 22 Jun 2025 16:18:50 +0100 Subject: [PATCH 18/35] Add checks and validations --- tools/imatrix/imatrix.cpp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index f63510928ca8d..78a5b56232760 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -116,14 +116,20 @@ static void process_tensor_name(const std::string & input, std::string & layer, } } -static void compute_statistics(std::vector & tstats, const int & i, const std::string & name, const Stats & e, const std::vector & activations) { +static void compute_statistics(std::vector & tstats, const std::string & name, const Stats & e, const std::vector & activations) { + if (activations.empty()) { + LOG_ERR("%s: there are no activations for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str()); + return; + } + const float total = std::accumulate(activations.begin(), activations.end(), 0.0f); const float max = *std::max_element(activations.begin(), activations.end()); const float min = *std::min_element(activations.begin(), activations.end()); const float mean = total / activations.size(); const float sq_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f); - const float dev = std::sqrt((sq_total / activations.size()) - (mean * mean)); - float threshold = min + min * 0.5f; + const float variance = (sq_total / activations.size()) - (mean * mean); + const float dev = std::sqrt(std::max(0.0f, variance)); + float threshold = 1e-6f; const int inactive_count = std::count_if(activations.begin(), activations.end(), [threshold](const float v) { return fabs(v) <= threshold; }); const float active_ratio = 1 - static_cast(inactive_count) / activations.size(); @@ -138,14 +144,15 @@ static void compute_statistics(std::vector & tstats, const in } int z_score = 0; - for (const auto act : activations) { - if (const float p = (act - mean) / dev; p > 1) { - z_score++; + if (dev > 0.0f) { + for (const auto act : activations) { + if (const float p = (act - mean) / dev; p > 1) { + z_score++; + } } } - tstats.emplace_back(); - auto & ts = tstats[i]; + auto & ts = tstats.emplace_back(); ts.tensor = name; ts.stats = e; ts.total_bias = total; @@ -464,7 +471,7 @@ bool IMatrixCollector::load_imatrix(const char * fname, std::vector Date: Sun, 22 Jun 2025 22:48:56 +0100 Subject: [PATCH 19/35] Remove unnecessary include directory --- tools/imatrix/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt index 078e73161dd10..412696c47c31c 100644 --- a/tools/imatrix/CMakeLists.txt +++ b/tools/imatrix/CMakeLists.txt @@ -2,5 +2,4 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../src) target_compile_features(${TARGET} PRIVATE cxx_std_17) From 19f8e1568680c26ac597b4851aec768bfa2ea976 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 24 Jun 2025 21:16:35 +0100 Subject: [PATCH 20/35] Rename labels --- tools/imatrix/imatrix.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 78a5b56232760..19e56343e1bab 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -746,7 +746,7 @@ static bool show_statistics(const common_params & params) { std::map ws; LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); - LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", " Tensor", " Σ(Bias)", + LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", " Tensor", " Σ(Act²)", " Min", " Max", " μ", " σ", " % Active", "N", " Entropy", "E (norm)", "ZD", " CosSim"); LOG_INF( @@ -789,7 +789,7 @@ static bool show_statistics(const common_params & params) { const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; }); LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers); - LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " μΣ(Bias)", " μZD", "μCosSim"); + LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " μΣ(Act²)", " μZD", "μCosSim"); LOG_INF("===============================================\n"); for (const auto & [first, second] : ws) { const auto & layer = first; From f5fd2b70f08aa9d75a4f1110f49e3c2ddfbd7043 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 24 Jun 2025 21:24:20 +0100 Subject: [PATCH 21/35] Add m_stats getter and refactor compute_statistics out of load_imatrix --- tools/imatrix/imatrix.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 19e56343e1bab..3ced209fc4edb 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -58,7 +58,8 @@ class IMatrixCollector { void set_params(common_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; - bool load_imatrix(const char * fname, std::vector * tstats = nullptr); + bool load_imatrix(const char * fname); + const std::unordered_map & get_mstats() const { return m_stats; } private: std::unordered_map m_stats; common_params m_params; @@ -116,8 +117,12 @@ static void process_tensor_name(const std::string & input, std::string & layer, } } -static void compute_statistics(std::vector & tstats, const std::string & name, const Stats & e, const std::vector & activations) { - if (activations.empty()) { +static void compute_statistics(std::vector & tstats, const std::string & name, const Stats & e) { + if (e.values.size() != e.counts.size()) { + LOG_ERR("%s: activation size mismatch for tensor %s (%zu vs %zu)\n", __func__, name.c_str(), e.counts.size(), e.values.size()); + return; + } + if (e.counts.empty()) { LOG_ERR("%s: there are no activations for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str()); return; } @@ -413,7 +418,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); } -bool IMatrixCollector::load_imatrix(const char * fname, std::vector * tstats) { +bool IMatrixCollector::load_imatrix(const char * fname) { std::ifstream in(fname, std::ios::binary); if (!in) { LOG_ERR("%s: failed to open %s\n",__func__, fname); @@ -461,18 +466,11 @@ bool IMatrixCollector::load_imatrix(const char * fname, std::vector activations; - activations.reserve(nval); for (int i = 0; i < nval; i++) { e.values[i] += tmp[i]; e.counts[i] += ncall; - activations.push_back(e.values[i] / e.counts[i]); } e.ncall += ncall; - - if (tstats) { - compute_statistics(*tstats, name_as_vec.data(), e, activations); - } } return true; @@ -714,11 +712,14 @@ static bool show_statistics(const common_params & params) { LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n"); return false; } - if (!g_collector.load_imatrix(params.in_files[0].c_str(), &ts)) { + if (g_collector.load_imatrix(params.in_files[0].c_str())) { + for (const auto & [name, stats] :g_collector.get_mstats()) { + compute_statistics(ts, name, stats); + } + } else { LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str()); return false; } - if (!ts.empty()) { compute_cossim(ts); } else { From bc3bd576f5b2b5787d50fae9ce2d974bdcd67f3d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 24 Jun 2025 21:25:19 +0100 Subject: [PATCH 22/35] Refactor variable names --- tools/imatrix/imatrix.cpp | 71 +++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 3ced209fc4edb..008454fb5dbf4 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -40,16 +40,16 @@ struct Stats { struct tensor_statistics { std::string tensor; Stats stats; - float total_bias = 0; - float mean_bias = 0; - float max_bias = 0; - float min_bias = 0; - int elements = 0; - float stddev = 0; - float active = 0; - float entropy = 0; - float zd = 0; - float cossim = 0; + float total_sqract = 0.0f; + float mean_sqract = 0.0f; + float max_sqract = 0.0f; + float min_sqract = 0.0f; + int elements = 0; + float stddev = 0.0f; + float active = 0.0f; + float entropy = 0.0f; + float zd = 0.0f; + float cossim = 0.0f; }; class IMatrixCollector { @@ -127,31 +127,38 @@ static void compute_statistics(std::vector & tstats, const st return; } - const float total = std::accumulate(activations.begin(), activations.end(), 0.0f); - const float max = *std::max_element(activations.begin(), activations.end()); - const float min = *std::min_element(activations.begin(), activations.end()); - const float mean = total / activations.size(); - const float sq_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f); - const float variance = (sq_total / activations.size()) - (mean * mean); - const float dev = std::sqrt(std::max(0.0f, variance)); - float threshold = 1e-6f; - const int inactive_count = std::count_if(activations.begin(), activations.end(), + const int size = e.counts.size(); + std::vector activations; + activations.reserve(size); + for (int i = 0; i < size; i++) { + activations.push_back(e.values[i] / e.counts[i]); + } + + const float act_total = std::accumulate(activations.begin(), activations.end(), 0.0f); + const float act_max = *std::max_element(activations.begin(), activations.end()); + const float act_min = *std::min_element(activations.begin(), activations.end()); + const float act_mean = act_total / activations.size(); + const float act_sqr_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f); + const float act_var = (act_sqr_total / activations.size()) - (act_mean * act_mean); + const float act_dev = std::sqrt(std::max(0.0f, act_var)); + float threshold = 1e-5f; + const int inactive_count = std::count_if(activations.begin(), activations.end(), [threshold](const float v) { return fabs(v) <= threshold; }); - const float active_ratio = 1 - static_cast(inactive_count) / activations.size(); + const float active_ratio = 1 - static_cast(inactive_count) / activations.size(); float entropy = 0; - if (total > 0) { + if (act_total > 0) { for (const auto act : activations) { - if (const float p = act / total; p > 0) { + if (const float p = act / act_total; p > 0) { entropy -= p * std::log2(p); } } } int z_score = 0; - if (dev > 0.0f) { + if (act_dev > 0.0f) { for (const auto act : activations) { - if (const float p = (act - mean) / dev; p > 1) { + if (const float p = (act - act_mean) / act_dev; p > 1) { z_score++; } } @@ -160,12 +167,12 @@ static void compute_statistics(std::vector & tstats, const st auto & ts = tstats.emplace_back(); ts.tensor = name; ts.stats = e; - ts.total_bias = total; - ts.mean_bias = mean; - ts.max_bias = max; - ts.min_bias = min; + ts.total_sqract = act_total; + ts.mean_sqract = act_mean; + ts.max_sqract = act_max; + ts.min_sqract = act_min; ts.elements = static_cast(activations.size()); - ts.stddev = dev; + ts.stddev = act_dev; ts.active = active_ratio; ts.entropy = entropy; ts.zd = static_cast(z_score) / ts.elements; @@ -733,7 +740,7 @@ static bool show_statistics(const common_params & params) { ; process_tensor_name(a.tensor, layer, name_a); process_tensor_name(b.tensor, layer, name_b); - return name_a < name_b || (name_a == name_b && a.total_bias > b.total_bias); + return name_a < name_b || (name_a == name_b && a.total_sqract > b.total_sqract); } }; std::sort(ts.begin(), ts.end(), tensor_comparer()); @@ -765,11 +772,11 @@ static bool show_statistics(const common_params & params) { } LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n", - layer.c_str(), name.c_str(), tstat.total_bias, tstat.min_bias, tstat.max_bias, tstat.mean_bias, + layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract, tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy, 100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim); - const float weighted_bias = tstat.elements * tstat.total_bias; + const float weighted_bias = tstat.elements * tstat.total_sqract; const float weighted_zd = tstat.elements * tstat.zd; const float weighted_cossim = tstat.elements * tstat.cossim; From fde3089910a65a8f697f68060cb56bc2edb0148f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 29 Jun 2025 11:59:20 +0100 Subject: [PATCH 23/35] Minor cosmetic change --- tools/imatrix/imatrix.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 008454fb5dbf4..34466c557bfec 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -798,7 +798,7 @@ static bool show_statistics(const common_params & params) { const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; }); LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers); LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " μΣ(Act²)", " μZD", "μCosSim"); - LOG_INF("===============================================\n"); + LOG_INF("================================================\n"); for (const auto & [first, second] : ws) { const auto & layer = first; const auto & stats = second; From c5a3d0abd5623f3d15aeb95462e7df68b3531181 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 1 Jul 2025 06:54:09 +0100 Subject: [PATCH 24/35] Retrigger checks (empty commit) From b1c481ac34e59cc4f1b4612c30f2cbac657ec761 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 5 Jul 2025 21:30:58 +0100 Subject: [PATCH 25/35] Rerun checks (empty commit) From dd1317592ffe9046a396ad52045308993287aea1 Mon Sep 17 00:00:00 2001 From: Ed Addario <29247825+EAddario@users.noreply.github.com> Date: Mon, 7 Jul 2025 21:07:28 +0100 Subject: [PATCH 26/35] Fix unnecessary type promotion Co-authored-by: compilade --- tools/imatrix/imatrix.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 34466c557bfec..2957bc8019326 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -143,7 +143,7 @@ static void compute_statistics(std::vector & tstats, const st const float act_dev = std::sqrt(std::max(0.0f, act_var)); float threshold = 1e-5f; const int inactive_count = std::count_if(activations.begin(), activations.end(), - [threshold](const float v) { return fabs(v) <= threshold; }); + [threshold](const float v) { return fabsf(v) <= threshold; }); const float active_ratio = 1 - static_cast(inactive_count) / activations.size(); float entropy = 0; From 0cd8e676b3aaad609e9a95f6e317018b64ea9fb6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 7 Jul 2025 21:31:54 +0100 Subject: [PATCH 27/35] Reverting change to improve code readability --- tools/imatrix/imatrix.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 2957bc8019326..5bca411af5ac4 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -865,6 +865,7 @@ int main(int argc, char ** argv) { // init common_init_result llama_init = common_init_from_params(params); + llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); From 68263419f0b6cab48020d49b7b7027ea245c3ca2 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 7 Jul 2025 23:37:48 +0100 Subject: [PATCH 28/35] Rerun checks (empty commit) From 432650b5dc1f118c046aa681af5ae570d9f38d3c Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 8 Jul 2025 08:27:06 +0100 Subject: [PATCH 29/35] Rerun checks (empty commit) From 61a21a4f35d5391365e536f4314af29fe06c0329 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 9 Jul 2025 10:51:11 +0100 Subject: [PATCH 30/35] =?UTF-8?q?Rerun=20checks=20-=20third=20time's=20the?= =?UTF-8?q?=20Charm=20=F0=9F=A4=9E=20(empty=20commit)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From a3fdb2bdbd1ef024d7e373955896e920be95f965 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 12 Jul 2025 09:51:37 +0100 Subject: [PATCH 31/35] Minor cosmetic change --- tools/imatrix/imatrix.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 5bca411af5ac4..fd3c5863b1ac2 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -27,7 +27,7 @@ static void print_usage(int, char ** argv) { LOG("\n %s -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output]\n" " [--chunk 123] [--output-frequency 10] [--save-frequency 0] [--show-statistics]\n" " [--no-ppl] [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" - " [--parse-special]\n" , argv[0]); + " [--parse-special] [...]\n" , argv[0]); LOG("\n"); } From f9391bd6ed7183dabd79ead4e5ef6e1fc4db9de2 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 12 Jul 2025 09:52:08 +0100 Subject: [PATCH 32/35] Update README --- tools/imatrix/README.md | 73 +++++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/tools/imatrix/README.md b/tools/imatrix/README.md index 6d8897d98bb61..f914ae89691d9 100644 --- a/tools/imatrix/README.md +++ b/tools/imatrix/README.md @@ -1,33 +1,80 @@ # llama.cpp/tools/imatrix Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models. -More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861 +More information is [available here](https://github.com/ggml-org/llama.cpp/pull/4861) ## Usage ``` ./llama-imatrix \ - -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ - [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ - [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] + -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \ + [--chunk 123] [--output-frequency 10] [--save-frequency 0] [--show-statistics] \ + [--no-ppl] [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] \ + [--parse-special] [...] ``` -Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. +Here `-m | --model` with a model name and `-f | --file` with a file containing calibration data (such as e.g. `wiki.train.raw`) are mandatory. The parameters in square brackets are optional and have the following meaning: -* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. -* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. -* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) +* `-h | --help` shows usage information and exits. +* `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. +* `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. +* `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never) -* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. +* `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. +* `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets. +* `--parse-special` enables parsing of special tokens (e.g., `<|im_start|>` in some models). Useful for models with custom tokenizers. +* `--chunk` to skip the first `n` chunks of tokens from the input data. Useful for resuming or skipping initial low-quality data. +* `-n | --n-chunks` maximum number of chunks to process. Default is -1 for all available chunks. +* `--no-ppl` disables the calculation of perplexity for the processed chunks. Useful if you want to speed up the processing and do not care about perplexity. +* `--show-statistics` displays imatrix file's statistics. -For faster computation, make sure to use GPU offloading via the `-ngl` argument +For faster computation, make sure to use GPU offloading via the `-ngl | --n-gpu-layers` argument -## Example +## Examples ```bash -# generate importance matrix (imatrix.dat) -./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 +# generate importance matrix using default filename (imatrix.dat), offloading 99 layers to GPU +./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -ngl 99 # use the imatrix to perform a Q4_K_M quantization ./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m ``` + +```bash +# combine Existing imatrices +./llama-imatrix --in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat -o imatrix-combined.dat +``` + +```bash +# skip first 5 chunks, save intermediates every 20 chunks and snapshots every 50, parsing special tokens +./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --chunk 5 --output-frequency 20 --save-frequency 50 --parse-special +``` + +```bash +# analyse imatrix file and display summary statistics instead of running inference +./llama-imatrix --imatrix imatrix.dat --show-statistics +``` + +`--show-statistics` will display the following statistics: + +#### Per tensor + +* Σ(Act²): sum of all squared activations (the importance scores) +* Min & Max: minimum and maximum squared activations values +* μ & σ: Squared activations' mean and standard deviation +* % Active: proportion of elements whose average squared activation exceeds a small threshold (1e-5). Helpful to determine how alive/dormant the tensor is during inference +* N: number of squared activations +* Entropy: entropy of the squared activation distribution, in bits (standard Shannon entropy measurement) $S = -\sum_{i=1}^N p_i \log_2 p_i$ +* E (norm): Normalized entropy. $E(norm)=\frac{-\sum_{i=1}^N p_i \log_2 p_i}{log_2 N}$. These two metrics can be used to determine how well a prompt "exercises" the model's capabilities +* ZD Score: z-score distribution as described in _3.1 Layer Importance Scores_ of [Layer-Wise Quantization](https://arxiv.org/abs/2406.17415) +* CosSim: cosine similarity with respect to the previous layer's tensor. Useful to determine how similar the squared activations of the current layer are to the previous layer's squared activations. + +#### Per layer + +Weighted averages of Σ(Act²), ZD Score and CosSim are also calculated. + +#### Important note on the computed Statistics + +When using these statistics, please note that they are computed on the squared activations, **not on the actual (raw) activations**. +Whilst the results are still useful, they're less accurate than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors. +This limitation is due to the current implementation of the importance matrix, but a pull request ([use GGUF to store importance matrices](https://github.com/ggml-org/llama.cpp/pull/9400)) aims to address this. From 98bcd3e55bbd10fbf02c68c51995e06156fb39f4 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 12 Jul 2025 12:03:38 +0100 Subject: [PATCH 33/35] Fix typo --- tools/imatrix/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/imatrix/README.md b/tools/imatrix/README.md index f914ae89691d9..c5ac382bf5005 100644 --- a/tools/imatrix/README.md +++ b/tools/imatrix/README.md @@ -52,7 +52,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl | --n-gpu- ```bash # analyse imatrix file and display summary statistics instead of running inference -./llama-imatrix --imatrix imatrix.dat --show-statistics +./llama-imatrix --in-file imatrix.dat --show-statistics ``` `--show-statistics` will display the following statistics: From 69a0b1712e951643f4cca69fbdc3952871205bd1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 13 Jul 2025 17:43:58 +0100 Subject: [PATCH 34/35] Update README --- tools/imatrix/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/imatrix/README.md b/tools/imatrix/README.md index c5ac382bf5005..3de1d3adda297 100644 --- a/tools/imatrix/README.md +++ b/tools/imatrix/README.md @@ -41,7 +41,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl | --n-gpu- ``` ```bash -# combine Existing imatrices +# combine existing imatrices ./llama-imatrix --in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat -o imatrix-combined.dat ``` @@ -76,5 +76,4 @@ Weighted averages of Σ(Act²), ZD Score and CosSim are also calculated. #### Important note on the computed Statistics When using these statistics, please note that they are computed on the squared activations, **not on the actual (raw) activations**. -Whilst the results are still useful, they're less accurate than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors. -This limitation is due to the current implementation of the importance matrix, but a pull request ([use GGUF to store importance matrices](https://github.com/ggml-org/llama.cpp/pull/9400)) aims to address this. +Whilst the results are still useful, they're less reliable than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors. From 9f2c5585c5d1186fbd8ce5a6152ad60ff4bf589f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 13 Jul 2025 20:26:44 +0100 Subject: [PATCH 35/35] Rerun checks (empty commit)