Skip to content

Add User Friendly Quantize Type #93

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 16 additions & 14 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
#include "tts.h"
#include "args.h"
#include <stdio.h>
#include <thread>
#include "ggml.h"
#include <map>
#include <vector>

std::vector<ggml_type> valid_quantization_types = {
GGML_TYPE_F16,
GGML_TYPE_Q4_0,
GGML_TYPE_Q5_0,
GGML_TYPE_Q8_0,
#include "args.h"
#include "ggml.h"
#include "tts.h"

const std::map<std::string, ggml_type> valid_quantization_types = {
{"FP16", GGML_TYPE_F16},
{"Q4_0", GGML_TYPE_Q4_0},
{"Q5_0", GGML_TYPE_Q5_0},
{"Q8_0", GGML_TYPE_Q8_0},
};

int main(int argc, const char ** argv) {
int default_quantization = (int) GGML_TYPE_Q4_0;
int default_n_threads = std::max((int)std::thread::hardware_concurrency(), 1);
arg_list args;
args.add_argument(string_arg("--model-path", "(REQUIRED) The local path of the gguf model file for Parler TTS mini v1 to quantize.", "-mp", true));
args.add_argument(string_arg("--quantized-model-path", "(REQUIRED) The path to save the model in a quantized format.", "-qp", true));
args.add_argument(int_arg("--quantized-type", "(OPTIONAL) The ggml enum of the quantized type to convert compatible model tensors to. For more information see readme. Defaults to Q4_0 quantization (2).", "-qt", false, &default_quantization));
args.add_argument(string_arg("--quantized-type", "(OPTIONAL) The ggml enum of the quantized type to convert compatible model tensors to. For more information see readme. Defaults to Q4_0 quantization (2).", "-qt", false, "Q4_0"));
args.add_argument(int_arg("--n-threads", "(OPTIONAL) The number of cpu threads to run the quantization process with. Defaults to known hardware concurrency.", "-nt", false, &default_n_threads));
args.add_argument(bool_arg("--convert-dac-to-f16", "(OPTIONAL) Whether to convert the DAC audio decoder model to a 16 bit float.", "-df"));
args.add_argument(bool_arg("--quantize-output-heads", "(OPTIONAL) Whether to quantize the output heads. Defaults to false and is true when passed (does not accept a parameter).", "-qh"));
Expand All @@ -31,12 +32,13 @@ int main(int argc, const char ** argv) {
return 0;
}
args.validate();
enum ggml_type qtype = static_cast<ggml_type>(*args.get_int_param("--quantized-type"));
if (std::find(valid_quantization_types.begin(), valid_quantization_types.end(), qtype) == valid_quantization_types.end()) {
fprintf(stderr, "ERROR: %d is not a valid quantization type.\n", qtype);
std::string qtype = args.get_string_param("--quantized-type");
if (!valid_quantization_types.contains(qtype)) {
fprintf(stderr, "ERROR: %s is not a valid quantization type.\n",
qtype.c_str());
exit(1);
}
struct quantization_params * qp = new quantization_params((uint32_t) *args.get_int_param("--n-threads"), qtype);
struct quantization_params * qp = new quantization_params((uint32_t) *args.get_int_param("--n-threads"), valid_quantization_types.at(qtype));
qp->quantize_output_heads = args.get_bool_param("--quantize-output-heads");
qp->quantize_text_embeddings = args.get_bool_param("--quantize-text-embedding");
qp->quantize_cross_attn_kv = args.get_bool_param("--quantize-cross-attn-kv");
Expand Down