Skip to content

Commit c443aae

Browse files
committed
refactor: Encapsulate args and quantize_impl into examples/
constexpr also fixed ODR problems.
1 parent d1399e8 commit c443aae

File tree

18 files changed

+358
-343
lines changed

18 files changed

+358
-343
lines changed

examples/CMakeLists.txt

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
# examples
22

3-
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
3+
add_library(examples_common
4+
args.cpp
5+
args.h
6+
audio_file.h
7+
)
8+
target_include_directories(examples_common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
9+
target_link_libraries(examples_common PUBLIC ggml tts)
410

5-
if (EMSCRIPTEN)
6-
else()
11+
if (NOT EMSCRIPTEN)
712
add_subdirectory(cli)
813
add_subdirectory(perf_battery)
914
add_subdirectory(quantize)
1015
add_subdirectory(server)
1116
add_subdirectory(phonemize)
12-
endif()
17+
endif ()
File renamed without changes.
File renamed without changes.
File renamed without changes.

examples/cli/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ if (SDL2_FOUND)
1616
set_source_files_properties(playback.cpp PROPERTIES COMPILE_FLAGS -DSDL2_INSTALL=1)
1717
endif()
1818

19-
target_link_libraries(${TARGET} PRIVATE ggml tts)
19+
target_link_libraries(${TARGET} PRIVATE examples_common)

examples/perf_battery/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
add_executable(perf_battery perf_battery.cpp)
2-
target_link_libraries(perf_battery PRIVATE ggml tts)
2+
target_link_libraries(perf_battery PRIVATE examples_common)

examples/phonemize/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
add_executable(phonemize phonemize.cpp)
2-
target_link_libraries(phonemize PRIVATE ggml tts)
2+
target_link_libraries(phonemize PRIVATE examples_common)

examples/quantize/CMakeLists.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
1-
add_executable(quantize quantize.cpp)
2-
target_link_libraries(quantize PRIVATE ggml tts)
1+
add_executable(quantize
2+
quantize.cpp
3+
quantize_impl.cpp
4+
quantize_impl.h
5+
)
6+
target_link_libraries(quantize PRIVATE examples_common)

examples/quantize/quantize.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
#include "tts.h"
2-
#include "args.h"
3-
#include <stdio.h>
1+
#include <cstdio>
2+
#include <vector>
43
#include <thread>
4+
5+
#include "args.h"
56
#include "ggml.h"
6-
#include <vector>
7+
#include "tts.h"
8+
#include "quantize_impl.h"
79

810
std::vector<ggml_type> valid_quantization_types = {
911
GGML_TYPE_F16,

examples/quantize/quantize_impl.cpp

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
#include <thread>
2+
#include "quantize_impl.h"
3+
4+
#include <fstream>
5+
#include <mutex>
6+
7+
#include "util.h"
8+
9+
namespace {
10+
bool kokoro_is_f16_compatible(std::string name) {
11+
return name.find("voice_tensors") == std::string::npos &&
12+
name.find("bias") == std::string::npos &&
13+
name.find("gamma") == std::string::npos &&
14+
name.find("beta") == std::string::npos &&
15+
name.find("alpha") == std::string::npos &&
16+
!has_suffix(name, "embd") &&
17+
!has_suffix(name, "norm");
18+
}
19+
20+
bool kokoro_is_quantizable(std::string name, struct quantization_params * params) {
21+
// A list of all of the top level GGUF names under kokoro.duration_predictor that have quantization compatible tensors.
22+
constexpr std::array<const char *, 5> DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS = {
23+
"duration_proj",
24+
"encode",
25+
"shared_lstm",
26+
"duration_lstm",
27+
"layers"
28+
};
29+
if (kokoro_is_f16_compatible(name)) {
30+
if (has_prefix(name, "kokoro.albert") || has_prefix(name, "kokoro.text_encoder.lstm")) {
31+
return true;
32+
} else if (has_prefix(name, "kokoro.duration_predictor.")) {
33+
std::vector<std::string> parts = split(name, ".");
34+
for (std::string part : DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS) {
35+
if (part == parts[2]) {
36+
return true;
37+
}
38+
}
39+
}
40+
}
41+
return false;
42+
}
43+
44+
bool dia_is_quantizable(std::string name, struct quantization_params * params) {
45+
// The DAC audio encoder / decoder is not compatible with quantization and normalization tensors should not be quantized.
46+
bool quantizable = !has_prefix(name, "audio_encoder") && !has_suffix(name, "norm");
47+
if (!params->quantize_output_heads) {
48+
quantizable = quantizable && !has_prefix(name, "dia.decoder.heads");
49+
}
50+
return quantizable;
51+
}
52+
53+
bool parler_is_quanitizable(std::string name, struct quantization_params * params) {
54+
// the DAC audio encoder / decoder is not compatible with quantization, normalization weight shouldn't be quantized, and the text encoding shouldn't be normalized.
55+
bool quantizable = !has_prefix(name, "audio_encoder") && !has_suffix(name, "norm.weight") && !has_suffix(name, "text_encoding") && !has_suffix(name, "positional_embed") && !has_suffix(name, "norm.bias");
56+
if (!params->quantize_output_heads) {
57+
quantizable = quantizable && !has_suffix(name, "weight.head");
58+
}
59+
if (!params->quantize_text_embeddings) {
60+
quantizable = quantizable && !has_suffix(name, "embed_prompts");
61+
}
62+
if (!params->quantize_cross_attn_kv) {
63+
quantizable = quantizable && !has_suffix(name, "encoder_attn.k_proj.weight") && !has_suffix(name, "encoder_attn.v_proj.weight");
64+
}
65+
return quantizable;
66+
}
67+
68+
bool is_quantizable(tts_arch arch, std::string name, struct quantization_params * params) {
69+
switch(arch) {
70+
case PARLER_TTS_ARCH:
71+
return parler_is_quanitizable(name, params);
72+
case DIA_ARCH:
73+
return dia_is_quantizable(name, params);
74+
case KOKORO_ARCH:
75+
return kokoro_is_quantizable(name, params);
76+
default:
77+
TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, arch);
78+
}
79+
}
80+
81+
size_t quantize_tensor(void * new_data, struct ggml_tensor * tensor, const float * imatrix, enum ggml_type qtype, uint32_t n_threads) {
82+
// much of this is form copied from llama.cpp
83+
int chunk_size_multiplier = 1;
84+
if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8 || qtype == GGML_TYPE_Q4_0_8_8) {
85+
if ((qtype == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) qtype = GGML_TYPE_Q4_0;
86+
else if (tensor->ne[1] % 4 != 0) qtype = GGML_TYPE_Q4_0;
87+
if (qtype == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
88+
else if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
89+
}
90+
size_t out_size = 0;
91+
const int32_t d3_step = tensor->ne[0] * tensor->ne[1];
92+
const int32_t n_per_row = tensor->ne[0];
93+
const int32_t nrows = tensor->ne[1];
94+
static const int32_t min_chunk_size = 32 * 512;
95+
const int32_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * chunk_size_multiplier;
96+
uint32_t thread_count = std::max(1, std::min((int)n_threads, (int)(d3_step + chunk_size - 1) / chunk_size));
97+
std::mutex mutex;
98+
99+
for (int32_t d3_index = 0; d3_index < tensor->ne[2]; d3_index++) {
100+
const float * f32_data_d3 = ((float *) tensor->data) + d3_index * d3_step;
101+
void * new_data_d3 = (char *)new_data + ggml_row_size(qtype, tensor->ne[0]) * d3_index * nrows;
102+
const float * imatrix_03 = imatrix ? imatrix + d3_index * tensor->ne[0] : nullptr;
103+
if (thread_count <= 1) {
104+
// not threaded
105+
out_size += ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, 0, nrows, n_per_row, imatrix);
106+
} else {
107+
std::vector <std::thread> threads;
108+
int64_t counter = 0;
109+
size_t new_size = 0;
110+
bool valid = true;
111+
for (uint32_t t = 0; t < thread_count; t++) {
112+
auto func = [&mutex, &counter, &new_size, &valid, qtype, f32_data_d3, new_data_d3, chunk_size, nrows, n_per_row, imatrix]() {
113+
const int64_t nrows_per_chunk = chunk_size / n_per_row;
114+
size_t local_size = 0;
115+
while (true) {
116+
std::unique_lock<std::mutex> lock(mutex);
117+
int64_t first_row = counter;
118+
counter += nrows_per_chunk;
119+
if (first_row >= nrows) {
120+
if (local_size > 0) {
121+
new_size += local_size;
122+
}
123+
break;
124+
}
125+
lock.unlock();
126+
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
127+
size_t this_size = ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, first_row * n_per_row, this_nrow, n_per_row, imatrix);
128+
local_size += this_size;
129+
130+
// validate the quantized data; I am not sure how this would occur, but there is always the safe fallback on doing this single threaded.
131+
const size_t row_size = ggml_row_size(qtype, n_per_row);
132+
void * this_data = (char *) new_data_d3 + first_row * row_size;
133+
if (!ggml_validate_row_data(qtype, this_data, this_size)) {
134+
std::unique_lock<std::mutex> lock(mutex);
135+
valid = false;
136+
break;
137+
}
138+
}
139+
};
140+
threads.push_back(std::thread(func));
141+
}
142+
for (auto & t : threads) t.join();
143+
144+
if (!valid) {
145+
TTS_ABORT("Validation of quantized data failed. Please try again and/or switch to single thread quantization.\n");
146+
}
147+
out_size += new_size;
148+
}
149+
}
150+
return out_size;
151+
}
152+
153+
void zeros(std::ofstream & file, size_t n) {
154+
char zero = 0;
155+
for (size_t i = 0; i < n; ++i) {
156+
file.write(&zero, 1);
157+
}
158+
}
159+
160+
template <typename T>
161+
struct no_init {
162+
T value;
163+
no_init() { /* do nothing */ }
164+
};
165+
}
166+
167+
void quantize_gguf(const std::string & ifile, const std::string & ofile, struct quantization_params * params) {
168+
ggml_context * weight_ctx = NULL;
169+
struct gguf_init_params gguf_params = {
170+
/*.no_alloc =*/ false,
171+
/*.ctx =*/ &weight_ctx,
172+
};
173+
gguf_context * meta_ctx = gguf_init_from_file(ifile.c_str(), gguf_params);
174+
str arch = "parler-tts"; // only parler-tts gguf files should lack an explicit architecture.
175+
176+
if (int arch_key = gguf_find_key(meta_ctx, "general.architecture"); arch_key != -1) {
177+
arch = gguf_get_val_str(meta_ctx, arch_key);
178+
}
179+
const tts_arch arch_type{parse_arch_type(ifile.c_str(), arch)};
180+
181+
if (params->quantize_type != GGML_TYPE_Q5_0 && params->quantize_type != GGML_TYPE_Q8_0 && params->quantize_type != GGML_TYPE_F16 && params->quantize_type != GGML_TYPE_Q4_0) {
182+
fprintf(stdout, "Warning, %s is untested for quantization type '%d'. Use at your own risk.\n", arch, params->quantize_type);
183+
}
184+
185+
const size_t align = GGUF_DEFAULT_ALIGNMENT;
186+
gguf_context_ptr ctx_out { gguf_init_empty() };
187+
188+
// copy the KV pairs from the input file
189+
gguf_set_kv(ctx_out.get(), meta_ctx);
190+
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION);
191+
gguf_set_val_u32(ctx_out.get(), "general.quantization_type", params->quantize_type);
192+
for (ggml_tensor * tensor = ggml_get_first_tensor(weight_ctx); tensor; tensor = ggml_get_next_tensor(weight_ctx, tensor)) {
193+
std::string name = ggml_get_name(tensor);
194+
if (name.size() != 0) {
195+
gguf_add_tensor(ctx_out.get(), tensor);
196+
}
197+
}
198+
199+
std::vector<no_init<uint8_t>> work;
200+
201+
std::ofstream fout;
202+
auto close_ofstream = [&]() {
203+
// Write metadata and close file handler
204+
if (fout.is_open()) {
205+
fout.seekp(0);
206+
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out.get()));
207+
gguf_get_meta_data(ctx_out.get(), data.data());
208+
fout.write((const char *) data.data(), data.size());
209+
fout.close();
210+
}
211+
};
212+
auto new_ofstream = [&]() {
213+
std::string fname = ofile;
214+
fout = std::ofstream(fname, std::ios::binary);
215+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
216+
const size_t meta_size = gguf_get_meta_size(ctx_out.get());
217+
// placeholder for the meta data
218+
::zeros(fout, meta_size);
219+
};
220+
new_ofstream();
221+
for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
222+
enum ggml_type new_type;
223+
void * new_data;
224+
size_t new_size;
225+
std::string name = ggml_get_name(cur);
226+
227+
if (name.size() == 0) {
228+
continue;
229+
}
230+
231+
if (is_quantizable(arch_type, name, params)) {
232+
if ((cur->type) != GGML_TYPE_F32) {
233+
TTS_ABORT("ERROR: All quantized tensors must be transformed from 32bit floats. Tensor, '%s', has improper type, '%d'\n", cur->name, cur->type);
234+
}
235+
new_type = params->quantize_type;
236+
if ((new_type >= GGML_TYPE_IQ2_XXS && new_type <= GGML_TYPE_IQ4_XS)) {
237+
TTS_ABORT("ERROR: Quantization type '%d' requires an importance matrix.\n", new_type);
238+
}
239+
const int64_t nelement_size = ggml_nelements(cur) * 4;
240+
if (work.size() < (size_t)nelement_size) {
241+
work.resize(nelement_size); // upper bound on size
242+
}
243+
new_data = work.data();
244+
new_size = quantize_tensor(new_data, cur, nullptr, new_type, params->n_threads);
245+
} else if ((params->convert_non_quantizable_to_f16 && kokoro_is_f16_compatible(name)) || (params->convert_dac_to_f16 && has_prefix(name, "audio_encoder") && !has_suffix(name, "alpha"))) {
246+
if ((cur->type) != GGML_TYPE_F32) {
247+
TTS_ABORT("ERROR: All converted tensors must be transformed from 32bit floats. Tensor, '%s', has improper type, '%d'\n", cur->name, cur->type);
248+
}
249+
new_type = GGML_TYPE_F16;
250+
const int64_t nelement_size = ggml_nelements(cur) * 4;
251+
if (work.size() < (size_t)nelement_size) {
252+
work.resize(nelement_size); // upper bound on size
253+
}
254+
new_data = work.data();
255+
new_size = quantize_tensor(new_data, cur, nullptr, new_type, params->n_threads);
256+
} else {
257+
new_type = cur->type;
258+
new_data = cur->data;
259+
new_size = ggml_nbytes(cur);
260+
}
261+
262+
gguf_set_tensor_type(ctx_out.get(), name.c_str(), new_type);
263+
gguf_set_tensor_data(ctx_out.get(), name.c_str(), new_data, new_size);
264+
fprintf(stdout, "At tensor: '%s' with new size: %zu bytes\n", name.c_str(), new_size);
265+
// write tensor data + padding
266+
fout.write((const char *) new_data, new_size);
267+
zeros(fout, GGML_PAD(new_size, align) - new_size);
268+
}
269+
close_ofstream();
270+
}

0 commit comments

Comments
 (0)