Skip to content

Commit e4122e5

Browse files
committed
kokoro: Fix double free
kokoro_model was being freed twice, firstly by kokoro_duration_runner while in use by kokoro_runner, which then had a use-after-free when it tried to properly free it. kokoro_context was also double-freeing some backend data that its base class runner_context would later free again. There was a mismatched new/free in prepare_post_load. After removing the double-free, I chose to add unique_ptr to let this part hold on to RAII and ownership. reference_wrapper is instead used in another PR to indicate non-ownership. Nearby pointers have not been upgraded to unique_ptr, because they involve backends, the situation of which has been difficult to untangle. Before, there was a SIGSEGV on post-args-refactor server shutdown. After, there the server exits cleanly on Ctrl+C. This isn't visible on the main branch, which just leaks the memory.
1 parent ace4295 commit e4122e5

File tree

5 files changed

+33
-34
lines changed

5 files changed

+33
-34
lines changed

include/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ struct tts_runner {
5050
tts_arch arch;
5151
struct ggml_context * ctx = nullptr;
5252
float sampling_rate = 44100.0f;
53+
virtual ~tts_runner() = default;
5354

5455
void init_build(std::vector<uint8_t>* buf_compute_meta);
5556
void free_build();

src/kokoro_model.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,7 @@ struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_
958958
kctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
959959
ggml_set_input(kctx->positions);
960960

961-
inpL = build_albert_inputs(ctx, model, kctx->inp_tokens, kctx->positions, kctx->token_types);
961+
inpL = build_albert_inputs(ctx, &*model, kctx->inp_tokens, kctx->positions, kctx->token_types);
962962
ggml_set_name(inpL, "albert_embeddings");
963963
cur = inpL;
964964

@@ -1233,7 +1233,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
12331233
ggml_set_input(kctx->window_sq_sum);
12341234

12351235
// run generation
1236-
cur = build_generator(ctx, model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
1236+
cur = build_generator(ctx, &*model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
12371237
ggml_build_forward_expand(gf, cur);
12381238
free_build();
12391239
return gf;
@@ -1245,7 +1245,7 @@ void kokoro_runner::prepare_post_load() {
12451245
auto batch = build_worst_case_batch();
12461246
auto gf = build_kokoro_graph(batch);
12471247
kctx->prep_schedule(gf);
1248-
free(batch.resp);
1248+
delete batch.resp;
12491249
}
12501250

12511251
void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) {

src/kokoro_model.h

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -371,13 +371,15 @@ struct kokoro_duration_response {
371371
// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't
372372
// support the tensor dependent views that would otherwise be necessary.
373373
struct kokoro_duration_runner : tts_runner {
374-
kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
374+
explicit kokoro_duration_runner(/* shared */ kokoro_model * model, kokoro_duration_context * context,
375+
single_pass_tokenizer * tokenizer)
376+
: tokenizer{tokenizer}, model{model}, kctx{context} {
377+
};
378+
375379
~kokoro_duration_runner() {
376380
if (ctx) {
377381
ggml_free(ctx);
378382
}
379-
model->free();
380-
delete model;
381383
delete kctx;
382384
}
383385
struct single_pass_tokenizer * tokenizer;
@@ -396,17 +398,7 @@ struct kokoro_duration_runner : tts_runner {
396398
};
397399

398400
struct kokoro_context : runner_context {
399-
kokoro_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {};
400-
~kokoro_context() {
401-
ggml_backend_sched_free(sched);
402-
ggml_backend_free(backend_cpu);
403-
if (backend) {
404-
ggml_backend_free(backend);
405-
}
406-
if (buf_output) {
407-
ggml_backend_buffer_free(buf_output);
408-
}
409-
}
401+
explicit kokoro_context(kokoro_model * model, int n_threads) : runner_context{n_threads}, model{model} {}
410402

411403
std::string voice = "af_alloy";
412404

@@ -437,21 +429,21 @@ struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, in
437429

438430
// This manages the graph compilation of computation for the Kokoro model.
439431
struct kokoro_runner : tts_runner {
440-
kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) {
441-
tts_runner::sampling_rate = 24000.0f;
432+
explicit kokoro_runner(unique_ptr<kokoro_model> && model, kokoro_context * context,
433+
single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr)
434+
: tokenizer{tokenizer}, model{move(model)}, kctx{context}, drunner{drunner}, phmzr{phmzr} {
435+
sampling_rate = 24000.0f;
442436
};
443437
~kokoro_runner() {
444438
if (ctx) {
445439
ggml_free(ctx);
446440
}
447441
delete drunner;
448-
model->free();
449-
delete model;
450442
delete kctx;
451443
delete phmzr;
452444
}
453445
struct single_pass_tokenizer * tokenizer;
454-
kokoro_model * model;
446+
unique_ptr<kokoro_model> model;
455447
kokoro_context * kctx;
456448
kokoro_duration_runner * drunner;
457449
phonemizer * phmzr;

src/tts.cpp

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,23 +31,23 @@ struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context *
3131
ggml_free(weight_ctx);
3232
runner->arch = arch;
3333

34-
return (tts_runner*)runner;
34+
return runner;
3535
}
3636

3737
struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
38-
kokoro_model * model = new kokoro_model;
38+
unique_ptr<kokoro_model> model = make_unique<kokoro_model>();
3939
single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens");
4040
model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
41-
struct kokoro_duration_context * kdctx = build_new_duration_kokoro_context(model, n_threads, cpu_only);
42-
struct kokoro_duration_runner * duration_runner = new kokoro_duration_runner(model, kdctx, spt);
43-
struct kokoro_context * kctx = build_new_kokoro_context(model, n_threads, cpu_only);
41+
kokoro_duration_context * kdctx = build_new_duration_kokoro_context(&*model, n_threads, cpu_only);
42+
kokoro_duration_runner * duration_runner = new kokoro_duration_runner(&*model, kdctx, spt);
43+
kokoro_context * kctx = build_new_kokoro_context(&*model, n_threads, cpu_only);
4444
// if an espeak voice id wasn't specifically set infer it from the kokoro voice, if it was override it, otherwise fallback to American English.
4545
std::string espeak_voice_id = config->espeak_voice_id;
4646
if (espeak_voice_id.empty()) {
4747
espeak_voice_id = !config->voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(config->voice.at(0)) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[config->voice.at(0)] : "gmw/en-US";
4848
}
49-
struct phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
50-
struct kokoro_runner * runner = new kokoro_runner(model, kctx, spt, duration_runner, phmzr);
49+
phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
50+
kokoro_runner * runner = new kokoro_runner(move(model), kctx, spt, duration_runner, phmzr);
5151

5252
// TODO: change this weight assignment pattern to mirror llama.cpp
5353
for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
@@ -60,7 +60,7 @@ struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * wei
6060
ggml_free(weight_ctx);
6161
runner->arch = arch;
6262

63-
return (tts_runner*)runner;
63+
return runner;
6464
}
6565

6666
struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
@@ -85,7 +85,7 @@ struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight
8585
ggml_free(weight_ctx);
8686
runner->arch = arch;
8787

88-
return (tts_runner*)runner;
88+
return runner;
8989
}
9090

9191
// currently only metal and cpu devices are supported, so cpu_only only describes whether or not to try to load and run on metal.
@@ -137,8 +137,14 @@ int generate(tts_runner * runner, std::string sentence, struct tts_response * re
137137
}
138138

139139
void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) {
140-
int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads;
141-
((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
140+
const auto parler{dynamic_cast<parler_tts_runner *>(runner)};
141+
if (!parler) {
142+
fprintf(stderr, "Wrong model for conditional prompt\n");
143+
return;
144+
}
145+
146+
const int n_threads = parler->pctx->n_threads;
147+
parler->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
142148
}
143149

144150
bool kokoro_is_f16_compatible(std::string name) {

0 commit comments

Comments
 (0)