Skip to content

Commit d1399e8

Browse files
committed
kokoro: Fix double free
kokoro_model was being freed twice, firstly by kokoro_duration_runner while in use by kokoro_runner, which then had a use-after-free when it tried to properly free it. kokoro_context was also double-freeing some backend data that its base class runner_context would later free again. There was a mismatched new/free in prepare_post_load. After removing the double-free, I chose to add unique_ptr to let this part hold on to RAII and ownership. reference_wrapper is instead used in another PR to indicate non-ownership. Nearby pointers have not been upgraded to unique_ptr, because they involve backends, the situation of which has been difficult to untangle. Before, there was a SIGSEGV on post-args-refactor server shutdown. After, there the server exits cleanly on Ctrl+C. This isn't visible on the main branch, which just leaks the memory.
1 parent f2ebb39 commit d1399e8

File tree

5 files changed

+33
-34
lines changed

5 files changed

+33
-34
lines changed

include/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ struct tts_runner {
5050
tts_arch arch;
5151
struct ggml_context * ctx = nullptr;
5252
float sampling_rate = 44100.0f;
53+
virtual ~tts_runner() = default;
5354

5455
void init_build(std::vector<uint8_t>* buf_compute_meta);
5556
void free_build();

src/kokoro_model.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,7 @@ struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_
958958
kctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
959959
ggml_set_input(kctx->positions);
960960

961-
inpL = build_albert_inputs(ctx, model, kctx->inp_tokens, kctx->positions, kctx->token_types);
961+
inpL = build_albert_inputs(ctx, &*model, kctx->inp_tokens, kctx->positions, kctx->token_types);
962962
ggml_set_name(inpL, "albert_embeddings");
963963
cur = inpL;
964964

@@ -1233,7 +1233,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
12331233
ggml_set_input(kctx->window_sq_sum);
12341234

12351235
// run generation
1236-
cur = build_generator(ctx, model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
1236+
cur = build_generator(ctx, &*model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
12371237
ggml_build_forward_expand(gf, cur);
12381238
free_build();
12391239
return gf;
@@ -1245,7 +1245,7 @@ void kokoro_runner::prepare_post_load() {
12451245
auto batch = build_worst_case_batch();
12461246
auto gf = build_kokoro_graph(batch);
12471247
kctx->prep_schedule(gf);
1248-
free(batch.resp);
1248+
delete batch.resp;
12491249
}
12501250

12511251
void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) {

src/kokoro_model.h

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -362,13 +362,15 @@ struct kokoro_duration_response {
362362
// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't
363363
// support the tensor dependent views that would otherwise be necessary.
364364
struct kokoro_duration_runner : tts_runner {
365-
kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
365+
explicit kokoro_duration_runner(/* shared */ kokoro_model * model, kokoro_duration_context * context,
366+
single_pass_tokenizer * tokenizer)
367+
: tokenizer{tokenizer}, model{model}, kctx{context} {
368+
};
369+
366370
~kokoro_duration_runner() {
367371
if (ctx) {
368372
ggml_free(ctx);
369373
}
370-
model->free();
371-
delete model;
372374
delete kctx;
373375
}
374376
struct single_pass_tokenizer * tokenizer;
@@ -387,17 +389,7 @@ struct kokoro_duration_runner : tts_runner {
387389
};
388390

389391
struct kokoro_context : runner_context {
390-
kokoro_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {};
391-
~kokoro_context() {
392-
ggml_backend_sched_free(sched);
393-
ggml_backend_free(backend_cpu);
394-
if (backend) {
395-
ggml_backend_free(backend);
396-
}
397-
if (buf_output) {
398-
ggml_backend_buffer_free(buf_output);
399-
}
400-
}
392+
explicit kokoro_context(kokoro_model * model, int n_threads) : runner_context{n_threads}, model{model} {}
401393

402394
std::string voice = "af_alloy";
403395

@@ -428,21 +420,21 @@ struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, in
428420

429421
// This manages the graph compilation of computation for the Kokoro model.
430422
struct kokoro_runner : tts_runner {
431-
kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) {
432-
tts_runner::sampling_rate = 24000.0f;
423+
explicit kokoro_runner(unique_ptr<kokoro_model> && model, kokoro_context * context,
424+
single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr)
425+
: tokenizer{tokenizer}, model{move(model)}, kctx{context}, drunner{drunner}, phmzr{phmzr} {
426+
sampling_rate = 24000.0f;
433427
};
434428
~kokoro_runner() {
435429
if (ctx) {
436430
ggml_free(ctx);
437431
}
438432
delete drunner;
439-
model->free();
440-
delete model;
441433
delete kctx;
442434
delete phmzr;
443435
}
444436
struct single_pass_tokenizer * tokenizer;
445-
kokoro_model * model;
437+
unique_ptr<kokoro_model> model;
446438
kokoro_context * kctx;
447439
kokoro_duration_runner * drunner;
448440
phonemizer * phmzr;

src/tts.cpp

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,23 +40,23 @@ struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context *
4040
ggml_free(weight_ctx);
4141
runner->arch = arch;
4242

43-
return (tts_runner*)runner;
43+
return runner;
4444
}
4545

4646
struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
47-
kokoro_model * model = new kokoro_model;
47+
unique_ptr<kokoro_model> model = make_unique<kokoro_model>();
4848
single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens");
4949
model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
50-
struct kokoro_duration_context * kdctx = build_new_duration_kokoro_context(model, n_threads, cpu_only);
51-
struct kokoro_duration_runner * duration_runner = new kokoro_duration_runner(model, kdctx, spt);
52-
struct kokoro_context * kctx = build_new_kokoro_context(model, n_threads, cpu_only);
50+
kokoro_duration_context * kdctx = build_new_duration_kokoro_context(&*model, n_threads, cpu_only);
51+
kokoro_duration_runner * duration_runner = new kokoro_duration_runner(&*model, kdctx, spt);
52+
kokoro_context * kctx = build_new_kokoro_context(&*model, n_threads, cpu_only);
5353
// if an espeak voice id wasn't specifically set infer it from the kokoro voice, if it was override it, otherwise fallback to American English.
5454
std::string espeak_voice_id = config->espeak_voice_id;
5555
if (espeak_voice_id.empty()) {
5656
espeak_voice_id = !config->voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(config->voice.at(0)) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[config->voice.at(0)] : "gmw/en-US";
5757
}
58-
struct phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
59-
struct kokoro_runner * runner = new kokoro_runner(model, kctx, spt, duration_runner, phmzr);
58+
phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
59+
kokoro_runner * runner = new kokoro_runner(move(model), kctx, spt, duration_runner, phmzr);
6060

6161
// TODO: change this weight assignment pattern to mirror llama.cpp
6262
for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
@@ -69,7 +69,7 @@ struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * wei
6969
ggml_free(weight_ctx);
7070
runner->arch = arch;
7171

72-
return (tts_runner*)runner;
72+
return runner;
7373
}
7474

7575
struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
@@ -94,7 +94,7 @@ struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight
9494
ggml_free(weight_ctx);
9595
runner->arch = arch;
9696

97-
return (tts_runner*)runner;
97+
return runner;
9898
}
9999

100100
// currently only metal and cpu devices are supported, so cpu_only only describes whether or not to try to load and run on metal.
@@ -146,8 +146,14 @@ int generate(tts_runner * runner, std::string sentence, struct tts_response * re
146146
}
147147

148148
void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) {
149-
int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads;
150-
((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
149+
const auto parler{dynamic_cast<parler_tts_runner *>(runner)};
150+
if (!parler) {
151+
fprintf(stderr, "Wrong model for conditional prompt\n");
152+
return;
153+
}
154+
155+
const int n_threads = parler->pctx->n_threads;
156+
parler->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
151157
}
152158

153159
bool kokoro_is_f16_compatible(std::string name) {

0 commit comments

Comments
 (0)