kokoro: Fix double free

danielzgtg · danielzgtg · commit e4122e55a8a9 · 2025-06-03T09:38:04.000-04:00
kokoro_model was being freed twice, firstly by
kokoro_duration_runner while in use by kokoro_runner, which then had a
use-after-free when it tried to properly free it.
kokoro_context was also double-freeing some backend data that
its base class runner_context would later free again.
There was a mismatched new/free in prepare_post_load.

After removing the double-free, I chose to add unique_ptr to let this
part hold on to RAII and ownership. reference_wrapper is instead used in
another PR to indicate non-ownership. Nearby pointers have not been
upgraded to unique_ptr, because they involve backends, the situation of
which has been difficult to untangle.

Before, there was a SIGSEGV on post-args-refactor server shutdown.
After, there the server exits cleanly on Ctrl+C.
This isn't visible on the main branch, which just leaks the memory.
diff --git a/ggml b/ggml
@@ -1 +1 @@
-Subproject commit 1e85c87aeaa70548ad52766f1881c2f1257962e2
+Subproject commit e486998a9848fce92858ca54691ac9e6f506e202
diff --git a/include/common.h b/include/common.h
@@ -50,6 +50,7 @@ struct tts_runner {
 	tts_arch arch;
 	struct ggml_context * ctx = nullptr;
 	float sampling_rate = 44100.0f;
+	virtual ~tts_runner() = default;
 
 	void init_build(std::vector<uint8_t>* buf_compute_meta);
 	void free_build();
diff --git a/src/kokoro_model.cpp b/src/kokoro_model.cpp
@@ -958,7 +958,7 @@ struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_
     kctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
     ggml_set_input(kctx->positions);
 
-    inpL = build_albert_inputs(ctx, model, kctx->inp_tokens, kctx->positions, kctx->token_types);
+    inpL = build_albert_inputs(ctx, &*model, kctx->inp_tokens, kctx->positions, kctx->token_types);
     ggml_set_name(inpL, "albert_embeddings");
     cur = inpL;
 
@@ -1233,7 +1233,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
 	ggml_set_input(kctx->window_sq_sum);
 
 	// run generation
-	cur = build_generator(ctx, model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
+	cur = build_generator(ctx, &*model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
     ggml_build_forward_expand(gf, cur);
     free_build();
     return gf;
@@ -1245,7 +1245,7 @@ void kokoro_runner::prepare_post_load() {
     auto batch = build_worst_case_batch();
     auto gf = build_kokoro_graph(batch);
     kctx->prep_schedule(gf);
-    free(batch.resp);
+    delete batch.resp;
 }
 
 void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) {
diff --git a/src/kokoro_model.h b/src/kokoro_model.h
@@ -371,13 +371,15 @@ struct kokoro_duration_response {
 // Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't 
 // support the tensor dependent views that would otherwise be necessary.
 struct kokoro_duration_runner : tts_runner {
-    kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
+    explicit kokoro_duration_runner(/* shared */ kokoro_model * model, kokoro_duration_context * context,
+                                    single_pass_tokenizer * tokenizer)
+        :  tokenizer{tokenizer}, model{model}, kctx{context} {
+    };
+
     ~kokoro_duration_runner() {
         if (ctx) {
             ggml_free(ctx);
         }
-        model->free();
-        delete model;
         delete kctx;
     }
     struct single_pass_tokenizer * tokenizer;
@@ -396,17 +398,7 @@ struct kokoro_duration_runner : tts_runner {
 };
 
 struct kokoro_context : runner_context {
-    kokoro_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {};
-    ~kokoro_context() {
-        ggml_backend_sched_free(sched);
-        ggml_backend_free(backend_cpu);
-        if (backend) {
-            ggml_backend_free(backend);
-        }
-        if (buf_output) {
-            ggml_backend_buffer_free(buf_output);
-        }
-    }
+    explicit kokoro_context(kokoro_model * model, int n_threads) : runner_context{n_threads}, model{model} {}
 
     std::string voice = "af_alloy";
     
@@ -437,21 +429,21 @@ struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, in
 
 // This manages the graph compilation of computation for the Kokoro model.
 struct kokoro_runner : tts_runner {
-    kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) {
-    	tts_runner::sampling_rate = 24000.0f;
+    explicit kokoro_runner(unique_ptr<kokoro_model> && model, kokoro_context * context,
+                           single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr)
+        : tokenizer{tokenizer}, model{move(model)}, kctx{context}, drunner{drunner}, phmzr{phmzr} {
+        sampling_rate = 24000.0f;
     };
     ~kokoro_runner() {
         if (ctx) {
             ggml_free(ctx);
         }
         delete drunner;
-        model->free();
-        delete model;
         delete kctx;
         delete phmzr;
     }
     struct single_pass_tokenizer * tokenizer;
-    kokoro_model * model;
+    unique_ptr<kokoro_model> model;
     kokoro_context * kctx;
     kokoro_duration_runner * drunner;
     phonemizer * phmzr;
diff --git a/src/tts.cpp b/src/tts.cpp
@@ -31,23 +31,23 @@ struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context *
     ggml_free(weight_ctx);
     runner->arch = arch;
 
-    return (tts_runner*)runner;
+    return runner;
 }
 
 struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
-    kokoro_model * model = new kokoro_model;
+    unique_ptr<kokoro_model> model = make_unique<kokoro_model>();
     single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens");
     model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
-    struct kokoro_duration_context * kdctx = build_new_duration_kokoro_context(model, n_threads, cpu_only);
-    struct kokoro_duration_runner * duration_runner = new kokoro_duration_runner(model, kdctx, spt);
-    struct kokoro_context * kctx = build_new_kokoro_context(model, n_threads, cpu_only);
+    kokoro_duration_context * kdctx = build_new_duration_kokoro_context(&*model, n_threads, cpu_only);
+    kokoro_duration_runner * duration_runner = new kokoro_duration_runner(&*model, kdctx, spt);
+    kokoro_context * kctx = build_new_kokoro_context(&*model, n_threads, cpu_only);
     // if an espeak voice id wasn't specifically set infer it from the kokoro voice, if it was override it, otherwise fallback to American English.
     std::string espeak_voice_id = config->espeak_voice_id;
     if (espeak_voice_id.empty()) {
         espeak_voice_id = !config->voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(config->voice.at(0)) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[config->voice.at(0)] : "gmw/en-US";
     }
-    struct phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
-    struct kokoro_runner * runner = new kokoro_runner(model, kctx, spt, duration_runner, phmzr);
+    phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
+    kokoro_runner * runner = new kokoro_runner(move(model), kctx, spt, duration_runner, phmzr);
 
     // TODO: change this weight assignment pattern to mirror llama.cpp
     for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
@@ -60,7 +60,7 @@ struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * wei
     ggml_free(weight_ctx);
     runner->arch = arch;
 
-    return (tts_runner*)runner;
+    return runner;
 }
 
 struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
@@ -85,7 +85,7 @@ struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight
     ggml_free(weight_ctx);
     runner->arch = arch;
 
-    return (tts_runner*)runner;
+    return runner;
 }
 
 // currently only metal and cpu devices are supported, so cpu_only only describes whether or not to try to load and run on metal.
@@ -137,8 +137,14 @@ int generate(tts_runner * runner, std::string sentence, struct tts_response * re
 }
 
 void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) {
-    int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads;
-    ((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
+    const auto parler{dynamic_cast<parler_tts_runner *>(runner)};
+    if (!parler) {
+        fprintf(stderr, "Wrong model for conditional prompt\n");
+        return;
+    }
+
+    const int n_threads = parler->pctx->n_threads;
+    parler->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
 }
 
 bool kokoro_is_f16_compatible(std::string name) {