From f2ebb39be9288f2a353e8fe1b4ab38f184d7323e Mon Sep 17 00:00:00 2001
From: Daniel Tang <danielzgtg.opensource@gmail.com>
Date: Tue, 3 Jun 2025 08:10:58 -0400
Subject: [PATCH 1/2] chore: Enable C++23 and abbreviations

It's been a few weeks and no opposition surfaced. This change allows
new code to be written faster and less cluttered.
---
 CMakeLists.txt    |  2 +-
 README.md         |  2 +-
 include/common.h  |  9 ++-------
 include/imports.h | 17 +++++++++++++++++
 src/util.cpp      |  1 +
 src/util.h        | 13 +++----------
 6 files changed, 25 insertions(+), 19 deletions(-)
 create mode 100644 include/imports.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8fe3267..1661c33 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.14)
 project("tts.cpp" C CXX)
 include(CheckIncludeFileCXX)
 
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 23)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
diff --git a/README.md b/README.md
index 6d475d6..b339488 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ Additional Model support will initially be added based on open source model perf
 #### Requirements:
 
 * Local GGUF format model file (see [py-gguf](./py-ggufs/README.md) for information on how to convert the hugging face models to GGUF).
-* C++17 and C17
+* C++23 and C11
   * XCode Command Line Tools (via `xcode-select --install`) should suffice for OS X
 * CMake (>=3.14) 
 * GGML pulled locally
diff --git a/include/common.h b/include/common.h
index 02de8e1..991f184 100644
--- a/include/common.h
+++ b/include/common.h
@@ -1,10 +1,7 @@
-#ifndef common_h
-#define common_h
+#pragma once
 
-#include <cstdint>
-#include <string>
 #include <map>
-#include <vector>
+#include "imports.h"
 
 // Using this simple struct as opposed to a common std::vector allows us to return the cpu buffer
 // pointer directly rather than copying the contents of the buffer to a predefined std::vector.
@@ -57,5 +54,3 @@ struct tts_runner {
 	void init_build(std::vector<uint8_t>* buf_compute_meta);
 	void free_build();
 };
-
-#endif
diff --git a/include/imports.h b/include/imports.h
new file mode 100644
index 0000000..ce47701
--- /dev/null
+++ b/include/imports.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string_view>
+#include <ranges>
+#include <vector>
+
+using namespace std;
+using namespace std::string_view_literals;
+typedef std::string_view sv;
+typedef const char * str;
+
+#define TTS_ABORT(...) tts_abort(__FILE__, __LINE__, __VA_ARGS__)
+#define TTS_ASSERT(x) if (!(x)) TTS_ABORT("TTS_ASSERT(%s) failed", #x)
+[[noreturn]] void tts_abort(const char * file, int line, const char * fmt, ...);
diff --git a/src/util.cpp b/src/util.cpp
index a5bbb4b..37421ff 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -2,6 +2,7 @@
 
 #include <algorithm>
 #include <cstdio>
+#include <cstring>
 #include <stdarg.h>
 #ifdef __APPLE__
 #include <sys/sysctl.h>
diff --git a/src/util.h b/src/util.h
index 458d080..5b20b89 100644
--- a/src/util.h
+++ b/src/util.h
@@ -2,12 +2,9 @@
 #define util_h
 
 #include <functional>
-#include <math.h>
+#include <cmath>
 #include <random>
-#include <stdio.h>
-#include <string>
-#include <cstring>
-#include <vector>
+#include <cstdio>
 #include <stdint.h>
 #include <sys/types.h>
 #include "ggml-metal.h"
@@ -17,9 +14,7 @@
 #include "ggml.h"
 #include "ggml-impl.h"
 #include "ggml-cpp.h"
-
-#define TTS_ABORT(...) tts_abort(__FILE__, __LINE__, __VA_ARGS__)
-#define TTS_ASSERT(x) if (!(x)) TTS_ABORT("TTS_ASSERT(%s) failed", #x)
+#include "imports.h"
 
 struct model_tensor_meta {
 	uint32_t n_tensors = 0;
@@ -60,6 +55,4 @@ std::vector<std::string> split(std::string target, const char split_on, bool inc
 std::string strip(std::string target, std::string vals = " ");
 std::string replace_any(std::string target, std::string to_replace, std::string replacement);
 
-[[noreturn]] void tts_abort(const char * file, int line, const char * fmt, ...);
-
 #endif

From d1399e894f5c3142d1fa3b75722ce1269c9c818c Mon Sep 17 00:00:00 2001
From: Daniel Tang <danielzgtg.opensource@gmail.com>
Date: Tue, 3 Jun 2025 07:41:50 -0400
Subject: [PATCH 2/2] kokoro: Fix double free

kokoro_model was being freed twice, firstly by
kokoro_duration_runner while in use by kokoro_runner, which then had a
use-after-free when it tried to properly free it.
kokoro_context was also double-freeing some backend data that
its base class runner_context would later free again.
There was a mismatched new/free in prepare_post_load.

After removing the double-free, I chose to add unique_ptr to let this
part hold on to RAII and ownership. reference_wrapper is instead used in
another PR to indicate non-ownership. Nearby pointers have not been
upgraded to unique_ptr, because they involve backends, the situation of
which has been difficult to untangle.

Before, there was a SIGSEGV on post-args-refactor server shutdown.
After, there the server exits cleanly on Ctrl+C.
This isn't visible on the main branch, which just leaks the memory.
---
 ggml                 |  2 +-
 include/common.h     |  1 +
 src/kokoro_model.cpp |  6 +++---
 src/kokoro_model.h   | 30 +++++++++++-------------------
 src/tts.cpp          | 28 +++++++++++++++++-----------
 5 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/ggml b/ggml
index 1e85c87..e486998 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 1e85c87aeaa70548ad52766f1881c2f1257962e2
+Subproject commit e486998a9848fce92858ca54691ac9e6f506e202
diff --git a/include/common.h b/include/common.h
index 991f184..df4ce11 100644
--- a/include/common.h
+++ b/include/common.h
@@ -50,6 +50,7 @@ struct tts_runner {
 	tts_arch arch;
 	struct ggml_context * ctx = nullptr;
 	float sampling_rate = 44100.0f;
+	virtual ~tts_runner() = default;
 
 	void init_build(std::vector<uint8_t>* buf_compute_meta);
 	void free_build();
diff --git a/src/kokoro_model.cpp b/src/kokoro_model.cpp
index dad1cf5..0bee7e2 100644
--- a/src/kokoro_model.cpp
+++ b/src/kokoro_model.cpp
@@ -958,7 +958,7 @@ struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_
     kctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
     ggml_set_input(kctx->positions);
 
-    inpL = build_albert_inputs(ctx, model, kctx->inp_tokens, kctx->positions, kctx->token_types);
+    inpL = build_albert_inputs(ctx, &*model, kctx->inp_tokens, kctx->positions, kctx->token_types);
     ggml_set_name(inpL, "albert_embeddings");
     cur = inpL;
 
@@ -1233,7 +1233,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
 	ggml_set_input(kctx->window_sq_sum);
 
 	// run generation
-	cur = build_generator(ctx, model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
+	cur = build_generator(ctx, &*model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
     ggml_build_forward_expand(gf, cur);
     free_build();
     return gf;
@@ -1245,7 +1245,7 @@ void kokoro_runner::prepare_post_load() {
     auto batch = build_worst_case_batch();
     auto gf = build_kokoro_graph(batch);
     kctx->prep_schedule(gf);
-    free(batch.resp);
+    delete batch.resp;
 }
 
 void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) {
diff --git a/src/kokoro_model.h b/src/kokoro_model.h
index 328150d..bddc33c 100644
--- a/src/kokoro_model.h
+++ b/src/kokoro_model.h
@@ -362,13 +362,15 @@ struct kokoro_duration_response {
 // Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't 
 // support the tensor dependent views that would otherwise be necessary.
 struct kokoro_duration_runner : tts_runner {
-    kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
+    explicit kokoro_duration_runner(/* shared */ kokoro_model * model, kokoro_duration_context * context,
+                                    single_pass_tokenizer * tokenizer)
+        :  tokenizer{tokenizer}, model{model}, kctx{context} {
+    };
+
     ~kokoro_duration_runner() {
         if (ctx) {
             ggml_free(ctx);
         }
-        model->free();
-        delete model;
         delete kctx;
     }
     struct single_pass_tokenizer * tokenizer;
@@ -387,17 +389,7 @@ struct kokoro_duration_runner : tts_runner {
 };
 
 struct kokoro_context : runner_context {
-    kokoro_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {};
-    ~kokoro_context() {
-        ggml_backend_sched_free(sched);
-        ggml_backend_free(backend_cpu);
-        if (backend) {
-            ggml_backend_free(backend);
-        }
-        if (buf_output) {
-            ggml_backend_buffer_free(buf_output);
-        }
-    }
+    explicit kokoro_context(kokoro_model * model, int n_threads) : runner_context{n_threads}, model{model} {}
 
     std::string voice = "af_alloy";
     
@@ -428,21 +420,21 @@ struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, in
 
 // This manages the graph compilation of computation for the Kokoro model.
 struct kokoro_runner : tts_runner {
-    kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) {
-    	tts_runner::sampling_rate = 24000.0f;
+    explicit kokoro_runner(unique_ptr<kokoro_model> && model, kokoro_context * context,
+                           single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr)
+        : tokenizer{tokenizer}, model{move(model)}, kctx{context}, drunner{drunner}, phmzr{phmzr} {
+        sampling_rate = 24000.0f;
     };
     ~kokoro_runner() {
         if (ctx) {
             ggml_free(ctx);
         }
         delete drunner;
-        model->free();
-        delete model;
         delete kctx;
         delete phmzr;
     }
     struct single_pass_tokenizer * tokenizer;
-    kokoro_model * model;
+    unique_ptr<kokoro_model> model;
     kokoro_context * kctx;
     kokoro_duration_runner * drunner;
     phonemizer * phmzr;
diff --git a/src/tts.cpp b/src/tts.cpp
index d426dae..f042537 100644
--- a/src/tts.cpp
+++ b/src/tts.cpp
@@ -40,23 +40,23 @@ struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context *
     ggml_free(weight_ctx);
     runner->arch = arch;
 
-    return (tts_runner*)runner;
+    return runner;
 }
 
 struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
-    kokoro_model * model = new kokoro_model;
+    unique_ptr<kokoro_model> model = make_unique<kokoro_model>();
     single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens");
     model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
-    struct kokoro_duration_context * kdctx = build_new_duration_kokoro_context(model, n_threads, cpu_only);
-    struct kokoro_duration_runner * duration_runner = new kokoro_duration_runner(model, kdctx, spt);
-    struct kokoro_context * kctx = build_new_kokoro_context(model, n_threads, cpu_only);
+    kokoro_duration_context * kdctx = build_new_duration_kokoro_context(&*model, n_threads, cpu_only);
+    kokoro_duration_runner * duration_runner = new kokoro_duration_runner(&*model, kdctx, spt);
+    kokoro_context * kctx = build_new_kokoro_context(&*model, n_threads, cpu_only);
     // if an espeak voice id wasn't specifically set infer it from the kokoro voice, if it was override it, otherwise fallback to American English.
     std::string espeak_voice_id = config->espeak_voice_id;
     if (espeak_voice_id.empty()) {
         espeak_voice_id = !config->voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(config->voice.at(0)) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[config->voice.at(0)] : "gmw/en-US";
     }
-    struct phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
-    struct kokoro_runner * runner = new kokoro_runner(model, kctx, spt, duration_runner, phmzr);
+    phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
+    kokoro_runner * runner = new kokoro_runner(move(model), kctx, spt, duration_runner, phmzr);
 
     // TODO: change this weight assignment pattern to mirror llama.cpp
     for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
@@ -69,7 +69,7 @@ struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * wei
     ggml_free(weight_ctx);
     runner->arch = arch;
 
-    return (tts_runner*)runner;
+    return runner;
 }
 
 struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
@@ -94,7 +94,7 @@ struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight
     ggml_free(weight_ctx);
     runner->arch = arch;
 
-    return (tts_runner*)runner;
+    return runner;
 }
 
 // currently only metal and cpu devices are supported, so cpu_only only describes whether or not to try to load and run on metal.
@@ -146,8 +146,14 @@ int generate(tts_runner * runner, std::string sentence, struct tts_response * re
 }
 
 void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) {
-    int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads;
-    ((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
+    const auto parler{dynamic_cast<parler_tts_runner *>(runner)};
+    if (!parler) {
+        fprintf(stderr, "Wrong model for conditional prompt\n");
+        return;
+    }
+
+    const int n_threads = parler->pctx->n_threads;
+    parler->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
 }
 
 bool kokoro_is_f16_compatible(std::string name) {