From f2ebb39be9288f2a353e8fe1b4ab38f184d7323e Mon Sep 17 00:00:00 2001 From: Daniel Tang Date: Tue, 3 Jun 2025 08:10:58 -0400 Subject: [PATCH 1/2] chore: Enable C++23 and abbreviations It's been a few weeks and no opposition surfaced. This change allows new code to be written faster and less cluttered. --- CMakeLists.txt | 2 +- README.md | 2 +- include/common.h | 9 ++------- include/imports.h | 17 +++++++++++++++++ src/util.cpp | 1 + src/util.h | 13 +++---------- 6 files changed, 25 insertions(+), 19 deletions(-) create mode 100644 include/imports.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 8fe3267..1661c33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.14) project("tts.cpp" C CXX) include(CheckIncludeFileCXX) -set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD 23) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/README.md b/README.md index 6d475d6..b339488 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Additional Model support will initially be added based on open source model perf #### Requirements: * Local GGUF format model file (see [py-gguf](./py-ggufs/README.md) for information on how to convert the hugging face models to GGUF). -* C++17 and C17 +* C++23 and C11 * XCode Command Line Tools (via `xcode-select --install`) should suffice for OS X * CMake (>=3.14) * GGML pulled locally diff --git a/include/common.h b/include/common.h index 02de8e1..991f184 100644 --- a/include/common.h +++ b/include/common.h @@ -1,10 +1,7 @@ -#ifndef common_h -#define common_h +#pragma once -#include -#include #include -#include +#include "imports.h" // Using this simple struct as opposed to a common std::vector allows us to return the cpu buffer // pointer directly rather than copying the contents of the buffer to a predefined std::vector. @@ -57,5 +54,3 @@ struct tts_runner { void init_build(std::vector* buf_compute_meta); void free_build(); }; - -#endif diff --git a/include/imports.h b/include/imports.h new file mode 100644 index 0000000..ce47701 --- /dev/null +++ b/include/imports.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace std::string_view_literals; +typedef std::string_view sv; +typedef const char * str; + +#define TTS_ABORT(...) tts_abort(__FILE__, __LINE__, __VA_ARGS__) +#define TTS_ASSERT(x) if (!(x)) TTS_ABORT("TTS_ASSERT(%s) failed", #x) +[[noreturn]] void tts_abort(const char * file, int line, const char * fmt, ...); diff --git a/src/util.cpp b/src/util.cpp index a5bbb4b..37421ff 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #ifdef __APPLE__ #include diff --git a/src/util.h b/src/util.h index 458d080..5b20b89 100644 --- a/src/util.h +++ b/src/util.h @@ -2,12 +2,9 @@ #define util_h #include -#include +#include #include -#include -#include -#include -#include +#include #include #include #include "ggml-metal.h" @@ -17,9 +14,7 @@ #include "ggml.h" #include "ggml-impl.h" #include "ggml-cpp.h" - -#define TTS_ABORT(...) tts_abort(__FILE__, __LINE__, __VA_ARGS__) -#define TTS_ASSERT(x) if (!(x)) TTS_ABORT("TTS_ASSERT(%s) failed", #x) +#include "imports.h" struct model_tensor_meta { uint32_t n_tensors = 0; @@ -60,6 +55,4 @@ std::vector split(std::string target, const char split_on, bool inc std::string strip(std::string target, std::string vals = " "); std::string replace_any(std::string target, std::string to_replace, std::string replacement); -[[noreturn]] void tts_abort(const char * file, int line, const char * fmt, ...); - #endif From d1399e894f5c3142d1fa3b75722ce1269c9c818c Mon Sep 17 00:00:00 2001 From: Daniel Tang Date: Tue, 3 Jun 2025 07:41:50 -0400 Subject: [PATCH 2/2] kokoro: Fix double free kokoro_model was being freed twice, firstly by kokoro_duration_runner while in use by kokoro_runner, which then had a use-after-free when it tried to properly free it. kokoro_context was also double-freeing some backend data that its base class runner_context would later free again. There was a mismatched new/free in prepare_post_load. After removing the double-free, I chose to add unique_ptr to let this part hold on to RAII and ownership. reference_wrapper is instead used in another PR to indicate non-ownership. Nearby pointers have not been upgraded to unique_ptr, because they involve backends, the situation of which has been difficult to untangle. Before, there was a SIGSEGV on post-args-refactor server shutdown. After, there the server exits cleanly on Ctrl+C. This isn't visible on the main branch, which just leaks the memory. --- ggml | 2 +- include/common.h | 1 + src/kokoro_model.cpp | 6 +++--- src/kokoro_model.h | 30 +++++++++++------------------- src/tts.cpp | 28 +++++++++++++++++----------- 5 files changed, 33 insertions(+), 34 deletions(-) diff --git a/ggml b/ggml index 1e85c87..e486998 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 1e85c87aeaa70548ad52766f1881c2f1257962e2 +Subproject commit e486998a9848fce92858ca54691ac9e6f506e202 diff --git a/include/common.h b/include/common.h index 991f184..df4ce11 100644 --- a/include/common.h +++ b/include/common.h @@ -50,6 +50,7 @@ struct tts_runner { tts_arch arch; struct ggml_context * ctx = nullptr; float sampling_rate = 44100.0f; + virtual ~tts_runner() = default; void init_build(std::vector* buf_compute_meta); void free_build(); diff --git a/src/kokoro_model.cpp b/src/kokoro_model.cpp index dad1cf5..0bee7e2 100644 --- a/src/kokoro_model.cpp +++ b/src/kokoro_model.cpp @@ -958,7 +958,7 @@ struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_ kctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); ggml_set_input(kctx->positions); - inpL = build_albert_inputs(ctx, model, kctx->inp_tokens, kctx->positions, kctx->token_types); + inpL = build_albert_inputs(ctx, &*model, kctx->inp_tokens, kctx->positions, kctx->token_types); ggml_set_name(inpL, "albert_embeddings"); cur = inpL; @@ -1233,7 +1233,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) { ggml_set_input(kctx->window_sq_sum); // run generation - cur = build_generator(ctx, model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf); + cur = build_generator(ctx, &*model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf); ggml_build_forward_expand(gf, cur); free_build(); return gf; @@ -1245,7 +1245,7 @@ void kokoro_runner::prepare_post_load() { auto batch = build_worst_case_batch(); auto gf = build_kokoro_graph(batch); kctx->prep_schedule(gf); - free(batch.resp); + delete batch.resp; } void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) { diff --git a/src/kokoro_model.h b/src/kokoro_model.h index 328150d..bddc33c 100644 --- a/src/kokoro_model.h +++ b/src/kokoro_model.h @@ -362,13 +362,15 @@ struct kokoro_duration_response { // Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't // support the tensor dependent views that would otherwise be necessary. struct kokoro_duration_runner : tts_runner { - kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {}; + explicit kokoro_duration_runner(/* shared */ kokoro_model * model, kokoro_duration_context * context, + single_pass_tokenizer * tokenizer) + : tokenizer{tokenizer}, model{model}, kctx{context} { + }; + ~kokoro_duration_runner() { if (ctx) { ggml_free(ctx); } - model->free(); - delete model; delete kctx; } struct single_pass_tokenizer * tokenizer; @@ -387,17 +389,7 @@ struct kokoro_duration_runner : tts_runner { }; struct kokoro_context : runner_context { - kokoro_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {}; - ~kokoro_context() { - ggml_backend_sched_free(sched); - ggml_backend_free(backend_cpu); - if (backend) { - ggml_backend_free(backend); - } - if (buf_output) { - ggml_backend_buffer_free(buf_output); - } - } + explicit kokoro_context(kokoro_model * model, int n_threads) : runner_context{n_threads}, model{model} {} std::string voice = "af_alloy"; @@ -428,21 +420,21 @@ struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, in // This manages the graph compilation of computation for the Kokoro model. struct kokoro_runner : tts_runner { - kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) { - tts_runner::sampling_rate = 24000.0f; + explicit kokoro_runner(unique_ptr && model, kokoro_context * context, + single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr) + : tokenizer{tokenizer}, model{move(model)}, kctx{context}, drunner{drunner}, phmzr{phmzr} { + sampling_rate = 24000.0f; }; ~kokoro_runner() { if (ctx) { ggml_free(ctx); } delete drunner; - model->free(); - delete model; delete kctx; delete phmzr; } struct single_pass_tokenizer * tokenizer; - kokoro_model * model; + unique_ptr model; kokoro_context * kctx; kokoro_duration_runner * drunner; phonemizer * phmzr; diff --git a/src/tts.cpp b/src/tts.cpp index d426dae..f042537 100644 --- a/src/tts.cpp +++ b/src/tts.cpp @@ -40,23 +40,23 @@ struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * ggml_free(weight_ctx); runner->arch = arch; - return (tts_runner*)runner; + return runner; } struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { - kokoro_model * model = new kokoro_model; + unique_ptr model = make_unique(); single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens"); model->setup_from_file(meta_ctx, weight_ctx, cpu_only); - struct kokoro_duration_context * kdctx = build_new_duration_kokoro_context(model, n_threads, cpu_only); - struct kokoro_duration_runner * duration_runner = new kokoro_duration_runner(model, kdctx, spt); - struct kokoro_context * kctx = build_new_kokoro_context(model, n_threads, cpu_only); + kokoro_duration_context * kdctx = build_new_duration_kokoro_context(&*model, n_threads, cpu_only); + kokoro_duration_runner * duration_runner = new kokoro_duration_runner(&*model, kdctx, spt); + kokoro_context * kctx = build_new_kokoro_context(&*model, n_threads, cpu_only); // if an espeak voice id wasn't specifically set infer it from the kokoro voice, if it was override it, otherwise fallback to American English. std::string espeak_voice_id = config->espeak_voice_id; if (espeak_voice_id.empty()) { espeak_voice_id = !config->voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(config->voice.at(0)) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[config->voice.at(0)] : "gmw/en-US"; } - struct phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id); - struct kokoro_runner * runner = new kokoro_runner(model, kctx, spt, duration_runner, phmzr); + phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id); + kokoro_runner * runner = new kokoro_runner(move(model), kctx, spt, duration_runner, phmzr); // TODO: change this weight assignment pattern to mirror llama.cpp for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { @@ -69,7 +69,7 @@ struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * wei ggml_free(weight_ctx); runner->arch = arch; - return (tts_runner*)runner; + return runner; } struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { @@ -94,7 +94,7 @@ struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight ggml_free(weight_ctx); runner->arch = arch; - return (tts_runner*)runner; + return runner; } // currently only metal and cpu devices are supported, so cpu_only only describes whether or not to try to load and run on metal. @@ -146,8 +146,14 @@ int generate(tts_runner * runner, std::string sentence, struct tts_response * re } void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) { - int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads; - ((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only); + const auto parler{dynamic_cast(runner)}; + if (!parler) { + fprintf(stderr, "Wrong model for conditional prompt\n"); + return; + } + + const int n_threads = parler->pctx->n_threads; + parler->update_conditional_prompt(file_path, prompt, n_threads, cpu_only); } bool kokoro_is_f16_compatible(std::string name) {