Skip to content

kokoro: Fix double free #85

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.14)
project("tts.cpp" C CXX)
include(CheckIncludeFileCXX)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD 23)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Additional Model support will initially be added based on open source model perf
#### Requirements:

* Local GGUF format model file (see [py-gguf](./py-ggufs/README.md) for information on how to convert the hugging face models to GGUF).
* C++17 and C17
* C++23 and C11
* XCode Command Line Tools (via `xcode-select --install`) should suffice for OS X
* CMake (>=3.14)
* GGML pulled locally
Expand Down
2 changes: 1 addition & 1 deletion ggml
10 changes: 3 additions & 7 deletions include/common.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
#ifndef common_h
#define common_h
#pragma once

#include <cstdint>
#include <string>
#include <map>
#include <vector>
#include "imports.h"

// Using this simple struct as opposed to a common std::vector allows us to return the cpu buffer
// pointer directly rather than copying the contents of the buffer to a predefined std::vector.
Expand Down Expand Up @@ -53,9 +50,8 @@ struct tts_runner {
tts_arch arch;
struct ggml_context * ctx = nullptr;
float sampling_rate = 44100.0f;
virtual ~tts_runner() = default;

void init_build(std::vector<uint8_t>* buf_compute_meta);
void free_build();
};

#endif
17 changes: 17 additions & 0 deletions include/imports.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once

#include <algorithm>
#include <cstdint>
#include <memory>
#include <string_view>
#include <ranges>
#include <vector>

using namespace std;
using namespace std::string_view_literals;
typedef std::string_view sv;
typedef const char * str;

#define TTS_ABORT(...) tts_abort(__FILE__, __LINE__, __VA_ARGS__)
#define TTS_ASSERT(x) if (!(x)) TTS_ABORT("TTS_ASSERT(%s) failed", #x)
[[noreturn]] void tts_abort(const char * file, int line, const char * fmt, ...);
6 changes: 3 additions & 3 deletions src/kokoro_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -958,7 +958,7 @@ struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_
kctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
ggml_set_input(kctx->positions);

inpL = build_albert_inputs(ctx, model, kctx->inp_tokens, kctx->positions, kctx->token_types);
inpL = build_albert_inputs(ctx, &*model, kctx->inp_tokens, kctx->positions, kctx->token_types);
ggml_set_name(inpL, "albert_embeddings");
cur = inpL;

Expand Down Expand Up @@ -1233,7 +1233,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
ggml_set_input(kctx->window_sq_sum);

// run generation
cur = build_generator(ctx, model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
cur = build_generator(ctx, &*model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
ggml_build_forward_expand(gf, cur);
free_build();
return gf;
Expand All @@ -1245,7 +1245,7 @@ void kokoro_runner::prepare_post_load() {
auto batch = build_worst_case_batch();
auto gf = build_kokoro_graph(batch);
kctx->prep_schedule(gf);
free(batch.resp);
delete batch.resp;
}

void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) {
Expand Down
30 changes: 11 additions & 19 deletions src/kokoro_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -362,13 +362,15 @@ struct kokoro_duration_response {
// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't
// support the tensor dependent views that would otherwise be necessary.
struct kokoro_duration_runner : tts_runner {
kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
explicit kokoro_duration_runner(/* shared */ kokoro_model * model, kokoro_duration_context * context,
single_pass_tokenizer * tokenizer)
: tokenizer{tokenizer}, model{model}, kctx{context} {
};

~kokoro_duration_runner() {
if (ctx) {
ggml_free(ctx);
}
model->free();
delete model;
delete kctx;
}
struct single_pass_tokenizer * tokenizer;
Expand All @@ -387,17 +389,7 @@ struct kokoro_duration_runner : tts_runner {
};

struct kokoro_context : runner_context {
kokoro_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {};
~kokoro_context() {
ggml_backend_sched_free(sched);
ggml_backend_free(backend_cpu);
if (backend) {
ggml_backend_free(backend);
}
if (buf_output) {
ggml_backend_buffer_free(buf_output);
}
}
explicit kokoro_context(kokoro_model * model, int n_threads) : runner_context{n_threads}, model{model} {}

std::string voice = "af_alloy";

Expand Down Expand Up @@ -428,21 +420,21 @@ struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, in

// This manages the graph compilation of computation for the Kokoro model.
struct kokoro_runner : tts_runner {
kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) {
tts_runner::sampling_rate = 24000.0f;
explicit kokoro_runner(unique_ptr<kokoro_model> && model, kokoro_context * context,
single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr)
: tokenizer{tokenizer}, model{move(model)}, kctx{context}, drunner{drunner}, phmzr{phmzr} {
sampling_rate = 24000.0f;
};
~kokoro_runner() {
if (ctx) {
ggml_free(ctx);
}
delete drunner;
model->free();
delete model;
delete kctx;
delete phmzr;
}
struct single_pass_tokenizer * tokenizer;
kokoro_model * model;
unique_ptr<kokoro_model> model;
kokoro_context * kctx;
kokoro_duration_runner * drunner;
phonemizer * phmzr;
Expand Down
28 changes: 17 additions & 11 deletions src/tts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,23 +40,23 @@ struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context *
ggml_free(weight_ctx);
runner->arch = arch;

return (tts_runner*)runner;
return runner;
}

struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
kokoro_model * model = new kokoro_model;
unique_ptr<kokoro_model> model = make_unique<kokoro_model>();
single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens");
model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
struct kokoro_duration_context * kdctx = build_new_duration_kokoro_context(model, n_threads, cpu_only);
struct kokoro_duration_runner * duration_runner = new kokoro_duration_runner(model, kdctx, spt);
struct kokoro_context * kctx = build_new_kokoro_context(model, n_threads, cpu_only);
kokoro_duration_context * kdctx = build_new_duration_kokoro_context(&*model, n_threads, cpu_only);
kokoro_duration_runner * duration_runner = new kokoro_duration_runner(&*model, kdctx, spt);
kokoro_context * kctx = build_new_kokoro_context(&*model, n_threads, cpu_only);
// if an espeak voice id wasn't specifically set infer it from the kokoro voice, if it was override it, otherwise fallback to American English.
std::string espeak_voice_id = config->espeak_voice_id;
if (espeak_voice_id.empty()) {
espeak_voice_id = !config->voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(config->voice.at(0)) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[config->voice.at(0)] : "gmw/en-US";
}
struct phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
struct kokoro_runner * runner = new kokoro_runner(model, kctx, spt, duration_runner, phmzr);
phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
kokoro_runner * runner = new kokoro_runner(move(model), kctx, spt, duration_runner, phmzr);

// TODO: change this weight assignment pattern to mirror llama.cpp
for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
Expand All @@ -69,7 +69,7 @@ struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * wei
ggml_free(weight_ctx);
runner->arch = arch;

return (tts_runner*)runner;
return runner;
}

struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
Expand All @@ -94,7 +94,7 @@ struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight
ggml_free(weight_ctx);
runner->arch = arch;

return (tts_runner*)runner;
return runner;
}

// currently only metal and cpu devices are supported, so cpu_only only describes whether or not to try to load and run on metal.
Expand Down Expand Up @@ -146,8 +146,14 @@ int generate(tts_runner * runner, std::string sentence, struct tts_response * re
}

void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) {
int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads;
((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
const auto parler{dynamic_cast<parler_tts_runner *>(runner)};
if (!parler) {
fprintf(stderr, "Wrong model for conditional prompt\n");
return;
}

const int n_threads = parler->pctx->n_threads;
parler->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
}

bool kokoro_is_f16_compatible(std::string name) {
Expand Down
1 change: 1 addition & 0 deletions src/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <algorithm>
#include <cstdio>
#include <cstring>
#include <stdarg.h>
#ifdef __APPLE__
#include <sys/sysctl.h>
Expand Down
13 changes: 3 additions & 10 deletions src/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,9 @@
#define util_h

#include <functional>
#include <math.h>
#include <cmath>
#include <random>
#include <stdio.h>
#include <string>
#include <cstring>
#include <vector>
#include <cstdio>
#include <stdint.h>
#include <sys/types.h>
#include "ggml-metal.h"
Expand All @@ -17,9 +14,7 @@
#include "ggml.h"
#include "ggml-impl.h"
#include "ggml-cpp.h"

#define TTS_ABORT(...) tts_abort(__FILE__, __LINE__, __VA_ARGS__)
#define TTS_ASSERT(x) if (!(x)) TTS_ABORT("TTS_ASSERT(%s) failed", #x)
#include "imports.h"

struct model_tensor_meta {
uint32_t n_tensors = 0;
Expand Down Expand Up @@ -60,6 +55,4 @@ std::vector<std::string> split(std::string target, const char split_on, bool inc
std::string strip(std::string target, std::string vals = " ");
std::string replace_any(std::string target, std::string to_replace, std::string replacement);

[[noreturn]] void tts_abort(const char * file, int line, const char * fmt, ...);

#endif