ggml-org
diff --git a/‎src/llama-grammar.cpp
Lines changed: 4 additions & 4 deletions b/‎src/llama-grammar.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/llama-grammar.h
Lines changed: 6 additions & 2 deletions b/‎src/llama-grammar.h
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/llama-sampling.cpp
Lines changed: 28 additions & 28 deletions b/‎src/llama-sampling.cpp
Lines changed: 28 additions & 28 deletions
diff --git a/‎src/llama-sampling.h
Lines changed: 22 additions & 18 deletions b/‎src/llama-sampling.h
Lines changed: 22 additions & 18 deletions
@@ -464,7 +464,7 @@ struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * gram
     return result;
 }
 
-void llama_grammar_sample(const struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token_data_array * candidates) {
+void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token_data_array * candidates) {
     GGML_ASSERT(grammar);
     GGML_ASSERT(vocab);
 
@@ -488,7 +488,7 @@ void llama_grammar_sample(const struct llama_grammar * grammar, const struct lla
         const llama_token id      = candidates->data[i].id;
         const std::string & piece = vocab->cache_token_to_piece.at(id);
 
-        if (llama_token_is_eog(*vocab, id)) {
+        if (llama_token_is_eog_impl(*vocab, id)) {
             if (!allow_eog) {
                 candidates->data[i].logit = -INFINITY;
             }
@@ -508,10 +508,10 @@ void llama_grammar_sample(const struct llama_grammar * grammar, const struct lla
     smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 
-void llama_grammar_accept_token(struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token token) {
+void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token token) {
     const int64_t t_start_sample_us = ggml_time_us();
 
-    if (llama_token_is_eog(*vocab, token)) {
+    if (llama_token_is_eog_impl(*vocab, token)) {
         for (const auto & stack : grammar->stacks) {
             if (stack.empty()) {
                 return;
 
@@ -15,6 +15,10 @@ struct llama_grammar {
 
 struct llama_grammar * llama_get_grammar(struct llama_context * ctx);
 
+//
+// internal API
+//
+
 struct llama_grammar * llama_grammar_init_impl(
             const llama_grammar_element ** rules,
                                  size_t    n_rules,
@@ -24,13 +28,13 @@ void llama_grammar_free_impl(struct llama_grammar * grammar);
 
 struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar);
 
-void llama_grammar_sample(
+void llama_grammar_sample_impl(
         const struct llama_grammar * grammar,
           const struct llama_vocab * vocab,
        const struct llama_sampling * smpl,
             llama_token_data_array * candidates);
 
-void llama_grammar_accept_token(
+void llama_grammar_accept_token_impl(
               struct llama_grammar * grammar,
           const struct llama_vocab * vocab,
        const struct llama_sampling * smpl,
 
@@ -21,15 +21,15 @@ static void llama_log_softmax(float * array, size_t size) {
     }
 }
 
-void llama_set_rng_seed(struct llama_sampling * smpl, uint32_t seed) {
+void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed) {
     if (seed == LLAMA_DEFAULT_SEED) {
         seed = time(NULL);
     }
 
     smpl->rng.seed(seed);
 }
 
-void llama_sample_softmax(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+void llama_sample_softmax_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
     GGML_ASSERT(candidates->size > 0);
 
     const int64_t t_start_sample_us = ggml_time_us();
@@ -58,7 +58,7 @@ void llama_sample_softmax(struct llama_sampling * smpl, llama_token_data_array *
     }
 }
 
-void llama_sample_top_k(struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
+void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
     // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
     // if (k >= (int32_t)candidates->size) {
     //     return;
@@ -139,12 +139,12 @@ void llama_sample_top_k(struct llama_sampling * smpl, llama_token_data_array * c
     }
 }
 
-void llama_sample_top_p(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+void llama_sample_top_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
     if (p >= 1.0f) {
         return;
     }
 
-    llama_sample_softmax(smpl, candidates);
+    llama_sample_softmax_impl(smpl, candidates);
 
     const int64_t t_start_sample_us = ggml_time_us();
 
@@ -171,7 +171,7 @@ void llama_sample_top_p(struct llama_sampling * smpl, llama_token_data_array * c
     }
 }
 
-void llama_sample_min_p(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+void llama_sample_min_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
     if (p <= 0.0f || !candidates->size) {
         return;
     }
@@ -232,12 +232,12 @@ void llama_sample_min_p(struct llama_sampling * smpl, llama_token_data_array * c
     }
 }
 
-void llama_sample_tail_free(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep) {
+void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep) {
     if (z >= 1.0f || candidates->size <= 2) {
         return;
     }
 
-    llama_sample_softmax((struct llama_sampling *) nullptr, candidates);
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
     const int64_t t_start_sample_us = ggml_time_us();
 
     // Compute the first and second derivatives
@@ -291,15 +291,15 @@ void llama_sample_tail_free(struct llama_sampling * smpl, llama_token_data_array
     }
 }
 
-void llama_sample_typical(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
     // Reference implementation:
     // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
     if (p >= 1.0f) {
         return;
     }
 
     // Compute the softmax of logits and calculate entropy
-    llama_sample_softmax((struct llama_sampling *) nullptr, candidates);
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
 
     const int64_t t_start_sample_us = ggml_time_us();
 
@@ -355,7 +355,7 @@ void llama_sample_typical(struct llama_sampling * smpl, llama_token_data_array *
     }
 }
 
-void llama_sample_entropy(struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {
+void llama_sample_entropy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {
     const int64_t t_start_sample_us = ggml_time_us();
 
     // no need to do anything if there is only one (or zero) candidates
@@ -366,7 +366,7 @@ void llama_sample_entropy(struct llama_sampling * smpl, llama_token_data_array *
     // Calculate maximum possible entropy
     float max_entropy = -logf(1.0f / candidates->size);
 
-    llama_sample_softmax((struct llama_sampling *) nullptr, candidates);
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
 
     // Calculate entropy of the softmax probabilities
     float entropy = 0.0f;
@@ -422,7 +422,7 @@ void llama_sample_entropy(struct llama_sampling * smpl, llama_token_data_array *
     }
 }
 
-void llama_sample_temp(struct llama_sampling * smpl, llama_token_data_array * candidates, float temp) {
+void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float temp) {
     const int64_t t_start_sample_us = ggml_time_us();
 
     for (size_t i = 0; i < candidates->size; ++i) {
@@ -434,7 +434,7 @@ void llama_sample_temp(struct llama_sampling * smpl, llama_token_data_array * ca
     }
 }
 
-void llama_sample_repetition_penalties(
+void llama_sample_repetition_penalties_impl(
         struct llama_sampling * smpl,
        llama_token_data_array * candidates,
             const llama_token * last_tokens,
@@ -481,7 +481,7 @@ void llama_sample_repetition_penalties(
     }
 }
 
-void llama_sample_apply_guidance(
+void llama_sample_apply_guidance_impl(
         struct llama_sampling * smpl,
                         float * logits,
                         float * logits_guidance,
@@ -504,14 +504,14 @@ void llama_sample_apply_guidance(
     smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 
-llama_token llama_sample_token_mirostat(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
+llama_token llama_sample_token_mirostat_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
     GGML_ASSERT(smpl);
 
     const int32_t n_vocab = float(smpl->n_vocab);
 
     int64_t t_start_sample_us = ggml_time_us();
 
-    llama_sample_softmax((struct llama_sampling *) nullptr, candidates);
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
 
     // Estimate s_hat using the most probable m tokens
     float s_hat = 0.0;
@@ -530,9 +530,9 @@ llama_token llama_sample_token_mirostat(struct llama_sampling * smpl, llama_toke
     float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat);
 
     // Sample the next word X using top-k sampling
-    llama_sample_top_k((struct llama_sampling *) nullptr, candidates, int(k), 1);
+    llama_sample_top_k_impl((struct llama_sampling *) nullptr, candidates, int(k), 1);
     smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
-    llama_token X = llama_sample_token(smpl, candidates);
+    llama_token X = llama_sample_token_impl(smpl, candidates);
     t_start_sample_us = ggml_time_us();
 
     // Compute error as the difference between observed surprise and target surprise value
@@ -549,11 +549,11 @@ llama_token llama_sample_token_mirostat(struct llama_sampling * smpl, llama_toke
     return X;
 }
 
-llama_token llama_sample_token_mirostat_v2(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {
+llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {
     int64_t t_start_sample_us;
     t_start_sample_us = ggml_time_us();
 
-    llama_sample_softmax(smpl, candidates);
+    llama_sample_softmax_impl(smpl, candidates);
 
     // Truncate the words with surprise values greater than mu
     candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
@@ -569,10 +569,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_sampling * smpl, llama_t
     }
 
     // Normalize the probabilities of the remaining words
-    llama_sample_softmax(smpl, candidates);
+    llama_sample_softmax_impl(smpl, candidates);
 
     // Sample the next word X from the remaining words
-    llama_token X = llama_sample_token(smpl, candidates);
+    llama_token X = llama_sample_token_impl(smpl, candidates);
     t_start_sample_us = ggml_time_us();
 
     // Compute error as the difference between observed surprise and target surprise value
@@ -591,7 +591,7 @@ llama_token llama_sample_token_mirostat_v2(struct llama_sampling * smpl, llama_t
     return X;
 }
 
-llama_token llama_sample_token_greedy(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+llama_token llama_sample_token_greedy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
     const int64_t t_start_sample_us = ggml_time_us();
 
     // Find max element
@@ -607,11 +607,11 @@ llama_token llama_sample_token_greedy(struct llama_sampling * smpl, llama_token_
     return result;
 }
 
-llama_token llama_sample_token_with_rng(struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng) {
+llama_token llama_sample_token_with_rng_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng) {
     GGML_ASSERT(smpl);
 
     const int64_t t_start_sample_us = ggml_time_us();
-    llama_sample_softmax((struct llama_sampling *) nullptr, candidates);
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
 
     std::vector<float> probs;
     probs.reserve(candidates->size);
@@ -630,6 +630,6 @@ llama_token llama_sample_token_with_rng(struct llama_sampling * smpl, llama_toke
     return result;
 }
 
-llama_token llama_sample_token(struct llama_sampling * smpl, llama_token_data_array * candidates) {
-    return llama_sample_token_with_rng(smpl, candidates, smpl->rng);
+llama_token llama_sample_token_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    return llama_sample_token_with_rng_impl(smpl, candidates, smpl->rng);
 }
@@ -20,18 +20,22 @@ struct llama_sampling {
 
 struct llama_sampling * llama_get_sampling(struct llama_context * ctx);
 
-void llama_set_rng_seed(struct llama_sampling * smpl, uint32_t seed);
-
-void llama_sample_softmax  (struct llama_sampling * smpl, llama_token_data_array * candidates);
-void llama_sample_top_k    (struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep);
-void llama_sample_top_p    (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
-void llama_sample_min_p    (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
-void llama_sample_tail_free(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep);
-void llama_sample_typical  (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
-void llama_sample_entropy  (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
-void llama_sample_temp     (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
-
-void llama_sample_repetition_penalties(
+//
+// internal API
+//
+
+void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed);
+
+void llama_sample_softmax_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates);
+void llama_sample_top_k_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep);
+void llama_sample_top_p_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sample_min_p_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep);
+void llama_sample_typical_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sample_entropy_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
+void llama_sample_temp_impl     (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
+
+void llama_sample_repetition_penalties_impl(
         struct llama_sampling * smpl,
        llama_token_data_array * candidates,
             const llama_token * last_tokens,
@@ -40,15 +44,15 @@ void llama_sample_repetition_penalties(
                         float   penalty_freq,
                         float   penalty_present);
 
-void llama_sample_apply_guidance(
+void llama_sample_apply_guidance_impl(
         struct llama_sampling * smpl,
                         float * logits,
                         float * logits_guidance,
                         float   scale);
 
-llama_token llama_sample_token_mirostat   (struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu);
-llama_token llama_sample_token_mirostat_v2(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu);
-llama_token llama_sample_token_greedy     (struct llama_sampling * smpl, llama_token_data_array * candidates);
-llama_token llama_sample_token_with_rng   (struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng);
-llama_token llama_sample_token            (struct llama_sampling * smpl, llama_token_data_array * candidates);
+llama_token llama_sample_token_mirostat_impl   (struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu);
+llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu);
+llama_token llama_sample_token_greedy_impl     (struct llama_sampling * smpl, llama_token_data_array * candidates);
+llama_token llama_sample_token_with_rng_impl   (struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng);
+llama_token llama_sample_token_impl            (struct llama_sampling * smpl, llama_token_data_array * candidates);
Original file line number	Diff line number	Diff line change
`@@ -21,15 +21,15 @@ static void llama_log_softmax(float * array, size_t size) {`
`21`	`21`	`}`
`22`	`22`	`}`
`23`	`23`
`24`		`-void llama_set_rng_seed(struct llama_sampling * smpl, uint32_t seed) {`
	`24`	`+void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed) {`
`25`	`25`	`if (seed == LLAMA_DEFAULT_SEED) {`
`26`	`26`	`seed = time(NULL);`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`smpl->rng.seed(seed);`
`30`	`30`	`}`
`31`	`31`
`32`		`-void llama_sample_softmax(struct llama_sampling * smpl, llama_token_data_array * candidates) {`
	`32`	`+void llama_sample_softmax_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {`
`33`	`33`	`GGML_ASSERT(candidates->size > 0);`
`34`	`34`
`35`	`35`	`const int64_t t_start_sample_us = ggml_time_us();`
`@@ -58,7 +58,7 @@ void llama_sample_softmax(struct llama_sampling * smpl, llama_token_data_array *`
`58`	`58`	`}`
`59`	`59`	`}`
`60`	`60`
`61`		`-void llama_sample_top_k(struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep) {`
	`61`	`+void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep) {`
`62`	`62`	`// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast`
`63`	`63`	`// if (k >= (int32_t)candidates->size) {`
`64`	`64`	`// return;`
`@@ -139,12 +139,12 @@ void llama_sample_top_k(struct llama_sampling * smpl, llama_token_data_array * c`
`139`	`139`	`}`
`140`	`140`	`}`
`141`	`141`
`142`		`-void llama_sample_top_p(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {`
	`142`	`+void llama_sample_top_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {`
`143`	`143`	`if (p >= 1.0f) {`
`144`	`144`	`return;`
`145`	`145`	`}`
`146`	`146`
`147`		`- llama_sample_softmax(smpl, candidates);`
	`147`	`+ llama_sample_softmax_impl(smpl, candidates);`
`148`	`148`
`149`	`149`	`const int64_t t_start_sample_us = ggml_time_us();`
`150`	`150`
`@@ -171,7 +171,7 @@ void llama_sample_top_p(struct llama_sampling * smpl, llama_token_data_array * c`
`171`	`171`	`}`
`172`	`172`	`}`
`173`	`173`
`174`		`-void llama_sample_min_p(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {`
	`174`	`+void llama_sample_min_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {`
`175`	`175`	`if (p <= 0.0f \|\| !candidates->size) {`
`176`	`176`	`return;`
`177`	`177`	`}`
`@@ -232,12 +232,12 @@ void llama_sample_min_p(struct llama_sampling * smpl, llama_token_data_array * c`
`232`	`232`	`}`
`233`	`233`	`}`
`234`	`234`
`235`		`-void llama_sample_tail_free(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep) {`
	`235`	`+void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep) {`
`236`	`236`	`if (z >= 1.0f \|\| candidates->size <= 2) {`
`237`	`237`	`return;`
`238`	`238`	`}`
`239`	`239`
`240`		`- llama_sample_softmax((struct llama_sampling *) nullptr, candidates);`
	`240`	`+ llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);`
`241`	`241`	`const int64_t t_start_sample_us = ggml_time_us();`
`242`	`242`
`243`	`243`	`// Compute the first and second derivatives`
`@@ -291,15 +291,15 @@ void llama_sample_tail_free(struct llama_sampling * smpl, llama_token_data_array`
`291`	`291`	`}`
`292`	`292`	`}`
`293`	`293`
`294`		`-void llama_sample_typical(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {`
	`294`	`+void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {`
`295`	`295`	`// Reference implementation:`
`296`	`296`	`// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr`
`297`	`297`	`if (p >= 1.0f) {`
`298`	`298`	`return;`
`299`	`299`	`}`
`300`	`300`
`301`	`301`	`// Compute the softmax of logits and calculate entropy`
`302`		`- llama_sample_softmax((struct llama_sampling *) nullptr, candidates);`
	`302`	`+ llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);`
`303`	`303`
`304`	`304`	`const int64_t t_start_sample_us = ggml_time_us();`
`305`	`305`
`@@ -355,7 +355,7 @@ void llama_sample_typical(struct llama_sampling * smpl, llama_token_data_array *`
`355`	`355`	`}`
`356`	`356`	`}`
`357`	`357`
`358`		`-void llama_sample_entropy(struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {`
	`358`	`+void llama_sample_entropy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {`
`359`	`359`	`const int64_t t_start_sample_us = ggml_time_us();`
`360`	`360`
`361`	`361`	`// no need to do anything if there is only one (or zero) candidates`
`@@ -366,7 +366,7 @@ void llama_sample_entropy(struct llama_sampling * smpl, llama_token_data_array *`
`366`	`366`	`// Calculate maximum possible entropy`
`367`	`367`	`float max_entropy = -logf(1.0f / candidates->size);`
`368`	`368`
`369`		`- llama_sample_softmax((struct llama_sampling *) nullptr, candidates);`
	`369`	`+ llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);`
`370`	`370`
`371`	`371`	`// Calculate entropy of the softmax probabilities`
`372`	`372`	`float entropy = 0.0f;`
`@@ -422,7 +422,7 @@ void llama_sample_entropy(struct llama_sampling * smpl, llama_token_data_array *`
`422`	`422`	`}`
`423`	`423`	`}`
`424`	`424`
`425`		`-void llama_sample_temp(struct llama_sampling * smpl, llama_token_data_array * candidates, float temp) {`
	`425`	`+void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float temp) {`
`426`	`426`	`const int64_t t_start_sample_us = ggml_time_us();`
`427`	`427`
`428`	`428`	`for (size_t i = 0; i < candidates->size; ++i) {`
`@@ -434,7 +434,7 @@ void llama_sample_temp(struct llama_sampling * smpl, llama_token_data_array * ca`
`434`	`434`	`}`
`435`	`435`	`}`
`436`	`436`
`437`		`-void llama_sample_repetition_penalties(`
	`437`	`+void llama_sample_repetition_penalties_impl(`
`438`	`438`	`struct llama_sampling * smpl,`
`439`	`439`	`llama_token_data_array * candidates,`
`440`	`440`	`const llama_token * last_tokens,`
`@@ -481,7 +481,7 @@ void llama_sample_repetition_penalties(`
`481`	`481`	`}`
`482`	`482`	`}`
`483`	`483`
`484`		`-void llama_sample_apply_guidance(`
	`484`	`+void llama_sample_apply_guidance_impl(`
`485`	`485`	`struct llama_sampling * smpl,`
`486`	`486`	`float * logits,`
`487`	`487`	`float * logits_guidance,`
`@@ -504,14 +504,14 @@ void llama_sample_apply_guidance(`
`504`	`504`	`smpl->t_sample_us += ggml_time_us() - t_start_sample_us;`
`505`	`505`	`}`
`506`	`506`
`507`		`-llama_token llama_sample_token_mirostat(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {`
	`507`	`+llama_token llama_sample_token_mirostat_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {`
`508`	`508`	`GGML_ASSERT(smpl);`
`509`	`509`
`510`	`510`	`const int32_t n_vocab = float(smpl->n_vocab);`
`511`	`511`
`512`	`512`	`int64_t t_start_sample_us = ggml_time_us();`
`513`	`513`
`514`		`- llama_sample_softmax((struct llama_sampling *) nullptr, candidates);`
	`514`	`+ llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);`
`515`	`515`
`516`	`516`	`// Estimate s_hat using the most probable m tokens`
`517`	`517`	`float s_hat = 0.0;`
`@@ -530,9 +530,9 @@ llama_token llama_sample_token_mirostat(struct llama_sampling * smpl, llama_toke`
`530`	`530`	`float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat);`
`531`	`531`
`532`	`532`	`// Sample the next word X using top-k sampling`
`533`		`- llama_sample_top_k((struct llama_sampling *) nullptr, candidates, int(k), 1);`
	`533`	`+ llama_sample_top_k_impl((struct llama_sampling *) nullptr, candidates, int(k), 1);`
`534`	`534`	`smpl->t_sample_us += ggml_time_us() - t_start_sample_us;`
`535`		`- llama_token X = llama_sample_token(smpl, candidates);`
	`535`	`+ llama_token X = llama_sample_token_impl(smpl, candidates);`
`536`	`536`	`t_start_sample_us = ggml_time_us();`
`537`	`537`
`538`	`538`	`// Compute error as the difference between observed surprise and target surprise value`
`@@ -549,11 +549,11 @@ llama_token llama_sample_token_mirostat(struct llama_sampling * smpl, llama_toke`
`549`	`549`	`return X;`
`550`	`550`	`}`
`551`	`551`
`552`		`-llama_token llama_sample_token_mirostat_v2(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {`
	`552`	`+llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {`
`553`	`553`	`int64_t t_start_sample_us;`
`554`	`554`	`t_start_sample_us = ggml_time_us();`
`555`	`555`
`556`		`- llama_sample_softmax(smpl, candidates);`
	`556`	`+ llama_sample_softmax_impl(smpl, candidates);`
`557`	`557`
`558`	`558`	`// Truncate the words with surprise values greater than mu`
`559`	`559`	`candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {`
`@@ -569,10 +569,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_sampling * smpl, llama_t`
`569`	`569`	`}`
`570`	`570`
`571`	`571`	`// Normalize the probabilities of the remaining words`
`572`		`- llama_sample_softmax(smpl, candidates);`
	`572`	`+ llama_sample_softmax_impl(smpl, candidates);`
`573`	`573`
`574`	`574`	`// Sample the next word X from the remaining words`
`575`		`- llama_token X = llama_sample_token(smpl, candidates);`
	`575`	`+ llama_token X = llama_sample_token_impl(smpl, candidates);`
`576`	`576`	`t_start_sample_us = ggml_time_us();`
`577`	`577`
`578`	`578`	`// Compute error as the difference between observed surprise and target surprise value`
`@@ -591,7 +591,7 @@ llama_token llama_sample_token_mirostat_v2(struct llama_sampling * smpl, llama_t`
`591`	`591`	`return X;`
`592`	`592`	`}`
`593`	`593`
`594`		`-llama_token llama_sample_token_greedy(struct llama_sampling * smpl, llama_token_data_array * candidates) {`
	`594`	`+llama_token llama_sample_token_greedy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {`
`595`	`595`	`const int64_t t_start_sample_us = ggml_time_us();`
`596`	`596`
`597`	`597`	`// Find max element`
`@@ -607,11 +607,11 @@ llama_token llama_sample_token_greedy(struct llama_sampling * smpl, llama_token_`
`607`	`607`	`return result;`
`608`	`608`	`}`
`609`	`609`
`610`		`-llama_token llama_sample_token_with_rng(struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng) {`
	`610`	`+llama_token llama_sample_token_with_rng_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng) {`
`611`	`611`	`GGML_ASSERT(smpl);`
`612`	`612`
`613`	`613`	`const int64_t t_start_sample_us = ggml_time_us();`
`614`		`- llama_sample_softmax((struct llama_sampling *) nullptr, candidates);`
	`614`	`+ llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);`
`615`	`615`
`616`	`616`	`std::vector<float> probs;`
`617`	`617`	`probs.reserve(candidates->size);`
`@@ -630,6 +630,6 @@ llama_token llama_sample_token_with_rng(struct llama_sampling * smpl, llama_toke`
`630`	`630`	`return result;`
`631`	`631`	`}`
`632`	`632`
`633`		`-llama_token llama_sample_token(struct llama_sampling * smpl, llama_token_data_array * candidates) {`
`634`		`- return llama_sample_token_with_rng(smpl, candidates, smpl->rng);`
	`633`	`+llama_token llama_sample_token_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {`
	`634`	`+ return llama_sample_token_with_rng_impl(smpl, candidates, smpl->rng);`
`635`	`635`	`}`