cont : simplify common/sampling

ggerganov · ggerganov · commit 97731bf7ecdd · 2024-08-29T15:11:32.000+03:00
ggml-ci
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -2,6 +2,35 @@
 
 #include "common.h"
 
+std::string gpt_sampling_params::print_all() const {
+    char result[1024];
+
+    snprintf(result, sizeof(result),
+            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
+            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
+            top_k, tfs_z, top_p, min_p, typ_p, temp,
+            mirostat, mirostat_eta, mirostat_tau);
+
+    return std::string(result);
+}
+
+std::string gpt_sampling_params::print_samplers() const {
+    std::string result = "CFG -> Penalties ";
+    if (mirostat == 0) {
+        for (const auto & sampler : samplers) {
+            const auto name = llama_sampling_type_to_str(sampler);
+            if (!name.empty()) {
+                result += "-> " + name + " ";
+            }
+        }
+    } else {
+        result += "-> mirostat ";
+    }
+
+    return result;
+}
 struct llama_sampling_context * llama_sampling_init(const struct llama_model * model, const struct gpt_sampling_params & params) {
     struct llama_sampling_context * result = new llama_sampling_context();
 
@@ -52,10 +81,6 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
     delete ctx;
 }
 
-void llama_sampling_reset(llama_sampling_context * ctx) {
-    llama_sampling_reset(ctx->smpl);
-}
-
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
     if (dst->smpl) {
         llama_sampling_free(dst->smpl);
@@ -89,38 +114,8 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
     return result;
 }
 
-std::string llama_sampling_print(const gpt_sampling_params & params) {
-    char result[1024];
-
-    snprintf(result, sizeof(result),
-            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
-            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typ_p, params.temp,
-            params.mirostat, params.mirostat_eta, params.mirostat_tau);
-
-    return std::string(result);
-}
-
-std::string llama_sampling_order_print(const gpt_sampling_params & params) {
-    std::string result = "CFG -> Penalties ";
-    if (params.mirostat == 0) {
-        for (auto sampler_type : params.samplers) {
-            const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
-            if (!sampler_type_name.empty()) {
-                result += "-> " + sampler_type_name + " ";
-            }
-        }
-    } else {
-        result += "-> mirostat ";
-    }
-
-    return result;
-}
-
-char llama_sampling_type_to_chr(llama_sampler_type sampler_type) {
-    switch (sampler_type) {
+char llama_sampling_type_to_chr(llama_sampler_type sampler) {
+    switch (sampler) {
         case LLAMA_SAMPLER_TYPE_TOP_K:       return 'k';
         case LLAMA_SAMPLER_TYPE_TFS_Z:       return 'f';
         case LLAMA_SAMPLER_TYPE_TYPICAL_P:   return 'y';
@@ -131,8 +126,8 @@ char llama_sampling_type_to_chr(llama_sampler_type sampler_type) {
     }
 }
 
-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
-    switch (sampler_type) {
+std::string llama_sampling_type_to_str(llama_sampler_type sampler) {
+    switch (sampler) {
         case LLAMA_SAMPLER_TYPE_TOP_K:       return "top_k";
         case LLAMA_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
         case LLAMA_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
@@ -210,35 +205,15 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
     return sampler_types;
 }
 
-void llama_sampling_prepare(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        int idx) {
-    llama_sampling_set_logits(ctx_sampling->smpl, llama_get_logits_ith(ctx_main, idx));
-}
-
-static llama_token llama_sampling_sample(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_token_data_array * cur_p) {
-    return llama_sampling_sample(ctx_sampling->smpl, cur_p);
-}
-
 llama_token llama_sampling_sample(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
         int idx) {
-    llama_sampling_prepare(ctx_sampling, ctx_main, idx);
+    llama_sampling_set_logits(ctx_sampling->smpl, llama_get_logits_ith(ctx_main, idx));
 
     auto * cur_p = llama_sampling_get_candidates(ctx_sampling->smpl);
 
     llama_sampling_grammar(ctx_sampling->smpl, cur_p);
 
-    return llama_sampling_sample(ctx_sampling, cur_p);
-}
-
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        llama_token id,
-        bool apply_grammar) {
-    llama_sampling_accept(ctx_sampling->smpl, id, apply_grammar);
+    return llama_sampling_sample(ctx_sampling->smpl, cur_p);
 }
diff --git a/common/sampling.h b/common/sampling.h
@@ -42,6 +42,12 @@ typedef struct gpt_sampling_params {
     std::string grammar;  // optional BNF-like grammar to constrain sampling
 
     std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+
+    // print the parameters into a string
+    std::string print_all() const;
+
+    // print the samplers into a string
+    std::string print_samplers() const;
 } gpt_sampling_params;
 
 // general sampler context
@@ -58,11 +64,6 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_model * m
 
 void llama_sampling_free(struct llama_sampling_context * ctx);
 
-// Reset the sampler context
-// - clear prev tokens
-// - reset grammar
-void llama_sampling_reset(llama_sampling_context * ctx);
-
 // Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
 
@@ -72,50 +73,13 @@ llama_token llama_sampling_last(llama_sampling_context * ctx);
 // Get a string representation of the last accepted tokens
 std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
 
-// Print sampling parameters into a string
-std::string llama_sampling_print(const gpt_sampling_params & params);
-
-// Print sampling order into a string
-std::string llama_sampling_order_print(const gpt_sampling_params & params);
-
 char        llama_sampling_type_to_chr(llama_sampler_type sampler_type);
 std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
 
 std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
 
-// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
-void llama_sampling_prepare(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        int idx);
-
-// this is a common sampling function used across the examples for convenience
-// it can serve as a starting point for implementing your own sampling function
-// Note: When using multiple sequences, it is the caller's responsibility to call
-//       llama_sampling_reset when a sequence ends
-//
-// required:
-//  - ctx_main:     context to use for sampling
-//  - ctx_sampling: sampling-specific context
-//
-// optional:
-//  - idx:          sample from llama_get_logits_ith(ctx, idx)
-//
-// returns:
-//  - token:      sampled token
-//  - candidates: vector of candidate tokens
-//
-//llama_token llama_sampling_sample(
-//        struct llama_sampling_context * ctx_sampling,
-//        struct llama_token_data_array * cur_p);
-
 llama_token llama_sampling_sample(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
         int idx = -1);
-
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        llama_token id,
-        bool apply_grammar);
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
@@ -301,7 +301,7 @@ int main(int argc, char ** argv) {
             LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("sampling: \n%s\n", sparams.print_all().c_str());
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
 
@@ -419,7 +419,7 @@ int main(int argc, char ** argv) {
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             const llama_token id = llama_sampling_sample(ctx_sampling, ctx);
 
-            llama_sampling_accept(ctx_sampling, id, true);
+            llama_sampling_accept(ctx_sampling->smpl, id, true);
 
             // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev.to_vector()).c_str());
 
@@ -440,7 +440,7 @@ int main(int argc, char ** argv) {
 
                 // push the prompt in the sampling context in order to apply repetition penalties later
                 // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, embd_inp[n_consumed], false);
+                llama_sampling_accept(ctx_sampling->smpl, embd_inp[n_consumed], false);
 
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -611,7 +611,7 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    llama_sampling_reset(ctx_sampling->smpl);
                 }
                 is_interacting = false;
             }
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
@@ -44,7 +44,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
                            struct llama_context * ctx_llama,
                            int * n_past) {
     const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama);
-    llama_sampling_accept(ctx_sampling, id, true);
+    llama_sampling_accept(ctx_sampling->smpl, id, true);
     static std::string ret;
     if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
         ret = "</s>";
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
@@ -167,7 +167,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
                            struct llama_context * ctx_llama,
                            int * n_past) {
     const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama);
-    llama_sampling_accept(ctx_sampling, id, true);
+    llama_sampling_accept(ctx_sampling->smpl, id, true);
     static std::string ret;
     if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
         ret = "</s>";
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
     {
         id = llama_sampling_sample(ctx_sampling, ctx, 0);
 
-        llama_sampling_accept(ctx_sampling, id, true);
+        llama_sampling_accept(ctx_sampling->smpl, id, true);
 
         {
             const std::string token_str = llama_token_to_piece(ctx, id);
@@ -285,7 +285,7 @@ int main(int argc, char ** argv) {
             // sample the next token
             id = llama_sampling_sample(ctx_sampling, ctx, i_batch);
 
-            llama_sampling_accept(ctx_sampling, id, true);
+            llama_sampling_accept(ctx_sampling->smpl, id, true);
 
             // print
             {
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
@@ -130,7 +130,7 @@ int main(int argc, char ** argv){
             // sample from the target model
             llama_token id = llama_sampling_sample(ctx_sampling, ctx, i_dft);
 
-            llama_sampling_accept(ctx_sampling, id, true);
+            llama_sampling_accept(ctx_sampling->smpl, id, true);
 
             const std::string token_str = llama_token_to_piece(ctx, id);
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -426,8 +426,8 @@ int main(int argc, char ** argv) {
             }
         }
     }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
+    LOG_TEE("sampling params: \n%s\n", sparams.print_all().c_str());
+    LOG_TEE("sampling order:  \n%s\n", sparams.print_samplers().c_str());
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
 
     // group-attention state
@@ -652,7 +652,7 @@ int main(int argc, char ** argv) {
 
             const llama_token id = llama_sampling_sample(ctx_sampling, ctx);
 
-            llama_sampling_accept(ctx_sampling, id, /* apply_grammar= */ true);
+            llama_sampling_accept(ctx_sampling->smpl, id, /* apply_grammar= */ true);
 
             // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev.to_vector()).c_str());
 
@@ -673,7 +673,7 @@ int main(int argc, char ** argv) {
 
                 // push the prompt in the sampling context in order to apply repetition penalties later
                 // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, embd_inp[n_consumed], /* apply_grammar= */ false);
+                llama_sampling_accept(ctx_sampling->smpl, embd_inp[n_consumed], /* apply_grammar= */ false);
 
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -872,7 +872,7 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    llama_sampling_reset(ctx_sampling->smpl);
                 }
                 is_interacting = false;
             }
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -253,7 +253,7 @@ int main(int argc, char ** argv) {
                     client.prompt   = client.input + "\nAssistant:";
                     client.response = "";
 
-                    llama_sampling_reset(client.ctx_sampling);
+                    llama_sampling_reset(client.ctx_sampling->smpl);
 
                     // do not prepend BOS because we have a system prompt!
                     std::vector<llama_token> tokens_prompt;
@@ -343,7 +343,7 @@ int main(int argc, char ** argv) {
 
                 const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, client.i_batch - i);
 
-                llama_sampling_accept(client.ctx_sampling, id, true);
+                llama_sampling_accept(client.ctx_sampling->smpl, id, true);
 
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2098,7 +2098,7 @@ struct server_context {
                                 GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                             }
 
-                            llama_sampling_reset(slot.ctx_sampling);
+                            llama_sampling_reset(slot.ctx_sampling->smpl);
 
                             if (!slot.params.cache_prompt) {
                                 slot.n_past_se = 0;
@@ -2111,7 +2111,7 @@ struct server_context {
 
                                 // push the prompt into the sampling context (do not apply grammar)
                                 for (int i = 0; i < slot.n_past; ++i) {
-                                    llama_sampling_accept(slot.ctx_sampling, slot.cache_tokens[i], false);
+                                    llama_sampling_accept(slot.ctx_sampling->smpl, slot.cache_tokens[i], false);
                                 }
                             }
                         }
@@ -2164,7 +2164,7 @@ struct server_context {
                         slot.n_past_se = 0;
                         slot.ga_i = 0;
                         // TODO: is the system prompt ever in the sampling context?
-                        llama_sampling_reset(slot.ctx_sampling);
+                        llama_sampling_reset(slot.ctx_sampling->smpl);
                     }
 
                     // remove the non-common part from the cache
@@ -2343,7 +2343,7 @@ struct server_context {
                 completion_token_output result;
                 const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
 
-                llama_sampling_accept(slot.ctx_sampling, id, true);
+                llama_sampling_accept(slot.ctx_sampling->smpl, id, true);
 
                 slot.n_decoded += 1;
                 if (slot.n_decoded == 1) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
diff --git a/include/llama.h b/include/llama.h

Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {`
`160`	`160`	`{`
`161`	`161`	`id = llama_sampling_sample(ctx_sampling, ctx, 0);`
`162`	`162`
`163`		`- llama_sampling_accept(ctx_sampling, id, true);`
	`163`	`+ llama_sampling_accept(ctx_sampling->smpl, id, true);`
`164`	`164`
`165`	`165`	`{`
`166`	`166`	`const std::string token_str = llama_token_to_piece(ctx, id);`
`@@ -285,7 +285,7 @@ int main(int argc, char ** argv) {`
`285`	`285`	`// sample the next token`
`286`	`286`	`id = llama_sampling_sample(ctx_sampling, ctx, i_batch);`
`287`	`287`
`288`		`- llama_sampling_accept(ctx_sampling, id, true);`
	`288`	`+ llama_sampling_accept(ctx_sampling->smpl, id, true);`
`289`	`289`
`290`	`290`	`// print`
`291`	`291`	`{`
Original file line number	Diff line number	Diff line change
`@@ -426,8 +426,8 @@ int main(int argc, char ** argv) {`
`426`	`426`	`}`
`427`	`427`	`}`
`428`	`428`	`}`
`429`		`- LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());`
`430`		`- LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());`
	`429`	`+ LOG_TEE("sampling params: \n%s\n", sparams.print_all().c_str());`
	`430`	`+ LOG_TEE("sampling order: \n%s\n", sparams.print_samplers().c_str());`
`431`	`431`	`LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);`
`432`	`432`
`433`	`433`	`// group-attention state`
`@@ -652,7 +652,7 @@ int main(int argc, char ** argv) {`
`652`	`652`
`653`	`653`	`const llama_token id = llama_sampling_sample(ctx_sampling, ctx);`
`654`	`654`
`655`		`- llama_sampling_accept(ctx_sampling, id, /* apply_grammar= */ true);`
	`655`	`+ llama_sampling_accept(ctx_sampling->smpl, id, /* apply_grammar= */ true);`
`656`	`656`
`657`	`657`	`// LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev.to_vector()).c_str());`
`658`	`658`
`@@ -673,7 +673,7 @@ int main(int argc, char ** argv) {`
`673`	`673`
`674`	`674`	`// push the prompt in the sampling context in order to apply repetition penalties later`
`675`	`675`	`// for the prompt, we don't apply grammar rules`
`676`		`- llama_sampling_accept(ctx_sampling, embd_inp[n_consumed], /* apply_grammar= */ false);`
	`676`	`+ llama_sampling_accept(ctx_sampling->smpl, embd_inp[n_consumed], /* apply_grammar= */ false);`
`677`	`677`
`678`	`678`	`++n_consumed;`
`679`	`679`	`if ((int) embd.size() >= params.n_batch) {`
`@@ -872,7 +872,7 @@ int main(int argc, char ** argv) {`
`872`	`872`
`873`	`873`	`if (n_past > 0) {`
`874`	`874`	`if (is_interacting) {`
`875`		`- llama_sampling_reset(ctx_sampling);`
	`875`	`+ llama_sampling_reset(ctx_sampling->smpl);`
`876`	`876`	`}`
`877`	`877`	`is_interacting = false;`
`878`	`878`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2098,7 +2098,7 @@ struct server_context {`
`2098`	`2098`	`GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);`
`2099`	`2099`	`}`
`2100`	`2100`
`2101`		`- llama_sampling_reset(slot.ctx_sampling);`
	`2101`	`+ llama_sampling_reset(slot.ctx_sampling->smpl);`
`2102`	`2102`
`2103`	`2103`	`if (!slot.params.cache_prompt) {`
`2104`	`2104`	`slot.n_past_se = 0;`
`@@ -2111,7 +2111,7 @@ struct server_context {`
`2111`	`2111`
`2112`	`2112`	`// push the prompt into the sampling context (do not apply grammar)`
`2113`	`2113`	`for (int i = 0; i < slot.n_past; ++i) {`
`2114`		`- llama_sampling_accept(slot.ctx_sampling, slot.cache_tokens[i], false);`
	`2114`	`+ llama_sampling_accept(slot.ctx_sampling->smpl, slot.cache_tokens[i], false);`
`2115`	`2115`	`}`
`2116`	`2116`	`}`
`2117`	`2117`	`}`
`@@ -2164,7 +2164,7 @@ struct server_context {`
`2164`	`2164`	`slot.n_past_se = 0;`
`2165`	`2165`	`slot.ga_i = 0;`
`2166`	`2166`	`// TODO: is the system prompt ever in the sampling context?`
`2167`		`- llama_sampling_reset(slot.ctx_sampling);`
	`2167`	`+ llama_sampling_reset(slot.ctx_sampling->smpl);`
`2168`	`2168`	`}`
`2169`	`2169`
`2170`	`2170`	`// remove the non-common part from the cache`
`@@ -2343,7 +2343,7 @@ struct server_context {`
`2343`	`2343`	`completion_token_output result;`
`2344`	`2344`	`const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, slot.i_batch - i);`
`2345`	`2345`
`2346`		`- llama_sampling_accept(slot.ctx_sampling, id, true);`
	`2346`	`+ llama_sampling_accept(slot.ctx_sampling->smpl, id, true);`
`2347`	`2347`
`2348`	`2348`	`slot.n_decoded += 1;`
`2349`	`2349`	`if (slot.n_decoded == 1) {`