ggml-org
diff --git a/‎common/common.cpp
Lines changed: 13 additions & 17 deletions b/‎common/common.cpp
Lines changed: 13 additions & 17 deletions
diff --git a/‎common/common.h
Lines changed: 1 addition & 3 deletions b/‎common/common.h
Lines changed: 1 addition & 3 deletions
diff --git a/‎common/sampling.cpp
Lines changed: 49 additions & 15 deletions b/‎common/sampling.cpp
Lines changed: 49 additions & 15 deletions
diff --git a/‎common/sampling.h
Lines changed: 9 additions & 15 deletions b/‎common/sampling.h
Lines changed: 9 additions & 15 deletions
diff --git a/‎examples/batched.swift/Sources/main.swift
Lines changed: 1 addition & 1 deletion b/‎examples/batched.swift/Sources/main.swift
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/batched/batched.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/batched/batched.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/gritlm/gritlm.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/gritlm/gritlm.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/infill/infill.cpp
Lines changed: 2 additions & 1 deletion b/‎examples/infill/infill.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Lines changed: 1 addition & 1 deletion b/‎examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/main/main.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/main/main.cpp
Lines changed: 1 addition & 1 deletion
@@ -252,7 +252,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
     bool invalid_param = false;
     std::string arg;
     const std::string arg_prefix = "--";
-    llama_sampling_params & sparams = params.sparams;
+    auto & sparams = params.sparams;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -320,7 +320,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
     const char split_delim = ',';
 
-    llama_sampling_params & sparams = params.sparams;
+    auto & sparams = params.sparams;
 
     if (arg == "-s" || arg == "--seed") {
         CHECK_ARG
@@ -1039,7 +1039,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "--ignore-eos") {
-        params.ignore_eos = true;
+        sparams.ignore_eos = true;
         return true;
     }
     if (arg == "--penalize-nl") {
@@ -1054,7 +1054,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         std::string value_str;
         try {
             if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                sparams.logit_bias.push_back({key, bias});
             }
             else {
                 throw std::exception();
@@ -1401,7 +1402,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 #endif
 
 void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sparams;
+    const auto & sparams = params.sparams;
 
     std::string sampler_type_chars;
     std::string sampler_type_names;
@@ -2165,8 +2166,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         llama_lora_adapters_apply(lctx, iparams.lora_adapters);
     }
 
-    if (params.ignore_eos) {
-        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+    if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
+        fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        params.sparams.ignore_eos = false;
     }
 
     if (params.warmup) {
@@ -3142,7 +3144,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
 
 void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    const llama_sampling_params & sparams = params.sparams;
+    const auto & sparams = params.sparams;
 
     fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
     fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
@@ -3205,10 +3207,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
     fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
     fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
-
-    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
-    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
-    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
+    fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
 
     yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
     fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
@@ -3219,11 +3218,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
 
     fprintf(stream, "logit_bias:\n");
-    for (std::pair<llama_token, float> lb : sparams.logit_bias) {
-        if (ignore_eos && lb.first == logit_bias_eos->first) {
-            continue;
-        }
-        fprintf(stream, "  %d: %f", lb.first, lb.second);
+    for (const auto & logit_bias : sparams.logit_bias) {
+        fprintf(stream, "  %d: %f", logit_bias.token, logit_bias.bias);
     }
 
     fprintf(stream, "lora:\n");
 
@@ -108,8 +108,7 @@ struct gpt_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
 
-    // // sampling parameters
-    struct llama_sampling_params sparams;
+    struct gpt_sampling_params sparams;
 
     std::string model                = ""; // model path
     std::string model_draft          = ""; // draft model for speculative decoding
@@ -173,7 +172,6 @@ struct gpt_params {
     bool flash_attn        = false; // flash attention
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool ignore_eos        = false; // ignore generated EOS tokens
     bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
 
@@ -1,19 +1,49 @@
 #include "sampling.h"
 
-#include <random>
+#include "common.h"
 
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, const struct llama_model * model) {
+struct llama_sampling_context * llama_sampling_init(const struct gpt_sampling_params & params, const struct llama_model * model) {
     struct llama_sampling_context * result = new llama_sampling_context();
 
     result->params = params;
-    result->smpl   = llama_sampling_init(model, params.grammar.c_str(), "root");
+
+    {
+        auto lp = llama_sampling_default_params();
+
+        lp.seed              = params.seed;
+        lp.n_prev            = params.n_prev;
+        lp.n_probs           = params.n_probs;
+        lp.min_keep          = params.min_keep;
+        lp.top_k             = params.top_k;
+        lp.top_p             = params.top_p;
+        lp.min_p             = params.min_p;
+        lp.tfs_z             = params.tfs_z;
+        lp.typical_p         = params.typical_p;
+        lp.temp              = params.temp;
+        lp.dynatemp_range    = params.dynatemp_range;
+        lp.dynatemp_exponent = params.dynatemp_exponent;
+        lp.penalty_last_n    = params.penalty_last_n;
+        lp.penalty_repeat    = params.penalty_repeat;
+        lp.penalty_freq      = params.penalty_freq;
+        lp.penalty_present   = params.penalty_present;
+        lp.mirostat          = params.mirostat;
+        lp.mirostat_tau      = params.mirostat_tau;
+        lp.mirostat_eta      = params.mirostat_eta;
+        lp.penalize_nl       = params.penalize_nl;
+        lp.ignore_eos        = params.ignore_eos;
+
+        result->smpl = llama_sampling_init(model, lp);
+
+        llama_sampling_set_rng_seed  (result->smpl, params.seed);
+        llama_sampling_set_grammar   (result->smpl, params.grammar.c_str(), "root");
+        llama_sampling_set_cfg       (result->smpl, params.cfg_negative_prompt.c_str(), params.cfg_scale);
+        llama_sampling_set_logit_bias(result->smpl, params.logit_bias.size(), params.logit_bias.data());
+    }
 
     result->prev.resize(params.n_prev);
 
     result->n_valid = 0;
 
-    llama_sampling_set_rng_seed(result->smpl, params.seed);
-
     return result;
 }
 
@@ -24,7 +54,7 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
 }
 
 void llama_sampling_reset(llama_sampling_context * ctx) {
-    llama_sampling_reset(ctx->smpl, ctx->params.grammar.c_str(), "root");
+    llama_sampling_reset(ctx->smpl);
 
     std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
     ctx->cur.clear();
@@ -58,7 +88,7 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
     return result;
 }
 
-std::string llama_sampling_print(const llama_sampling_params & params) {
+std::string llama_sampling_print(const gpt_sampling_params & params) {
     char result[1024];
 
     snprintf(result, sizeof(result),
@@ -72,7 +102,7 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
     return std::string(result);
 }
 
-std::string llama_sampling_order_print(const llama_sampling_params & params) {
+std::string llama_sampling_order_print(const gpt_sampling_params & params) {
     std::string result = "CFG -> Penalties ";
     if (params.mirostat == 0) {
         for (auto sampler_type : params.samplers_sequence) {
@@ -176,7 +206,7 @@ static void sampler_queue(
                                  size_t   min_keep) {
     llama_sampling * smpl = ctx_sampling->smpl;
 
-    const llama_sampling_params & params = ctx_sampling->params;
+    const gpt_sampling_params & params = ctx_sampling->params;
 
     const float         temp              = params.temp;
     const float         dynatemp_range    = params.dynatemp_range;
@@ -217,7 +247,7 @@ static llama_token llama_sampling_sample_impl(
                   bool is_resampling) {
     llama_sampling * smpl = ctx_sampling->smpl;
 
-    const llama_sampling_params & params = ctx_sampling->params;
+    const gpt_sampling_params & params = ctx_sampling->params;
 
     const float temp         = params.temp;
     const int   mirostat     = params.mirostat;
@@ -308,7 +338,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
                   std::vector<float> * original_logits) {
     llama_sampling * smpl = ctx_sampling->smpl;
 
-    const llama_sampling_params & params = ctx_sampling->params;
+    const gpt_sampling_params & params = ctx_sampling->params;
 
     const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
 
@@ -332,13 +362,17 @@ static llama_token_data_array llama_sampling_prepare_impl(
     }
 
     // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
+    for (const auto & logit_bias : params.logit_bias) {
+        logits[logit_bias.token] += logit_bias.bias;
+    }
+
+    if (params.ignore_eos) {
+        logits[llama_token_eos(llama_get_model(ctx_main))] = -INFINITY;
     }
 
     if (ctx_cfg) {
         float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sampling_apply_guidance(smpl, logits, logits_guidance, params.cfg_scale);
+        llama_sampling_cfg(smpl, logits, logits_guidance, params.cfg_scale);
     }
 
     cur.resize(n_vocab);
@@ -350,7 +384,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
     llama_token_data_array cur_p = { cur.data(), cur.size(), false };
 
     // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
+    const auto & penalty_tokens = prev;
     const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
     if (penalty_tokens_used_size) {
         const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
 
@@ -2,9 +2,7 @@
 
 #include "llama.h"
 
-#include <random>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 // sampler types
@@ -18,7 +16,8 @@ enum class llama_sampler_type : char {
 };
 
 // sampling parameters
-typedef struct llama_sampling_params {
+typedef struct gpt_sampling_params {
+    uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
     int32_t     n_prev                = 64;                 // number of previous tokens to remember
     int32_t     n_probs               = 0;                  // if greater than 0, output the probabilities of top n_probs tokens.
     int32_t     min_keep              = 0;                  // 0 = disabled, otherwise samplers should return at least min_keep tokens
@@ -38,7 +37,7 @@ typedef struct llama_sampling_params {
     float       mirostat_tau          = 5.00f;              // target entropy
     float       mirostat_eta          = 0.10f;              // learning rate
     bool        penalize_nl           = false;              // consider newlines as a repeatable token
-    uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
+    bool        ignore_eos            = false;
 
     std::vector<llama_sampler_type> samplers_sequence = {
         llama_sampler_type::TOP_K,
@@ -56,17 +55,14 @@ typedef struct llama_sampling_params {
     std::string cfg_negative_prompt; // string to help guidance
     float       cfg_scale     = 1.f; // how strong is guidance
 
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
-    std::vector<llama_token> penalty_prompt_tokens;
-    bool                     use_penalty_prompt_tokens = false;
-} llama_sampling_params;
+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+} gpt_sampling_params;
 
 // general sampler context
 // TODO: move to llama.h
 struct llama_sampling_context {
     // parameters that will be used for sampling
-    llama_sampling_params params;
+    gpt_sampling_params params;
 
     // mirostat sampler state
     float mirostat_mu;
@@ -80,10 +76,8 @@ struct llama_sampling_context {
     size_t n_valid; // Number of correct top tokens with correct probabilities.
 };
 
-#include "common.h"
-
 // Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, const struct llama_model * model);
+struct llama_sampling_context * llama_sampling_init(const struct gpt_sampling_params & params, const struct llama_model * model);
 
 void llama_sampling_free(struct llama_sampling_context * ctx);
 
@@ -102,10 +96,10 @@ llama_token llama_sampling_last(llama_sampling_context * ctx);
 std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
 
 // Print sampling parameters into a string
-std::string llama_sampling_print(const llama_sampling_params & params);
+std::string llama_sampling_print(const gpt_sampling_params & params);
 
 // Print sampling order into a string
-std::string llama_sampling_order_print(const llama_sampling_params & params);
+std::string llama_sampling_order_print(const gpt_sampling_params & params);
 
 std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
 
 
@@ -50,7 +50,7 @@ defer {
     llama_free(context)
 }
 
-let smpl = llama_sampling_init(model, nil, nil)
+let smpl = llama_sampling_init(model, llama_sampling_default_params())
 guard smpl != nil else {
     print("Failed to initialize sampling")
     exit(1)
 
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
     ctx_params.n_batch = std::max(n_predict, n_parallel);
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-    llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr);
+    llama_sampling * smpl = llama_sampling_init(model, llama_sampling_default_params());
 
     if (ctx == NULL) {
         fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
 
@@ -172,7 +172,7 @@ int main(int argc, char * argv[]) {
     // create generation context
     llama_context * ctx = llama_new_context_with_model(model, cparams);
 
-    llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr);
+    llama_sampling * smpl = llama_sampling_init(model, llama_sampling_default_params());
 
     // ### Embedding/Representation ###
     // samples taken from: https://github.com/ContextualAI/gritlm#basic
 
@@ -103,14 +103,15 @@ static void sigint_handler(int signo) {
 
 int main(int argc, char ** argv) {
     gpt_params params;
-    llama_sampling_params & sparams = params.sparams;
     g_params = &params;
 
     if (!gpt_params_parse(argc, argv, params)) {
         gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
+    auto & sparams = params.sparams;
+
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("infill", "log"));
     LOG_TEE("Log start\n");
 
@@ -43,7 +43,7 @@ actor LlamaContext {
         self.tokens_list = []
         self.batch = llama_batch_init(512, 0, 1)
         self.temporary_invalid_cchars = []
-        self.sampling = llama_sampling_init(context, nil, nil);
+        self.sampling = llama_sampling_init(context, llama_sampling_default_params())
     }
 
     deinit {
 
@@ -137,7 +137,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_sampling_params & sparams = params.sparams;
+    auto & sparams = params.sparams;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ defer {`
`50`	`50`	`llama_free(context)`
`51`	`51`	`}`
`52`	`52`
`53`		`-let smpl = llama_sampling_init(model, nil, nil)`
	`53`	`+let smpl = llama_sampling_init(model, llama_sampling_default_params())`
`54`	`54`	`guard smpl != nil else {`
`55`	`55`	`print("Failed to initialize sampling")`
`56`	`56`	`exit(1)`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ actor LlamaContext {`
`43`	`43`	`self.tokens_list = []`
`44`	`44`	`self.batch = llama_batch_init(512, 0, 1)`
`45`	`45`	`self.temporary_invalid_cchars = []`
`46`		`- self.sampling = llama_sampling_init(context, nil, nil);`
	`46`	`+ self.sampling = llama_sampling_init(context, llama_sampling_default_params())`
`47`	`47`	`}`
`48`	`48`
`49`	`49`	`deinit {`
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ int main(int argc, char ** argv) {`
`137`	`137`	`return 1;`
`138`	`138`	`}`
`139`	`139`
`140`		`- llama_sampling_params & sparams = params.sparams;`
	`140`	`+ auto & sparams = params.sparams;`
`141`	`141`
`142`	`142`	`#ifndef LOG_DISABLE_LOGS`
`143`	`143`	`log_set_target(log_filename_generator("main", "log"));`