minor : clean-up + comments

ggerganov · ggerganov · commit 5d4c807e2e48 · 2024-08-31T12:03:00.000+03:00
ggml-ci
diff --git a/common/common.cpp b/common/common.cpp
@@ -360,6 +360,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
             std::replace(arg.begin(), arg.end(), '_', '-');
         }
+
         bool invalid_param = false;
         if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
             throw std::invalid_argument("error: unknown argument: " + arg);
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -31,44 +31,41 @@ std::string gpt_sampling_params::print_samplers() const {
 
     return result;
 }
+
 struct llama_sampling * llama_sampling_init(const struct llama_model * model, const struct gpt_sampling_params & params) {
-    struct llama_sampling * result = nullptr;
-
-    {
-        auto lparams = llama_sampling_default_params();
-
-        lparams.seed              = params.seed;
-        lparams.n_prev            = params.n_prev;
-        lparams.n_probs           = params.n_probs;
-        lparams.min_keep          = params.min_keep;
-        lparams.top_k             = params.top_k;
-        lparams.top_p             = params.top_p;
-        lparams.min_p             = params.min_p;
-        lparams.tfs_z             = params.tfs_z;
-        lparams.typ_p             = params.typ_p;
-        lparams.temp              = params.temp;
-        lparams.dynatemp_range    = params.dynatemp_range;
-        lparams.dynatemp_exponent = params.dynatemp_exponent;
-        lparams.penalty_last_n    = params.penalty_last_n;
-        lparams.penalty_repeat    = params.penalty_repeat;
-        lparams.penalty_freq      = params.penalty_freq;
-        lparams.penalty_present   = params.penalty_present;
-        lparams.mirostat          = params.mirostat;
-        lparams.mirostat_tau      = params.mirostat_tau;
-        lparams.mirostat_eta      = params.mirostat_eta;
-        lparams.penalize_nl       = params.penalize_nl;
-        lparams.ignore_eos        = params.ignore_eos;
-
-        lparams.n_samplers = params.samplers.size();
-        for (int i = 0; i < lparams.n_samplers; i++) {
-            lparams.samplers[i] = params.samplers[i];
-        }
+    llama_sampling_params lparams = llama_sampling_default_params();
+
+    lparams.seed              = params.seed;
+    lparams.n_prev            = params.n_prev;
+    lparams.n_probs           = params.n_probs;
+    lparams.min_keep          = params.min_keep;
+    lparams.top_k             = params.top_k;
+    lparams.top_p             = params.top_p;
+    lparams.min_p             = params.min_p;
+    lparams.tfs_z             = params.tfs_z;
+    lparams.typ_p             = params.typ_p;
+    lparams.temp              = params.temp;
+    lparams.dynatemp_range    = params.dynatemp_range;
+    lparams.dynatemp_exponent = params.dynatemp_exponent;
+    lparams.penalty_last_n    = params.penalty_last_n;
+    lparams.penalty_repeat    = params.penalty_repeat;
+    lparams.penalty_freq      = params.penalty_freq;
+    lparams.penalty_present   = params.penalty_present;
+    lparams.mirostat          = params.mirostat;
+    lparams.mirostat_tau      = params.mirostat_tau;
+    lparams.mirostat_eta      = params.mirostat_eta;
+    lparams.penalize_nl       = params.penalize_nl;
+    lparams.ignore_eos        = params.ignore_eos;
+
+    lparams.n_samplers = params.samplers.size();
+    for (int i = 0; i < lparams.n_samplers; i++) {
+        lparams.samplers[i] = params.samplers[i];
+    }
 
-        result = llama_sampling_init(model, lparams);
+    struct llama_sampling * result = llama_sampling_init(model, lparams);
 
-        llama_sampling_set_grammar   (result, params.grammar.c_str(), "root");
-        llama_sampling_set_logit_bias(result, params.logit_bias.size(), params.logit_bias.data());
-    }
+    llama_sampling_set_grammar   (result, params.grammar.c_str(), "root");
+    llama_sampling_set_logit_bias(result, params.logit_bias.size(), params.logit_bias.data());
 
     return result;
 }
@@ -81,6 +78,35 @@ void llama_sampling_cp(llama_sampling * src, llama_sampling * dst) {
     dst = llama_sampling_cp(src);
 }
 
+llama_token llama_sampling_sample(
+        struct llama_sampling * smpl,
+        struct llama_context * ctx,
+        int idx) {
+    llama_sampling_set_logits(smpl, llama_get_logits_ith(ctx, idx));
+
+    // first, sample the token without any grammar constraints
+    const llama_token id = llama_sampling_sample(smpl, nullptr);
+
+    // create an array with a single token data element for the sampled id
+    llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
+    llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
+
+    llama_sampling_grammar(smpl, &single_token_data_array);
+
+    // check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
+    const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
+    if (is_valid) {
+        return id;
+    }
+
+    // if the token is not valid, sample again, after applying the grammar constraints
+    llama_sampling_set_logits(smpl, llama_get_logits_ith(ctx, idx));
+
+    llama_sampling_grammar(smpl, nullptr);
+
+    return llama_sampling_sample(smpl, nullptr);
+}
+
 std::string llama_sampling_prev_str(llama_sampling * smpl, llama_context * ctx_main, int n) {
     n = std::min(n, llama_sampling_n_prev(smpl));
 
@@ -152,27 +178,27 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
         { "temp",        LLAMA_SAMPLER_TYPE_TEMPERATURE },
     };
 
-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names.size());
+    std::vector<llama_sampler_type> samplers;
+    samplers.reserve(names.size());
 
     for (const auto & name : names) {
-        auto sampler_item = sampler_canonical_name_map.find(name);
-        if (sampler_item != sampler_canonical_name_map.end()) {
-            sampler_types.push_back(sampler_item->second);
+        auto sampler = sampler_canonical_name_map.find(name);
+        if (sampler != sampler_canonical_name_map.end()) {
+            samplers.push_back(sampler->second);
         } else {
             if (allow_alt_names) {
-                sampler_item = sampler_alt_name_map.find(name);
-                if (sampler_item != sampler_alt_name_map.end()) {
-                    sampler_types.push_back(sampler_item->second);
+                sampler = sampler_alt_name_map.find(name);
+                if (sampler != sampler_alt_name_map.end()) {
+                    samplers.push_back(sampler->second);
                 }
             }
         }
     }
 
-    return sampler_types;
+    return samplers;
 }
 
-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
+std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & chars) {
     std::unordered_map<char, llama_sampler_type> sampler_name_map {
         { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TOP_K),       LLAMA_SAMPLER_TYPE_TOP_K },
         { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TFS_Z),       LLAMA_SAMPLER_TYPE_TFS_Z },
@@ -182,42 +208,15 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
         { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TEMPERATURE), LLAMA_SAMPLER_TYPE_TEMPERATURE }
     };
 
-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names_string.size());
-    for (const auto & c : names_string) {
-        const auto sampler_item = sampler_name_map.find(c);
-        if (sampler_item != sampler_name_map.end()) {
-            sampler_types.push_back(sampler_item->second);
-        }
-    }
-    return sampler_types;
-}
+    std::vector<llama_sampler_type> samplers;
+    samplers.reserve(chars.size());
 
-llama_token llama_sampling_sample(
-        struct llama_sampling * smpl,
-        struct llama_context * ctx,
-        int idx) {
-    llama_sampling_set_logits(smpl, llama_get_logits_ith(ctx, idx));
-
-    // first, sample the token without any grammar constraints
-    auto id = llama_sampling_sample(smpl, nullptr);
-
-    // create an array with a single token data element for the sampled id
-    llama_token_data single_token_data = {id, 1.0f, 0.0f};
-    llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
-
-    llama_sampling_grammar(smpl, &single_token_data_array);
-
-    // check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
-    const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-    if (is_valid) {
-        return id;
+    for (const auto & c : chars) {
+        const auto sampler = sampler_name_map.find(c);
+        if (sampler != sampler_name_map.end()) {
+            samplers.push_back(sampler->second);
+        }
     }
 
-    // if the token is not valid, sample again, after applying the grammar constraints
-    llama_sampling_set_logits(smpl, llama_get_logits_ith(ctx, idx));
-
-    llama_sampling_grammar(smpl, nullptr);
-
-    return llama_sampling_sample(smpl, nullptr);
+    return samplers;
 }
diff --git a/common/sampling.h b/common/sampling.h
@@ -39,7 +39,7 @@ typedef struct gpt_sampling_params {
         LLAMA_SAMPLER_TYPE_TEMPERATURE
     };
 
-    std::string grammar;  // optional BNF-like grammar to constrain sampling
+    std::string grammar; // optional BNF-like grammar to constrain sampling
 
     std::vector<llama_logit_bias> logit_bias; // logit biases to apply
 
@@ -55,16 +55,25 @@ struct llama_sampling * llama_sampling_init(const struct llama_model * model, co
 
 void llama_sampling_cp(llama_sampling * src, llama_sampling * dst);
 
+// common sampling implementation:
+//
+// - set logits
+// - apply the configured sampling constraints
+// - check if the token fits the grammar (if any)
+// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
+//
+llama_token llama_sampling_sample(
+        struct llama_sampling * smpl,
+         struct llama_context * ctx,
+                          int   idx);
+
+// helpers
+
 // get a string representation of the last accepted tokens
 std::string llama_sampling_prev_str(llama_sampling * smpl, llama_context * ctx, int n);
 
 char        llama_sampling_type_to_chr(enum llama_sampler_type sampler_type);
 std::string llama_sampling_type_to_str(enum llama_sampler_type sampler_type);
 
 std::vector<enum llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<enum llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
-
-llama_token llama_sampling_sample(
-        struct llama_sampling * smpl,
-        struct llama_context * ctx,
-        int idx);
+std::vector<enum llama_sampler_type> llama_sampling_types_from_chars(const std::string & chars);
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
@@ -109,14 +109,18 @@ static std::string generate(llama_context * ctx, llama_sampling * smpl, const st
 
     while (true) {
         llama_batch_clear(bat);
-        auto n_inputs = (int32_t)inputs.size();
-        for (int32_t i = 0; i < n_inputs; i++) {
-            llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+        {
+            const int32_t n_inputs = inputs.size();
+
+            for (int32_t i = 0; i < n_inputs; i++) {
+                llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+            }
         }
         inputs.clear();
 
         llama_decode(ctx, bat);
-        auto * logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
+
+        const auto * logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
 
         llama_sampling_set_logits(smpl, logits);
 
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -120,7 +120,8 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
     LOGi("Using %d threads", n_threads);
 
     llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.n_ctx = 2048;
+
+    ctx_params.n_ctx           = 2048;
     ctx_params.n_threads       = n_threads;
     ctx_params.n_threads_batch = n_threads;
 
@@ -393,8 +394,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
     if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
     if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
 
-    auto n_vocab = llama_n_vocab(model);
-    auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
+    const auto * logits = llama_get_logits_ith(context, batch->n_tokens - 1);
 
     llama_sampling_set_logits(sampling, logits);
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2356,6 +2356,8 @@ struct server_context {
 
                 const auto * cur_p = llama_sampling_get_candidates(slot.smpl);
 
+                // TODO: this logic might have been broken during https://github.com/ggerganov/llama.cpp/pull/8643
+                //       fix if necessary
                 for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
                     result.probs.push_back({
                         cur_p->data[i].id,
diff --git a/include/llama.h b/include/llama.h
@@ -439,8 +439,8 @@ extern "C" {
     // Helpers for getting default parameters
     LLAMA_API struct llama_model_params          llama_model_default_params(void);
     LLAMA_API struct llama_context_params        llama_context_default_params(void);
-    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
     LLAMA_API struct llama_sampling_params       llama_sampling_default_params(void);
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
 
     // Initialize the llama + ggml backend
     // If numa is true, use NUMA optimizations
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -17399,24 +17399,6 @@ struct llama_context_params llama_context_default_params() {
     return result;
 }
 
-struct llama_model_quantize_params llama_model_quantize_default_params() {
-    struct llama_model_quantize_params result = {
-        /*.nthread                     =*/ 0,
-        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
-        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
-        /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
-        /*.allow_requantize            =*/ false,
-        /*.quantize_output_tensor      =*/ true,
-        /*.only_copy                   =*/ false,
-        /*.pure                        =*/ false,
-        /*.keep_split                  =*/ false,
-        /*.imatrix                     =*/ nullptr,
-        /*.kv_overrides                =*/ nullptr,
-    };
-
-    return result;
-}
-
 struct llama_sampling_params llama_sampling_default_params() {
     struct llama_sampling_params result = {
         /*.seed              =*/ LLAMA_DEFAULT_SEED,
@@ -17447,6 +17429,24 @@ struct llama_sampling_params llama_sampling_default_params() {
     return result;
 }
 
+struct llama_model_quantize_params llama_model_quantize_default_params() {
+    struct llama_model_quantize_params result = {
+        /*.nthread                     =*/ 0,
+        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
+        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
+        /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
+        /*.allow_requantize            =*/ false,
+        /*.quantize_output_tensor      =*/ true,
+        /*.only_copy                   =*/ false,
+        /*.pure                        =*/ false,
+        /*.keep_split                  =*/ false,
+        /*.imatrix                     =*/ nullptr,
+        /*.kv_overrides                =*/ nullptr,
+    };
+
+    return result;
+}
+
 size_t llama_max_devices(void) {
 #if defined(GGML_USE_RPC)
     return GGML_RPC_MAX_SERVERS;

Original file line number	Diff line number	Diff line change
`@@ -360,6 +360,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {`
`360`	`360`	`if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {`
`361`	`361`	`std::replace(arg.begin(), arg.end(), '_', '-');`
`362`	`362`	`}`
	`363`	`+`
`363`	`364`	`bool invalid_param = false;`
`364`	`365`	`if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {`
`365`	`366`	`throw std::invalid_argument("error: unknown argument: " + arg);`