wip [no ci]

ggerganov · ggerganov · commit 201a190cfdb7 · 2024-07-26T15:02:03.000+03:00
diff --git a/include/llama.h b/include/llama.h
@@ -55,6 +55,7 @@ extern "C" {
     // TODO: show sample usage
     //
 
+    // struct llama_vocab; // TODO: add in the future
     struct llama_model;
     struct llama_context;
 
@@ -423,24 +424,23 @@ extern "C" {
     LLAMA_API bool llama_supports_mlock      (void);
     LLAMA_API bool llama_supports_gpu_offload(void);
 
-    LLAMA_API const struct llama_model    * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API       struct llama_sampling * llama_get_sampling(      struct llama_context * ctx);
-
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
 
-    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
-
-    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
-    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
-
     LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
     LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
     LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
     LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
 
+    LLAMA_API const struct llama_model    * llama_get_model   (const struct llama_context * ctx);
+    LLAMA_API       struct llama_sampling * llama_get_sampling(      struct llama_context * ctx);
+
+    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
+    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
+
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
 
@@ -967,36 +967,16 @@ extern "C" {
     //
 
     // TODO: args become llama_sampling_params
-    LLAMA_API struct llama_sampling * llama_sampling_init(int32_t n_vocab, const char * grammar_str, const char * grammar_root);
+    // TODO: llama_model should become llama_vocab
+    LLAMA_API struct llama_sampling * llama_sampling_init(const struct llama_model * model, const char * grammar_str, const char * grammar_root);
 
     LLAMA_API void llama_sampling_free(struct llama_sampling * smpl);
 
-    LLAMA_API struct llama_sampling * llama_sampling_cp(const struct llama_grammar * grammar);
+    LLAMA_API struct llama_sampling * llama_sampling_cp(const struct llama_sampling * smpl);
 
     // Sets the current rng seed.
     LLAMA_API void llama_sampling_set_rng_seed(struct llama_sampling * smpl, uint32_t seed);
 
-    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sampling_repetition_penalties(
-            struct llama_sampling * smpl,
-           llama_token_data_array * candidates,
-                const llama_token * last_tokens,
-                           size_t   penalty_last_n,
-                            float   penalty_repeat,
-                            float   penalty_freq,
-                            float   penalty_present);
-
-    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-    /// @param logits Logits extracted from the original generation context.
-    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    LLAMA_API void llama_sampling_apply_guidance(
-            struct llama_sampling * smpl,
-                            float * logits,
-                            float * logits_guidance,
-                            float   scale);
-
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     LLAMA_API void llama_sampling_softmax(
             struct llama_sampling * smpl,
@@ -1050,6 +1030,32 @@ extern "C" {
            llama_token_data_array * candidates,
                             float   temp);
 
+    /// @details Apply constraints from grammar
+    LLAMA_API void llama_sampling_grammar(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates);
+
+    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+    LLAMA_API void llama_sampling_repetition_penalties(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates,
+                const llama_token * last_tokens,
+                           size_t   penalty_last_n,
+                            float   penalty_repeat,
+                            float   penalty_freq,
+                            float   penalty_present);
+
+    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+    /// @param logits Logits extracted from the original generation context.
+    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    LLAMA_API void llama_sampling_apply_guidance(
+            struct llama_sampling * smpl,
+                            float * logits,
+                            float * logits_guidance,
+                            float   scale);
+
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1082,21 +1088,14 @@ extern "C" {
             struct llama_sampling * smpl,
            llama_token_data_array * candidates);
 
-    /// @details Randomly selects a token from the candidates based on their probabilities using RNG[0] of smpl.
+    /// @details Randomly selects a token from the candidates based on their probabilities
     LLAMA_API llama_token llama_sampling_sample(
             struct llama_sampling * smpl,
            llama_token_data_array * candidates);
 
-    /// @details Apply constraints from grammar
-    LLAMA_API void llama_sampling_grammar(
-            const struct llama_sampling * smpl,
-             const struct llama_context * ctx,
-                 llama_token_data_array * candidates);
-
     /// @details Accepts the sampled token into the grammar
     LLAMA_API void llama_sampling_accept(
             struct llama_sampling * smpl,
-             struct llama_context * ctx,
                       llama_token   token);
 
     //
@@ -1116,8 +1115,8 @@ extern "C" {
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
 
-    LLAMA_API void llama_print_timings(struct llama_context * ctx, struct llama_sampling * smpl, struct llama_grammar * grammar);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx, struct llama_sampling * smpl, struct llama_grammar * grammar);
+    LLAMA_API void llama_print_timings(struct llama_context * ctx, struct llama_sampling * smpl);
+    LLAMA_API void llama_reset_timings(struct llama_context * ctx, struct llama_sampling * smpl);
 
     // Print system information
     LLAMA_API const char * llama_print_system_info(void);
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
@@ -502,16 +502,16 @@ bool llama_grammar_parser::parse(const char * src) {
     return true;
 }
 
-void llama_grammar::print(FILE * file) {
+void llama_grammar_parser::print(FILE * file) {
     try {
         std::map<uint32_t, std::string> symbol_id_names;
-        for (const auto & kv : parser.symbol_ids) {
+        for (const auto & kv : symbol_ids) {
             symbol_id_names[kv.second] = kv.first;
         }
-        for (size_t i = 0, end = parser.rules.size(); i < end; i++) {
+        for (size_t i = 0, end = rules.size(); i < end; i++) {
             // fprintf(file, "%zu: ", i);
-            // print_rule_binary(file, parser.rules[i]);
-            print_rule(file, uint32_t(i), parser.rules[i], symbol_id_names);
+            // print_rule_binary(file, rules[i]);
+            print_rule(file, uint32_t(i), rules[i], symbol_id_names);
             // fprintf(file, "\n");
         }
     } catch (const std::exception & err) {
@@ -848,7 +848,7 @@ struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar & gram
     return result;
 }
 
-void llama_grammar_sample_impl(const struct llama_grammar & grammar, const struct llama_vocab & vocab, llama_token_data_array * candidates) {
+void llama_grammar_apply_impl(const struct llama_grammar & grammar, const struct llama_vocab & vocab, llama_token_data_array * candidates) {
     bool allow_eog = false;
     for (const auto & stack : grammar.stacks) {
         if (stack.empty()) {
@@ -885,7 +885,7 @@ void llama_grammar_sample_impl(const struct llama_grammar & grammar, const struc
     }
 }
 
-void llama_grammar_accept_token_impl(struct llama_grammar & grammar, const struct llama_vocab & vocab, llama_token token) {
+void llama_grammar_accept_impl(struct llama_grammar & grammar, const struct llama_vocab & vocab, llama_token token) {
     if (llama_token_is_eog_impl(vocab, token)) {
         for (const auto & stack : grammar.stacks) {
             if (stack.empty()) {
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
@@ -109,6 +109,7 @@ struct llama_grammar_parser {
     const char * parse_rule(const char * src);
 
     bool parse(const char * src);
+    void print(FILE * file);
 };
 
 struct llama_grammar {
@@ -118,14 +119,10 @@ struct llama_grammar {
     // buffer for partially generated UTF-8 sequence from accepted tokens
     llama_partial_utf8 partial_utf8;
 
-    llama_grammar_parser parser;
-
     mutable int64_t t_total_us;
 
     mutable int32_t n_sample;
     mutable int32_t n_accept;
-
-    void print(FILE * file);
 };
 
 //
@@ -138,12 +135,13 @@ void llama_grammar_free_impl(struct llama_grammar * grammar);
 
 struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar & grammar);
 
-void llama_grammar_sample_impl(
+// TODO: move the API below as member functions of llama_grammar
+void llama_grammar_apply_impl(
         const struct llama_grammar & grammar,
           const struct llama_vocab & vocab,
             llama_token_data_array * candidates);
 
-void llama_grammar_accept_token_impl(
+void llama_grammar_accept_impl(
               struct llama_grammar & grammar,
           const struct llama_vocab & vocab,
                        llama_token   token);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -1,5 +1,6 @@
 #include "llama-sampling.h"
 
+#include "llama-vocab.h"
 #include "llama-grammar.h"
 
 #include <algorithm>
@@ -23,7 +24,7 @@ static void llama_log_softmax(float * array, size_t size) {
     }
 }
 
-llama_sampling::llama_sampling(int32_t n_vocab, const char * grammar_str, const char * grammar_root) : n_vocab(n_vocab) {
+llama_sampling::llama_sampling(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) : vocab(vocab) {
     if (grammar_str != nullptr) {
         grammar = llama_grammar_init_impl(grammar_str, grammar_root);
     }
@@ -35,8 +36,8 @@ llama_sampling::~llama_sampling() {
     }
 }
 
-struct llama_sampling * llama_sampling_init_impl(int32_t n_vocab, const char * grammar_str, const char * grammar_root) {
-    return new llama_sampling(n_vocab, grammar_str, grammar_root);
+struct llama_sampling * llama_sampling_init_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) {
+    return new llama_sampling(vocab, grammar_str, grammar_root);
 }
 
 void llama_sampling_free_impl(struct llama_sampling * sampling) {
@@ -411,6 +412,12 @@ void llama_sampling_temp_impl(struct llama_sampling & /*smpl*/, llama_token_data
     }
 }
 
+void llama_sampling_grammar_impl(struct llama_sampling & smpl, llama_token_data_array * candidates) {
+    if (smpl.grammar) {
+        llama_grammar_apply_impl(*smpl.grammar, smpl.vocab, candidates);
+    }
+}
+
 void llama_sampling_repetition_penalties_impl(
         struct llama_sampling & /*smpl*/,
        llama_token_data_array * candidates,
@@ -457,12 +464,12 @@ void llama_sampling_apply_guidance_impl(
                         float * logits,
                         float * logits_guidance,
                         float   scale) {
-    const auto n_vocab = smpl.n_vocab;
+    const auto n_vocab = smpl.vocab.n_vocab;
 
     llama_log_softmax(logits, n_vocab);
     llama_log_softmax(logits_guidance, n_vocab);
 
-    for (int i = 0; i < n_vocab; ++i) {
+    for (uint32_t i = 0; i < n_vocab; ++i) {
               auto & l = logits[i];
         const auto & g = logits_guidance[i];
 
@@ -471,7 +478,7 @@ void llama_sampling_apply_guidance_impl(
 }
 
 llama_token llama_sampling_sample_mirostat_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
-    const int32_t n_vocab = float(smpl.n_vocab);
+    const int32_t n_vocab = float(smpl.vocab.n_vocab);
 
     llama_sampling_softmax_impl(smpl, candidates);
 
@@ -570,3 +577,11 @@ llama_token llama_sampling_sample_with_rng_impl(struct llama_sampling & smpl, ll
 llama_token llama_sampling_sample_impl(struct llama_sampling & smpl, llama_token_data_array * candidates) {
     return llama_sampling_sample_with_rng_impl(smpl, candidates, smpl.rng);
 }
+
+void llama_sampling_accept_impl(struct llama_sampling & smpl, llama_token token) {
+    // TODO: implement token storing in history
+
+    if (smpl.grammar) {
+        llama_grammar_accept_impl(*smpl.grammar, smpl.vocab, token);
+    }
+}
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
@@ -3,13 +3,14 @@
 #include "llama-impl.h"
 #include "llama-grammar.h"
 
+struct llama_vocab;
 struct llama_grammar;
 
 struct llama_sampling {
-    llama_sampling(int32_t n_vocab, const char * grammar_str, const char * grammar_root);
+    llama_sampling(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root);
     ~llama_sampling();
 
-    const int32_t n_vocab;
+    const struct llama_vocab & vocab;
 
     std::mt19937 rng;
 
@@ -24,10 +25,11 @@ struct llama_sampling {
 // internal API
 //
 
-struct llama_sampling * llama_sampling_init_impl(int32_t n_vocab, const char * grammar_str, const char * grammar_root);
+struct llama_sampling * llama_sampling_init_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root);
 
 void llama_sampling_free_impl(struct llama_sampling * sampling);
 
+// TODO: move the API below as member functions of llama_sampling
 void llama_sampling_set_rng_seed_impl(struct llama_sampling & smpl, uint32_t seed);
 
 void llama_sampling_softmax_impl  (struct llama_sampling & smpl, llama_token_data_array * candidates);
@@ -38,6 +40,7 @@ void llama_sampling_tail_free_impl(struct llama_sampling & smpl, llama_token_dat
 void llama_sampling_typical_impl  (struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep);
 void llama_sampling_entropy_impl  (struct llama_sampling & smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
 void llama_sampling_temp_impl     (struct llama_sampling & smpl, llama_token_data_array * candidates, float temp);
+void llama_sampling_grammar_impl  (struct llama_sampling & smpl, llama_token_data_array * candidates);
 
 void llama_sampling_repetition_penalties_impl(
         struct llama_sampling & smpl,
@@ -60,3 +63,4 @@ llama_token llama_sampling_sample_greedy_impl     (struct llama_sampling & smpl,
 llama_token llama_sampling_sample_with_rng_impl   (struct llama_sampling & smpl, llama_token_data_array * candidates, std::mt19937 & rng);
 llama_token llama_sampling_sample_impl            (struct llama_sampling & smpl, llama_token_data_array * candidates);
 
+void llama_sampling_accept_impl(struct llama_sampling & smpl, llama_token token);
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
@@ -18,6 +18,8 @@ struct llama_vocab {
         tattr attr;
     };
 
+    uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
+
     enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
     enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 
@@ -61,8 +63,6 @@ struct llama_vocab {
     int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
 };
 
-const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx);
-
 //
 // internal API
 //
@@ -75,6 +75,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
         bool add_special,
         bool parse_special = false);
 
+// TODO: move the API below as member functions of llama_vocab
 llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
 
 const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
diff --git a/src/llama.cpp b/src/llama.cpp