batch : rework llama_batch_allocr (ggml-org#14153)

ggerganov · Minh141120 · commit dfb86c56d311 · 2025-07-05T23:34:16.000+07:00
* batch : rework llama_batch_allocr

ggml-ci

* cont : move validation inside class

ggml-ci

* cont : move output counting to class

ggml-ci

* cont : minor

ggml-ci

* batch : add TODOs

ggml-ci
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -1,8 +1,8 @@
 #include "llama-batch.h"
 
 #include "llama-impl.h"
+#include "llama-cparams.h"
 #include "llama-vocab.h"
-#include "llama-memory.h"
 
 #include <cassert>
 #include <cstring>
@@ -611,37 +611,48 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
             }
         }
 
-        ss_seq_id_unq << "]";
-        ss_seq_idx    << "]";
-
-        LLAMA_LOG_DEBUG("%s:   token      = %p\n", __func__, (void *) ubatch.token);
-        LLAMA_LOG_DEBUG("%s:   embd       = %p\n", __func__, (void *) ubatch.embd);
-        LLAMA_LOG_DEBUG("%s:   pos        = %p\n", __func__, (void *) ubatch.pos);
-        LLAMA_LOG_DEBUG("%s:   n_seq_id   = %p\n", __func__, (void *) ubatch.n_seq_id);
-        LLAMA_LOG_DEBUG("%s:   seq_id     = %p\n", __func__, (void *) ubatch.seq_id);
-        LLAMA_LOG_DEBUG("%s:   seq_id_unq = %s\n", __func__, ss_seq_id_unq.str().c_str());
-        LLAMA_LOG_DEBUG("%s:   seq_idx    = %s\n", __func__, ss_seq_idx.str().c_str());
-        LLAMA_LOG_DEBUG("%s:   output     = %p\n", __func__, (void *) ubatch.output);
-        LLAMA_LOG_DEBUG("%s:   n_outputs  = %d\n", __func__, n_outputs);
-
-        if (debug > 1) {
-            int seq_id_max = 0;
-            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-                for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
-                    for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
-                        seq_id_max = std::max(seq_id_max, ubatch.seq_id[i][s]);
-                    }
+    // keep shared prompts first at the end, then sort by length descending.
+    std::sort(seq.begin(), seq.end(),
+            [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
+                if (a.n_seq_id == b.n_seq_id) {
+                    return a.length > b.length;
                 }
+                return a.n_seq_id < b.n_seq_id;
             }
-            ++seq_id_max;
-
-            LLAMA_LOG_DEBUG("%s:   token     = [\n", __func__);
-            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-                std::vector<int8_t> seq_id(seq_id_max);
+            );
+}
 
 llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) {
     batch = in_batch;
     GGML_ASSERT(batch.n_tokens > 0);
+
+    if (!batch.pos) {
+        if (batch.seq_id) {
+            LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
+            return false;
+        }
+    }
+
+    if (batch.token) {
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return false;
+            }
+        }
+    }
+
+    if (batch.seq_id) {
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
+            for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_PARALLEL_SEQUENCES);
+                    return false;
+                }
+            }
+        }
+    }
+
     if (!batch.pos) {
         assert(p0 >= 0);
         pos.resize(batch.n_tokens);
@@ -650,13 +661,15 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
         }
         batch.pos = pos.data();
     }
+
     if (!batch.n_seq_id) {
         n_seq_id.resize(batch.n_tokens);
         for (int32_t i = 0; i < batch.n_tokens; i++) {
             n_seq_id[i] = seq_id_0.size();
         }
         batch.n_seq_id = n_seq_id.data();
     }
+
     if (!batch.seq_id) {
         seq_id.resize(batch.n_tokens + 1);
         seq_id[batch.n_tokens] = NULL;
@@ -665,12 +678,37 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
         }
         batch.seq_id = seq_id.data();
     }
+
     if (!batch.logits) {
         // by default return the output only for the last token
         output.resize(batch.n_tokens);
         output[output.size() - 1] = true;
         batch.logits = output.data();
     }
+
+    for (int32_t i = 0; i < batch.n_tokens; ++i) {
+        n_outputs += batch.logits[i] != 0;
+    }
+
+    return true;
+}
+
+const llama_batch & llama_batch_allocr::get_batch() const {
+    return batch;
+}
+
+uint32_t llama_batch_allocr::get_n_outputs() const {
+    return n_outputs;
+}
+
+void llama_batch_allocr::clear() {
+    n_outputs = 0;
+
+    batch = {};
+    pos.clear();
+    n_seq_id.clear();
+    seq_id.clear();
+    output.clear();
 }
 
 //
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -21,19 +21,12 @@ struct llama_ubatch {
     uint32_t n_seqs;       // sequence sets in the ubatch
     uint32_t n_seqs_unq;   // unique sequence ids in the ubatch
 
-    // seq_id_unq: unique sequence ids in the ubatch
-    // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
-    //             used for extracting sequence pooled embeddings
-
-    //                          // size               | idx | val
-    llama_token  *  token;      // [n_tokens]         | i   | id, token
-    float        *  embd;       // [n_embd, n_tokens] | i   | embd
-    llama_pos    *  pos;        // [n_tokens]         | i   | pos
-    int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
-    llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
-    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
-    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
-    int8_t       *  output;     // [n_tokens]         | i   | -
+    llama_token  *  token;    // [n_tokens]
+    float        *  embd;     // [n_embd, n_tokens]
+    llama_pos    *  pos;      // [n_tokens]
+    int32_t      *  n_seq_id; // [n_seqs]
+    llama_seq_id ** seq_id;   // [n_seqs]
+    int8_t       *  output;   // [n_tokens]
 };
 
 struct llama_sbatch_seq {
@@ -91,14 +84,29 @@ struct llama_sbatch {
     llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false);
 };
 
-    // used[i] indicates if token i has already been used in a previous ubatch
-    std::vector<bool> used;
+// temporary allocate memory for the input batch if needed
+class llama_batch_allocr {
+public:
+    llama_batch_allocr();
+
+    // optionally fulfill the batch returned by llama_batch_get_one
+    bool init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0);
+
+    const llama_batch & get_batch() const;
+
+    uint32_t get_n_outputs() const;
+
+private:
+    void clear();
+
+    llama_batch batch;
+
+    uint32_t n_outputs;
 
     std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
+
     std::vector<llama_pos>      pos;
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id *> seq_id;
     std::vector<int8_t>         output;
-
-    int debug;
 };
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -20,7 +20,7 @@ llama_context::llama_context(
         const llama_model & model,
               llama_context_params params) :
     model(model),
-    balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
+    batch_allocr(std::make_unique<llama_batch_allocr>()) {
     LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
 
     t_start_us = model.t_start_us;
@@ -722,26 +722,23 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
 }
 
 int llama_context::encode(const llama_batch & batch_inp) {
-    GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
-
     if (batch_inp.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
-    const auto & hparams = model.hparams;
-
-    const int64_t n_embd = hparams.n_embd;
-
+    // temporary allocate memory for the input batch if needed
     // note: during encode, we always pass the full sequence starting from pos = 0
-    if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
+    if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : 0)) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
 
-    const uint32_t n_tokens = balloc->get_n_tokens();
+    const llama_batch & batch = batch_allocr->get_batch();
 
-    const llama_ubatch ubatch = balloc->split_simple(n_tokens);
+    const uint32_t n_tokens = batch.n_tokens;
+
+    const int64_t n_embd = hparams.n_embd;
 
     // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
     GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
@@ -755,6 +752,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     n_queued_tokens += n_tokens;
 
+    const auto & hparams = model.hparams;
+
     const int64_t n_embd = hparams.n_embd;
 
     llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true);
@@ -824,6 +823,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
                         const llama_seq_id seq_id  = ubatch.seq_id_unq[s];
                         const int32_t      seq_idx = ubatch.seq_idx[seq_id];
 
+                    // TODO: fix indexing [UBATCH_IDX]
+                    for (uint32_t i = 0; i < n_tokens; i++) {
+                        const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
                         embd_seq_out[seq_id].resize(n_embd);
                         ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
                     }
@@ -835,10 +840,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
                     const uint32_t n_cls_out = hparams.n_cls_out;
 
-                    for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
-                        const llama_seq_id seq_id  = ubatch.seq_id_unq[s];
-                        const int32_t      seq_idx = ubatch.seq_idx[seq_id];
-
+                    // TODO: fix indexing [UBATCH_IDX]
+                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
                         embd_seq_out[seq_id].resize(n_cls_out);
                         ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_cls_out*seq_idx)*sizeof(float), n_cls_out*sizeof(float));
                     }
@@ -868,15 +875,11 @@ int llama_context::encode(const llama_batch & batch_inp) {
         const auto & batch = balloc->get_batch();
 
         // remember the sequence ids used during the encoding - needed for cross attention later
-        // TODO: the seuqence indexing here is likely not correct in the general case
-        //       probably works only for split_simple
         cross.seq_ids_enc.resize(n_tokens);
         for (uint32_t i = 0; i < n_tokens; i++) {
             cross.seq_ids_enc[i].clear();
-
             for (int s = 0; s < batch.n_seq_id[i]; s++) {
-                const llama_seq_id seq_id = batch.seq_id[i][s];
-
+                llama_seq_id seq_id = batch.seq_id[i][s];
                 cross.seq_ids_enc[i].insert(seq_id);
             }
         }
@@ -886,57 +889,44 @@ int llama_context::encode(const llama_batch & batch_inp) {
 }
 
 int llama_context::decode(const llama_batch & batch_inp) {
-    GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
-
     if (!memory) {
         LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
         return encode(batch_inp);
+        return encode(batch_inp);
     }
 
     if (batch_inp.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
+    // temporary allocate memory for the input batch if needed
+    if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : memory->seq_pos_max(0) + 1)) {
+        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
+        return -1;
+    }
+
+    const llama_batch & batch = batch_allocr->get_batch();
+
     const auto & vocab   = model.vocab;
     const auto & hparams = model.hparams;
 
     const int32_t n_vocab = vocab.n_tokens();
+    const int64_t n_embd  = hparams.n_embd;
 
-    const int64_t n_tokens_all = batch.n_tokens;
-    const int64_t n_embd       = hparams.n_embd;
+    const uint32_t n_tokens_all = batch.n_tokens;
 
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
-    // TODO: move the validation to the llama_batch_allocr
-    if (batch.token) {
-        for (int64_t i = 0; i < n_tokens_all; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
-                return -1;
-            }
-
-            if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
-                LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
-                return -1;
-            }
-        }
-    }
-
     // this indicates we are doing pooled embedding
     const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
 
-    int64_t n_outputs_all = 0;
-
-    // count outputs
-    for (uint32_t i = 0; i < n_tokens_all; ++i) {
-        n_outputs_all += batch.logits[i] != 0;
-    }
+    const uint32_t n_outputs_all = batch_allocr->get_n_outputs();
 
     if (embd_pooled) {
         // require that all tokens are output
         if (n_outputs_all != n_tokens_all) {
-            LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %" PRId64 ", n_tokens_all = %" PRId64 ")\n",
+            LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
                     __func__, n_outputs_all, n_tokens_all);
             return -1;
         }
@@ -1044,6 +1034,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                 pos_min[s] = std::numeric_limits<llama_pos>::max();
             }
 
+            // TODO: fix sequence indexing
             for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
                 const auto & seq_id = ubatch.seq_id[i][0];
 
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -247,7 +247,7 @@ struct llama_context {
     std::map<llama_seq_id, std::vector<float>> embd_seq;
 
     // reuse the batch_allocr to avoid unnecessary memory allocations
-    std::unique_ptr<llama_batch_allocr> balloc;
+    std::unique_ptr<llama_batch_allocr> batch_allocr;
 
     uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp