cont : add comments

ggerganov · ggerganov · commit 4fc4b0d647bb · 2025-06-17T16:10:58.000+03:00
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -88,6 +88,7 @@ bool llama_batch_allocr::init(
         llama_pos p0[LLAMA_MAX_SEQ];
         for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
             if (!memory) {
+                // if no memory -> start from 0
                 p0[s] = 0;
             } else {
                 p0[s] = memory->seq_pos_max(s) + 1;
@@ -99,8 +100,11 @@ bool llama_batch_allocr::init(
 
             pos[i] = p0[seq_id];
 
+            // update the starting position for all sequences that are assigned to the this token
             for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-                p0[batch.seq_id[i][s]] = pos[i] + 1;
+                const llama_seq_id seq_id = batch.seq_id[i][s];
+
+                p0[seq_id] = pos[i] + 1;
             }
         }
 
@@ -141,6 +145,7 @@ bool llama_batch_allocr::init(
 
     this->n_embd = n_embd;
 
+    // count the outputs in this batch
     for (int32_t i = 0; i < batch.n_tokens; ++i) {
         n_outputs += batch.logits[i] != 0;
     }
@@ -159,22 +164,23 @@ bool llama_batch_allocr::init(
                 // mark that sequence s1 is coupled to s0
                 seq_cpl[s1][s0] = true;
 
-                // note: the other way around is not necessary for now
+                // note: tracking the other way around is not necessary for now
                 //seq_cpl[s0][s1] = true;
             }
         }
     }
 
+    // precompute the sequence sets for each token and determine the unique sequence ids that participate in the batch
     {
         seq_set_t seq_set_unq;
 
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
             seq_set_t cur;
             for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-                const llama_seq_id s0 = batch.seq_id[i][s];
+                const llama_seq_id seq_id = batch.seq_id[i][s];
 
-                cur.set(s0);
-                seq_set_unq.set(s0);
+                cur        .set(seq_id);
+                seq_set_unq.set(seq_id);
             }
 
             seq_set.push_back(cur);
@@ -263,6 +269,15 @@ bool llama_batch_allocr::init(
         }
     }
 
+    // disallow disjoint sequence sets:
+    //
+    // invalid:          x
+    //            i: 0 1 2 ...
+    // ---------------------------------------
+    // seq_id[i][0]: 0 0 1
+    // seq_id[i][1]: 1 1 2
+    // seq_id[i][2]: 2
+    //
     {
         seq_set_t cur_seq_set[LLAMA_MAX_SEQ];
         for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
@@ -368,11 +383,13 @@ void llama_batch_allocr::split_reset() {
 }
 
 llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
+    // find the first unused token
     uint32_t cur_idx = 0;
     while (cur_idx < used.size() && used[cur_idx]) {
         ++cur_idx;
     }
 
+    // we are done
     if (cur_idx >= used.size()) {
         return {};
     }
@@ -401,7 +418,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
 llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
     std::vector<seq_set_t> cur_seq_set;
 
-    // determine the sequence sets participating in this ubatch
+    // determine the non-overlapping sequence sets participating in this ubatch
     for (int32_t i = 0; i < batch.n_tokens; ++i) {
         if (used[i]) {
             continue;
@@ -428,10 +445,12 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
 
     const uint32_t n_seqs = cur_seq_set.size();
 
+    // we are done
     if (n_seqs == 0) {
         return {};
     }
 
+    // the current batch index of each sequence set
     std::vector<int32_t> cur_idx(n_seqs, 0);
 
     for (uint32_t s = 0; s < n_seqs; ++s) {
@@ -440,9 +459,13 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
         }
     }
 
+    // the list of batch indices for each sequence set
+    // at the end we will concat these to get the final ubatch
     std::vector<idx_vec_t> idxs_per_seq(n_seqs);
 
     while (true) {
+        // we can only add new n_seq_tokens tokens if all the sequence sets have at least one more unused token and
+        //   if we haven't reached n_ubatch
         bool can_expand = true;
 
         for (uint32_t s = 0; s < n_seqs; ++s) {
@@ -458,6 +481,7 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
 
         for (uint32_t s = 0; s < n_seqs; ++s) {
             const int32_t idx = seq_set_map[cur_seq_set[s]][cur_idx[s]];
+
             idxs_per_seq[s].push_back(idx);
 
             used[idx] = true;
@@ -470,6 +494,7 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
         }
     }
 
+    // concat the per-sequence-set lists
     std::vector<int32_t> idxs;
 
     for (uint32_t s = 0; s < n_seqs; ++s) {
@@ -480,15 +505,19 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
 }
 
 llama_ubatch llama_batch_allocr::split_seq(uint32_t n_ubatch) {
+    // find the first unused token
     uint32_t cur_idx = 0;
     while (cur_idx < used.size() && used[cur_idx]) {
         ++cur_idx;
     }
 
+    // we are done
     if (cur_idx >= used.size()) {
         return {};
     }
 
+    // this is the starting sequence set
+    // we allow adding tokens only if their sequence set is a subset of the current sequence set
     auto cur_seq_set = seq_set[cur_idx];
 
     std::vector<int32_t> idxs;
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -10,6 +10,8 @@
 #include <bitset>
 #include <unordered_map>
 
+// keep this struct lightweight
+// it points to data in `llama_batch_allocr`
 struct llama_ubatch {
     bool equal_seqs;
     // TODO: whole_seqs for embeddings?
@@ -19,14 +21,19 @@ struct llama_ubatch {
     uint32_t n_seqs;       // sequence sets in the ubatch
     uint32_t n_seqs_unq;   // unique sequence ids in the ubatch
 
-    llama_token  *  token;      // [n_tokens]
-    float        *  embd;       // [n_embd, n_tokens]
-    llama_pos    *  pos;        // [n_tokens]
-    int32_t      *  n_seq_id;   // [n_tokens]
-    llama_seq_id ** seq_id;     // [n_tokens]
-    llama_seq_id *  seq_id_unq; // [n_seqs_unq]
-    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]
-    int8_t       *  output;     // [n_tokens]
+    // seq_id_unq: unique sequence ids in the ubatch
+    // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
+    //             used for extracting sequence pooled embeddings
+
+    //                          // size               | idx | val
+    llama_token  *  token;      // [n_tokens]         | i   | id, token
+    float        *  embd;       // [n_embd, n_tokens] | i   | embd
+    llama_pos    *  pos;        // [n_tokens]         | i   | pos
+    int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
+    llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
+    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
+    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
+    int8_t       *  output;     // [n_tokens]         | i   | -
 };
 
 // a helper for sanitizing, fulfilling and splitting a batch
@@ -48,11 +55,14 @@ class llama_batch_allocr {
     uint32_t get_n_tokens()  const;
     uint32_t get_n_outputs() const;
 
+    // the array of output indices in the order they were encountered during the ubatch splitting
     std::vector<int32_t> & get_out_ids();
 
+    // min/max positions of each sequence in the current ubatch
     llama_pos seq_pos_min(llama_seq_id seq_id) const;
     llama_pos seq_pos_max(llama_seq_id seq_id) const;
 
+    // call once before splitting the batch to reset the internal state
     void split_reset();
 
     // simple split, unknown number of sequences of unequal lengths
@@ -62,15 +72,21 @@ class llama_batch_allocr {
     llama_ubatch split_equal(uint32_t n_ubatch);
 
     // sequence-wise split - each ubatch contains a single sequence
+    // TODO: fit more than one full sequence, as long as they fit in the ubatch
     llama_ubatch split_seq(uint32_t n_ubatch);
 
+    // a helper method for creating a well-defined ubatch of tokens
+    // TODO: support embeddings if needed in the future
     llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);
 
 private:
     void clear();
 
+    // create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
+    // return llama_ubatch.n_tokens == 0 if the entire batch was consumed
     llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);
 
+    // for debugging, start with LLAMA_BATCH_DEBUG=2
     void ubatch_print(const llama_ubatch & ubatch, int debug);
 
     llama_batch batch;
@@ -99,15 +115,17 @@ class llama_batch_allocr {
     using idx_vec_t = std::vector<int32_t>;
     using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
 
-    std::vector<seq_set_t> seq_set;
+    std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i
 
-    std::unordered_map<seq_set_t, idx_vec_t> seq_set_map;
+    std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears
 
     // batch indices of the output
     std::vector<int32_t> out_ids;
 
+    // used[i] indicates if token i has already been used in a previous ubatch
     std::vector<bool> used;
 
+    // llama_ubatch points to this data:
     struct ubatch {
         std::vector<llama_token>    token;
         std::vector<float>          embd;
@@ -119,6 +137,7 @@ class llama_batch_allocr {
         std::vector<int8_t>         output;
     };
 
+    // current splitting state:
     std::vector<ubatch> ubatches;
 
     int debug;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -865,8 +865,10 @@ int llama_context::encode(const llama_batch & batch_inp) {
         cross.seq_ids_enc.resize(n_tokens);
         for (uint32_t i = 0; i < n_tokens; i++) {
             cross.seq_ids_enc[i].clear();
+
             for (int s = 0; s < batch.n_seq_id[i]; s++) {
-                llama_seq_id seq_id = batch.seq_id[i][s];
+                const llama_seq_id seq_id = batch.seq_id[i][s];
+
                 cross.seq_ids_enc[i].insert(seq_id);
             }
         }

Original file line number	Diff line number	Diff line change
`@@ -865,8 +865,10 @@ int llama_context::encode(const llama_batch & batch_inp) {`
`865`	`865`	`cross.seq_ids_enc.resize(n_tokens);`
`866`	`866`	`for (uint32_t i = 0; i < n_tokens; i++) {`
`867`	`867`	`cross.seq_ids_enc[i].clear();`
	`868`	`+`
`868`	`869`	`for (int s = 0; s < batch.n_seq_id[i]; s++) {`
`869`		`- llama_seq_id seq_id = batch.seq_id[i][s];`
	`870`	`+ const llama_seq_id seq_id = batch.seq_id[i][s];`
	`871`	`+`
`870`	`872`	`cross.seq_ids_enc[i].insert(seq_id);`
`871`	`873`	`}`
`872`	`874`	`}`