context : simplify output counting logic during decode (ggml-org#14142)

ggerganov · Minh141120 · commit 3fd6eb44963f · 2025-07-05T23:29:26.000+07:00
* batch : remove logits_all flag

ggml-ci

* context : simplify output counting logic during decode

ggml-ci

* cont : fix comments
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -639,30 +639,37 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
             for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
                 std::vector<int8_t> seq_id(seq_id_max);
 
-                for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
-                    seq_id[ubatch.seq_id[i][s]] = 1;
-                }
-
-                std::stringstream ss;
-                for (int s = 0; s < seq_id_max; ++s) {
-                    if (seq_id[s]) {
-                        ss << s%10;
-                    } else {
-                        ss << ".";
-                    }
-                }
-
-                if (ubatch.token) {
-                    LLAMA_LOG_DEBUG("%s:  %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
-                            __func__, i, ubatch.token[i], vocab->token_to_piece(ubatch.token[i]).c_str(),
-                            ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
-                } else {
-                    LLAMA_LOG_DEBUG("%s:  %4d: [embd], pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
-                            __func__, i, ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
-                }
-            }
-            LLAMA_LOG_DEBUG("%s:   ]\n", __func__);
+llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) {
+    batch = in_batch;
+    GGML_ASSERT(batch.n_tokens > 0);
+    if (!batch.pos) {
+        assert(p0 >= 0);
+        pos.resize(batch.n_tokens);
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            pos[i] = p0 + i;
         }
+        batch.pos = pos.data();
+    }
+    if (!batch.n_seq_id) {
+        n_seq_id.resize(batch.n_tokens);
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            n_seq_id[i] = seq_id_0.size();
+        }
+        batch.n_seq_id = n_seq_id.data();
+    }
+    if (!batch.seq_id) {
+        seq_id.resize(batch.n_tokens + 1);
+        seq_id[batch.n_tokens] = NULL;
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            seq_id[i] = seq_id_0.data();
+        }
+        batch.seq_id = seq_id.data();
+    }
+    if (!batch.logits) {
+        // by default return the output only for the last token
+        output.resize(batch.n_tokens);
+        output[output.size() - 1] = true;
+        batch.logits = output.data();
     }
 }
 
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -94,20 +94,11 @@ struct llama_sbatch {
     // used[i] indicates if token i has already been used in a previous ubatch
     std::vector<bool> used;
 
-    // llama_ubatch points to this data:
-    struct ubatch {
-        std::vector<llama_token>    token;
-        std::vector<float>          embd;
-        std::vector<llama_pos>      pos;
-        std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;
-        std::vector<llama_seq_id>   seq_id_unq;
-        std::vector<int32_t>        seq_idx;
-        std::vector<int8_t>         output;
-    };
-
-    // current splitting state:
-    std::vector<ubatch> ubatches;
+    std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id *> seq_id;
+    std::vector<int8_t>         output;
 
     int debug;
 };
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -902,23 +902,41 @@ int llama_context::decode(const llama_batch & batch_inp) {
     const auto & hparams = model.hparams;
 
     const int32_t n_vocab = vocab.n_tokens();
-    const int64_t n_embd  = hparams.n_embd;
 
-    // when computing embeddings, all tokens are output
-    const bool output_all = cparams.embeddings;
+    const int64_t n_tokens_all = batch.n_tokens;
+    const int64_t n_embd       = hparams.n_embd;
 
-    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, output_all)) {
-        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
-        return -1;
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+    // TODO: move the validation to the llama_batch_allocr
+    if (batch.token) {
+        for (int64_t i = 0; i < n_tokens_all; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
+
+            if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
+                return -1;
+            }
+        }
     }
 
-    const uint32_t n_tokens_all  = balloc->get_n_tokens();
-    const uint32_t n_outputs_all = balloc->get_n_outputs();
+    // this indicates we are doing pooled embedding
+    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
+    int64_t n_outputs_all = 0;
 
-    if (output_all) {
+    // count outputs
+    for (uint32_t i = 0; i < n_tokens_all; ++i) {
+        n_outputs_all += batch.logits[i] != 0;
+    }
+
+    if (embd_pooled) {
         // require that all tokens are output
         if (n_outputs_all != n_tokens_all) {
-            LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
+            LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %" PRId64 ", n_tokens_all = %" PRId64 ")\n",
                     __func__, n_outputs_all, n_tokens_all);
             return -1;
         }
@@ -2045,6 +2063,9 @@ void llama_context::opt_epoch_iter(
 
         n_queued_tokens += n_tokens_all;
 
+        // this indicates we are doing pooled embedding
+        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
         embd_seq.clear();
 
         uint32_t n_outputs_all = n_tokens_all;