cont : move output counting to class

ggerganov · ggerganov · commit 75a0d520fb76 · 2025-06-12T20:02:44.000+03:00
ggml-ci
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -339,14 +339,24 @@ bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab &
         batch.logits = output.data();
     }
 
+    for (int32_t i = 0; i < batch.n_tokens; ++i) {
+        n_outputs += batch.logits[i] != 0;
+    }
+
     return true;
 }
 
 const llama_batch & llama_batch_allocr::get_batch() const {
     return batch;
 }
 
+uint32_t llama_batch_allocr::get_n_outputs() const {
+    return n_outputs;
+}
+
 void llama_batch_allocr::clear() {
+    n_outputs = 0;
+
     batch = {};
     pos.clear();
     n_seq_id.clear();
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -87,12 +87,17 @@ class llama_batch_allocr {
 
     const llama_batch & get_batch() const;
 
+    uint32_t get_n_outputs() const;
+
 private:
     void clear();
 
     llama_batch batch;
 
+    uint32_t n_outputs;
+
     std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
+
     std::vector<llama_pos>      pos;
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id *> seq_id;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -730,6 +730,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
     // temporary allocate memory for the input batch if needed
     // note: during encode, we always pass the full sequence starting from pos = 0
     if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : 0)) {
+        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
 
@@ -904,6 +905,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
     // temporary allocate memory for the input batch if needed
     if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : memory->seq_pos_max(0) + 1)) {
+        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
 
@@ -922,12 +924,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     // this indicates we are doing pooled embedding
     const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
 
-    uint32_t n_outputs_all = 0;
-
-    // count outputs
-    for (uint32_t i = 0; i < n_tokens_all; ++i) {
-        n_outputs_all += batch.logits[i] != 0;
-    }
+    const uint32_t n_outputs_all = batch_allocr->get_n_outputs();
 
     if (embd_pooled) {
         // require that all tokens are output