llama : rename batch.logits to batch.output

danbev · danbev · commit 74edb4275bf9 · 2024-10-23T12:31:41.000+02:00
This commit renames the `logits` field of the `llama_batch` struct to
`output`.

The motivation for this change (apart from the TODO comment) is that
the `logits` field is actually used to specify that output should be
generated. For example, in the case of generating embeddings, setting
logits to true can be confusing since the logits are not used when
generating embeddings.
diff --git a/common/common.cpp b/common/common.cpp
@@ -554,7 +554,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
             << ":pos " << std::to_string(batch.pos[i])
             << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
             << ":seq_id " << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
+            << ":output " << std::to_string(batch.output[i]);
     }
 
     buf << " ]";
@@ -1480,7 +1480,7 @@ void common_batch_add(
                         llama_token   id,
                           llama_pos   pos,
     const std::vector<llama_seq_id> & seq_ids,
-                               bool   logits) {
+                               bool   output) {
     GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
 
     batch.token   [batch.n_tokens] = id;
@@ -1489,7 +1489,7 @@ void common_batch_add(
     for (size_t i = 0; i < seq_ids.size(); ++i) {
         batch.seq_id[batch.n_tokens][i] = seq_ids[i];
     }
-    batch.logits  [batch.n_tokens] = logits;
+    batch.output  [batch.n_tokens] = output;
 
     batch.n_tokens++;
 }
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
@@ -73,7 +73,7 @@ int main(int argc, char ** argv) {
                 batch.pos      + i,
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
-                batch.logits   + i,
+                batch.output   + i,
             };
 
             const int ret = llama_decode(ctx, batch_view);
@@ -128,7 +128,7 @@ int main(int argc, char ** argv) {
                         common_batch_add(batch, 0, i, { j }, false);
                     }
                 }
-                batch.logits[batch.n_tokens - 1] = true;
+                batch.output[batch.n_tokens - 1] = true;
 
                 const auto t_pp_start = ggml_time_us();
 
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
@@ -99,11 +99,11 @@ for (i, token) in tokens.enumerated() {
     if let seq_id = batch.seq_id[i] {
         seq_id[0] = 0
     }
-    batch.logits[i] = 0
+    batch.output[i] = 0
 }
 
 // llama_decode will output logits only for the last token of the prompt
-batch.logits[Int(batch.n_tokens) - 1] = 1
+batch.output[Int(batch.n_tokens) - 1] = 1
 
 if llama_decode(context, batch) != 0 {
     print("llama_decode() failed")
@@ -166,7 +166,7 @@ while n_cur <= n_len {
         if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
             seq_id[0] = Int32(i)
         }
-        batch.logits[Int(batch.n_tokens)] = 1
+        batch.output[Int(batch.n_tokens)] = 1
 
         i_batch[i] = batch.n_tokens
 
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
@@ -128,7 +128,7 @@ int main(int argc, char ** argv) {
     }
 
     // llama_decode will output logits only for the last token of the prompt
-    batch.logits[batch.n_tokens - 1] = true;
+    batch.output[batch.n_tokens - 1] = true;
 
     if (llama_decode(ctx, batch) != 0) {
         LOG_ERR("%s: llama_decode() failed\n", __func__);
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -54,7 +54,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     }
 
     for (int i = 0; i < batch.n_tokens; i++) {
-        if (!batch.logits[i]) {
+        if (!batch.output[i]) {
             continue;
         }
 
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -193,7 +193,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
             common_batch_add(*batch, 0, i, { 0 }, false);
         }
 
-        batch->logits[batch->n_tokens - 1] = true;
+        batch->output[batch->n_tokens - 1] = true;
         llama_kv_cache_clear(context);
 
         const auto t_pp_start = ggml_time_us();
@@ -297,7 +297,7 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
     for (int i = 0; i < n_tokens; ++i) {
         batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
     }
-    batch->logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
+    batch->output   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
 
     return reinterpret_cast<jlong>(batch);
 }
@@ -377,7 +377,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
     }
 
     // llama_decode will output logits only for the last token of the prompt
-    batch->logits[batch->n_tokens - 1] = true;
+    batch->output[batch->n_tokens - 1] = true;
 
     if (llama_decode(context, *batch) != 0) {
         LOGe("llama_decode() failed");
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -137,7 +137,7 @@ actor LlamaContext {
             let i = Int(i1)
             llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
         }
-        batch.logits[Int(batch.n_tokens) - 1] = 1 // true
+        batch.output[Int(batch.n_tokens) - 1] = 1 // true
 
         if llama_decode(context, batch) != 0 {
             print("llama_decode() failed")
@@ -206,7 +206,7 @@ actor LlamaContext {
             for i in 0..<n_tokens {
                 llama_batch_add(&batch, 0, Int32(i), [0], false)
             }
-            batch.logits[Int(batch.n_tokens) - 1] = 1 // true
+            batch.output[Int(batch.n_tokens) - 1] = 1 // true
 
             llama_kv_cache_clear(context)
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -406,13 +406,13 @@ struct llava_embd_batch {
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id>   seq_id_0;
     std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
+    std::vector<int8_t>         outputs;
     llama_batch batch;
     llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
+        outputs .resize(n_tokens);
         seq_id_0.resize(1);
         seq_id_0[0] = seq_id;
         seq_ids [n_tokens] = nullptr;
@@ -423,13 +423,13 @@ struct llava_embd_batch {
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
             /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
+            /*output         =*/ outputs.data(),
         };
         for (int i = 0; i < n_tokens; i++) {
             batch.pos     [i] = pos_0 + i;
             batch.n_seq_id[i] = 1;
             batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
+            batch.output  [i] = false;
         }
     }
 };
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -264,7 +264,7 @@ int main(int argc, char ** argv) {
 
                     // extract the logits only for the last token
                     if (batch.n_tokens > 0) {
-                        batch.logits[batch.n_tokens - 1] = true;
+                        batch.output[batch.n_tokens - 1] = true;
                     }
 
                     client.n_prompt  = tokens_prompt.size();
@@ -307,7 +307,7 @@ int main(int argc, char ** argv) {
                 batch.pos      + i,
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
-                batch.logits   + i,
+                batch.output   + i,
             };
 
             const int ret = llama_decode(ctx, batch_view);
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
         }
 
         if (i + n_batch >= n_tokens_all) {
-            batch.logits[batch.n_tokens - 1] = true;
+            batch.output[batch.n_tokens - 1] = true;
         }
 
         if (llama_decode(ctx, batch) != 0) {
@@ -178,7 +178,7 @@ int main(int argc, char ** argv) {
         }
 
         if (i + n_batch >= n_tokens_all) {
-            batch.logits[batch.n_tokens - 1] = true;
+            batch.output[batch.n_tokens - 1] = true;
         }
 
         if (llama_decode(ctx, batch) != 0) {
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -615,9 +615,9 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
                     batch.pos     [idx]    = j*n_batch + k;
                     batch.n_seq_id[idx]    = 1;
                     batch.seq_id  [idx][0] = seq;
-                    batch.logits  [idx]    = batch.pos[idx] >= first ? 1 : 0;
+                    batch.output  [idx]    = batch.pos[idx] >= first ? 1 : 0;
 
-                    n_outputs += batch.logits[idx] != 0;
+                    n_outputs += batch.output[idx] != 0;
                 }
                 batch.n_tokens += batch_size;
 
@@ -712,7 +712,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
             batch.pos      + i,
             batch.n_seq_id + i,
             batch.seq_id   + i,
-            batch.logits   + i,
+            batch.output   + i,
         };
 
         const int ret = llama_decode(ctx, batch_view);
@@ -723,7 +723,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
 
         int n_outputs = 0;
         for (int i = 0; i < n_tokens; ++i) {
-            n_outputs += batch_view.logits[i] != 0;
+            n_outputs += batch_view.output[i] != 0;
         }
 
         memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
@@ -936,7 +936,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
             for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
                 common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
             }
-            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+            batch.output[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
             n_logits += 1;
 
             for (int s = 0; s < 4; ++s) {
@@ -1215,7 +1215,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
             for (size_t i = 0; i < data[i1].common_prefix; ++i) {
                 common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
             }
-            batch.logits[batch.n_tokens - 1] = true;
+            batch.output[batch.n_tokens - 1] = true;
             n_logits += 1;
 
             for (int s = 0; s < 2; ++s) {
@@ -1581,7 +1581,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
                 //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
                 common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
             }
-            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+            batch.output[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
             n_logits += 1;
 
             for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
@@ -92,7 +92,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     }
 
     for (int i = 0; i < batch.n_tokens; i++) {
-        if (!batch.logits[i]) {
+        if (!batch.output[i]) {
             continue;
         }
 
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
@@ -52,7 +52,7 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < tokens.size(); i++) {
         common_batch_add(batch, tokens[i], i, {0}, false);
     }
-    batch.logits[batch.n_tokens - 1] = true; // generate next token
+    batch.output[batch.n_tokens - 1] = true; // generate next token
 
     // evaluate prompt
     llama_decode(ctx, batch);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1382,7 +1382,7 @@ struct server_context {
         std::vector<float> embd_res(n_embd, 0.0f);
 
         for (int i = 0; i < batch.n_tokens; ++i) {
-            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
+            if (!batch.output[i] || batch.seq_id[i][0] != slot.id + 1) {
                 continue;
             }
 
@@ -1422,7 +1422,7 @@ struct server_context {
         res.stop     = true;
 
         for (int i = 0; i < batch.n_tokens; ++i) {
-            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
+            if (!batch.output[i] || batch.seq_id[i][0] != slot.id + 1) {
                 continue;
             }
 
@@ -2289,7 +2289,7 @@ struct server_context {
                         GGML_ASSERT(batch.n_tokens > 0);
 
                         // extract the logits only for the last token
-                        batch.logits[batch.n_tokens - 1] = true;
+                        batch.output[batch.n_tokens - 1] = true;
 
                         slot.n_decoded = 0;
                         slot.i_batch   = batch.n_tokens - 1;
@@ -2325,7 +2325,7 @@ struct server_context {
                 batch.pos      + i,
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
-                batch.logits   + i,
+                batch.output   + i,
             };
 
             const int ret = llama_decode(ctx, batch_view);
diff --git a/include/llama.h b/include/llama.h
@@ -247,7 +247,7 @@ extern "C" {
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-        int8_t       *  logits; // TODO: rename this to "output"
+        int8_t       *  output;
     } llama_batch;
 
     enum llama_model_kv_override_type {
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -3072,17 +3072,17 @@ struct llama_sbatch {
                 ubatch.output[ubatch.n_tokens + i] = 1;
                 out_ids.push_back(ids[seq.offset + i]);
             }
-        } else if (batch->logits) {
+        } else if (batch->output) {
             if (ubatch.equal_seqs) {
                 for (size_t i = 0; i < length; ++i) {
                     size_t id = ids[seq.offset + i];
-                    int8_t is_output = batch->logits[id];
+                    int8_t is_output = batch->output[id];
                     ubatch.output[ubatch.n_tokens + i] = is_output;
                     if (is_output) { out_ids.push_back(id); }
                 }
             } else {
                 // simple split
-                ubatch.output = batch->logits + seq.offset;
+                ubatch.output = batch->output + seq.offset;
                 for (size_t i = 0; i < length; ++i) {
                     if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
                 }
@@ -5184,7 +5184,7 @@ struct llama_batch_allocr {
     std::vector<llama_pos>      pos;
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id *> seq_id;
-    std::vector<int8_t>         logits;
+    std::vector<int8_t>         outputs;
     struct llama_batch          batch;
     // optionally fulfill the batch returned by llama_batch_get_one
     llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) {
@@ -5220,10 +5220,10 @@ struct llama_batch_allocr {
             }
             batch.seq_id = seq_id.data();
         }
-        if (!batch.logits) {
-            logits.resize(batch.n_tokens);
-            logits[logits.size() - 1] = true;
-            batch.logits = logits.data();
+        if (!batch.output) {
+            outputs.resize(batch.n_tokens);
+            outputs[outputs.size() - 1] = true;
+            batch.output = outputs.data();
         }
     }
 };
@@ -17200,9 +17200,9 @@ static int llama_decode_internal(
     lctx.embd_seq.clear();
 
     // count outputs
-    if (batch.logits && !embd_pooled) {
+    if (batch.output && !embd_pooled) {
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
-            n_outputs += batch.logits[i] != 0;
+            n_outputs += batch.output[i] != 0;
         }
     } else if (lctx.logits_all || embd_pooled) {
         n_outputs = n_tokens_all;
@@ -21129,7 +21129,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
     }
     batch.seq_id[n_tokens_alloc] = nullptr;
 
-    batch.logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens_alloc);
+    batch.output   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens_alloc);
 
     return batch;
 }
@@ -21145,7 +21145,7 @@ void llama_batch_free(struct llama_batch batch) {
         }
         free(batch.seq_id);
     }
-    if (batch.logits)   free(batch.logits);
+    if (batch.output)   free(batch.output);
 }
 
 int32_t llama_encode(

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ int main(int argc, char ** argv) {`
`73`	`73`	`batch.pos + i,`
`74`	`74`	`batch.n_seq_id + i,`
`75`	`75`	`batch.seq_id + i,`
`76`		`- batch.logits + i,`
	`76`	`+ batch.output + i,`
`77`	`77`	`};`
`78`	`78`
`79`	`79`	`const int ret = llama_decode(ctx, batch_view);`
`@@ -128,7 +128,7 @@ int main(int argc, char ** argv) {`
`128`	`128`	`common_batch_add(batch, 0, i, { j }, false);`
`129`	`129`	`}`
`130`	`130`	`}`
`131`		`- batch.logits[batch.n_tokens - 1] = true;`
	`131`	`+ batch.output[batch.n_tokens - 1] = true;`
`132`	`132`
`133`	`133`	`const auto t_pp_start = ggml_time_us();`
`134`	`134`
Original file line number	Diff line number	Diff line change
`@@ -99,11 +99,11 @@ for (i, token) in tokens.enumerated() {`
`99`	`99`	`if let seq_id = batch.seq_id[i] {`
`100`	`100`	`seq_id[0] = 0`
`101`	`101`	`}`
`102`		`- batch.logits[i] = 0`
	`102`	`+ batch.output[i] = 0`
`103`	`103`	`}`
`104`	`104`
`105`	`105`	`// llama_decode will output logits only for the last token of the prompt`
`106`		`-batch.logits[Int(batch.n_tokens) - 1] = 1`
	`106`	`+batch.output[Int(batch.n_tokens) - 1] = 1`
`107`	`107`
`108`	`108`	`if llama_decode(context, batch) != 0 {`
`109`	`109`	`print("llama_decode() failed")`
`@@ -166,7 +166,7 @@ while n_cur <= n_len {`
`166`	`166`	`if let seq_id = batch.seq_id[Int(batch.n_tokens)] {`
`167`	`167`	`seq_id[0] = Int32(i)`
`168`	`168`	`}`
`169`		`- batch.logits[Int(batch.n_tokens)] = 1`
	`169`	`+ batch.output[Int(batch.n_tokens)] = 1`
`170`	`170`
`171`	`171`	`i_batch[i] = batch.n_tokens`
`172`	`172`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu`
`54`	`54`	`}`
`55`	`55`
`56`	`56`	`for (int i = 0; i < batch.n_tokens; i++) {`
`57`		`- if (!batch.logits[i]) {`
	`57`	`+ if (!batch.output[i]) {`
`58`	`58`	`continue;`
`59`	`59`	`}`
`60`	`60`
Original file line number	Diff line number	Diff line change
`@@ -193,7 +193,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(`
`193`	`193`	`common_batch_add(*batch, 0, i, { 0 }, false);`
`194`	`194`	`}`
`195`	`195`
`196`		`- batch->logits[batch->n_tokens - 1] = true;`
	`196`	`+ batch->output[batch->n_tokens - 1] = true;`
`197`	`197`	`llama_kv_cache_clear(context);`
`198`	`198`
`199`	`199`	`const auto t_pp_start = ggml_time_us();`
`@@ -297,7 +297,7 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,`
`297`	`297`	`for (int i = 0; i < n_tokens; ++i) {`
`298`	`298`	`batch->seq_id[i] = (llama_seq_id ) malloc(sizeof(llama_seq_id) n_seq_max);`
`299`	`299`	`}`
`300`		`- batch->logits = (int8_t ) malloc(sizeof(int8_t) n_tokens);`
	`300`	`+ batch->output = (int8_t ) malloc(sizeof(int8_t) n_tokens);`
`301`	`301`
`302`	`302`	`return reinterpret_cast<jlong>(batch);`
`303`	`303`	`}`
`@@ -377,7 +377,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(`
`377`	`377`	`}`
`378`	`378`
`379`	`379`	`// llama_decode will output logits only for the last token of the prompt`
`380`		`- batch->logits[batch->n_tokens - 1] = true;`
	`380`	`+ batch->output[batch->n_tokens - 1] = true;`
`381`	`381`
`382`	`382`	`if (llama_decode(context, *batch) != 0) {`
`383`	`383`	`LOGe("llama_decode() failed");`
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ actor LlamaContext {`
`137`	`137`	`let i = Int(i1)`
`138`	`138`	`llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)`
`139`	`139`	`}`
`140`		`- batch.logits[Int(batch.n_tokens) - 1] = 1 // true`
	`140`	`+ batch.output[Int(batch.n_tokens) - 1] = 1 // true`
`141`	`141`
`142`	`142`	`if llama_decode(context, batch) != 0 {`
`143`	`143`	`print("llama_decode() failed")`
`@@ -206,7 +206,7 @@ actor LlamaContext {`
`206`	`206`	`for i in 0..<n_tokens {`
`207`	`207`	`llama_batch_add(&batch, 0, Int32(i), [0], false)`
`208`	`208`	`}`
`209`		`- batch.logits[Int(batch.n_tokens) - 1] = 1 // true`
	`209`	`+ batch.output[Int(batch.n_tokens) - 1] = 1 // true`
`210`	`210`
`211`	`211`	`llama_kv_cache_clear(context)`
`212`	`212`
Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {`
`144`	`144`	`}`
`145`	`145`
`146`	`146`	`if (i + n_batch >= n_tokens_all) {`
`147`		`- batch.logits[batch.n_tokens - 1] = true;`
	`147`	`+ batch.output[batch.n_tokens - 1] = true;`
`148`	`148`	`}`
`149`	`149`
`150`	`150`	`if (llama_decode(ctx, batch) != 0) {`
`@@ -178,7 +178,7 @@ int main(int argc, char ** argv) {`
`178`	`178`	`}`
`179`	`179`
`180`	`180`	`if (i + n_batch >= n_tokens_all) {`
`181`		`- batch.logits[batch.n_tokens - 1] = true;`
	`181`	`+ batch.output[batch.n_tokens - 1] = true;`
`182`	`182`	`}`
`183`	`183`
`184`	`184`	`if (llama_decode(ctx, batch) != 0) {`