parallel: fix adding tokens to batch

matiaslin · matiaslin · commit 85e1f183dbf3 · 2024-09-27T10:06:26.000-07:00
A crash was observed when the number of tokens added to a batch
exceeds the context size. Assertions have been added to ensure the
number of tokens added to batch is within bounds of context size.
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -13,6 +13,13 @@
 #include <vector>
 #include <ctime>
 
+#define LLAMA_ASSERT(condition, ...) { \
+    if (!condition) { \
+      LOG_ERR(__VA_ARGS__); \
+      return 1; \
+    } \
+}
+
 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
     size_t start = 0;
@@ -188,6 +195,9 @@ int main(int argc, char ** argv) {
     {
         LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
 
+        LLAMA_ASSERT((batch.n_tokens + n_tokens_system < n_ctx),
+                     "%s: Unable to add system tokens (%d tokens) to batch due to context overflow. "
+                     "Consider increasing context size (%d).\n" , __func__, n_tokens_system, n_ctx);
         for (int32_t i = 0; i < n_tokens_system; ++i) {
             llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
         }
@@ -223,6 +233,9 @@ int main(int argc, char ** argv) {
 
             client.i_batch = batch.n_tokens;
 
+            LLAMA_ASSERT((batch.n_tokens + 1 < n_ctx),
+                         "%s: Unable to add client %d's sampled token to batch due to context overflow. "
+                         "Consider increasing context size (Found: %d).\n", __func__, client.id, n_ctx);
             llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
 
             client.n_decoded += 1;
@@ -258,7 +271,11 @@ int main(int argc, char ** argv) {
                     std::vector<llama_token> tokens_prompt;
                     tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
 
-                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
+                    size_t n_tokens_prompt = tokens_prompt.size();
+                    LLAMA_ASSERT((batch.n_tokens + n_tokens_prompt < n_ctx),
+                                 "%s: Unable to add client %d's prompt tokens (%d tokens) to batch due to context overflow. "
+                                 "Consider increasing context size (Found: %d).\n", __func__, client.id, n_tokens_prompt, n_ctx);
+                    for (size_t i = 0; i < n_tokens_prompt; ++i) {
                         llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
                     }