batch : add optional for sequential equal split

ggerganov · ggerganov · commit 4acfc25c2428 · 2025-07-03T11:09:02.000+03:00
ggml-ci
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -166,6 +166,8 @@ bool llama_batch_allocr::init(
 
                 // note: tracking the other way around is not necessary for now
                 //seq_cpl[s0][s1] = true;
+
+                has_cpl = true;
             }
         }
     }
@@ -459,9 +461,17 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
     return ubatch_add(idxs, idxs.size(), false);
 }
 
-llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
+llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
+    if (sequential && has_cpl) {
+        LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
+
+        return {};
+    }
+
     std::vector<seq_set_t> cur_seq_set;
 
+    llama_seq_id last_seq_id = -1;
+
     // determine the non-overlapping sequence sets participating in this ubatch
     for (int32_t i = 0; i < batch.n_tokens; ++i) {
         if (used[i]) {
@@ -478,9 +488,16 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
             }
         }
 
+        // accept only increasing sequence ids
+        if (sequential) {
+            add = add && (cur_seq_set.empty() || batch.seq_id[i][0] == last_seq_id + 1);
+        }
+
         if (add) {
             cur_seq_set.push_back(seq_set[i]);
 
+            last_seq_id = batch.seq_id[i][0];
+
             if (cur_seq_set.size() > n_ubatch) {
                 break;
             }
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -69,7 +69,8 @@ class llama_batch_allocr {
     llama_ubatch split_simple(uint32_t n_ubatch);
 
     // make ubatches of equal-length sequences sets
-    llama_ubatch split_equal(uint32_t n_ubatch);
+    // if sequential == true, the tokens in the ubatch will have increasing sequential sequence ids
+    llama_ubatch split_equal(uint32_t n_ubatch, bool sequential);
 
     // sequence-set-wise split - each ubatch contains a single sequence-set
     llama_ubatch split_seq(uint32_t n_ubatch);
@@ -112,6 +113,9 @@ class llama_batch_allocr {
     using pos_set_t = std::set<llama_pos>;
     using seq_cpl_t = std::vector<bool>;
 
+    // helper flag to quickly determine if there are any coupled sequences in the batch
+    bool has_cpl;
+
     std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
     std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
 
diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp
@@ -135,7 +135,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
 
         std::vector<llama_ubatch> ubatches;
         while (true) {
-            auto ubatch = balloc.split_equal(n_ubatch);
+            auto ubatch = balloc.split_equal(n_ubatch, false);
 
             if (ubatch.n_tokens == 0) {
                 break;
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp
@@ -70,7 +70,7 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
                 // if all tokens are output, split by sequence
                 ubatch = balloc.split_seq(n_ubatch);
             } else {
-                ubatch = balloc.split_equal(n_ubatch);
+                ubatch = balloc.split_equal(n_ubatch, false);
             }
 
             if (ubatch.n_tokens == 0) {
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
@@ -374,7 +374,7 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
                 // if all tokens are output, split by sequence
                 ubatch = balloc.split_seq(n_ubatch);
             } else {
-                ubatch = balloc.split_equal(n_ubatch);
+                ubatch = balloc.split_equal(n_ubatch, false);
             }
 
             if (ubatch.n_tokens == 0) {

Original file line number	Diff line number	Diff line change
`@@ -166,6 +166,8 @@ bool llama_batch_allocr::init(`
`166`	`166`
`167`	`167`	`// note: tracking the other way around is not necessary for now`
`168`	`168`	`//seq_cpl[s0][s1] = true;`
	`169`	`+`
	`170`	`+ has_cpl = true;`
`169`	`171`	`}`
`170`	`172`	`}`
`171`	`173`	`}`
`@@ -459,9 +461,17 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {`
`459`	`461`	`return ubatch_add(idxs, idxs.size(), false);`
`460`	`462`	`}`
`461`	`463`
`462`		`-llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {`
	`464`	`+llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {`
	`465`	`+ if (sequential && has_cpl) {`
	`466`	`+ LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);`
	`467`	`+`
	`468`	`+ return {};`
	`469`	`+ }`
	`470`	`+`
`463`	`471`	`std::vector<seq_set_t> cur_seq_set;`
`464`	`472`
	`473`	`+ llama_seq_id last_seq_id = -1;`
	`474`	`+`
`465`	`475`	`// determine the non-overlapping sequence sets participating in this ubatch`
`466`	`476`	`for (int32_t i = 0; i < batch.n_tokens; ++i) {`
`467`	`477`	`if (used[i]) {`
`@@ -478,9 +488,16 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {`
`478`	`488`	`}`
`479`	`489`	`}`
`480`	`490`
	`491`	`+ // accept only increasing sequence ids`
	`492`	`+ if (sequential) {`
	`493`	`+ add = add && (cur_seq_set.empty() \|\| batch.seq_id[i][0] == last_seq_id + 1);`
	`494`	`+ }`
	`495`	`+`
`481`	`496`	`if (add) {`
`482`	`497`	`cur_seq_set.push_back(seq_set[i]);`
`483`	`498`
	`499`	`+ last_seq_id = batch.seq_id[i][0];`
	`500`	`+`
`484`	`501`	`if (cur_seq_set.size() > n_ubatch) {`
`485`	`502`	`break;`
`486`	`503`	`}`
Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba`
`70`	`70`	`// if all tokens are output, split by sequence`
`71`	`71`	`ubatch = balloc.split_seq(n_ubatch);`
`72`	`72`	`} else {`
`73`		`- ubatch = balloc.split_equal(n_ubatch);`
	`73`	`+ ubatch = balloc.split_equal(n_ubatch, false);`
`74`	`74`	`}`
`75`	`75`
`76`	`76`	`if (ubatch.n_tokens == 0) {`
Original file line number	Diff line number	Diff line change
`@@ -374,7 +374,7 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &`
`374`	`374`	`// if all tokens are output, split by sequence`
`375`	`375`	`ubatch = balloc.split_seq(n_ubatch);`
`376`	`376`	`} else {`
`377`		`- ubatch = balloc.split_equal(n_ubatch);`
	`377`	`+ ubatch = balloc.split_equal(n_ubatch, false);`
`378`	`378`	`}`
`379`	`379`
`380`	`380`	`if (ubatch.n_tokens == 0) {`