imaginationtech
diff --git a/‎docs/ContribOperators.md
Lines changed: 4 additions & 2 deletions b/‎docs/ContribOperators.md
Lines changed: 4 additions & 2 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/attention_common.h
Lines changed: 2 additions & 1 deletion b/‎onnxruntime/contrib_ops/cpu/bert/attention_common.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/attention_helper.h
Lines changed: 4 additions & 7 deletions b/‎onnxruntime/contrib_ops/cpu/bert/attention_helper.h
Lines changed: 4 additions & 7 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
Lines changed: 89 additions & 88 deletions b/‎onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
Lines changed: 89 additions & 88 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc
Lines changed: 23 additions & 8 deletions b/‎onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc
Lines changed: 23 additions & 8 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
Lines changed: 28 additions & 8 deletions b/‎onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
Lines changed: 28 additions & 8 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/sparse/sparse_attention_base.h
Lines changed: 2 additions & 2 deletions b/‎onnxruntime/contrib_ops/cpu/sparse/sparse_attention_base.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
Lines changed: 0 additions & 1 deletion b/‎onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
Lines changed: 0 additions & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
Lines changed: 2 additions & 3 deletions b/‎onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
Lines changed: 2 additions & 3 deletions
@@ -2521,6 +2521,8 @@ This version of the operator has been available since version 1 of the 'com.micr
   Only supports causal and local attention.
   Supports rotary position embedding for CPU and CUDA.
   Supports packed input for CPU and CUDA.
+  Supports continuous decoding for batch_size == 1 for CPU and CUDA.
+  
 
 #### Version
 
@@ -2561,9 +2563,9 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>past_value</tt> (optional) : T</dt>
 <dd>past state value with support for format BNSH. When past_value uses same tensor as present_value(k-v cache), it is of length max_sequence_length... otherwise of length past_sequence_length.</dd>
 <dt><tt>seqlens_k</tt> : M</dt>
-<dd>1d Tensor of shape (batch_size). Indicates past sequence lengths for token generation case.</dd>
+<dd>1D Tensor of shape (batch_size). Equivalent to (total_sequence_lengths - 1).</dd>
 <dt><tt>total_sequence_length</tt> : M</dt>
-<dd>Scalar tensor of total sequence length (past + new).</dd>
+<dd>Scalar tensor equivalent to the maximum total sequence length (past + new) of the batch. Used for checking inputs and determining prompt vs token generation case.</dd>
 <dt><tt>cos_cache</tt> (optional) : T</dt>
 <dd>2D tensor with shape (max_sequence_length, head_size / 2).</dd>
 <dt><tt>sin_cache</tt> (optional) : T</dt>
 
@@ -114,7 +114,8 @@ struct GroupQueryAttentionParameters {
   int local_window_size;
   bool kv_share_buffer;
   bool is_packed_qkv;
-  bool is_prompt;  // determines if seqlens_k is past or kv sequence length tensor
+  bool is_subsequent_prompt;  // indicates whether we have past context and seqlen > 1
+  bool is_first_prompt;       // indicates whether this is first decoding step
   bool do_rotary;
   bool rotary_interleaved;
   bool use_smooth_softmax;
 
@@ -236,19 +236,16 @@ T* ConcatStateChunkGQA(const T* past,
                        size_t past_buff_chunk_length,
                        size_t past_chunk_length,
                        size_t new_chunk_length,
-                       bool is_prompt,
                        bool past_present_share_buffer,
                        std::ptrdiff_t i) {
   T* start = present + i * present_buff_chunk_length;
 
   T* p = start;
-  if (!is_prompt) {
-    if (!past_present_share_buffer) {
-      const T* src_past = past + i * past_buff_chunk_length;
-      memcpy(p, src_past, past_chunk_length * sizeof(T));
-    }
-    p += past_chunk_length;
+  if (!past_present_share_buffer && past_chunk_length > 0) {
+    const T* src_past = past + i * past_buff_chunk_length;
+    memcpy(p, src_past, past_chunk_length * sizeof(T));
   }
+  p += past_chunk_length;
 
   memcpy(p, chunk, new_chunk_length * sizeof(T));
   return start;
 
@@ -45,7 +45,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
   const Tensor* past_key = context->Input<Tensor>(3);
   const Tensor* past_value = context->Input<Tensor>(4);
   const Tensor* seqlens_k = context->Input<Tensor>(5);
-  const Tensor* total_seqlen = context->Input<Tensor>(6);
+  const Tensor* total_seqlen_tensor = context->Input<Tensor>(6);
   const Tensor* cos_cache = context->Input<Tensor>(7);
   const Tensor* sin_cache = context->Input<Tensor>(8);
 
@@ -61,7 +61,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
                                                                 num_heads_,
                                                                 kv_num_heads_,
                                                                 seqlens_k,
-                                                                total_seqlen,
+                                                                total_seqlen_tensor,
                                                                 scale_,
                                                                 softcap_));
 
@@ -103,6 +103,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
   }
 
   if (do_rotary_) {
+    // Initialize rotary parameters
     rotary_embedding_helper::RotaryParameters rotary_params = {};
     rotary_params.batch_size = batch_size;
     rotary_params.sequence_length = sequence_length;
@@ -114,17 +115,29 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
     rotary_params.seq_stride = head_size;
     rotary_params.head_stride = sequence_length * rotary_params.seq_stride;
     rotary_params.batch_stride = (packed_qkv ? (num_heads_ + 2 * kv_num_heads_) : num_heads_) * rotary_params.head_stride;
-    rotary_params.position_ids_format = sequence_length == 1 ? 1 : 0;
+    rotary_params.position_ids_format = !parameters.is_first_prompt ? 1 : 0;
     rotary_params.transposed = true;
     auto* tp = context->GetOperatorThreadPool();
-    std::vector<int64_t> pos_ids(sequence_length == 1 ? batch_size : 1);
-    if (sequence_length == 1) {
+    // Generate position ids
+    const int pos_ids_size = parameters.is_first_prompt ? 1 : batch_size * sequence_length;
+    std::vector<int64_t> pos_ids(pos_ids_size);
+    if (parameters.is_first_prompt) {
+      pos_ids[0] = static_cast<int64_t>(0);
+    } else {
+      // Note: As of now, interactive decoding supports only batch size 1 and token generation supports only sequence length 1.
       for (int b = 0; b < batch_size; b++) {
-        pos_ids[b] = static_cast<int64_t>(seqlens_k->Data<int32_t>()[b]);
+        const int total_seqlen = seqlens_k->Data<int32_t>()[b] + 1;
+        const int past_seqlen = total_seqlen - sequence_length;
+        for (int s = 0; s < sequence_length; s++) {
+          if (past_seqlen + s < total_seqlen) {
+            pos_ids[b * sequence_length + s] = static_cast<int64_t>(past_seqlen) + s;
+          } else {
+            pos_ids[b * sequence_length + s] = static_cast<int64_t>(1);
+          }
+        }
       }
-    } else {
-      pos_ids[0] = static_cast<int64_t>(0);
     }
+    // Initialize separate buffers for rotary embeddings
     const T* q_input;
     const T* k_input;
     T* q_rotary;
@@ -149,6 +162,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
       Q = RotaryQ;
       K = RotaryK;
     }
+    // Run rotary embedding for Q and K
     ORT_RETURN_IF_ERROR(RunRotaryEmbedding<T>(tp, rotary_params, q_input,
                                               pos_ids.data(), cos_cache->Data<T>(),
                                               sin_cache->Data<T>(), q_rotary, rotary_interleaved_));
@@ -161,6 +175,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
     ORT_RETURN_IF_ERROR(RunRotaryEmbedding<T>(tp, rotary_params, k_input,
                                               pos_ids.data(), cos_cache->Data<T>(),
                                               sin_cache->Data<T>(), k_rotary, rotary_interleaved_));
+    // Pack V into rotary QKV buffer
     if (packed_qkv) {
       const T* v_input = k_input + kv_num_heads_ * sequence_length * head_size;
       T* v_rotary = k_rotary + kv_num_heads_ * sequence_length * head_size;
 
@@ -168,14 +168,13 @@ Status CheckInputs(const Tensor* query,
                            "Input 'past_key' and 'past_value' shall be both present or both absent.");
   }
 
-  // Check seqlens_k tensor (holding past seqlen for token gen)
-  const auto& seqlens_dim = seqlens_k->Shape().GetDims();
-  if (seqlens_dim.size() != 1 && seqlens_dim[0] != batch_size) {
+  const auto& seqlens_k_dim = seqlens_k->Shape().GetDims();
+  if (seqlens_k_dim.size() != 1 && seqlens_k_dim[0] != batch_size) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "seqlens_k must be shape (batch_size).");
   }
 
-  // Set present sequence length and kv_share_buffer from input total_seqlen tensor
+  // Set present sequence length from input total_seqlen tensor
   if (!onnxruntime::IsScalarOr1ElementVector(total_seqlen)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "total_sequence_length tensor must be of one element.");
@@ -195,11 +194,11 @@ Status CheckInputs(const Tensor* query,
     }
     if (cos_dims[0] < total_sequence_length) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "cos_cache dimension 0 should be not be less than total_sequence_length.");
+                             "cos_cache dimension 0 shall not be less than total_sequence_length.");
     }
     if (sin_dims[0] < total_sequence_length) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "sin_cache dimension 0 should be not be less than total_sequence_length.");
+                             "sin_cache dimension 0 shall not be less than total_sequence_length.");
     }
     if (cos_dims[1] > (head_size / 16) * 8 || cos_dims[1] % 8 != 0) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
@@ -219,14 +218,34 @@ Status CheckInputs(const Tensor* query,
                            "Input 'cos_cache' and 'sin_cache' shall be both present or both absent.");
   }
 
-  bool is_prompt = sequence_length != 1;
+  bool is_subsequent_prompt = false;
+  if (sequence_length > 1 && sequence_length != total_sequence_length) {
+    if (batch_size != 1) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "batch_size must be 1 when sequence_length > 1 and past context is given.");
+    }
+    is_subsequent_prompt = true;
+  }
+
+  bool is_first_prompt;
+  if (is_subsequent_prompt) {
+    is_first_prompt = false;  // irrelevant for interactive decoding
+  } else {
+    // If not interactive, sequence_length is 1 for token gen and arbitrarily large for prompt
+    is_first_prompt = (sequence_length == total_sequence_length);
+    if (!is_first_prompt && sequence_length != 1) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "sequence_length shall be 1 when it is not prompt.");
+    }
+  }
 
   if (parameters != nullptr) {
     GroupQueryAttentionParameters* output_parameters = reinterpret_cast<GroupQueryAttentionParameters*>(parameters);
     output_parameters->batch_size = batch_size;
     output_parameters->sequence_length = sequence_length;                  // sequence length of Q
     output_parameters->seqlen_past_kv_cache = past_sequence_length;        // max sequence length of past kv tensors
     output_parameters->seqlen_present_kv_cache = present_sequence_length;  // max sequence length of present kv tensors
+    output_parameters->total_sequence_length = total_sequence_length;      // total sequence length
     output_parameters->hidden_size = q_hidden_size;
     output_parameters->num_heads = num_heads;
     output_parameters->head_size = head_size;
@@ -235,7 +254,8 @@ Status CheckInputs(const Tensor* query,
     output_parameters->rotary_dim = rotary_dim;
     output_parameters->is_packed_qkv = is_packed_qkv;
     output_parameters->is_unidirectional = true;
-    output_parameters->is_prompt = is_prompt;
+    output_parameters->is_subsequent_prompt = is_subsequent_prompt;
+    output_parameters->is_first_prompt = is_first_prompt;
     output_parameters->scale = scale;
     output_parameters->softcap = softcap;
     output_parameters->qkv_format = qkv_format;
 
@@ -184,7 +184,7 @@ class SparseAttentionBase {
         // Concatenate past_k + k -> present_k
         // TODO: avoid copying mutiple times for a group.
         k = ConcatStateChunkGQA(past_key, k, present_key, present_buff_chunk_length, past_buff_chunk_length,
-                                past_chunk_length, kv_input_chunk_length, is_prompt, past_present_share_buffer,
+                                is_prompt ? 0 : past_chunk_length, kv_input_chunk_length, past_present_share_buffer,
                                 i / kv_num_heads_factor);
 
         // Compute Q*K' + AttentionMask
@@ -365,7 +365,7 @@ class SparseAttentionBase {
 
             // Concatenate past_v + v -> present_v
             v = ConcatStateChunkGQA(past_value, v, present_value, present_buff_chunk_length, past_buff_chunk_length,
-                                    past_chunk_length, kv_input_chunk_length, is_prompt, past_present_share_buffer,
+                                    is_prompt ? 0 : past_chunk_length, kv_input_chunk_length, past_present_share_buffer,
                                     i / kv_num_heads_factor);
 
             DUMP_CPU_TENSOR("present_value", v, total_seq_len, head_size);
 
@@ -42,7 +42,6 @@ struct RightPaddingBatchHook {
 
     auto lse_dim = ceil_div((int32_t)(p.num_queries), kAlignLSE) * kAlignLSE;
 
-    // Advance to current batch - in case of different sequence lengths
     if (p.seqlen_k_ptr) {
       p.num_keys = p.seqlen_k_ptr[batch_id];
     }
 
@@ -5,7 +5,7 @@
 #include "core/platform/env_var_utils.h"
 #include "contrib_ops/cuda/bert/group_query_attention_impl.h"
 #include "contrib_ops/cuda/bert/group_query_attention.h"
-#include "contrib_ops/cuda/bert/group_query_attention_helper.h"
+#include "contrib_ops/cpu/bert/group_query_attention_helper.h"
 #include "contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h"
 #include "contrib_ops/cuda/bert/flash_attention/flash_api.h"
 
@@ -95,7 +95,6 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                                                 kv_num_heads_,
                                                                 seqlens_k,
                                                                 total_seqlen,
-                                                                is_past_bsnh_,
                                                                 scale_,
                                                                 softcap_,
                                                                 device_prop.maxThreadsPerBlock));
@@ -253,7 +252,7 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
     data.out_accum = reinterpret_cast<CudaT*>(out_accum_buffer.get());
   }
   if (seqlens_k_buffer != nullptr) {
-    data.seqlens_k_total = reinterpret_cast<int*>(seqlens_k_buffer.get());
+    data.seqlens_k_buff = reinterpret_cast<int*>(seqlens_k_buffer.get());
   }
   // Memory Efficient Buffers
   if (k_buffer != nullptr) {
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,6 @@ struct RightPaddingBatchHook {`
`42`	`42`
`43`	`43`	`auto lse_dim = ceil_div((int32_t)(p.num_queries), kAlignLSE) * kAlignLSE;`
`44`	`44`
`45`		`- // Advance to current batch - in case of different sequence lengths`
`46`	`45`	`if (p.seqlen_k_ptr) {`
`47`	`46`	`p.num_keys = p.seqlen_k_ptr[batch_id];`
`48`	`47`	`}`