minor

congcongchen123 · congcongchen123 · commit ab802f354a28 · 2025-07-09T18:01:33.000Z
diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py
@@ -54,6 +54,7 @@ def get_kv_cache_shape(
     ) -> Tuple[int, ...]:
         if block_size % 16 != 0:
             raise ValueError("Block size must be a multiple of 16.")
+        assert num_kv_heads % 2 == 0, "num_kv_heads must be divisible by 2"
         return (2, 2, num_blocks, block_size, num_kv_heads // 2, head_size)
 
     @staticmethod
@@ -872,7 +873,7 @@ def forward(
             k1, k2 = self.split_heads(k)
             v1, v2 = self.split_heads(v)
 
-            # kv_cache shape is (2, 2, num_blocks, block_size * num_kv_heads // 2 * head_size)
+            # kv_cache shape is (2, 2, num_blocks, block_size, num_kv_heads // 2, head_size)
             # Split by half along the first dimension.
             kv_cache1, kv_cache2 = self.split_kv_cache(kv_cache)
             assert kv_cache1.is_contiguous(), "kv_cache1 is not contiguous"
@@ -909,7 +910,7 @@ def forward(
         else: # re-use the kv cache, full attention
             q = q.view(-1, self.num_heads, self.head_size)
             q1, q2 = self.split_heads(q)
-            # kv_cache shape is (2, num_blocks, block_size * num_kv_heads * head_size)
+            # kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size)
             kv_cache1, kv_cache2 = self.split_kv_cache(kv_cache)
             key_cache1, value_cache1 = kv_cache1[0], kv_cache1[1]
             key_cache2, value_cache2 = kv_cache2[0], kv_cache2[1]
diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py
@@ -517,9 +517,9 @@ def forward(
                 if kv_cache[0].numel() == 0:
                     break
 
-                # Starting from this layer, we do not need to cuculate the kv cache since we reuse
-                # the kv cache from last layer. If in prefill phase, we can prune truncate 
-                # hidden state to save computation cost.
+                # Starting from this layer, we do not need to calculate the kv cache since we reuse
+                # the kv cache from last layer. If in prefill phase, we can <s>prune></s> truncate 
+                # the hidden state to save computation cost.
                 if attn_metadata.prefill_metadata:
                     selected_token_indices = torch.cumsum(attn_metadata.seq_lens_tensor, dim=0) - 1
                     hidden_states = hidden_states.index_select(0, selected_token_indices)