review comments

LucasWilkinson · LucasWilkinson · commit 35c2d7100aea · 2025-07-10T23:45:46.000-04:00
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
@@ -44,32 +44,27 @@ def _convert_dtype_to_torch(dtype):
 # Define common batch configurations
 BATCH_SPECS = {
     "small_decode":
-    BatchSpec(batch_size=2, seq_lens=[32, 40], query_lens=[1, 1]),
+    BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
     "small_prefill":
-    BatchSpec(batch_size=2, seq_lens=[32, 40], query_lens=[8, 8]),
+    BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
     "mixed_small":
-    BatchSpec(batch_size=4, seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5,
-                                                                   5]),
+    BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
     "medium_decode":
-    BatchSpec(batch_size=8,
-              seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
+    BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
               query_lens=[1, 1, 1, 1, 1, 1, 1, 1]),
     "medium_prefill":
-    BatchSpec(batch_size=4,
-              seq_lens=[256, 512, 1024, 2048],
-              query_lens=[16, 16, 16, 16]),
+    BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]),
     "mixed_medium":
-    BatchSpec(batch_size=6,
-              seq_lens=[512, 1024, 2048, 512, 1024, 2048],
+    BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048],
               query_lens=[1, 1, 1, 7, 7, 7]),
     "large_decode":
-    BatchSpec(batch_size=32, seq_lens=[2048] * 32, query_lens=[1] * 32),
+    BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
     "large_prefill":
-    BatchSpec(batch_size=8, seq_lens=[4096] * 8, query_lens=[32] * 8),
+    BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
     "single_decode":
-    BatchSpec(batch_size=1, seq_lens=[1024], query_lens=[1]),
+    BatchSpec(seq_lens=[1024], query_lens=[1]),
     "single_prefill":
-    BatchSpec(batch_size=1, seq_lens=[1024], query_lens=[64]),
+    BatchSpec(seq_lens=[1024], query_lens=[64]),
 }
 
 
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
@@ -20,15 +20,17 @@
 @dataclass
 class BatchSpec:
     """Specification for a batch configuration (workload shape only)."""
-    batch_size: int
     seq_lens: list[int]
     query_lens: list[int]
 
     name: str = "unnamed"
 
+    @property
+    def batch_size(self):
+        return len(self.seq_lens)
+
     def __post_init__(self):
-        assert len(self.seq_lens) == self.batch_size
-        assert len(self.query_lens) == self.batch_size
+        assert len(self.seq_lens) == len(self.query_lens)
 
     def compute_num_tokens(self):
         return sum(self.query_lens)
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
@@ -6,7 +6,8 @@
 import pytest
 import torch
 
-from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata,
                                       create_standard_kv_cache_spec,
                                       get_attention_backend)
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
@@ -56,31 +57,6 @@ def _create_proposer(method: str, k: int) -> EagleProposer:
                          device=current_platform.device_type)
 
 
-def _create_common_attn_metadata(
-        cu_target_query_lens: torch.Tensor,
-        device: torch.device) -> CommonAttentionMetadata:
-    """Create minimal CommonAttentionMetadata for testing."""
-    batch_size = cu_target_query_lens.shape[0] - 1
-    num_tokens = cu_target_query_lens[-1].item()
-    seq_lens = cu_target_query_lens[1:] - cu_target_query_lens[:-1]
-
-    return CommonAttentionMetadata(
-        query_start_loc=cu_target_query_lens,
-        query_start_loc_cpu=cu_target_query_lens.cpu(),
-        seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens.cpu(),
-        num_computed_tokens_cpu=seq_lens.cpu(),
-        num_reqs=batch_size,
-        num_actual_tokens=int(num_tokens),
-        max_query_len=int(seq_lens.max().item()),
-        block_table_tensor=torch.zeros((batch_size, 1),
-                                       dtype=torch.int32,
-                                       device=device),
-        slot_mapping=torch.arange(num_tokens, dtype=torch.int64,
-                                  device=device),
-    )
-
-
 def test_prepare_inputs():
     """
     cu_target_query_lens: [0, a, a + b, a + b + c]
@@ -97,7 +73,6 @@ def test_prepare_inputs():
     # n1 = 1, n2 = 3, n3 = 2
 
     batch_spec = BatchSpec(
-        batch_size=4,
         seq_lens=[4, 7, 5],
         query_lens=[4, 7, 5],
     )
@@ -324,9 +299,28 @@ def create_deterministic_logits(token_ids):
                                    device=device)
     sampling_metadata = mock.MagicMock()
 
-    # Create CommonAttentionMetadata for new API
-    common_attn_metadata = _create_common_attn_metadata(cu_num_tokens, device)
-    attn_metadata_builder_cls, _ = get_attention_backend("flash_attn")
+    batch_size = cu_num_tokens.shape[0] - 1
+    num_tokens = cu_num_tokens[-1].item()
+    seq_lens = cu_num_tokens[1:] - cu_num_tokens[:-1]
+
+    common_attn_metadata = CommonAttentionMetadata(
+        query_start_loc=cu_num_tokens,
+        query_start_loc_cpu=cu_num_tokens.cpu(),
+        seq_lens=seq_lens,
+        seq_lens_cpu=seq_lens.cpu(),
+        num_computed_tokens_cpu=seq_lens.cpu(),
+        num_reqs=batch_size,
+        num_actual_tokens=int(num_tokens),
+        max_query_len=int(seq_lens.max().item()),
+        block_table_tensor=torch.zeros((batch_size, 1),
+                                       dtype=torch.int32,
+                                       device=device),
+        slot_mapping=torch.arange(num_tokens, dtype=torch.int64,
+                                  device=device),
+    )
+
+    attn_metadata_builder_cls, _ = get_attention_backend(
+        _Backend.FLASH_ATTN_VLLM_V1)
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
         vllm_config=proposer.vllm_config,
@@ -335,8 +329,7 @@ def create_deterministic_logits(token_ids):
 
     # Mock runner for attention metadata building
     proposer.runner = mock.MagicMock()
-    proposer.runner.attn_metadata_builders = [mock.MagicMock()]
-    proposer.runner.attn_metadata_builders[0] = attn_metadata_builder
+    proposer.runner.attn_metadata_builders = [attn_metadata_builder]
 
     result = proposer.propose(target_token_ids=target_token_ids,
                               target_positions=target_positions,