Fix pre-commit

sarckk · sarckk · commit e171dd5c76f9 · 2025-07-03T10:40:13.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/tests/v1/e2e/test_kv_sharing_skip_prefill.py b/tests/v1/e2e/test_kv_sharing_skip_prefill.py
@@ -317,6 +317,9 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 @pytest.fixture
 def test_prompts():
+    """
+    Adapted from tests/v1/e2e/test_spec_decode.py
+    """
     prompt_types = ["repeat", "sentence"]
     # Setting higher num prompts increases the chance of numerics mismatch
     # due to matrix multiplication numerics depending on batch dimension
@@ -326,8 +329,6 @@ def test_prompts():
     random.seed(0)
     random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
 
-    # Generate a mixed batch of prompts, some of which can be easily
-    # predicted by n-gram matching and some which likely cannot.
     for kind in random_prompt_type_choices:
         word_choices = ["test", "temp", "hello", "where"]
         word = random.choice(word_choices)
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
@@ -25,8 +25,6 @@
 
 def skip_torch_compile(cls: _T) -> _T:
     cls._skip_compile_vllm = True
-    for base in cls.__bases__:
-        setattr(base,"_skip_compile_vllm",True)
     return cls
 
 
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -273,6 +273,8 @@ def build(
         max_seq_len = int(self.runner.seq_lens_np[:num_reqs].max())
         query_start_loc = common_attn_metadata.query_start_loc
         query_start_loc_np = common_attn_metadata.query_start_loc_np
+        if query_start_loc_np is None:
+            query_start_loc_np = self.runner.query_start_loc_np[:num_reqs + 1]
         seq_lens = common_attn_metadata.seq_lens
         block_table = self.block_table
         block_table_tensor = block_table.get_device_tensor()[:num_reqs]
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -33,9 +33,6 @@ class CommonAttentionMetadata:
     query_start_loc: torch.Tensor
     """(batch_size + 1,), the start location of each request in query Tensor"""
 
-    query_start_loc_np: np.ndarray
-    """(batch_size + 1,), numpy version of query_start_loc on the CPU"""
-
     seq_lens: torch.Tensor
     """(batch_size,), the length of each request including both computed tokens
     and newly scheduled tokens"""
@@ -50,6 +47,9 @@ class CommonAttentionMetadata:
     decode_indices: Optional[torch.Tensor] = None
     """indices used for decoding"""
 
+    query_start_loc_np: Optional[np.ndarray] = None
+    """(batch_size + 1,), numpy equivalent of query_start_loc on the CPU"""
+
 
 M = TypeVar("M")
 
@@ -59,13 +59,8 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     full_cudagraph_supported: ClassVar[bool] = False
 
     @abstractmethod
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        decode_only_common_attn_metadata: Optional[
-            CommonAttentionMetadata] = None,
-    ) -> M:
+    def build(self, common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata) -> M:
         """
         Central method that builds attention metadata.
         Some builders (MLA) require reorder_batch to be called prior to build.