remove syncs and move back necessary sync

leo-cf-tian · leo-cf-tian · commit 75c05eaf0e6c · 2025-06-02T17:22:05.000Z
Signed-off-by: Leo Tian &lt;leo.tian@centml.ai&gt;
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -27,6 +27,10 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -129,6 +133,11 @@ eval-type-backport==0.2.2
     # via mteb
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.3.0
+    # via
+    #   anyio
+    #   hypothesis
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -641,7 +650,6 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
-    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -701,8 +709,13 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+toml==0.10.2
+    # via datamodel-code-generator
 tomli==2.2.1
-    # via schemathesis
+    # via
+    #   black
+    #   pytest
+    #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -776,13 +789,18 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
+    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
     #   mteb
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   rich
     #   torch
     #   typer
     #   typing-inspection
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -90,6 +90,8 @@ def propose(
         cu_num_tokens: torch.Tensor,
         # [batch_size, max_num_blocks_per_req]
         block_table: torch.Tensor,
+        max_seq_len: int,
+        max_num_tokens: int,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
@@ -113,10 +115,6 @@ def propose(
         seq_lens = (target_positions[last_token_indices] + 1).int()
 
         if self.method in ["eagle", "eagle3"]:
-            # FIXME(woosuk): The below two ops cause synchronization. Optimize.
-            max_seq_len = seq_lens.max().item()
-            max_num_tokens = (cu_num_tokens[1:] -
-                              cu_num_tokens[:-1]).max().item()
             attn_metadata = FlashAttentionMetadata(
                 num_actual_tokens=num_tokens,
                 max_query_len=max_num_tokens,
@@ -133,9 +131,6 @@ def propose(
                 suffix_kv_lens=None,
             )
         elif self.method == "deepseek_mtp":
-            query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1]
-            max_query_len = query_lens.max().item()
-
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=cu_num_tokens, seq_lens=seq_lens)
 
@@ -145,7 +140,7 @@ def propose(
             attn_metadata = self.runner.attn_metadata_builder.build(
                 num_reqs=batch_size,
                 num_actual_tokens=num_tokens,
-                max_query_len=max_query_len,
+                max_query_len=max_num_tokens,
                 common_prefix_len=0,
                 common_attn_metadata=common_attn_metadata,
             )
@@ -298,7 +293,7 @@ def prepare_inputs(
         # [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
         cu_num_tokens = torch.zeros_like(cu_target_query_lens)
         torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:])
-        token_indices = torch.empty(
+        token_indices = torch.zeros(
             num_tokens,
             dtype=torch.int32,
             device=cu_target_query_lens.device,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py