[CI] Cache sampled token ids in model runner to fix CI error (#1573)

wangxiyuan · web-flow · commit 641a4e60928c · 2025-07-02T12:11:14.000+08:00
### What this PR does / why we need it? vllm change vllm-project/vllm@7f280d6 break vllm-ascend. This PR Fix the broken CI ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? passed Closes: #1572 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import vllm
 from vllm.lora.request import LoRARequest
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -527,24 +527,27 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 self.input_batch.num_tokens[req_index] = end_token_index
         else:
             req_data = scheduler_output.scheduled_cached_reqs
+            is_last_rank = get_pp_group().is_last_rank
             for i, req_id in enumerate(req_data.req_ids):
                 req_state = self.requests[req_id]
                 num_computed_tokens = req_data.num_computed_tokens[i]
-                new_token_ids = req_data.new_token_ids[i]
                 new_block_ids = req_data.new_block_ids[i]
                 resumed_from_preemption = req_data.resumed_from_preemption[i]
 
                 req_state.num_computed_tokens = num_computed_tokens
-                # Add the sampled token(s) from the previous step (if any).
-                # This doesn't include "unverified" tokens like spec decode tokens.
-                num_new_tokens = (num_computed_tokens + len(new_token_ids) -
-                                  req_state.num_tokens)
-                if num_new_tokens == 1:
-                    # Avoid slicing list in most common case.
-                    req_state.output_token_ids.append(new_token_ids[-1])
-                elif num_new_tokens > 0:
-                    req_state.output_token_ids.extend(
-                        new_token_ids[-num_new_tokens:])
+                if not is_last_rank:
+                    new_token_ids = req_data.new_token_ids[i]
+                    # Add the sampled token(s) from the previous step (if any).
+                    # This doesn't include "unverified" tokens like spec decode tokens.
+                    num_new_tokens = (num_computed_tokens +
+                                      len(new_token_ids) -
+                                      req_state.num_tokens)
+                    if num_new_tokens == 1:
+                        # Avoid slicing list in most common case.
+                        req_state.output_token_ids.append(new_token_ids[-1])
+                    elif num_new_tokens > 0:
+                        req_state.output_token_ids.extend(
+                            new_token_ids[-num_new_tokens:])
                 # Update the block IDs.
                 if not resumed_from_preemption:
                     # Append the new blocks to the existing block IDs.
@@ -570,25 +573,27 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
                 self.input_batch.block_table.append_row(
                     new_block_ids, req_index)
-                # Add new_token_ids to token_ids_cpu.
-                start_token_index = num_computed_tokens
-                end_token_index = num_computed_tokens + len(new_token_ids)
-                self.input_batch.token_ids_cpu[
-                    req_index,
-                    start_token_index:end_token_index] = new_token_ids
-                self.input_batch.num_tokens_no_spec[
-                    req_index] = end_token_index
-                # Add spec_token_ids to token_ids_cpu.
-                spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
-                    req_id, ())
-                if spec_token_ids:
-                    start_index = end_token_index
-                    end_token_index += len(spec_token_ids)
+
+                if not is_last_rank:
+                    # Add new_token_ids to token_ids_cpu.
+                    start_token_index = num_computed_tokens
+                    end_token_index = num_computed_tokens + len(new_token_ids)
                     self.input_batch.token_ids_cpu[
                         req_index,
-                        start_index:end_token_index] = spec_token_ids
-                # NOTE(woosuk): `num_tokens` here may include spec decode tokens.
-                self.input_batch.num_tokens[req_index] = end_token_index
+                        start_token_index:end_token_index] = new_token_ids
+                    self.input_batch.num_tokens_no_spec[
+                        req_index] = end_token_index
+                    # Add spec_token_ids to token_ids_cpu.
+                    spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
+                        req_id, ())
+                    if spec_token_ids:
+                        start_index = end_token_index
+                        end_token_index += len(spec_token_ids)
+                        self.input_batch.token_ids_cpu[
+                            req_index,
+                            start_index:end_token_index] = spec_token_ids
+                    # NOTE(woosuk): `num_tokens` here may include spec decode tokens.
+                    self.input_batch.num_tokens[req_index] = end_token_index
 
         # Check if the batch has changed. If not, we can skip copying the
         # sampling metadata from CPU to GPU.
@@ -1641,6 +1646,30 @@ def execute_model(
 
             for i in discard_sampled_tokens_req_indices:
                 valid_sampled_token_ids[i].clear()
+            if not vllm_version_is("0.9.1"):
+                # Cache the sampled tokens in the model runner, so that the schedulerAdd commentMore actions
+                # doesn't need to send them back.
+                # NOTE(woosuk): As an exception, when using PP, the scheduler sends
+                # the sampled tokens back, because there's no direct communication
+                # between the first-stage worker and the last-stage worker.
+                for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
+                    if not sampled_ids:
+                        continue
+
+                    start_idx = self.input_batch.num_tokens_no_spec[req_idx]
+                    end_idx = start_idx + len(sampled_ids)
+                    assert end_idx <= self.model_config.max_model_len, (
+                        "Sampled token IDs exceed the max model length. "
+                        f"Total number of tokens: {end_idx} > max_model_len: "
+                        f"{self.model_config.max_model_len}")
+
+                    self.input_batch.token_ids_cpu[
+                        req_idx, start_idx:end_idx] = sampled_ids
+                    self.input_batch.num_tokens_no_spec[req_idx] = end_idx
+                    self.input_batch.num_tokens[req_idx] = end_idx
+                    req_id = self.input_batch.req_ids[req_idx]
+                    req_state = self.requests[req_id]
+                    req_state.output_token_ids.extend(sampled_ids)
 
             spec_token_ids = self._get_spec_token_ids(
                 valid_sampled_token_ids,

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`# SPDX-License-Identifier: Apache-2.0`
`2`		`-`
`3`	`2`	`import vllm`
`4`	`3`	`from vllm.lora.request import LoRARequest`
`5`	`4`