Update model_runner_v1.py

ponix-j · web-flow · commit 6beab758049f · 2025-05-23T10:36:17.000+08:00
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -869,72 +869,9 @@ def _get_spec_token_ids(
             spec_token_ids = self._generate_draft_token_ids(
                 valid_sampled_token_ids, sampling_metadata)
         elif self.speculative_config.method == "eagle":
-            raise NotImplementedError("eagle method for spec decode doesn't work on vllm-ascend currently")
-            assert isinstance(self.drafter, EagleProposer)
-            # TODO(woosuk): Refactor the loop.
-            next_token_ids: list[int] = []
-            for i, token_ids in enumerate(valid_sampled_token_ids):
-                if token_ids:
-                    # Common case.
-                    next_token_id = token_ids[-1]
-                else:
-                    # Partial prefill (rare case).
-                    # Get the next token id from the request state.
-                    req_id = self.input_batch.req_ids[i]
-                    req_state = self.requests[req_id]
-                    seq_len = (req_state.num_computed_tokens +
-                               scheduler_output.num_scheduled_tokens[req_id])
-                    next_token_id = req_state.get_token_id(seq_len)
-                next_token_ids.append(next_token_id)
-            next_token_ids = torch.tensor(next_token_ids,
-                                          dtype=torch.int32,
-                                          device=self.device)
-
-            if spec_decode_metadata is None:
-                # input_ids can be None for multimodal models.
-                # We need to slice token_ids, positions, and hidden_states
-                # because the eagle head does not use cuda graph and should
-                # not include padding.
-                target_token_ids = self.input_ids[:num_scheduled_tokens]
-                target_positions = positions[:num_scheduled_tokens]
-                target_hidden_states = hidden_states[:num_scheduled_tokens]
-                target_slot_mapping = attn_metadata.slot_mapping
-                cu_num_tokens = attn_metadata.query_start_loc
-            else:
-                # TODO(woosuk): Refactor this.
-                num_draft_tokens = spec_decode_metadata.num_draft_tokens
-                num_rejected_tokens = [
-                    n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
-                    for i, n in enumerate(num_draft_tokens)
-                ]
-                num_rejected_tokens = torch.tensor(
-                    num_rejected_tokens,
-                    dtype=torch.int32,
-                    device=self.device,
-                )
-                cu_num_tokens, token_indices = self.drafter.prepare_inputs(
-                    attn_metadata.query_start_loc,
-                    num_rejected_tokens,
-                )
-                target_token_ids = self.input_ids[token_indices]
-                target_positions = positions[token_indices]
-                target_hidden_states = hidden_states[token_indices]
-                target_slot_mapping = attn_metadata.slot_mapping[token_indices]
-
-            draft_token_ids, draft_probs = self.drafter.propose(
-                target_token_ids=target_token_ids,
-                target_positions=target_positions,
-                target_hidden_states=target_hidden_states,
-                target_slot_mapping=target_slot_mapping,
-                next_token_ids=next_token_ids,
-                cu_num_tokens=cu_num_tokens,
-                block_table=attn_metadata.block_tables,
-                sampling_metadata=sampling_metadata,
+            raise NotImplementedError(
+                "eagle method for spec decode doesn't work on vllm-ascend currently"
             )
-            spec_token_ids = draft_token_ids.tolist()
-            # TODO(woosuk): Cache draft_probs and use it for rejection sampling
-            # in the next step.
-            del draft_probs
         return spec_token_ids
 
     @torch.inference_mode()