fix bug

skylee-01 · skylee-01 · commit 26f2b7f938c2 · 2025-07-09T16:03:56.000+08:00
Signed-off-by: skylee-01 &lt;497627264@qq.com&gt;
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1477,7 +1477,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                 speculative_model = self.speculative_config.get("model")
                 if speculative_model in ("ngram", "[ngram]"):
                     is_ngram_enabled = True
-            if not (is_ngram_enabled or is_eagle_enabled or is_medusa_enabled):
+            if not (is_ngram_enabled or is_eagle_enabled or is_medusa_enabled or is_mlp_speculator_enabled):
                 # Other speculative decoding methods are not supported yet.
                 _raise_or_fallback(feature_name="Speculative Decoding",
                                    recommend_to_remove=False)
diff --git a/vllm/v1/spec_decode/mlp_speculator.py b/vllm/v1/spec_decode/mlp_speculator.py
@@ -31,7 +31,8 @@ def __init__(
         self.hidden_size = vllm_config.speculative_config.\
             draft_model_config.get_hidden_size(
         )
-        self.num_speculative_tokens = vllm_config.speculative_config.num_speculative_tokens
+        self.num_speculative_tokens = vllm_config.speculative_config.\
+            num_speculative_tokens
         self.dtype = vllm_config.model_config.dtype
 
     def propose(
@@ -43,8 +44,7 @@ def propose(
     ) -> torch.Tensor:
         # Generate blocks and compute logits
         draft_tokens = self.model.generate_proposals(input_ids, previous_hidden_states, num_predict_tokens,sampling_metadata)
-        draft_tokens = list(map(lambda x: x[0], zip(*[i.sampled_token_ids.tolist() for i in draft_tokens])))
-        return draft_tokens
+        return list(map(lambda x: x[0], zip(*[i.sampled_token_ids.tolist() for i in draft_tokens])))
 
     def load_model(self, target_model: nn.Module) -> None:
         self.model = get_model(vllm_config=self.vllm_config,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1607,16 +1607,13 @@ def propose_draft_token_ids(
             )
         elif self.speculative_config.method == "mlp_speculator":
             assert isinstance(self.drafter, MLPSpeculatorProposer)
-
             is_sample_match = sample_hidden_states.shape[0] == len(
                 sampled_token_ids)
             # Get last token from each sequence
             draft_input_ids = torch.tensor(
-                sampled_token_ids[0] if is_sample_match else
                 [tokens[-1] for tokens in sampled_token_ids],
                 device=sample_hidden_states.device)
-
-            if is_sample_match:
+            if not is_sample_match:
                 # Calculate indices for hidden states
                 indices = []
                 offset = 0
@@ -1629,7 +1626,6 @@ def propose_draft_token_ids(
                 hidden_states = sample_hidden_states[indices]
             else:
                 hidden_states = sample_hidden_states
-
             spec_token_ids = self.drafter.propose(
                 input_ids=draft_input_ids,
                 previous_hidden_states=hidden_states,