vllm-project · rahul-tuli · Jul 3, 2025 · Jul 3, 2025 · Jul 4, 2025 · Jul 9, 2025
@@ -2767,9 +2767,15 @@
                 # Automatically detect the method
                 if self.method in ('eagle', 'eagle3'):
                     pass
+                elif hasattr(self.draft_model_config.hf_config, 
+                           "speculators_model_type") and \
+                        self.draft_model_config.hf_config.speculators_model_type in ("eagle", "eagle3"):
+                    self.method = self.draft_model_config.hf_config.speculators_model_type
                 elif "eagle-" in self.draft_model_config.model.lower() or \
                         "eagle3-" in self.draft_model_config.model.lower():
                     self.method = "eagle"
+                elif self.draft_model_config.hf_config.model_type == "eagle":
+                    self.method = "eagle"
                 elif self.draft_model_config.hf_config.model_type == "medusa":
                     self.method = "medusa"
                 elif (self.draft_model_config.hf_config.model_type ==
@@ -2986,10 +2992,14 @@
                              "speculative decoding is > 1, but got "
                              f"{self.disable_by_batch_size=}")
 
-        if self.method == "eagle3" and self.target_model_config and \
-            "llama" not in self.target_model_config.hf_text_config.model_type:
+        if (
+            self.method == "eagle3"
+            and self.target_model_config
+            and "llama" not in self.target_model_config.hf_text_config.model_type
+            and "qwen" not in self.target_model_config.hf_text_config.model_type
+        ):
             raise ValueError(
-                "Eagle3 is only supported for Llama models. "
+                "Eagle3 is only supported for Llama/Qwen models. "
                 f"Got {self.target_model_config.hf_text_config.model_type=}")
 
         return self

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -41,6 +41,7 @@
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
+from vllm.transformers_utils.configs.speculators_eagle import is_speculators_eagle_config
 
 # yapf: enable
 
@@ -1416,6 +1417,8 @@
         if self.speculative_config is not None:
             # This is supported but experimental (handled below).
             speculative_method = self.speculative_config.get("method")
+            speculative_model = self.speculative_config.get("model")
+
             if speculative_method:
                 if speculative_method in ("ngram", "[ngram]"):
                     is_ngram_enabled = True
@@ -1424,9 +1427,15 @@
                 elif speculative_method in ("eagle", "eagle3", "deepseek_mtp"):
                     is_eagle_enabled = True
             else:
-                speculative_model = self.speculative_config.get("model")
-                if speculative_model in ("ngram", "[ngram]"):
-                    is_ngram_enabled = True
+                # If method is not set, try to detect from model
+                if speculative_model:
+                    if speculative_model in ("ngram", "[ngram]"):
+                        is_ngram_enabled = True
+                    # Detect speculators format Eagle models which don't set the method
+                    # field explicitly but can be identified by their config structure
+                    elif is_speculators_eagle_config(speculative_model):
+                        is_eagle_enabled = True
+
             if not (is_ngram_enabled or is_eagle_enabled or is_medusa_enabled):
                 # Other speculative decoding methods are not supported yet.
                 _raise_or_fallback(feature_name="Speculative Decoding",

diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -11,6 +12,7 @@
 from vllm.config import VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
@@ -22,6 +24,27 @@
 
 logger = init_logger(__name__)
 
+# Map speculators weight names to vLLM names
+SPECULATORS_WEIGHT_MAP = {
+    "fusion_fc.weight": "model.fc.weight",
+    "fusion_fc.bias": "model.fc.bias",
+    "embedding_layernorm.weight": "model.embedding_layernorm.weight",
+    "pre_lm_head_layernorm.weight": "model.hidden_states_layernorm.weight",
+}
+
+
+def remap_speculators_weight_name(name: str) -> Optional[str]:
+    """Remap speculators format weight names to vLLM names.
+
+    Returns None for weights that should be skipped.
+    """
+    if name in SPECULATORS_WEIGHT_MAP:
+        return SPECULATORS_WEIGHT_MAP[name]
+    elif name.startswith("transformer."):
+        # Replace "transformer." with "model.layers.0."
+        return "model.layers.0." + name[len("transformer."):]
+    return name
+
 
 class LlamaDecoderLayer(LlamaDecoderLayer):
 
@@ -70,7 +93,15 @@ def __init__(
         ])
         self.fc = torch.nn.Linear(self.config.hidden_size * 2,
                                   self.config.hidden_size,
-                                  bias=False)
+                                  bias=getattr(self.config, "fusion_bias", False))
+
+        # HASS variant support
+        self.has_embedding_layernorms = getattr(self.config, "add_para_norm", False)
+        if self.has_embedding_layernorms:
+            self.embedding_layernorm = RMSNorm(self.config.hidden_size, 
+                                               eps=self.config.rms_norm_eps)
+            self.hidden_states_layernorm = RMSNorm(self.config.hidden_size,
+                                                   eps=self.config.rms_norm_eps)
 
     def forward(
         self,
@@ -79,6 +110,12 @@ def forward(
         hidden_states: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         input_embeds = self.embed_tokens(input_ids)
+
+        # Apply HASS normalization if enabled
+        if self.has_embedding_layernorms:
+            input_embeds = self.embedding_layernorm(input_embeds)
+            hidden_states = self.hidden_states_layernorm(hidden_states)
+
         hidden_states = self.fc(
             torch.cat((input_embeds, hidden_states), dim=-1))
         residual = None
@@ -104,6 +141,11 @@ def load_weights(self, weights: Iterable[tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
+            remapped_name = remap_speculators_weight_name(name)
+            if remapped_name is None:
+                continue
+            name = remapped_name
+
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -119,6 +161,10 @@ def load_weights(self, weights: Iterable[tuple[str,
                     "embed_tokens." in name:
                     continue
 
+                # Skip weights that don't exist in the model
+                if name not in params_dict:
+                    continue
+
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
@@ -159,7 +205,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
 
         model_weights = {}
         for name, loaded_weight in weights:
-            if "lm_head" not in name:
-                name = "model." + name
-            model_weights[name] = loaded_weight
+            remapped_name = remap_speculators_weight_name(name)
+            if remapped_name is None:
+                continue
+            model_weights[remapped_name] = loaded_weight
         loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
@@ -50,6 +50,7 @@ def __init__(
         )
 
         self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm_before_residual = getattr(config, "norm_before_residual", False)
 
     def forward(
         self,
@@ -59,9 +60,14 @@ def forward(
         residual: Optional[torch.Tensor],
     ) -> tuple[torch.Tensor, torch.Tensor]:
 
-        residual = hidden_states
         embeds = self.input_layernorm(embeds)
-        hidden_states = self.hidden_norm(hidden_states)
+
+        if self.norm_before_residual:
+            hidden_states = self.hidden_norm(hidden_states)
+            residual = hidden_states
+        else:
+            residual = hidden_states
+            hidden_states = self.hidden_norm(hidden_states)
 
         hidden_states = torch.cat([embeds, hidden_states], dim=-1)
         # Self Attention

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
@@ -284,7 +284,7 @@ def __init__(self,
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-
+        self.aux_hidden_state_layers: tuple[int] = tuple()
         # TODO (@robertgshaw2): see if this can be moved out
         if (cache_config.sliding_window is not None
                 and hasattr(config, "max_window_layers")):
@@ -351,18 +351,22 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                residual,
-            )
+
+        aux_hidden_states = []
+        for idx, layer in enumerate(
+                self.layers[self.start_layer:self.end_layer]):
+            if idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
-            if idx in self.aux_hidden_state_layers:
-                aux_hidden_states.append(hidden_states + residual)
+            if self.start_layer + idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
-            if idx in self.aux_hidden_state_layers:
-                aux_hidden_states.append(hidden_states + residual)
+            if self.start_layer + idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
                 "residual": residual
             })
         hidden_states, _ = self.norm(hidden_states, residual)
+        if len(aux_hidden_states) > 0:   
+            return hidden_states, aux_hidden_states
         return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str,

diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
@@ -271,7 +271,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.quant_config = quant_config
         self.model = Qwen3Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
-
+        
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
                 self.lm_head = self.model.embed_tokens
@@ -302,6 +302,15 @@ def forward(
         hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
+
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:  
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:  
+        self.model.aux_hidden_state_layers = layers
+
 
     def compute_logits(
         self,
@@ -322,4 +331,4 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loader.load_weights(weights)
 
 
-Qwen3ForSequenceClassification = as_seq_cls_model(Qwen3ForCausalLM)
+Qwen3ForSequenceClassification = as_seq_cls_model(Qwen3ForCausalLM)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
@@ -40,9 +40,11 @@
                                              NemotronConfig, NVLM_D_Config,
                                              OvisConfig, RWConfig,
                                              SkyworkR1VChatConfig, SolarConfig,
+                                             SpeculatorsEagleConfig,
                                              Telechat2Config, UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.configs.mistral import adapt_config_dict
+from vllm.transformers_utils.configs.speculators_eagle import is_speculators_eagle_config
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import resolve_obj_by_qualname
 
@@ -350,6 +352,19 @@
             raise ValueError(error_message) from e
 
     if config_format == ConfigFormat.HF:
+        # Speculators Eagle models use a different config format that requires
+        # translation to vLLM's expected format. This must be handled before
+        # the standard config loading to ensure proper model initialization.
+        if is_speculators_eagle_config(model):
+            config = SpeculatorsEagleConfig.from_pretrained(
+                model,
+                revision=revision,
+                code_revision=code_revision,
+                token=_get_hf_token(),
+                **kwargs,
+            )
+            return config
+
         config_dict, _ = PretrainedConfig.get_config_dict(
             model,
             revision=revision,

diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
@@ -7,6 +7,7 @@
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
+from vllm.transformers_utils.configs.speculators_eagle import SpeculatorsEagleConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
@@ -40,6 +41,7 @@
     "MedusaConfig",
     "EAGLEConfig",
     "ExaoneConfig",
+    "SpeculatorsEagleConfig",
     "MiniMaxText01Config",
     "MiniMaxVL01Config",
     "MllamaConfig",