rahul-tuli · rahul-tuli · Jul 16, 2025 · Jul 17, 2025 · Jul 17, 2025
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -41,7 +41,7 @@
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                        GiB_bytes, get_ip, is_in_ray_actor)
 from vllm.transformers_utils.configs.speculators_eagle import is_speculators_eagle_config

 # yapf: enable

@@ -394,6 +394,7 @@
         str] = ModelConfig.logits_processor_pattern
 
     speculative_config: Optional[Dict[str, Any]] = None
+    draft_tensor_parallel_size: Optional[int] = None
 
     show_hidden_metrics_for_version: Optional[str] = \
         ObservabilityConfig.show_hidden_metrics_for_version
@@ -767,6 +768,13 @@
             default=None,
             help="The configurations for speculative decoding. Should be a "
             "JSON string.")
+        speculative_group.add_argument(
+            "--draft-tensor-parallel-size",
+            type=int,
+            default=None,
+            help="Number of tensor parallel replicas for the draft model. "
+            "Only used with speculative decoding. "
+            "Note: draft_tensor_parallel_size > 1 is not supported at the moment.")
 
         # Observability arguments
         observability_kwargs = get_kwargs(ObservabilityConfig)
@@ -874,6 +882,42 @@
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
+        # Auto-detect speculators format models
+        if args.model and not args.speculative_config:
+            from vllm.transformers_utils.configs import extract_speculators_info
+            from vllm.logger import init_logger
+            logger = init_logger(__name__)
+
+            speculators_info = extract_speculators_info(args.model)
+            if speculators_info:
+                # Log what we're doing
+                logger.info("🦅 Auto-detected Eagle speculators format model")
+                logger.info(f"  Target model: {speculators_info['target_model']}")
+                logger.info(f"  Draft model: {args.model}")
+                logger.info(f"  Method: {speculators_info['method']}")
+                logger.info(f"  Speculative tokens: {speculators_info['num_tokens']}")
+
+                # Build speculative config
+                spec_config = {
+                    "method": speculators_info["method"],
+                    "model": args.model,  # Original model becomes draft
+                    "num_speculative_tokens": speculators_info["num_tokens"],
+                }
+
+                # Add draft tensor parallel size if specified
+                if hasattr(args, 'draft_tensor_parallel_size') and args.draft_tensor_parallel_size is not None:
+                    spec_config["draft_tensor_parallel_size"] = args.draft_tensor_parallel_size
+
+                # Set the speculative config directly (it's already parsed by argparse)
+                args.speculative_config = spec_config
+
+                # Swap the model to target
+                args.model = speculators_info["target_model"]
+
+                # Also update tokenizer if not explicitly set
+                if not hasattr(args, 'tokenizer') or args.tokenizer is None:
+                    args.tokenizer = speculators_info["target_model"]
+
         # Get the list of attributes of this dataclass.
         attrs = [attr.name for attr in dataclasses.fields(cls)]
         # Set the attributes from the parsed arguments.

diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
@@ -7,7 +7,8 @@
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
-from vllm.transformers_utils.configs.speculators_eagle import SpeculatorsEagleConfig
+from vllm.transformers_utils.configs.speculators_eagle import (
+    SpeculatorsEagleConfig, extract_speculators_info)
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
@@ -42,6 +43,7 @@
     "EAGLEConfig",
     "ExaoneConfig",
     "SpeculatorsEagleConfig",
+    "extract_speculators_info",
     "MiniMaxText01Config",
     "MiniMaxVL01Config",
     "MllamaConfig",

diff --git a/vllm/transformers_utils/configs/speculators_eagle.py b/vllm/transformers_utils/configs/speculators_eagle.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Any, Union
+from typing import Any, Optional, Union
 
 from transformers import PretrainedConfig
 
@@ -282,3 +282,63 @@ def is_speculators_eagle_config(config_path: Union[str, os.PathLike]) -> bool:
         return model_type in SUPPORTED_SPECULATORS_TYPES
     except Exception:
         return False
+
+
+def extract_speculators_info(model_path: Union[str, os.PathLike]) -> Optional[dict[str, Any]]:
+    """
+    Extract target model and config from speculators format model.
+
+    Returns dict with:
+    - target_model: str - The target model name/path
+    - method: str - The speculative method (eagle/eagle3)
+    - num_tokens: int - Number of speculative tokens
+
+    Returns None if not speculators format or missing target model.
+    """
+    try:
+        # Check if it's speculators format
+        if not is_speculators_eagle_config(model_path):
+            return None
+
+        # Load the config
+        config_dict, _ = PretrainedConfig.get_config_dict(model_path)
+
+        # Extract method
+        method = config_dict.get("speculators_model_type", "eagle")
+
+        # Extract num tokens
+        num_tokens = DEFAULT_NUM_LOOKAHEAD_TOKENS  # default
+        speculators_cfg = config_dict.get("speculators_config", {})
+        proposal_methods = speculators_cfg.get("proposal_methods", [])
+        if proposal_methods:
+            num_tokens = proposal_methods[0].get("speculative_tokens", DEFAULT_NUM_LOOKAHEAD_TOKENS)
+
+        # Extract target model - try multiple possible locations
+        target_model = None
+
+        # Try target_config.model_name (original format)
+        target_config = speculators_cfg.get("target_config", {})
+        target_model = target_config.get("model_name")
+
+        # Try verifier.name_or_path (new format)
+        if not target_model:
+            verifier_config = speculators_cfg.get("verifier", {})
+            target_model = verifier_config.get("name_or_path")
+
+        # If no target model in config, return None
+        # This will require user to specify target model explicitly
+        if not target_model:
+            return None
+
+        return {
+            "target_model": target_model,
+            "method": method,
+            "num_tokens": num_tokens
+        }
+    except Exception as e:
+        from vllm.logger import init_logger
+        logger = init_logger(__name__)
+        logger.debug("Failed to extract speculators info from %s.",
+                     model_path,
+                     exc_info=e)
+        return None