From 6927b02554530925d9b0fc1b33ce37f2afed5e14 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 2 Jul 2025 04:51:51 +0000 Subject: [PATCH 01/25] resolve conflict --- vllm/model_executor/layers/fused_moe/layer.py | 25 +- .../layers/quantization/modelopt.py | 239 +++++++++++++++++- 2 files changed, 254 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 36ac75a8df4..24129484125 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1049,12 +1049,25 @@ def weight_loader(self, # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern if "ModelOpt" in quant_method_name: - if ('weight_scale_2' in weight_name - or 'input_scale' in weight_name): - self._load_per_tensor_weight_scale(shard_id=shard_id, - param=param, - loaded_weight=loaded_weight, - expert_id=expert_id) + # Determine per-tensor weight scale patterns based on variant + is_fp4_variant = ( + "ModelOptNvFp4FusedMoEMethod" in self.quant_method.__class__.__name__ + ) + + # FP4 uses "weight_scale_2" for per-tensor, FP8 uses "weight_scale" for per-tensor + per_tensor_conditions = ( + "weight_scale_2" in weight_name + if is_fp4_variant + else "weight_scale" in weight_name + ) or "input_scale" in weight_name + + if per_tensor_conditions: + self._load_per_tensor_weight_scale( + shard_id=shard_id, + param=param, + loaded_weight=loaded_weight, + expert_id=expert_id, + ) elif "weight" in weight_name: self._load_model_weight_or_group_weight_scale( shard_id=shard_id, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 9db87533023..7ac00a71343 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -85,18 +85,20 @@ def get_quant_method(self, layer: torch.nn.Module, return ModelOptFp8LinearMethod(self) elif isinstance(layer, Attention): return ModelOptFp8KVCacheMethod(self) + elif isinstance(layer, FusedMoE): + return ModelOptFp8MoEMethod(self) return None class ModelOptFp8LinearMethod(LinearMethodBase): """Linear method for Model Optimizer static quantization. Supports loading FP8 checkpoints with static weight scale and - activation scale. Future support might be added for dynamic + activation scale. Future support might be added for dynamic scales. Limitations: 1. Only support per-tensor quantization due to torch._scaled_mm support. - 2. Only support float8_e4m3fn datatype + 2. Only support float8_e4m3fn datatype Args: quant_config: The ModelOpt quantization config. """ @@ -170,6 +172,235 @@ def apply( input_scale=layer.input_scale, bias=bias) +class ModelOptFp8MoEMethod: + """MoE method for ModelOpt FP8. + Supports loading FP8 checkpoints with static weight scale and activation scale. + Args: + quant_config: The ModelOpt quantization config. + """ + + def __new__(cls, *args, **kwargs): + """ + Dynamic class composition pattern. + This allows us to effectively "inject" FusedMoEMethodBase as a parent class + at runtime while avoiding circular import issues. + """ + + if not hasattr(cls, "_initialized"): + original_init = cls.__init__ + new_cls = type( + cls.__name__, + (FusedMoEMethodBase,), + { + "__init__": original_init, + **{k: v for k, v in cls.__dict__.items() if k != "__dict__"}, + }, + ) + obj = super(new_cls, new_cls).__new__(new_cls) + obj.__init__(*args, **kwargs) + return obj + return super().__new__(cls) + + def __init__(self, quant_config: ModelOptFp8Config): + self.quant_config = quant_config + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + cutlass_fp8_supported) + self.cutlass_fp8_supported = cutlass_fp8_supported() + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + + # Use FP8 dtype if checkpoint is serialized, otherwise use the default dtype + weight_dtype = ( + torch.float8_e4m3fn + if self.quant_config.is_checkpoint_fp8_serialized + else params_dtype + ) + weight_loader = extra_weight_attrs.get("weight_loader") + + w13_weight = ModelWeightParameter( + data=torch.empty( + num_experts, 2 * intermediate_size, hidden_size, dtype=weight_dtype + ), + input_dim=2, + output_dim=1, + weight_loader=weight_loader, + ) + layer.register_parameter("w13_weight", w13_weight) + + w2_weight = ModelWeightParameter( + data=torch.empty( + num_experts, hidden_size, intermediate_size, dtype=weight_dtype + ), + input_dim=2, + output_dim=1, + weight_loader=weight_loader, + ) + layer.register_parameter("w2_weight", w2_weight) + + if self.quant_config.is_checkpoint_fp8_serialized: + # WEIGHT SCALES - Per-tensor scaling for ModelOpts + # Allocate 2 scales for w1 and w3 respectively. + # They will be combined to a single scale after weight loading. + w13_weight_scale = PerTensorScaleParameter( + data=torch.full( + (num_experts, 2), + torch.finfo(torch.float32).min, + dtype=torch.float32, + ), + weight_loader=weight_loader, + ) + w2_weight_scale = PerTensorScaleParameter( + data=torch.full( + (num_experts,), torch.finfo(torch.float32).min, dtype=torch.float32 + ), + weight_loader=weight_loader, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + + # Set weight loader attributes for scales + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + + # INPUT SCALES - Per-tensor scaling for ModelOpt + w13_input_scale = PerTensorScaleParameter( + data=torch.full((num_experts,), 1.0, dtype=torch.float32), + weight_loader=weight_loader, + ) + w2_input_scale = PerTensorScaleParameter( + data=torch.full((num_experts,), 1.0, dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("w13_input_scale", w13_input_scale) + layer.register_parameter("w2_input_scale", w2_input_scale) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + """Process FP8 MoE weights after loading from serialized checkpoint. + Only supports pre-quantized checkpoints with FP8 weights and scales. + """ + + layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False) + layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False) + + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + per_tensor_dequantize) + from vllm._custom_ops import scaled_fp8_quant + + # Handle scale parameters + if hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None: + # Fp8 moe kernel needs single weight scale for w13 per expert. + # We take the max of the w1 and w3 scales then dequant and requant each expert. + if layer.w13_weight_scale.dim() == 2: # Shape: (num_experts, 2) + + # Get the maximum scale across w1 and w3 for each expert + max_w13_scales = layer.w13_weight_scale.max(dim=1).values + + # Requantize each expert's weights using the combined scale + # w13_weight has shape (num_experts, 2 * intermediate_size, hidden_size) + # where the first intermediate_size rows are w1, the next are w3 + intermediate_size = layer.w13_weight.shape[1] // 2 + for expert_id in range(layer.w13_weight.shape[0]): + start = 0 + for shard_id in range(2): # w1 and w3 + # Dequantize using the original scale for this shard + dq_weight = per_tensor_dequantize( + layer.w13_weight[expert_id][ + start : start + intermediate_size, : + ], + layer.w13_weight_scale[expert_id][shard_id], + ) + # Requantize using the combined max scale + + ( + layer.w13_weight[expert_id][ + start : start + intermediate_size, : + ], + _, + ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id]) + + start += intermediate_size + + # Update the scale parameter to be per-expert instead of per-shard + layer.w13_weight_scale = Parameter(max_w13_scales, requires_grad=False) + else: + layer.w13_weight_scale = Parameter( + layer.w13_weight_scale.data, requires_grad=False + ) + + if hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None: + layer.w2_weight_scale = Parameter( + layer.w2_weight_scale.data, requires_grad=False + ) + if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None: + layer.w13_input_scale = Parameter( + layer.w13_input_scale.max(), requires_grad=False + ) + if hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None: + layer.w2_input_scale = Parameter( + layer.w2_input_scale.max(), requires_grad=False + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + num_fused_shared_experts: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + correction_bias: Optional[torch.Tensor] = None, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + inplace: bool = True, + no_combine: bool = False, + routed_scaling_factor: Optional[float] = None, + ) -> torch.Tensor: + + # Expert selection + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + num_fused_shared_experts=num_fused_shared_experts, + custom_routing_function=custom_routing_function, + correction_bias=correction_bias, + routed_scaling_factor=routed_scaling_factor, + ) + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + cutlass_moe_fp8) + return cutlass_moe_fp8( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=inplace, + activation=activation, + use_fp8_w8a8=True, + per_channel_quant=False, # ModelOpt uses per-tensor quantization + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + no_combine=no_combine, + ) class ModelOptNvFp4Config(QuantizationConfig): """Config class for ModelOpt FP4.""" @@ -273,7 +504,7 @@ def __init__(self, quant_config: Union[ModelOptFp8Config, class ModelOptNvFp4LinearMethod(LinearMethodBase): """Linear method for Model Optimizer NVFP4. Supports loading NVFP4 checkpoints with the following structure: - + input_scale: torch.float32, scalar , weight: NVFP4(represented as byte) Shape: [1, X, y/2] weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale, @@ -454,7 +685,7 @@ def apply( class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): """ MoE Method for FP4 Quantization. - Args: + Args: quant_config: NVFP4 Quant Config """ From b45972e2f568abb9b2c33b6a78a9902d2dff3fe5 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Fri, 20 Jun 2025 23:17:43 +0000 Subject: [PATCH 02/25] bugfix --- vllm/model_executor/layers/quantization/modelopt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 7ac00a71343..2dae1ac3b40 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -212,7 +212,7 @@ def create_weights( layer: torch.nn.Module, num_experts: int, hidden_size: int, - intermediate_size: int, + intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs, ): @@ -227,7 +227,7 @@ def create_weights( w13_weight = ModelWeightParameter( data=torch.empty( - num_experts, 2 * intermediate_size, hidden_size, dtype=weight_dtype + num_experts, 2 * intermediate_size_per_partition, hidden_size, dtype=weight_dtype ), input_dim=2, output_dim=1, @@ -237,7 +237,7 @@ def create_weights( w2_weight = ModelWeightParameter( data=torch.empty( - num_experts, hidden_size, intermediate_size, dtype=weight_dtype + num_experts, hidden_size, intermediate_size_per_partition, dtype=weight_dtype ), input_dim=2, output_dim=1, From bf965286a4c869b281c3e24ae81c140839b958c4 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Fri, 20 Jun 2025 23:29:20 +0000 Subject: [PATCH 03/25] handle language_model. prefix --- vllm/model_executor/models/mllama4.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 1276d626a7c..9eb30fb52b3 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -918,6 +918,23 @@ def load_weights(self, weights: Iterable[tuple[str, # using llama4's load_weights routine. language_model_weights, other_weights = self.separate_weights( weights, prefix="language_model.") + + # If no language_model weights found, try with "model." prefix and rename + language_model_weights_list = list(language_model_weights) + if not language_model_weights_list: + # No language_model.* weights found, try model.* weights + def rename_model_weights(): + for name, weight in weights: + if name.startswith("model."): + # Rename model.* to language_model.model.* + yield (name.replace("model.", "language_model.model.", 1), weight) + else: + # Keep other weights as is + yield (name, weight) + + language_model_weights, other_weights = self.separate_weights( + rename_model_weights(), prefix="language_model.") + loader = AutoWeightsLoader(self) loaded_language_model_params = loader.load_weights( language_model_weights) From cb20cd1db219122bc5037a552d2be39330652217 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Sat, 21 Jun 2025 00:57:11 +0000 Subject: [PATCH 04/25] fix issue in fused_experts calling --- .../layers/quantization/modelopt.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 2dae1ac3b40..09133808b48 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -359,9 +359,13 @@ def apply( use_grouped_topk: bool, topk_group: Optional[int] = None, num_expert_group: Optional[int] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, num_fused_shared_experts: Optional[int] = None, custom_routing_function: Optional[Callable] = None, correction_bias: Optional[torch.Tensor] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, activation: str = "silu", apply_router_weight_on_input: bool = False, inplace: bool = True, @@ -378,14 +382,14 @@ def apply( renormalize=renormalize, topk_group=topk_group, num_expert_group=num_expert_group, - num_fused_shared_experts=num_fused_shared_experts, custom_routing_function=custom_routing_function, - correction_bias=correction_bias, - routed_scaling_factor=routed_scaling_factor, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, ) - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( - cutlass_moe_fp8) - return cutlass_moe_fp8( + # from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + # cutlass_moe_fp8) + from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts + return fused_experts( x, layer.w13_weight, layer.w2_weight, @@ -395,11 +399,13 @@ def apply( activation=activation, use_fp8_w8a8=True, per_channel_quant=False, # ModelOpt uses per-tensor quantization + global_num_experts=global_num_experts, + expert_map=expert_map, w1_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale, a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, - no_combine=no_combine, + apply_router_weight_on_input=apply_router_weight_on_input, ) class ModelOptNvFp4Config(QuantizationConfig): From 1f6180273bb71b064ab22cf8c6c6d32595365903 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Sat, 21 Jun 2025 05:37:33 +0000 Subject: [PATCH 05/25] minor --- vllm/model_executor/layers/quantization/modelopt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 09133808b48..13338705652 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -252,14 +252,14 @@ def create_weights( w13_weight_scale = PerTensorScaleParameter( data=torch.full( (num_experts, 2), - torch.finfo(torch.float32).min, + 1.0, # Initialize to reasonable default instead of -inf dtype=torch.float32, ), weight_loader=weight_loader, ) w2_weight_scale = PerTensorScaleParameter( data=torch.full( - (num_experts,), torch.finfo(torch.float32).min, dtype=torch.float32 + (num_experts,), 1.0, dtype=torch.float32 # Initialize to reasonable default instead of -inf ), weight_loader=weight_loader, ) From 0fb23e12f6c92847186e59cd14c83f0bd24c3be6 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Sun, 22 Jun 2025 08:39:59 +0000 Subject: [PATCH 06/25] update ModelOptFp8Config, handle prefix in mllama4 weight loading, debug --- .../layers/quantization/modelopt.py | 23 ++++++- vllm/model_executor/models/mllama4.py | 62 ++++++++++++------- 2 files changed, 60 insertions(+), 25 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 13338705652..278e747035f 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Optional, Union, List import torch from torch.nn import Module @@ -42,9 +42,13 @@ class ModelOptFp8Config(QuantizationConfig): def __init__( self, is_checkpoint_fp8_serialized: bool = False, + kv_cache_quant_method: Optional[str] = None, + exclude_modules: Optional[List[str]] = None, ) -> None: super().__init__() self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized + self.kv_cache_quant_method = kv_cache_quant_method + self.exclude_modules = exclude_modules if is_checkpoint_fp8_serialized: logger.warning("Detected ModelOpt fp8 checkpoint. Please note that" " the format is experimental and could change.") @@ -69,6 +73,13 @@ def get_config_filenames(cls) -> list[str]: def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": quant_config = cls.get_from_keys(config, ["quantization"]) quant_method = quant_config["quant_algo"] + kv_cache_quant_method = cls.get_from_keys(config, ["quantization"]).get( + "kv_cache_quant_algo" + ) + exclude_modules = cls.get_from_keys(config, ["quantization"]).get( + "exclude_modules" + ) + if quant_method not in QUANT_ALGOS: raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}" " quantizations in vLLM. Please check the " @@ -76,7 +87,15 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": "quant configuration.") is_checkpoint_fp8_serialized = ("FP8" in quant_method) - return cls(is_checkpoint_fp8_serialized) + # Convert exclude_modules to handle the language_model prefix that gets added by mllama4.py + converted_exclude_modules = [] + if exclude_modules: + for module in exclude_modules: + converted_exclude_modules.append(module) + if not module.startswith("language_model."): + converted_exclude_modules.append(f"language_model.{module}") + + return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, converted_exclude_modules) def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 9eb30fb52b3..e9a50b19109 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -914,30 +914,46 @@ def load_weights(self, weights: Iterable[tuple[str, params_dict = dict(self.named_parameters()) updated_params: set[str] = set() - # language_model is an Llama4ForCausalLM instance. We load it's - # using llama4's load_weights routine. - language_model_weights, other_weights = self.separate_weights( - weights, prefix="language_model.") - - # If no language_model weights found, try with "model." prefix and rename - language_model_weights_list = list(language_model_weights) - if not language_model_weights_list: - # No language_model.* weights found, try model.* weights - def rename_model_weights(): - for name, weight in weights: - if name.startswith("model."): - # Rename model.* to language_model.model.* - yield (name.replace("model.", "language_model.model.", 1), weight) - else: - # Keep other weights as is - yield (name, weight) - - language_model_weights, other_weights = self.separate_weights( - rename_model_weights(), prefix="language_model.") - + # Debug: Print first 30 parameter names from initialized model + print("=== INITIALIZED MODEL PARAMETERS ===") + print("First 30 parameter names containing 'scale':") + scale_params = [name for name in params_dict.keys() if "scale" in name] + for i, name in enumerate(scale_params[:30]): + print(f" {i+1:2d}. {name}") + print(f"Total parameters with 'scale': {len(scale_params)}") + print(f"Total model parameters: {len(params_dict)}") + print("=== END DEBUG ===\n") + + # Combine renaming and separation logic in a single pass + def process_and_separate_weights(): + language_model_weights = [] + other_weights = [] + + for name, weight in weights: + # Apply renaming logic + if name.startswith("model."): + # Rename model.* to language_model.model.* + renamed = name.replace("model.", "language_model.model.", 1) + elif name.startswith("lm_head.weight"): + # Rename lm_head.weight to language_model.lm_head.weight + renamed = name.replace("lm_head.weight", "language_model.lm_head.weight") + else: + # Keep other weights as is + renamed = name + + # Separate into language_model and other weights + if renamed.startswith("language_model."): + language_model_weights.append((renamed, weight)) + else: + other_weights.append((renamed, weight)) + + return language_model_weights, other_weights + + language_model_weights, other_weights = process_and_separate_weights() + + # Load language model weights loader = AutoWeightsLoader(self) - loaded_language_model_params = loader.load_weights( - language_model_weights) + loaded_language_model_params = loader.load_weights(language_model_weights) assert loaded_language_model_params is not None updated_params.update(loaded_language_model_params) From 03d2b3be9b970dfceda3674d3e3d67550f1294d8 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Mon, 23 Jun 2025 08:58:25 +0000 Subject: [PATCH 07/25] debug, handle kv scales --- vllm/model_executor/models/llama4.py | 49 +++++++++++++++++- vllm/model_executor/models/mllama4.py | 71 ++++++++++++++++++++++++++- 2 files changed, 117 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 0c9baab1f2e..c5d498b0e0a 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -35,7 +35,7 @@ RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk, @@ -435,6 +435,12 @@ def load_weights(self, weights: Iterable[tuple[str, name = name.replace(weight_name, param_name) if is_pp_missing_parameter(name, self): continue + if name.endswith("scale") and "expert" not in name: + # Remapping the name of FP8 kv-scale. + remapped_name = maybe_remap_kv_scale_name(name, params_dict) + if remapped_name is None: + continue + name = remapped_name param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -452,6 +458,47 @@ def load_weights(self, weights: Iterable[tuple[str, if not moe_loaded: if is_pp_missing_parameter(name, self): continue + + # Handle flat expert scale parameters that don't match per-expert patterns + if ("experts." in name and + ("w13_input_scale" in name or "w13_weight_scale" in name or + "w2_input_scale" in name or "w2_weight_scale" in name)): + # These are flat expert scales that apply to all experts + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + + # Check if this is a MoE-specific weight loader that needs extra arguments + if hasattr(param, 'weight_loader'): + try: + # Try to inspect the weight_loader signature + import inspect + sig = inspect.signature(weight_loader) + if 'expert_id' in sig.parameters and 'shard_id' in sig.parameters: + # This is a MoE weight loader, provide the required arguments + # Determine the appropriate shard_id based on parameter name + if "w13_" in name: + # w13 corresponds to gate_up_proj, which can be either w1 or w3 + # For scales, we typically use w1 as the representative + shard_id = "w1" + elif "w2_" in name: + # w2 corresponds to down_proj + shard_id = "w2" + else: + # Fallback - this shouldn't happen for scale parameters + shard_id = "w1" + + weight_loader(param, loaded_weight, name, shard_id=shard_id, expert_id=0) + else: + # Regular weight loader + weight_loader(param, loaded_weight) + except Exception: + # Fallback to regular loading if signature inspection fails + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight) + loaded_params.add(name) + continue + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index e9a50b19109..136a6828b09 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -717,6 +717,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], } @classmethod @@ -910,6 +911,12 @@ def load_weights(self, weights: Iterable[tuple[str, (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), + # Shared expert gate_up_proj stacking + (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0), + (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1), + # Feed forward gate_up_proj stacking (for non-MoE layers if any) + (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0), + (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1), ] params_dict = dict(self.named_parameters()) updated_params: set[str] = set() @@ -929,11 +936,38 @@ def process_and_separate_weights(): language_model_weights = [] other_weights = [] + # Track scale parameters for debugging + checkpoint_scales = [] + renamed_scales = [] + for name, weight in weights: + # Track scale parameters from checkpoint + if "scale" in name: + checkpoint_scales.append(name) + # Apply renaming logic if name.startswith("model."): - # Rename model.* to language_model.model.* - renamed = name.replace("model.", "language_model.model.", 1) + # Handle expert scale parameters with flat naming + if "feed_forward.experts." in name and ("_input_scale" in name or "_weight_scale" in name): + # Expert scales in checkpoint are single values for all experts + # e.g., "model.layers.0.feed_forward.experts.down_proj_input_scale" + # should map to "language_model.model.layers.0.feed_forward.experts.w2_input_scale" + + renamed = name.replace("model.", "language_model.model.", 1) + + # Map checkpoint naming to vLLM's expected naming + if "down_proj_input_scale" in renamed: + renamed = renamed.replace("down_proj_input_scale", "w2_input_scale") + elif "down_proj_weight_scale" in renamed: + renamed = renamed.replace("down_proj_weight_scale", "w2_weight_scale") + elif "gate_up_proj_input_scale" in renamed: + renamed = renamed.replace("gate_up_proj_input_scale", "w13_input_scale") + elif "gate_up_proj_weight_scale" in renamed: + renamed = renamed.replace("gate_up_proj_weight_scale", "w13_weight_scale") + # If none of the above patterns match, keep the renamed version as is + else: + # Standard model.* to language_model.model.* renaming + renamed = name.replace("model.", "language_model.model.", 1) elif name.startswith("lm_head.weight"): # Rename lm_head.weight to language_model.lm_head.weight renamed = name.replace("lm_head.weight", "language_model.lm_head.weight") @@ -941,12 +975,45 @@ def process_and_separate_weights(): # Keep other weights as is renamed = name + # Track renamed scale parameters + if "scale" in renamed: + renamed_scales.append(renamed) + # Separate into language_model and other weights if renamed.startswith("language_model."): language_model_weights.append((renamed, weight)) else: other_weights.append((renamed, weight)) + # Debug scale parameter mapping + print("=== SCALE PARAMETER MAPPING DEBUG ===") + print(f"Total scale parameters in checkpoint: {len(checkpoint_scales)}") + print(f"Total renamed scale parameters: {len(renamed_scales)}") + + # Categorize scale parameters + self_attn_scales = [s for s in checkpoint_scales if "self_attn" in s] + expert_scales = [s for s in checkpoint_scales if "experts" in s and "shared_expert" not in s] + shared_expert_scales = [s for s in checkpoint_scales if "shared_expert" in s] + other_scales = [s for s in checkpoint_scales if s not in self_attn_scales + expert_scales + shared_expert_scales] + + print(f"\nScale parameter categories from checkpoint:") + print(f" Self-attention scales: {len(self_attn_scales)}") + print(f" Expert scales: {len(expert_scales)}") + print(f" Shared expert scales: {len(shared_expert_scales)}") + print(f" Other scales: {len(other_scales)}") + + if expert_scales: + print(f"\nFirst 5 expert scale parameters (original):") + for i, name in enumerate(expert_scales[:5]): + print(f" {i+1}. {name}") + + print(f"\nFirst 5 expert scale parameters (renamed):") + expert_renamed = [s for s in renamed_scales if "experts" in s and "shared_expert" not in s] + for i, name in enumerate(expert_renamed[:5]): + print(f" {i+1}. {name}") + + print("=== END SCALE DEBUG ===\n") + return language_model_weights, other_weights language_model_weights, other_weights = process_and_separate_weights() From 93d7185b38e81058a548f95a0e7b8e3791298d80 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Mon, 23 Jun 2025 20:40:58 +0000 Subject: [PATCH 08/25] fix kv scale name matching issue --- vllm/model_executor/models/llama4.py | 5 ++--- vllm/model_executor/models/mllama4.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index c5d498b0e0a..16c8c3a411a 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -437,10 +437,9 @@ def load_weights(self, weights: Iterable[tuple[str, continue if name.endswith("scale") and "expert" not in name: # Remapping the name of FP8 kv-scale. - remapped_name = maybe_remap_kv_scale_name(name, params_dict) - if remapped_name is None: + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: continue - name = remapped_name param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 136a6828b09..f750609f49f 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.utils import initialize_model -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -968,6 +968,19 @@ def process_and_separate_weights(): else: # Standard model.* to language_model.model.* renaming renamed = name.replace("model.", "language_model.model.", 1) + + # Handle FP8 scale parameters: k_proj.k_scale -> attn.k_scale, v_proj.v_scale -> attn.v_scale + if ".k_proj.k_scale" in renamed: + original_renamed = renamed + renamed = renamed.replace(".k_proj.k_scale", ".attn.k_scale") + print(f"Remapped FP8 k_scale: {original_renamed} -> {renamed}") + elif ".v_proj.v_scale" in renamed: + original_renamed = renamed + renamed = renamed.replace(".v_proj.v_scale", ".attn.v_scale") + print(f"Remapped FP8 v_scale: {original_renamed} -> {renamed}") + # Track renamed scale parameters + if "scale" in renamed: + renamed_scales.append(renamed) elif name.startswith("lm_head.weight"): # Rename lm_head.weight to language_model.lm_head.weight renamed = name.replace("lm_head.weight", "language_model.lm_head.weight") From d154fe14efb3eb4e2e7753ca7ec4e3167122dce3 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Mon, 23 Jun 2025 23:47:53 +0000 Subject: [PATCH 09/25] update, debug --- .../model_loader/weight_utils.py | 10 +++ vllm/model_executor/models/llama4.py | 36 +++++--- vllm/model_executor/models/mllama4.py | 85 +++++++++---------- 3 files changed, 74 insertions(+), 57 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 857f4bca682..b886efed0a8 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -758,6 +758,10 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: modelopt_scale_names = [ ".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale" ] + # Also support qkv_proj scale parameters (from stacked parameter processing) + qkv_proj_scale_names = [ + ".self_attn.qkv_proj.k_scale", ".self_attn.qkv_proj.v_scale" + ] for scale_name in possible_scale_names: if name.endswith(scale_name): if any(mo_scale_name in name @@ -765,6 +769,12 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: remapped_name = name.replace( f".self_attn.{scale_name[1]}_proj{scale_name}", f".self_attn.attn{scale_name}") + elif any(qkv_scale_name in name + for qkv_scale_name in qkv_proj_scale_names): + # Handle qkv_proj scale parameters: .self_attn.qkv_proj.k_scale -> .self_attn.attn.k_scale + remapped_name = name.replace( + f".self_attn.qkv_proj{scale_name}", + f".self_attn.attn{scale_name}") else: remapped_name = name.replace(scale_name, f".attn{scale_name}") if remapped_name not in params_dict: diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 16c8c3a411a..40c8cf1a440 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -395,11 +395,18 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), + # (".qkv_proj", ".q_proj", "q"), + # (".qkv_proj", ".k_proj", "k"), + # (".qkv_proj", ".v_proj", "v"), + # (".gate_up_proj", ".gate_proj", 0), + # (".gate_up_proj", ".up_proj", 1), + (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), + (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), + (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), + (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0), + (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1), + (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0), + (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1), ] fused_experts_params = False expert_params_mapping = FusedMoE.make_expert_params_mapping( @@ -432,17 +439,25 @@ def load_weights(self, weights: Iterable[tuple[str, for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name or "experts" in name: continue - name = name.replace(weight_name, param_name) + # Don't transform k_scale/v_scale parameter names with stacked parameter mapping + # but allow other scale parameters (input_scale, weight_scale) to be processed + if not (name.endswith((".k_scale", ".v_scale")) and "self_attn" in name): + name = name.replace(weight_name, param_name) if is_pp_missing_parameter(name, self): continue if name.endswith("scale") and "expert" not in name: # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) if name is None: - continue + continue # Skip this parameter if remapping failed param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) + weight_loader = getattr(param, "weight_loader", default_weight_loader) + if weight_loader == default_weight_loader: + # default_weight_loader doesn't support shard_id, just load the weight directly + weight_loader(param, loaded_weight) + else: + # Custom weight loader that supports shard_id + weight_loader(param, loaded_weight, shard_id) loaded_params.add(name) break else: @@ -499,8 +514,7 @@ def load_weights(self, weights: Iterable[tuple[str, continue param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) + weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index f750609f49f..63195c7b2f4 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -921,15 +921,15 @@ def load_weights(self, weights: Iterable[tuple[str, params_dict = dict(self.named_parameters()) updated_params: set[str] = set() - # Debug: Print first 30 parameter names from initialized model - print("=== INITIALIZED MODEL PARAMETERS ===") - print("First 30 parameter names containing 'scale':") - scale_params = [name for name in params_dict.keys() if "scale" in name] - for i, name in enumerate(scale_params[:30]): - print(f" {i+1:2d}. {name}") - print(f"Total parameters with 'scale': {len(scale_params)}") - print(f"Total model parameters: {len(params_dict)}") - print("=== END DEBUG ===\n") + # # Debug: Print first 30 parameter names from initialized model + # print("=== INITIALIZED MODEL PARAMETERS ===") + # print("First 30 parameter names containing 'scale':") + # scale_params = [name for name in params_dict.keys() if "scale" in name] + # for i, name in enumerate(scale_params[:30]): + # print(f" {i+1:2d}. {name}") + # print(f"Total parameters with 'scale': {len(scale_params)}") + # print(f"Total model parameters: {len(params_dict)}") + # print("=== END DEBUG ===\n") # Combine renaming and separation logic in a single pass def process_and_separate_weights(): @@ -969,15 +969,8 @@ def process_and_separate_weights(): # Standard model.* to language_model.model.* renaming renamed = name.replace("model.", "language_model.model.", 1) - # Handle FP8 scale parameters: k_proj.k_scale -> attn.k_scale, v_proj.v_scale -> attn.v_scale - if ".k_proj.k_scale" in renamed: - original_renamed = renamed - renamed = renamed.replace(".k_proj.k_scale", ".attn.k_scale") - print(f"Remapped FP8 k_scale: {original_renamed} -> {renamed}") - elif ".v_proj.v_scale" in renamed: - original_renamed = renamed - renamed = renamed.replace(".v_proj.v_scale", ".attn.v_scale") - print(f"Remapped FP8 v_scale: {original_renamed} -> {renamed}") + # Don't do FP8 scale parameter remapping here - let Llama4Model.load_weights() handle it + # The existing logic in Llama4Model.load_weights() already has proper scale remapping via maybe_remap_kv_scale_name # Track renamed scale parameters if "scale" in renamed: renamed_scales.append(renamed) @@ -998,34 +991,34 @@ def process_and_separate_weights(): else: other_weights.append((renamed, weight)) - # Debug scale parameter mapping - print("=== SCALE PARAMETER MAPPING DEBUG ===") - print(f"Total scale parameters in checkpoint: {len(checkpoint_scales)}") - print(f"Total renamed scale parameters: {len(renamed_scales)}") - - # Categorize scale parameters - self_attn_scales = [s for s in checkpoint_scales if "self_attn" in s] - expert_scales = [s for s in checkpoint_scales if "experts" in s and "shared_expert" not in s] - shared_expert_scales = [s for s in checkpoint_scales if "shared_expert" in s] - other_scales = [s for s in checkpoint_scales if s not in self_attn_scales + expert_scales + shared_expert_scales] - - print(f"\nScale parameter categories from checkpoint:") - print(f" Self-attention scales: {len(self_attn_scales)}") - print(f" Expert scales: {len(expert_scales)}") - print(f" Shared expert scales: {len(shared_expert_scales)}") - print(f" Other scales: {len(other_scales)}") - - if expert_scales: - print(f"\nFirst 5 expert scale parameters (original):") - for i, name in enumerate(expert_scales[:5]): - print(f" {i+1}. {name}") - - print(f"\nFirst 5 expert scale parameters (renamed):") - expert_renamed = [s for s in renamed_scales if "experts" in s and "shared_expert" not in s] - for i, name in enumerate(expert_renamed[:5]): - print(f" {i+1}. {name}") - - print("=== END SCALE DEBUG ===\n") + # # Debug scale parameter mapping + # print("=== SCALE PARAMETER MAPPING DEBUG ===") + # print(f"Total scale parameters in checkpoint: {len(checkpoint_scales)}") + # print(f"Total renamed scale parameters: {len(renamed_scales)}") + + # # Categorize scale parameters + # self_attn_scales = [s for s in checkpoint_scales if "self_attn" in s] + # expert_scales = [s for s in checkpoint_scales if "experts" in s and "shared_expert" not in s] + # shared_expert_scales = [s for s in checkpoint_scales if "shared_expert" in s] + # other_scales = [s for s in checkpoint_scales if s not in self_attn_scales + expert_scales + shared_expert_scales] + + # print(f"\nScale parameter categories from checkpoint:") + # print(f" Self-attention scales: {len(self_attn_scales)}") + # print(f" Expert scales: {len(expert_scales)}") + # print(f" Shared expert scales: {len(shared_expert_scales)}") + # print(f" Other scales: {len(other_scales)}") + + # if expert_scales: + # print(f"\nFirst 5 expert scale parameters (original):") + # for i, name in enumerate(expert_scales[:5]): + # print(f" {i+1}. {name}") + + # print(f"\nFirst 5 expert scale parameters (renamed):") + # expert_renamed = [s for s in renamed_scales if "experts" in s and "shared_expert" not in s] + # for i, name in enumerate(expert_renamed[:5]): + # print(f" {i+1}. {name}") + + # print("=== END SCALE DEBUG ===\n") return language_model_weights, other_weights From 782c018f2c1647699e1f5a57f466fa11d38b8485 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Tue, 24 Jun 2025 07:50:15 +0000 Subject: [PATCH 10/25] cleanup --- .../layers/quantization/modelopt.py | 2 - vllm/model_executor/models/llama4.py | 17 +++------ vllm/model_executor/models/mllama4.py | 38 ------------------- 3 files changed, 5 insertions(+), 52 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 278e747035f..7617b91cc2a 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -405,8 +405,6 @@ def apply( scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, ) - # from vllm.model_executor.layers.fused_moe.cutlass_moe import ( - # cutlass_moe_fp8) from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts return fused_experts( x, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 40c8cf1a440..4131c357ac2 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -395,18 +395,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) - # (".qkv_proj", ".q_proj", "q"), - # (".qkv_proj", ".k_proj", "k"), - # (".qkv_proj", ".v_proj", "v"), - # (".gate_up_proj", ".gate_proj", 0), - # (".gate_up_proj", ".up_proj", 1), - (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), - (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), - (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), - (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0), - (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1), - (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0), - (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1), + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), ] fused_experts_params = False expert_params_mapping = FusedMoE.make_expert_params_mapping( diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 63195c7b2f4..c609fbe1e91 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -921,16 +921,6 @@ def load_weights(self, weights: Iterable[tuple[str, params_dict = dict(self.named_parameters()) updated_params: set[str] = set() - # # Debug: Print first 30 parameter names from initialized model - # print("=== INITIALIZED MODEL PARAMETERS ===") - # print("First 30 parameter names containing 'scale':") - # scale_params = [name for name in params_dict.keys() if "scale" in name] - # for i, name in enumerate(scale_params[:30]): - # print(f" {i+1:2d}. {name}") - # print(f"Total parameters with 'scale': {len(scale_params)}") - # print(f"Total model parameters: {len(params_dict)}") - # print("=== END DEBUG ===\n") - # Combine renaming and separation logic in a single pass def process_and_separate_weights(): language_model_weights = [] @@ -991,34 +981,6 @@ def process_and_separate_weights(): else: other_weights.append((renamed, weight)) - # # Debug scale parameter mapping - # print("=== SCALE PARAMETER MAPPING DEBUG ===") - # print(f"Total scale parameters in checkpoint: {len(checkpoint_scales)}") - # print(f"Total renamed scale parameters: {len(renamed_scales)}") - - # # Categorize scale parameters - # self_attn_scales = [s for s in checkpoint_scales if "self_attn" in s] - # expert_scales = [s for s in checkpoint_scales if "experts" in s and "shared_expert" not in s] - # shared_expert_scales = [s for s in checkpoint_scales if "shared_expert" in s] - # other_scales = [s for s in checkpoint_scales if s not in self_attn_scales + expert_scales + shared_expert_scales] - - # print(f"\nScale parameter categories from checkpoint:") - # print(f" Self-attention scales: {len(self_attn_scales)}") - # print(f" Expert scales: {len(expert_scales)}") - # print(f" Shared expert scales: {len(shared_expert_scales)}") - # print(f" Other scales: {len(other_scales)}") - - # if expert_scales: - # print(f"\nFirst 5 expert scale parameters (original):") - # for i, name in enumerate(expert_scales[:5]): - # print(f" {i+1}. {name}") - - # print(f"\nFirst 5 expert scale parameters (renamed):") - # expert_renamed = [s for s in renamed_scales if "experts" in s and "shared_expert" not in s] - # for i, name in enumerate(expert_renamed[:5]): - # print(f" {i+1}. {name}") - - # print("=== END SCALE DEBUG ===\n") return language_model_weights, other_weights From b78b191398c439f7a8ceb38292e8f21ca0ade999 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Tue, 24 Jun 2025 20:20:35 +0000 Subject: [PATCH 11/25] fix format --- vllm/model_executor/layers/fused_moe/layer.py | 13 +- .../layers/quantization/modelopt.py | 128 ++++++++++-------- vllm/model_executor/models/llama4.py | 59 +++++--- vllm/model_executor/models/mllama4.py | 41 +++--- 4 files changed, 136 insertions(+), 105 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 24129484125..a44e83bfee8 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1050,16 +1050,13 @@ def weight_loader(self, # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern if "ModelOpt" in quant_method_name: # Determine per-tensor weight scale patterns based on variant - is_fp4_variant = ( - "ModelOptNvFp4FusedMoEMethod" in self.quant_method.__class__.__name__ - ) + is_fp4_variant = ("ModelOptNvFp4FusedMoEMethod" + in self.quant_method.__class__.__name__) - # FP4 uses "weight_scale_2" for per-tensor, FP8 uses "weight_scale" for per-tensor + # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale" per_tensor_conditions = ( - "weight_scale_2" in weight_name - if is_fp4_variant - else "weight_scale" in weight_name - ) or "input_scale" in weight_name + "weight_scale_2" in weight_name if is_fp4_variant else + "weight_scale" in weight_name) or "input_scale" in weight_name if per_tensor_conditions: self._load_per_tensor_weight_scale( diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 7617b91cc2a..a2b9212de88 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Callable, Optional, Union, List +from typing import Any, Callable, List, Optional, Union import torch from torch.nn import Module @@ -73,12 +73,10 @@ def get_config_filenames(cls) -> list[str]: def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": quant_config = cls.get_from_keys(config, ["quantization"]) quant_method = quant_config["quant_algo"] - kv_cache_quant_method = cls.get_from_keys(config, ["quantization"]).get( - "kv_cache_quant_algo" - ) - exclude_modules = cls.get_from_keys(config, ["quantization"]).get( - "exclude_modules" - ) + kv_cache_quant_method = cls.get_from_keys( + config, ["quantization"]).get("kv_cache_quant_algo") + exclude_modules = cls.get_from_keys( + config, ["quantization"]).get("exclude_modules") if quant_method not in QUANT_ALGOS: raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}" @@ -87,15 +85,17 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": "quant configuration.") is_checkpoint_fp8_serialized = ("FP8" in quant_method) - # Convert exclude_modules to handle the language_model prefix that gets added by mllama4.py + # Convert exclude_modules to handle the language_model prefix for llama4 converted_exclude_modules = [] if exclude_modules: for module in exclude_modules: converted_exclude_modules.append(module) if not module.startswith("language_model."): - converted_exclude_modules.append(f"language_model.{module}") + converted_exclude_modules.append( + f"language_model.{module}") - return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, converted_exclude_modules) + return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, + converted_exclude_modules) def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: @@ -191,6 +191,7 @@ def apply( input_scale=layer.input_scale, bias=bias) + class ModelOptFp8MoEMethod: """MoE method for ModelOpt FP8. Supports loading FP8 checkpoints with static weight scale and activation scale. @@ -209,10 +210,13 @@ def __new__(cls, *args, **kwargs): original_init = cls.__init__ new_cls = type( cls.__name__, - (FusedMoEMethodBase,), + (FusedMoEMethodBase, ), { "__init__": original_init, - **{k: v for k, v in cls.__dict__.items() if k != "__dict__"}, + **{ + k: v + for k, v in cls.__dict__.items() if k != "__dict__" + }, }, ) obj = super(new_cls, new_cls).__new__(new_cls) @@ -237,17 +241,16 @@ def create_weights( ): # Use FP8 dtype if checkpoint is serialized, otherwise use the default dtype - weight_dtype = ( - torch.float8_e4m3fn - if self.quant_config.is_checkpoint_fp8_serialized - else params_dtype - ) + weight_dtype = (torch.float8_e4m3fn + if self.quant_config.is_checkpoint_fp8_serialized else + params_dtype) weight_loader = extra_weight_attrs.get("weight_loader") w13_weight = ModelWeightParameter( - data=torch.empty( - num_experts, 2 * intermediate_size_per_partition, hidden_size, dtype=weight_dtype - ), + data=torch.empty(num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=weight_dtype), input_dim=2, output_dim=1, weight_loader=weight_loader, @@ -255,9 +258,10 @@ def create_weights( layer.register_parameter("w13_weight", w13_weight) w2_weight = ModelWeightParameter( - data=torch.empty( - num_experts, hidden_size, intermediate_size_per_partition, dtype=weight_dtype - ), + data=torch.empty(num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=weight_dtype), input_dim=2, output_dim=1, weight_loader=weight_loader, @@ -278,7 +282,10 @@ def create_weights( ) w2_weight_scale = PerTensorScaleParameter( data=torch.full( - (num_experts,), 1.0, dtype=torch.float32 # Initialize to reasonable default instead of -inf + (num_experts, ), + 1.0, + dtype=torch. + float32 # Initialize to reasonable default instead of -inf ), weight_loader=weight_loader, ) @@ -287,16 +294,15 @@ def create_weights( # Set weight loader attributes for scales extra_weight_attrs.update( - {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} - ) + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}) # INPUT SCALES - Per-tensor scaling for ModelOpt w13_input_scale = PerTensorScaleParameter( - data=torch.full((num_experts,), 1.0, dtype=torch.float32), + data=torch.full((num_experts, ), 1.0, dtype=torch.float32), weight_loader=weight_loader, ) w2_input_scale = PerTensorScaleParameter( - data=torch.full((num_experts,), 1.0, dtype=torch.float32), + data=torch.full((num_experts, ), 1.0, dtype=torch.float32), weight_loader=weight_loader, ) layer.register_parameter("w13_input_scale", w13_input_scale) @@ -307,24 +313,27 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: Only supports pre-quantized checkpoints with FP8 weights and scales. """ - layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False) + layer.w13_weight = Parameter(layer.w13_weight.data, + requires_grad=False) layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False) + from vllm._custom_ops import scaled_fp8_quant from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( per_tensor_dequantize) - from vllm._custom_ops import scaled_fp8_quant # Handle scale parameters - if hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None: + if hasattr(layer, + "w13_weight_scale") and layer.w13_weight_scale is not None: # Fp8 moe kernel needs single weight scale for w13 per expert. - # We take the max of the w1 and w3 scales then dequant and requant each expert. + # We take the max of the w1 and w3 scales + # then dequant and requant each expert. if layer.w13_weight_scale.dim() == 2: # Shape: (num_experts, 2) # Get the maximum scale across w1 and w3 for each expert max_w13_scales = layer.w13_weight_scale.max(dim=1).values # Requantize each expert's weights using the combined scale - # w13_weight has shape (num_experts, 2 * intermediate_size, hidden_size) + # w13_weight (num_experts, 2 * intermediate_size, hidden_size) # where the first intermediate_size rows are w1, the next are w3 intermediate_size = layer.w13_weight.shape[1] // 2 for expert_id in range(layer.w13_weight.shape[0]): @@ -332,41 +341,40 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: for shard_id in range(2): # w1 and w3 # Dequantize using the original scale for this shard dq_weight = per_tensor_dequantize( - layer.w13_weight[expert_id][ - start : start + intermediate_size, : - ], + layer.w13_weight[expert_id][start:start + + intermediate_size, :], layer.w13_weight_scale[expert_id][shard_id], ) # Requantize using the combined max scale ( - layer.w13_weight[expert_id][ - start : start + intermediate_size, : - ], + layer.w13_weight[expert_id][start:start + + intermediate_size, :], _, - ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id]) + ) = scaled_fp8_quant(dq_weight, + max_w13_scales[expert_id]) start += intermediate_size - # Update the scale parameter to be per-expert instead of per-shard - layer.w13_weight_scale = Parameter(max_w13_scales, requires_grad=False) + # Update the scale parameter to be per-expert + layer.w13_weight_scale = Parameter(max_w13_scales, + requires_grad=False) else: - layer.w13_weight_scale = Parameter( - layer.w13_weight_scale.data, requires_grad=False - ) - - if hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None: - layer.w2_weight_scale = Parameter( - layer.w2_weight_scale.data, requires_grad=False - ) - if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None: - layer.w13_input_scale = Parameter( - layer.w13_input_scale.max(), requires_grad=False - ) - if hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None: - layer.w2_input_scale = Parameter( - layer.w2_input_scale.max(), requires_grad=False - ) + layer.w13_weight_scale = Parameter(layer.w13_weight_scale.data, + requires_grad=False) + + if hasattr(layer, + "w2_weight_scale") and layer.w2_weight_scale is not None: + layer.w2_weight_scale = Parameter(layer.w2_weight_scale.data, + requires_grad=False) + if hasattr(layer, + "w13_input_scale") and layer.w13_input_scale is not None: + layer.w13_input_scale = Parameter(layer.w13_input_scale.max(), + requires_grad=False) + if hasattr(layer, + "w2_input_scale") and layer.w2_input_scale is not None: + layer.w2_input_scale = Parameter(layer.w2_input_scale.max(), + requires_grad=False) def apply( self, @@ -405,7 +413,8 @@ def apply( scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, ) - from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts + from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_experts) return fused_experts( x, layer.w13_weight, @@ -425,6 +434,7 @@ def apply( apply_router_weight_on_input=apply_router_weight_on_input, ) + class ModelOptNvFp4Config(QuantizationConfig): """Config class for ModelOpt FP4.""" diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 4131c357ac2..d0dbae20dce 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -35,7 +35,8 @@ RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk, @@ -432,9 +433,11 @@ def load_weights(self, weights: Iterable[tuple[str, for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name or "experts" in name: continue - # Don't transform k_scale/v_scale parameter names with stacked parameter mapping - # but allow other scale parameters (input_scale, weight_scale) to be processed - if not (name.endswith((".k_scale", ".v_scale")) and "self_attn" in name): + # Don't transform k_scale/v_scale parameter names with + # stacked parameter mapping but allow other scale parameters + # (input_scale, weight_scale) to be processed + if not (name.endswith( + (".k_scale", ".v_scale")) and "self_attn" in name): name = name.replace(weight_name, param_name) if is_pp_missing_parameter(name, self): continue @@ -444,9 +447,10 @@ def load_weights(self, weights: Iterable[tuple[str, if name is None: continue # Skip this parameter if remapping failed param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) if weight_loader == default_weight_loader: - # default_weight_loader doesn't support shard_id, just load the weight directly + # default_weight_loader doesn't support shard_id weight_loader(param, loaded_weight) else: # Custom weight loader that supports shard_id @@ -466,40 +470,52 @@ def load_weights(self, weights: Iterable[tuple[str, if is_pp_missing_parameter(name, self): continue - # Handle flat expert scale parameters that don't match per-expert patterns - if ("experts." in name and - ("w13_input_scale" in name or "w13_weight_scale" in name or - "w2_input_scale" in name or "w2_weight_scale" in name)): + # Handle flat expert scale parameters that + # don't match per-expert patterns + if ("experts." in name and ("w13_input_scale" in name + or "w13_weight_scale" in name + or "w2_input_scale" in name + or "w2_weight_scale" in name)): # These are flat expert scales that apply to all experts param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) - # Check if this is a MoE-specific weight loader that needs extra arguments + # Check if this is a MoE-specific weight loader that + # needs extra arguments if hasattr(param, 'weight_loader'): try: # Try to inspect the weight_loader signature import inspect sig = inspect.signature(weight_loader) - if 'expert_id' in sig.parameters and 'shard_id' in sig.parameters: - # This is a MoE weight loader, provide the required arguments - # Determine the appropriate shard_id based on parameter name + if ('expert_id' in sig.parameters and + 'shard_id' in sig.parameters): + # This is a MoE weight loader, provide the + # required arguments + # Determine the appropriate shard_id based + # on parameter name if "w13_" in name: - # w13 corresponds to gate_up_proj, which can be either w1 or w3 - # For scales, we typically use w1 as the representative + # w13 corresponds to gate_up_proj, which + # can be either w1 or w3 shard_id = "w1" elif "w2_" in name: # w2 corresponds to down_proj shard_id = "w2" else: - # Fallback - this shouldn't happen for scale parameters + # Fallback - this shouldn't happen for + # scale parameters shard_id = "w1" - weight_loader(param, loaded_weight, name, shard_id=shard_id, expert_id=0) + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=0) else: # Regular weight loader weight_loader(param, loaded_weight) except Exception: - # Fallback to regular loading if signature inspection fails + # Fallback to regular loading weight_loader(param, loaded_weight) else: weight_loader(param, loaded_weight) @@ -507,7 +523,8 @@ def load_weights(self, weights: Iterable[tuple[str, continue param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index c609fbe1e91..70e379b4def 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.utils import initialize_model -from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name +from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -938,35 +938,42 @@ def process_and_separate_weights(): # Apply renaming logic if name.startswith("model."): # Handle expert scale parameters with flat naming - if "feed_forward.experts." in name and ("_input_scale" in name or "_weight_scale" in name): - # Expert scales in checkpoint are single values for all experts - # e.g., "model.layers.0.feed_forward.experts.down_proj_input_scale" - # should map to "language_model.model.layers.0.feed_forward.experts.w2_input_scale" + if "feed_forward.experts." in name and ( + "_input_scale" in name or "_weight_scale" in name): + # Expert scales in checkpoint are single values for all + # experts e.g., "model.layers.0.feed_forward.experts. + # down_proj_input_scale" should map to "language_model. + # model.layers.0.feed_forward.experts.w2_input_scale" - renamed = name.replace("model.", "language_model.model.", 1) + renamed = name.replace("model.", + "language_model.model.", 1) # Map checkpoint naming to vLLM's expected naming if "down_proj_input_scale" in renamed: - renamed = renamed.replace("down_proj_input_scale", "w2_input_scale") + renamed = renamed.replace("down_proj_input_scale", + "w2_input_scale") elif "down_proj_weight_scale" in renamed: - renamed = renamed.replace("down_proj_weight_scale", "w2_weight_scale") + renamed = renamed.replace("down_proj_weight_scale", + "w2_weight_scale") elif "gate_up_proj_input_scale" in renamed: - renamed = renamed.replace("gate_up_proj_input_scale", "w13_input_scale") + renamed = renamed.replace( + "gate_up_proj_input_scale", "w13_input_scale") elif "gate_up_proj_weight_scale" in renamed: - renamed = renamed.replace("gate_up_proj_weight_scale", "w13_weight_scale") - # If none of the above patterns match, keep the renamed version as is + renamed = renamed.replace( + "gate_up_proj_weight_scale", + "w13_weight_scale") else: # Standard model.* to language_model.model.* renaming - renamed = name.replace("model.", "language_model.model.", 1) + renamed = name.replace("model.", + "language_model.model.", 1) - # Don't do FP8 scale parameter remapping here - let Llama4Model.load_weights() handle it - # The existing logic in Llama4Model.load_weights() already has proper scale remapping via maybe_remap_kv_scale_name # Track renamed scale parameters if "scale" in renamed: renamed_scales.append(renamed) elif name.startswith("lm_head.weight"): # Rename lm_head.weight to language_model.lm_head.weight - renamed = name.replace("lm_head.weight", "language_model.lm_head.weight") + renamed = name.replace("lm_head.weight", + "language_model.lm_head.weight") else: # Keep other weights as is renamed = name @@ -981,14 +988,14 @@ def process_and_separate_weights(): else: other_weights.append((renamed, weight)) - return language_model_weights, other_weights language_model_weights, other_weights = process_and_separate_weights() # Load language model weights loader = AutoWeightsLoader(self) - loaded_language_model_params = loader.load_weights(language_model_weights) + loaded_language_model_params = loader.load_weights( + language_model_weights) assert loaded_language_model_params is not None updated_params.update(loaded_language_model_params) From 22745f3508b92b4418b1cde3a0ead88d6b938608 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 9 Jul 2025 06:15:26 +0000 Subject: [PATCH 12/25] resolve conflict --- vllm/attention/layer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index f0ad68b1640..ee8c452b4e1 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -314,9 +314,13 @@ def __init__( _Backend.FLEX_ATTENTION): backend = _Backend.XFORMERS - self.attn_backend = backend if backend in { - _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1 - } else _Backend.TORCH_SDPA + # self.attn_backend = backend if backend in { + # _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1 + # } else _Backend.TORCH_SDPA + + # Force TORCH_SDPA to avoid xformers-triton compatibility issues + # TODO: Remove this workaround once xformers-triton compatibility is fixed + self.attn_backend = _Backend.TORCH_SDPA def forward( self, From 1c5acec33b93cf7633eed57e00b0926e5b37e009 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 2 Jul 2025 03:42:49 +0000 Subject: [PATCH 13/25] debug --- vllm/model_executor/models/mllama4.py | 222 +++++++++++++++++++++++++- 1 file changed, 220 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 70e379b4def..1538a098af6 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -929,6 +929,7 @@ def process_and_separate_weights(): # Track scale parameters for debugging checkpoint_scales = [] renamed_scales = [] + scale_mapping = {} for name, weight in weights: # Track scale parameters from checkpoint @@ -962,14 +963,31 @@ def process_and_separate_weights(): renamed = renamed.replace( "gate_up_proj_weight_scale", "w13_weight_scale") + # Handle attention scale parameters + elif "self_attn." in name and ( + ".k_scale" in name or ".v_scale" in name): + # Map attention scale parameters for ModelOpt checkpoints + # e.g., "model.layers.0.self_attn.k_proj.k_scale" + # should map to "language_model.model.layers.0.self_attn.attn.k_scale" + + renamed = name.replace("model.", + "language_model.model.", 1) + + # Map checkpoint attention scale naming to vLLM's expected naming + if ".k_proj.k_scale" in renamed: + renamed = renamed.replace(".k_proj.k_scale", ".attn.k_scale") + elif ".v_proj.v_scale" in renamed: + renamed = renamed.replace(".v_proj.v_scale", ".attn.v_scale") else: # Standard model.* to language_model.model.* renaming renamed = name.replace("model.", "language_model.model.", 1) - # Track renamed scale parameters + # Track renamed scale parameters and mapping if "scale" in renamed: renamed_scales.append(renamed) + if "scale" in name: # Only add to mapping if original was also a scale + scale_mapping[name] = renamed elif name.startswith("lm_head.weight"): # Rename lm_head.weight to language_model.lm_head.weight renamed = name.replace("lm_head.weight", @@ -978,9 +996,11 @@ def process_and_separate_weights(): # Keep other weights as is renamed = name - # Track renamed scale parameters + # Track renamed scale parameters and mapping if "scale" in renamed: renamed_scales.append(renamed) + if "scale" in name and name not in scale_mapping: # Avoid duplicates + scale_mapping[name] = renamed # Separate into language_model and other weights if renamed.startswith("language_model."): @@ -988,6 +1008,108 @@ def process_and_separate_weights(): else: other_weights.append((renamed, weight)) + # Print debugging information for scale parameters + print(f"\n=== SCALE PARAMETER LOADING DEBUG INFO ===") + print(f"Scale parameters found in checkpoint ({len(checkpoint_scales)}):") + + # Group scale parameters by type for better readability + moe_scales = [s for s in checkpoint_scales if "experts." in s] + attn_scales = [s for s in checkpoint_scales if "self_attn." in s and "scale" in s] + other_scales = [s for s in checkpoint_scales if s not in moe_scales and s not in attn_scales] + + # Further categorize attention scales + kv_cache_scales = [s for s in attn_scales if ".k_scale" in s or ".v_scale" in s] + linear_scales = [s for s in attn_scales if s not in kv_cache_scales] + + if moe_scales: + print(f"\n MoE Expert Scales ({len(moe_scales)}):") + for scale_name in sorted(moe_scales): + print(f" {scale_name}") + + if attn_scales: + print(f"\n Attention Scales ({len(attn_scales)}):") + if kv_cache_scales: + print(f" KV Cache Scales ({len(kv_cache_scales)}):") + for scale_name in sorted(kv_cache_scales): + print(f" {scale_name}") + if linear_scales: + print(f" Linear Projection Scales ({len(linear_scales)}):") + for scale_name in sorted(linear_scales): + print(f" {scale_name}") + + # Note about missing q_scale and prob_scale + print(f" 📝 Note: q_scale and prob_scale not found in checkpoint") + print(f" These will use default values (1.0) as expected") + + if other_scales: + print(f"\n Other Scales ({len(other_scales)}):") + for scale_name in sorted(other_scales): + print(f" {scale_name}") + + print(f"\nScale parameter name mappings ({len(scale_mapping)}):") + + # Group mappings by type for clarity + moe_mappings = {k: v for k, v in scale_mapping.items() if "experts." in k} + attn_mappings = {k: v for k, v in scale_mapping.items() if "self_attn." in k} + other_mappings = {k: v for k, v in scale_mapping.items() if k not in moe_mappings and k not in attn_mappings} + + if moe_mappings: + print(f"\n MoE Scale Mappings ({len(moe_mappings)}):") + for orig_name, renamed_name in sorted(moe_mappings.items()): + print(f" {orig_name} → {renamed_name}") + + if attn_mappings: + print(f"\n Attention Scale Mappings ({len(attn_mappings)}):") + for orig_name, renamed_name in sorted(attn_mappings.items()): + print(f" {orig_name} → {renamed_name}") + + if other_mappings: + print(f"\n Other Scale Mappings ({len(other_mappings)}):") + for orig_name, renamed_name in sorted(other_mappings.items()): + print(f" {orig_name} → {renamed_name}") + + print(f"\nRenamed scale parameters ({len(renamed_scales)}):") + for scale_name in sorted(renamed_scales): + print(f" {scale_name}") + + # Get expected scale parameters from model + model_scale_params = [] + for param_name in params_dict.keys(): + if "scale" in param_name: + model_scale_params.append(param_name) + + print(f"\nExpected scale parameters in model ({len(model_scale_params)}):") + for param_name in sorted(model_scale_params): + print(f" {param_name}") + + # Check for missing scale parameters + missing_scales = set(model_scale_params) - set(renamed_scales) + extra_scales = set(renamed_scales) - set(model_scale_params) + + # Filter out q_scale and prob_scale as they're expected to use defaults + expected_defaults = {p for p in missing_scales if ".attn.q_scale" in p or ".attn.prob_scale" in p} + truly_missing = missing_scales - expected_defaults + + if truly_missing: + print(f"\n⚠️ MISSING scale parameters ({len(truly_missing)}):") + for param_name in sorted(truly_missing): + print(f" {param_name}") + else: + print(f"\n✅ All required scale parameters found in checkpoint!") + + if expected_defaults: + print(f"\n📋 Scale parameters using defaults ({len(expected_defaults)}):") + for param_name in sorted(expected_defaults): + print(f" {param_name} (will use default value 1.0)") + + + if extra_scales: + print(f"\n⚠️ EXTRA scale parameters in checkpoint ({len(extra_scales)}):") + for param_name in sorted(extra_scales): + print(f" {param_name}") + + print(f"=== END SCALE PARAMETER DEBUG INFO ===\n") + return language_model_weights, other_weights language_model_weights, other_weights = process_and_separate_weights() @@ -1019,4 +1141,100 @@ def process_and_separate_weights(): weight_loader(param, loaded_weight) updated_params.add(name) + + # Print final verification of loaded scale parameters + print(f"\n=== SCALE PARAMETER LOADING VERIFICATION ===") + + # Show parameters that were loaded from checkpoint + loaded_scale_params = {} + for param_name, param in params_dict.items(): + if "scale" in param_name and param_name in updated_params: + if hasattr(param, 'data'): + param_value = param.data + if param_value.numel() == 1: + loaded_scale_params[param_name] = float(param_value.item()) + else: + loaded_scale_params[param_name] = f"tensor{list(param_value.shape)} (first few values: {param_value.flatten()[:5].tolist()})" + else: + loaded_scale_params[param_name] = "No .data attribute" + + if loaded_scale_params: + print(f"Scale parameters loaded from checkpoint ({len(loaded_scale_params)}):") + for param_name, value in sorted(loaded_scale_params.items()): + print(f" {param_name}: {value}") + + # Show parameters that weren't loaded but exist in model (including defaults) + not_loaded_scale_params = {} + for param_name, param in params_dict.items(): + if "scale" in param_name and param_name not in updated_params: + if hasattr(param, 'data'): + param_value = param.data + if param_value.numel() == 1: + not_loaded_scale_params[param_name] = float(param_value.item()) + else: + not_loaded_scale_params[param_name] = f"tensor{list(param_value.shape)} (first few values: {param_value.flatten()[:5].tolist()})" + else: + not_loaded_scale_params[param_name] = "No .data attribute" + + if not_loaded_scale_params: + print(f"\nScale parameters using default values ({len(not_loaded_scale_params)}):") + for param_name, value in sorted(not_loaded_scale_params.items()): + # Highlight q_scale and prob_scale specifically + if ".attn.q_scale" in param_name or ".attn.prob_scale" in param_name: + print(f" {param_name}: {value} ⭐ (expected default)") + else: + print(f" {param_name}: {value}") + + # Summary + total_scale_params = len(loaded_scale_params) + len(not_loaded_scale_params) + print(f"\nScale parameter summary:") + print(f" Loaded from checkpoint: {len(loaded_scale_params)}") + print(f" Using default values: {len(not_loaded_scale_params)}") + print(f" Total scale parameters: {total_scale_params}") + + # Fix missing attention scale parameters using proper defaults + attention_params_fixed = 0 + layer_k_scales = {} # Store k_scale values for each layer + + # First pass: collect k_scale values + for param_name, param in params_dict.items(): + if ".attn.k_scale" in param_name and hasattr(param, 'data'): + layer_prefix = param_name.replace(".attn.k_scale", "") + if param.data.numel() == 1: + layer_k_scales[layer_prefix] = float(param.data.item()) + print(f"📊 Found k_scale for {layer_prefix}: {layer_k_scales[layer_prefix]}") + + # Second pass: fix missing scales with proper defaults + for param_name, param in params_dict.items(): + if hasattr(param, 'data') and param.data.numel() == 1: + current_value = float(param.data.item()) + + # Fix q_scale: use k_scale from same layer if available + if ".attn.q_scale" in param_name and current_value == 1.0: + layer_prefix = param_name.replace(".attn.q_scale", "") + if layer_prefix in layer_k_scales: + k_scale_value = layer_k_scales[layer_prefix] + print(f"🔧 Setting {param_name}: {current_value} -> {k_scale_value} (using k_scale)") + param.data.fill_(k_scale_value) + attention_params_fixed += 1 + else: + print(f"⚠️ No k_scale found for {param_name}, keeping default 1.0") + + # Fix prob_scale: use standard default of 1.0/448.0 for missing values + elif ".attn.prob_scale" in param_name and current_value == 1.0: + prob_scale_default = 1.0 / 448.0 + print(f"🔧 Setting {param_name}: {current_value} -> {prob_scale_default} (attention prob default)") + param.data.fill_(prob_scale_default) + attention_params_fixed += 1 + + if attention_params_fixed > 0: + print(f"Fixed {attention_params_fixed} attention scale parameters with proper defaults") + + if not_loaded_scale_params and not any(".attn.q_scale" in p or ".attn.prob_scale" in p for p in not_loaded_scale_params): + print(f"\n⚠️ Warning: Expected q_scale and prob_scale to be using defaults, but they weren't found") + elif any(".attn.q_scale" in p or ".attn.prob_scale" in p for p in not_loaded_scale_params): + print(f"\n✅ q_scale and prob_scale are correctly using default values!") + + print(f"=== END SCALE PARAMETER VERIFICATION ===\n") + return updated_params From b10782dcbf123b61504a6c1fef6a4af8f2a62e01 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 2 Jul 2025 05:36:10 +0000 Subject: [PATCH 14/25] handle eplb in ModelOptFp8MoEMethod --- vllm/model_executor/layers/quantization/modelopt.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index a2b9212de88..600f2af9468 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -398,7 +398,14 @@ def apply( inplace: bool = True, no_combine: bool = False, routed_scaling_factor: Optional[float] = None, + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `ModelOptFp8MoEMethod` yet.") # Expert selection topk_weights, topk_ids = FusedMoE.select_experts( From 826528739df218db64fe0047f4d33097015bdf8f Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 3 Jul 2025 01:32:26 +0000 Subject: [PATCH 15/25] broadcasting BMM experts scales --- vllm/model_executor/models/mllama4.py | 183 +++++--------------------- 1 file changed, 34 insertions(+), 149 deletions(-) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 1538a098af6..a39080bc6c3 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -963,7 +963,8 @@ def process_and_separate_weights(): renamed = renamed.replace( "gate_up_proj_weight_scale", "w13_weight_scale") - # Handle attention scale parameters + + # Handle attention scale parameters elif "self_attn." in name and ( ".k_scale" in name or ".v_scale" in name): # Map attention scale parameters for ModelOpt checkpoints @@ -1008,119 +1009,48 @@ def process_and_separate_weights(): else: other_weights.append((renamed, weight)) - # Print debugging information for scale parameters - print(f"\n=== SCALE PARAMETER LOADING DEBUG INFO ===") - print(f"Scale parameters found in checkpoint ({len(checkpoint_scales)}):") - - # Group scale parameters by type for better readability - moe_scales = [s for s in checkpoint_scales if "experts." in s] - attn_scales = [s for s in checkpoint_scales if "self_attn." in s and "scale" in s] - other_scales = [s for s in checkpoint_scales if s not in moe_scales and s not in attn_scales] - - # Further categorize attention scales - kv_cache_scales = [s for s in attn_scales if ".k_scale" in s or ".v_scale" in s] - linear_scales = [s for s in attn_scales if s not in kv_cache_scales] - - if moe_scales: - print(f"\n MoE Expert Scales ({len(moe_scales)}):") - for scale_name in sorted(moe_scales): - print(f" {scale_name}") - - if attn_scales: - print(f"\n Attention Scales ({len(attn_scales)}):") - if kv_cache_scales: - print(f" KV Cache Scales ({len(kv_cache_scales)}):") - for scale_name in sorted(kv_cache_scales): - print(f" {scale_name}") - if linear_scales: - print(f" Linear Projection Scales ({len(linear_scales)}):") - for scale_name in sorted(linear_scales): - print(f" {scale_name}") - - # Note about missing q_scale and prob_scale - print(f" 📝 Note: q_scale and prob_scale not found in checkpoint") - print(f" These will use default values (1.0) as expected") - - if other_scales: - print(f"\n Other Scales ({len(other_scales)}):") - for scale_name in sorted(other_scales): - print(f" {scale_name}") - - print(f"\nScale parameter name mappings ({len(scale_mapping)}):") - - # Group mappings by type for clarity - moe_mappings = {k: v for k, v in scale_mapping.items() if "experts." in k} - attn_mappings = {k: v for k, v in scale_mapping.items() if "self_attn." in k} - other_mappings = {k: v for k, v in scale_mapping.items() if k not in moe_mappings and k not in attn_mappings} - - if moe_mappings: - print(f"\n MoE Scale Mappings ({len(moe_mappings)}):") - for orig_name, renamed_name in sorted(moe_mappings.items()): - print(f" {orig_name} → {renamed_name}") - - if attn_mappings: - print(f"\n Attention Scale Mappings ({len(attn_mappings)}):") - for orig_name, renamed_name in sorted(attn_mappings.items()): - print(f" {orig_name} → {renamed_name}") - - if other_mappings: - print(f"\n Other Scale Mappings ({len(other_mappings)}):") - for orig_name, renamed_name in sorted(other_mappings.items()): - print(f" {orig_name} → {renamed_name}") - - print(f"\nRenamed scale parameters ({len(renamed_scales)}):") - for scale_name in sorted(renamed_scales): - print(f" {scale_name}") - - # Get expected scale parameters from model - model_scale_params = [] - for param_name in params_dict.keys(): - if "scale" in param_name: - model_scale_params.append(param_name) - - print(f"\nExpected scale parameters in model ({len(model_scale_params)}):") - for param_name in sorted(model_scale_params): - print(f" {param_name}") - - # Check for missing scale parameters - missing_scales = set(model_scale_params) - set(renamed_scales) - extra_scales = set(renamed_scales) - set(model_scale_params) - - # Filter out q_scale and prob_scale as they're expected to use defaults - expected_defaults = {p for p in missing_scales if ".attn.q_scale" in p or ".attn.prob_scale" in p} - truly_missing = missing_scales - expected_defaults - - if truly_missing: - print(f"\n⚠️ MISSING scale parameters ({len(truly_missing)}):") - for param_name in sorted(truly_missing): - print(f" {param_name}") - else: - print(f"\n✅ All required scale parameters found in checkpoint!") - - if expected_defaults: - print(f"\n📋 Scale parameters using defaults ({len(expected_defaults)}):") - for param_name in sorted(expected_defaults): - print(f" {param_name} (will use default value 1.0)") - - - if extra_scales: - print(f"\n⚠️ EXTRA scale parameters in checkpoint ({len(extra_scales)}):") - for param_name in sorted(extra_scales): - print(f" {param_name}") - - print(f"=== END SCALE PARAMETER DEBUG INFO ===\n") return language_model_weights, other_weights language_model_weights, other_weights = process_and_separate_weights() - # Load language model weights + # Handle expert scale parameters separately to avoid FusedMoE weight loader issues + expert_scale_weights = [] + regular_language_model_weights = [] + + for name, weight in language_model_weights: + # Check if this is an expert scale parameter that needs broadcasting + if ("feed_forward.experts." in name and "scale" in name and + ".shared_expert" not in name): + + if name in params_dict: + param = params_dict[name] + if (hasattr(param, 'data') and param.data.numel() > 1 and + weight.numel() == 1): + # This needs broadcasting - handle it directly + # print(f"Broadcasting single scale value {weight.item()} to shape {param.data.shape} for {name}") + param.data.fill_(weight.item()) + updated_params.add(name) + continue + + # Regular expert scale loading - add to separate list + expert_scale_weights.append((name, weight)) + else: + regular_language_model_weights.append((name, weight)) + + # Load regular language model weights (excluding expert scales that need broadcasting) loader = AutoWeightsLoader(self) loaded_language_model_params = loader.load_weights( - language_model_weights) + regular_language_model_weights) assert loaded_language_model_params is not None updated_params.update(loaded_language_model_params) + # Load expert scale weights that didn't need broadcasting through normal mechanism + if expert_scale_weights: + loaded_expert_scale_params = loader.load_weights(expert_scale_weights) + if loaded_expert_scale_params: + updated_params.update(loaded_expert_scale_params) + if self.use_data_parallel: other_weights = self._consolidate_qkv_weights(other_weights) @@ -1192,49 +1122,4 @@ def process_and_separate_weights(): print(f" Using default values: {len(not_loaded_scale_params)}") print(f" Total scale parameters: {total_scale_params}") - # Fix missing attention scale parameters using proper defaults - attention_params_fixed = 0 - layer_k_scales = {} # Store k_scale values for each layer - - # First pass: collect k_scale values - for param_name, param in params_dict.items(): - if ".attn.k_scale" in param_name and hasattr(param, 'data'): - layer_prefix = param_name.replace(".attn.k_scale", "") - if param.data.numel() == 1: - layer_k_scales[layer_prefix] = float(param.data.item()) - print(f"📊 Found k_scale for {layer_prefix}: {layer_k_scales[layer_prefix]}") - - # Second pass: fix missing scales with proper defaults - for param_name, param in params_dict.items(): - if hasattr(param, 'data') and param.data.numel() == 1: - current_value = float(param.data.item()) - - # Fix q_scale: use k_scale from same layer if available - if ".attn.q_scale" in param_name and current_value == 1.0: - layer_prefix = param_name.replace(".attn.q_scale", "") - if layer_prefix in layer_k_scales: - k_scale_value = layer_k_scales[layer_prefix] - print(f"🔧 Setting {param_name}: {current_value} -> {k_scale_value} (using k_scale)") - param.data.fill_(k_scale_value) - attention_params_fixed += 1 - else: - print(f"⚠️ No k_scale found for {param_name}, keeping default 1.0") - - # Fix prob_scale: use standard default of 1.0/448.0 for missing values - elif ".attn.prob_scale" in param_name and current_value == 1.0: - prob_scale_default = 1.0 / 448.0 - print(f"🔧 Setting {param_name}: {current_value} -> {prob_scale_default} (attention prob default)") - param.data.fill_(prob_scale_default) - attention_params_fixed += 1 - - if attention_params_fixed > 0: - print(f"Fixed {attention_params_fixed} attention scale parameters with proper defaults") - - if not_loaded_scale_params and not any(".attn.q_scale" in p or ".attn.prob_scale" in p for p in not_loaded_scale_params): - print(f"\n⚠️ Warning: Expected q_scale and prob_scale to be using defaults, but they weren't found") - elif any(".attn.q_scale" in p or ".attn.prob_scale" in p for p in not_loaded_scale_params): - print(f"\n✅ q_scale and prob_scale are correctly using default values!") - - print(f"=== END SCALE PARAMETER VERIFICATION ===\n") - return updated_params From 59190eae74d6d7539406d9211a0fcd2b57b3c8e2 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 3 Jul 2025 05:30:38 +0000 Subject: [PATCH 16/25] cleanup --- vllm/model_executor/models/mllama4.py | 51 --------------------------- 1 file changed, 51 deletions(-) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index a39080bc6c3..2168f9c149a 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -1028,7 +1028,6 @@ def process_and_separate_weights(): if (hasattr(param, 'data') and param.data.numel() > 1 and weight.numel() == 1): # This needs broadcasting - handle it directly - # print(f"Broadcasting single scale value {weight.item()} to shape {param.data.shape} for {name}") param.data.fill_(weight.item()) updated_params.add(name) continue @@ -1072,54 +1071,4 @@ def process_and_separate_weights(): weight_loader(param, loaded_weight) updated_params.add(name) - # Print final verification of loaded scale parameters - print(f"\n=== SCALE PARAMETER LOADING VERIFICATION ===") - - # Show parameters that were loaded from checkpoint - loaded_scale_params = {} - for param_name, param in params_dict.items(): - if "scale" in param_name and param_name in updated_params: - if hasattr(param, 'data'): - param_value = param.data - if param_value.numel() == 1: - loaded_scale_params[param_name] = float(param_value.item()) - else: - loaded_scale_params[param_name] = f"tensor{list(param_value.shape)} (first few values: {param_value.flatten()[:5].tolist()})" - else: - loaded_scale_params[param_name] = "No .data attribute" - - if loaded_scale_params: - print(f"Scale parameters loaded from checkpoint ({len(loaded_scale_params)}):") - for param_name, value in sorted(loaded_scale_params.items()): - print(f" {param_name}: {value}") - - # Show parameters that weren't loaded but exist in model (including defaults) - not_loaded_scale_params = {} - for param_name, param in params_dict.items(): - if "scale" in param_name and param_name not in updated_params: - if hasattr(param, 'data'): - param_value = param.data - if param_value.numel() == 1: - not_loaded_scale_params[param_name] = float(param_value.item()) - else: - not_loaded_scale_params[param_name] = f"tensor{list(param_value.shape)} (first few values: {param_value.flatten()[:5].tolist()})" - else: - not_loaded_scale_params[param_name] = "No .data attribute" - - if not_loaded_scale_params: - print(f"\nScale parameters using default values ({len(not_loaded_scale_params)}):") - for param_name, value in sorted(not_loaded_scale_params.items()): - # Highlight q_scale and prob_scale specifically - if ".attn.q_scale" in param_name or ".attn.prob_scale" in param_name: - print(f" {param_name}: {value} ⭐ (expected default)") - else: - print(f" {param_name}: {value}") - - # Summary - total_scale_params = len(loaded_scale_params) + len(not_loaded_scale_params) - print(f"\nScale parameter summary:") - print(f" Loaded from checkpoint: {len(loaded_scale_params)}") - print(f" Using default values: {len(not_loaded_scale_params)}") - print(f" Total scale parameters: {total_scale_params}") - return updated_params From 7a6fc84e3f5f5e182428e937ab7cb23e4d93a3b7 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 3 Jul 2025 06:44:30 +0000 Subject: [PATCH 17/25] some refactor and cleanup --- .../layers/quantization/modelopt.py | 40 ++++++++++++------- .../model_loader/weight_utils.py | 2 +- vllm/model_executor/models/llama4.py | 17 +------- vllm/model_executor/models/mllama4.py | 37 +---------------- 4 files changed, 30 insertions(+), 66 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 600f2af9468..68aa2872c1f 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -85,22 +85,34 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": "quant configuration.") is_checkpoint_fp8_serialized = ("FP8" in quant_method) - # Convert exclude_modules to handle the language_model prefix for llama4 - converted_exclude_modules = [] - if exclude_modules: - for module in exclude_modules: - converted_exclude_modules.append(module) - if not module.startswith("language_model."): - converted_exclude_modules.append( - f"language_model.{module}") - return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, - converted_exclude_modules) + exclude_modules) + + def is_layer_excluded(self, prefix: str) -> bool: + """ + Check if a layer should be excluded from quantization. + + This method handles both regular models and multimodal models that use + the language_model prefix. For multimodal models, it checks if the + module name (without the language_model prefix) is in the exclude list. + """ + if self.exclude_modules is None: + return False + + # Check if any excluded module matches the prefix + for module in self.exclude_modules: + if (module in prefix or + (prefix.startswith("language_model.") and + module in prefix.removeprefix("language_model."))): + return True + return False def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import if isinstance(layer, LinearBase): + if self.is_layer_excluded(prefix): + return UnquantizedLinearMethod() return ModelOptFp8LinearMethod(self) elif isinstance(layer, Attention): return ModelOptFp8KVCacheMethod(self) @@ -275,7 +287,7 @@ def create_weights( w13_weight_scale = PerTensorScaleParameter( data=torch.full( (num_experts, 2), - 1.0, # Initialize to reasonable default instead of -inf + 1.0, dtype=torch.float32, ), weight_loader=weight_loader, @@ -285,7 +297,7 @@ def create_weights( (num_experts, ), 1.0, dtype=torch. - float32 # Initialize to reasonable default instead of -inf + float32 ), weight_loader=weight_loader, ) @@ -327,7 +339,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # Fp8 moe kernel needs single weight scale for w13 per expert. # We take the max of the w1 and w3 scales # then dequant and requant each expert. - if layer.w13_weight_scale.dim() == 2: # Shape: (num_experts, 2) + if layer.w13_weight_scale.dim() == 2: # Get the maximum scale across w1 and w3 for each expert max_w13_scales = layer.w13_weight_scale.max(dim=1).values @@ -431,7 +443,7 @@ def apply( inplace=inplace, activation=activation, use_fp8_w8a8=True, - per_channel_quant=False, # ModelOpt uses per-tensor quantization + per_channel_quant=False, global_num_experts=global_num_experts, expert_map=expert_map, w1_scale=layer.w13_weight_scale, diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index b886efed0a8..a70c89f2d82 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -771,7 +771,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: f".self_attn.attn{scale_name}") elif any(qkv_scale_name in name for qkv_scale_name in qkv_proj_scale_names): - # Handle qkv_proj scale parameters: .self_attn.qkv_proj.k_scale -> .self_attn.attn.k_scale + # Handle qkv_proj scale parameters remapped_name = name.replace( f".self_attn.qkv_proj{scale_name}", f".self_attn.attn{scale_name}") diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index d0dbae20dce..409e627cd9f 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -433,9 +433,6 @@ def load_weights(self, weights: Iterable[tuple[str, for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name or "experts" in name: continue - # Don't transform k_scale/v_scale parameter names with - # stacked parameter mapping but allow other scale parameters - # (input_scale, weight_scale) to be processed if not (name.endswith( (".k_scale", ".v_scale")) and "self_attn" in name): name = name.replace(weight_name, param_name) @@ -445,15 +442,13 @@ def load_weights(self, weights: Iterable[tuple[str, # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) if name is None: - continue # Skip this parameter if remapping failed + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) if weight_loader == default_weight_loader: - # default_weight_loader doesn't support shard_id weight_loader(param, loaded_weight) else: - # Custom weight loader that supports shard_id weight_loader(param, loaded_weight, shard_id) loaded_params.add(name) break @@ -490,20 +485,12 @@ def load_weights(self, weights: Iterable[tuple[str, sig = inspect.signature(weight_loader) if ('expert_id' in sig.parameters and 'shard_id' in sig.parameters): - # This is a MoE weight loader, provide the - # required arguments - # Determine the appropriate shard_id based - # on parameter name + # This is a MoE weight loader if "w13_" in name: - # w13 corresponds to gate_up_proj, which - # can be either w1 or w3 shard_id = "w1" elif "w2_" in name: - # w2 corresponds to down_proj shard_id = "w2" else: - # Fallback - this shouldn't happen for - # scale parameters shard_id = "w1" weight_loader(param, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 2168f9c149a..415e99ba3d0 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -926,25 +926,13 @@ def process_and_separate_weights(): language_model_weights = [] other_weights = [] - # Track scale parameters for debugging - checkpoint_scales = [] - renamed_scales = [] - scale_mapping = {} - for name, weight in weights: - # Track scale parameters from checkpoint - if "scale" in name: - checkpoint_scales.append(name) - # Apply renaming logic + # Apply renaming logic for ModelOpt llama4 fp8 checkpoints if name.startswith("model."): # Handle expert scale parameters with flat naming if "feed_forward.experts." in name and ( "_input_scale" in name or "_weight_scale" in name): - # Expert scales in checkpoint are single values for all - # experts e.g., "model.layers.0.feed_forward.experts. - # down_proj_input_scale" should map to "language_model. - # model.layers.0.feed_forward.experts.w2_input_scale" renamed = name.replace("model.", "language_model.model.", 1) @@ -967,42 +955,23 @@ def process_and_separate_weights(): # Handle attention scale parameters elif "self_attn." in name and ( ".k_scale" in name or ".v_scale" in name): - # Map attention scale parameters for ModelOpt checkpoints - # e.g., "model.layers.0.self_attn.k_proj.k_scale" - # should map to "language_model.model.layers.0.self_attn.attn.k_scale" renamed = name.replace("model.", "language_model.model.", 1) - # Map checkpoint attention scale naming to vLLM's expected naming if ".k_proj.k_scale" in renamed: renamed = renamed.replace(".k_proj.k_scale", ".attn.k_scale") elif ".v_proj.v_scale" in renamed: renamed = renamed.replace(".v_proj.v_scale", ".attn.v_scale") else: - # Standard model.* to language_model.model.* renaming renamed = name.replace("model.", "language_model.model.", 1) - - # Track renamed scale parameters and mapping - if "scale" in renamed: - renamed_scales.append(renamed) - if "scale" in name: # Only add to mapping if original was also a scale - scale_mapping[name] = renamed elif name.startswith("lm_head.weight"): - # Rename lm_head.weight to language_model.lm_head.weight renamed = name.replace("lm_head.weight", "language_model.lm_head.weight") else: - # Keep other weights as is renamed = name - # Track renamed scale parameters and mapping - if "scale" in renamed: - renamed_scales.append(renamed) - if "scale" in name and name not in scale_mapping: # Avoid duplicates - scale_mapping[name] = renamed - # Separate into language_model and other weights if renamed.startswith("language_model."): language_model_weights.append((renamed, weight)) @@ -1014,7 +983,6 @@ def process_and_separate_weights(): language_model_weights, other_weights = process_and_separate_weights() - # Handle expert scale parameters separately to avoid FusedMoE weight loader issues expert_scale_weights = [] regular_language_model_weights = [] @@ -1032,19 +1000,16 @@ def process_and_separate_weights(): updated_params.add(name) continue - # Regular expert scale loading - add to separate list expert_scale_weights.append((name, weight)) else: regular_language_model_weights.append((name, weight)) - # Load regular language model weights (excluding expert scales that need broadcasting) loader = AutoWeightsLoader(self) loaded_language_model_params = loader.load_weights( regular_language_model_weights) assert loaded_language_model_params is not None updated_params.update(loaded_language_model_params) - # Load expert scale weights that didn't need broadcasting through normal mechanism if expert_scale_weights: loaded_expert_scale_params = loader.load_weights(expert_scale_weights) if loaded_expert_scale_params: From 47a47a918503358c66386e150da0ffad53b71d4d Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 3 Jul 2025 07:03:48 +0000 Subject: [PATCH 18/25] refactor Llama4ForConditionalGeneration.load_weights --- vllm/model_executor/models/mllama4.py | 206 +++++++++++++------------- 1 file changed, 106 insertions(+), 100 deletions(-) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 415e99ba3d0..be3caf829f8 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -903,122 +903,88 @@ def _consolidate_qkv_weights( qkv_weight = torch.cat(weight, dim=0) yield key, qkv_weight - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), - (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), - (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), - # Shared expert gate_up_proj stacking - (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0), - (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1), - # Feed forward gate_up_proj stacking (for non-MoE layers if any) - (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0), - (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - updated_params: set[str] = set() + def _rename_weight_for_checkpoint(self, name: str) -> str: + """Rename weights from ModelOpt llama4 fp8 checkpoints to vLLM format.""" + if name.startswith("model."): + # Handle expert scale parameters with flat naming + if "feed_forward.experts." in name and ("_input_scale" in name or "_weight_scale" in name): + renamed = name.replace("model.", "language_model.model.", 1) + # Map checkpoint naming to vLLM's expected naming + if "down_proj_input_scale" in renamed: + return renamed.replace("down_proj_input_scale", "w2_input_scale") + elif "down_proj_weight_scale" in renamed: + return renamed.replace("down_proj_weight_scale", "w2_weight_scale") + elif "gate_up_proj_input_scale" in renamed: + return renamed.replace("gate_up_proj_input_scale", "w13_input_scale") + elif "gate_up_proj_weight_scale" in renamed: + return renamed.replace("gate_up_proj_weight_scale", "w13_weight_scale") + return renamed + + # Handle attention scale parameters + elif "self_attn." in name and (".k_scale" in name or ".v_scale" in name): + renamed = name.replace("model.", "language_model.model.", 1) + if ".k_proj.k_scale" in renamed: + return renamed.replace(".k_proj.k_scale", ".attn.k_scale") + elif ".v_proj.v_scale" in renamed: + return renamed.replace(".v_proj.v_scale", ".attn.v_scale") + return renamed + + # Standard model.* to language_model.model.* renaming + return name.replace("model.", "language_model.model.", 1) + + elif name.startswith("lm_head.weight"): + return name.replace("lm_head.weight", "language_model.lm_head.weight") + + return name + + def _separate_and_rename_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> tuple[list[tuple[str, torch.Tensor]], list[tuple[str, torch.Tensor]]]: + """Rename weights and separate them into language_model and other weights.""" + language_model_weights = [] + other_weights = [] + + for name, weight in weights: + renamed = self._rename_weight_for_checkpoint(name) + + if renamed.startswith("language_model."): + language_model_weights.append((renamed, weight)) + else: + other_weights.append((renamed, weight)) - # Combine renaming and separation logic in a single pass - def process_and_separate_weights(): - language_model_weights = [] - other_weights = [] - - for name, weight in weights: - - # Apply renaming logic for ModelOpt llama4 fp8 checkpoints - if name.startswith("model."): - # Handle expert scale parameters with flat naming - if "feed_forward.experts." in name and ( - "_input_scale" in name or "_weight_scale" in name): - - renamed = name.replace("model.", - "language_model.model.", 1) - - # Map checkpoint naming to vLLM's expected naming - if "down_proj_input_scale" in renamed: - renamed = renamed.replace("down_proj_input_scale", - "w2_input_scale") - elif "down_proj_weight_scale" in renamed: - renamed = renamed.replace("down_proj_weight_scale", - "w2_weight_scale") - elif "gate_up_proj_input_scale" in renamed: - renamed = renamed.replace( - "gate_up_proj_input_scale", "w13_input_scale") - elif "gate_up_proj_weight_scale" in renamed: - renamed = renamed.replace( - "gate_up_proj_weight_scale", - "w13_weight_scale") - - # Handle attention scale parameters - elif "self_attn." in name and ( - ".k_scale" in name or ".v_scale" in name): - - renamed = name.replace("model.", - "language_model.model.", 1) - - if ".k_proj.k_scale" in renamed: - renamed = renamed.replace(".k_proj.k_scale", ".attn.k_scale") - elif ".v_proj.v_scale" in renamed: - renamed = renamed.replace(".v_proj.v_scale", ".attn.v_scale") - else: - renamed = name.replace("model.", - "language_model.model.", 1) - elif name.startswith("lm_head.weight"): - renamed = name.replace("lm_head.weight", - "language_model.lm_head.weight") - else: - renamed = name - - # Separate into language_model and other weights - if renamed.startswith("language_model."): - language_model_weights.append((renamed, weight)) - else: - other_weights.append((renamed, weight)) - - - return language_model_weights, other_weights - - language_model_weights, other_weights = process_and_separate_weights() + return language_model_weights, other_weights + def _handle_expert_scale_broadcasting(self, weights: list[tuple[str, torch.Tensor]], params_dict: dict) -> tuple[list[tuple[str, torch.Tensor]], set[str]]: + """Handle expert scale parameters that need broadcasting.""" + regular_weights = [] expert_scale_weights = [] - regular_language_model_weights = [] + updated_params = set() - for name, weight in language_model_weights: + for name, weight in weights: # Check if this is an expert scale parameter that needs broadcasting - if ("feed_forward.experts." in name and "scale" in name and - ".shared_expert" not in name): - + if ("feed_forward.experts." in name and "scale" in name and ".shared_expert" not in name): if name in params_dict: param = params_dict[name] - if (hasattr(param, 'data') and param.data.numel() > 1 and - weight.numel() == 1): - # This needs broadcasting - handle it directly + if (hasattr(param, 'data') and param.data.numel() > 1 and weight.numel() == 1): + # Broadcast single value to all experts param.data.fill_(weight.item()) updated_params.add(name) continue expert_scale_weights.append((name, weight)) else: - regular_language_model_weights.append((name, weight)) + regular_weights.append((name, weight)) - loader = AutoWeightsLoader(self) - loaded_language_model_params = loader.load_weights( - regular_language_model_weights) - assert loaded_language_model_params is not None - updated_params.update(loaded_language_model_params) + return regular_weights, expert_scale_weights, updated_params - if expert_scale_weights: - loaded_expert_scale_params = loader.load_weights(expert_scale_weights) - if loaded_expert_scale_params: - updated_params.update(loaded_expert_scale_params) + def _load_other_weights(self, other_weights: Iterable[tuple[str, torch.Tensor]], params_dict: dict, stacked_params_mapping: list) -> set[str]: + """Load non-language-model weights with stacking support.""" + updated_params = set() if self.use_data_parallel: other_weights = self._consolidate_qkv_weights(other_weights) for name, loaded_weight in other_weights: + # Try stacked parameter mapping first + mapped = False for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name or self.use_data_parallel: continue @@ -1027,13 +993,53 @@ def process_and_separate_weights(): updated_params.add(name) weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) + mapped = True break - else: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) + if not mapped: + # Use regular weight loading + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) updated_params.add(name) return updated_params + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), + (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), + (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), + # Shared expert gate_up_proj stacking + (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0), + (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1), + # Feed forward gate_up_proj stacking (for non-MoE layers if any) + (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0), + (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + updated_params: set[str] = set() + + # Separate and rename weights + language_model_weights, other_weights = self._separate_and_rename_weights(weights) + + # Handle expert scale parameters + regular_weights, expert_scale_weights, updated_params_from_experts = self._handle_expert_scale_broadcasting(language_model_weights, params_dict) + updated_params.update(updated_params_from_experts) + + loader = AutoWeightsLoader(self) + loaded_language_model_params = loader.load_weights(regular_weights) + assert loaded_language_model_params is not None + updated_params.update(loaded_language_model_params) + + if expert_scale_weights: + loaded_expert_scale_params = loader.load_weights(expert_scale_weights) + if loaded_expert_scale_params: + updated_params.update(loaded_expert_scale_params) + + updated_params.update(self._load_other_weights(other_weights, params_dict, stacked_params_mapping)) + + return updated_params From eec1daf66d2e0df80780e35c89e3bcad74cb98e5 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 9 Jul 2025 06:16:04 +0000 Subject: [PATCH 19/25] resolve conflict --- vllm/attention/layer.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index ee8c452b4e1..f0ad68b1640 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -314,13 +314,9 @@ def __init__( _Backend.FLEX_ATTENTION): backend = _Backend.XFORMERS - # self.attn_backend = backend if backend in { - # _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1 - # } else _Backend.TORCH_SDPA - - # Force TORCH_SDPA to avoid xformers-triton compatibility issues - # TODO: Remove this workaround once xformers-triton compatibility is fixed - self.attn_backend = _Backend.TORCH_SDPA + self.attn_backend = backend if backend in { + _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1 + } else _Backend.TORCH_SDPA def forward( self, From 770bc24e376a5075a46327fc4a2d21df1d13f7db Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 3 Jul 2025 07:17:12 +0000 Subject: [PATCH 20/25] format and linter error fix --- .../layers/quantization/modelopt.py | 26 ++++---- vllm/model_executor/models/llama4.py | 4 +- vllm/model_executor/models/mllama4.py | 63 +++++++++++++------ 3 files changed, 57 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 68aa2872c1f..7f6903e9b90 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Callable, List, Optional, Union +from typing import Any, Callable, Optional, Union import torch from torch.nn import Module @@ -43,7 +43,7 @@ def __init__( self, is_checkpoint_fp8_serialized: bool = False, kv_cache_quant_method: Optional[str] = None, - exclude_modules: Optional[List[str]] = None, + exclude_modules: Optional[list[str]] = None, ) -> None: super().__init__() self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized @@ -101,9 +101,9 @@ def is_layer_excluded(self, prefix: str) -> bool: # Check if any excluded module matches the prefix for module in self.exclude_modules: - if (module in prefix or - (prefix.startswith("language_model.") and - module in prefix.removeprefix("language_model."))): + if (module in prefix + or (prefix.startswith("language_model.") + and module in prefix.removeprefix("language_model."))): return True return False @@ -206,7 +206,8 @@ def apply( class ModelOptFp8MoEMethod: """MoE method for ModelOpt FP8. - Supports loading FP8 checkpoints with static weight scale and activation scale. + Supports loading FP8 checkpoints with static weight scale and + activation scale. Args: quant_config: The ModelOpt quantization config. """ @@ -214,8 +215,8 @@ class ModelOptFp8MoEMethod: def __new__(cls, *args, **kwargs): """ Dynamic class composition pattern. - This allows us to effectively "inject" FusedMoEMethodBase as a parent class - at runtime while avoiding circular import issues. + This allows us to effectively "inject" FusedMoEMethodBase as a parent + class at runtime while avoiding circular import issues. """ if not hasattr(cls, "_initialized"): @@ -252,7 +253,7 @@ def create_weights( **extra_weight_attrs, ): - # Use FP8 dtype if checkpoint is serialized, otherwise use the default dtype + # Use FP8 dtype if checkpoint is serialized weight_dtype = (torch.float8_e4m3fn if self.quant_config.is_checkpoint_fp8_serialized else params_dtype) @@ -293,12 +294,7 @@ def create_weights( weight_loader=weight_loader, ) w2_weight_scale = PerTensorScaleParameter( - data=torch.full( - (num_experts, ), - 1.0, - dtype=torch. - float32 - ), + data=torch.full((num_experts, ), 1.0, dtype=torch.float32), weight_loader=weight_loader, ) layer.register_parameter("w13_weight_scale", w13_weight_scale) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 409e627cd9f..7696b84bf3f 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -483,8 +483,8 @@ def load_weights(self, weights: Iterable[tuple[str, # Try to inspect the weight_loader signature import inspect sig = inspect.signature(weight_loader) - if ('expert_id' in sig.parameters and - 'shard_id' in sig.parameters): + if ('expert_id' in sig.parameters + and 'shard_id' in sig.parameters): # This is a MoE weight loader if "w13_" in name: shard_id = "w1" diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index be3caf829f8..b7d0a1ddafd 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -904,24 +904,31 @@ def _consolidate_qkv_weights( yield key, qkv_weight def _rename_weight_for_checkpoint(self, name: str) -> str: - """Rename weights from ModelOpt llama4 fp8 checkpoints to vLLM format.""" + """Rename weights from ModelOpt llama4 fp8 checkpoints to vLLM + format.""" if name.startswith("model."): # Handle expert scale parameters with flat naming - if "feed_forward.experts." in name and ("_input_scale" in name or "_weight_scale" in name): + if "feed_forward.experts." in name and ("_input_scale" in name or + "_weight_scale" in name): renamed = name.replace("model.", "language_model.model.", 1) # Map checkpoint naming to vLLM's expected naming if "down_proj_input_scale" in renamed: - return renamed.replace("down_proj_input_scale", "w2_input_scale") + return renamed.replace("down_proj_input_scale", + "w2_input_scale") elif "down_proj_weight_scale" in renamed: - return renamed.replace("down_proj_weight_scale", "w2_weight_scale") + return renamed.replace("down_proj_weight_scale", + "w2_weight_scale") elif "gate_up_proj_input_scale" in renamed: - return renamed.replace("gate_up_proj_input_scale", "w13_input_scale") + return renamed.replace("gate_up_proj_input_scale", + "w13_input_scale") elif "gate_up_proj_weight_scale" in renamed: - return renamed.replace("gate_up_proj_weight_scale", "w13_weight_scale") + return renamed.replace("gate_up_proj_weight_scale", + "w13_weight_scale") return renamed # Handle attention scale parameters - elif "self_attn." in name and (".k_scale" in name or ".v_scale" in name): + elif "self_attn." in name and (".k_scale" in name + or ".v_scale" in name): renamed = name.replace("model.", "language_model.model.", 1) if ".k_proj.k_scale" in renamed: return renamed.replace(".k_proj.k_scale", ".attn.k_scale") @@ -933,12 +940,16 @@ def _rename_weight_for_checkpoint(self, name: str) -> str: return name.replace("model.", "language_model.model.", 1) elif name.startswith("lm_head.weight"): - return name.replace("lm_head.weight", "language_model.lm_head.weight") + return name.replace("lm_head.weight", + "language_model.lm_head.weight") return name - def _separate_and_rename_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> tuple[list[tuple[str, torch.Tensor]], list[tuple[str, torch.Tensor]]]: - """Rename weights and separate them into language_model and other weights.""" + def _separate_and_rename_weights( + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> tuple[list[tuple[str, torch.Tensor]], list[tuple[str, torch.Tensor]]]: + """Rename weights and separate them into language_model and other + weights.""" language_model_weights = [] other_weights = [] @@ -952,7 +963,9 @@ def _separate_and_rename_weights(self, weights: Iterable[tuple[str, torch.Tensor return language_model_weights, other_weights - def _handle_expert_scale_broadcasting(self, weights: list[tuple[str, torch.Tensor]], params_dict: dict) -> tuple[list[tuple[str, torch.Tensor]], set[str]]: + def _handle_expert_scale_broadcasting( + self, weights: list[tuple[str, torch.Tensor]], params_dict: dict + ) -> tuple[list[tuple[str, torch.Tensor]], set[str]]: """Handle expert scale parameters that need broadcasting.""" regular_weights = [] expert_scale_weights = [] @@ -960,10 +973,12 @@ def _handle_expert_scale_broadcasting(self, weights: list[tuple[str, torch.Tenso for name, weight in weights: # Check if this is an expert scale parameter that needs broadcasting - if ("feed_forward.experts." in name and "scale" in name and ".shared_expert" not in name): + if ("feed_forward.experts." in name and "scale" in name + and ".shared_expert" not in name): if name in params_dict: param = params_dict[name] - if (hasattr(param, 'data') and param.data.numel() > 1 and weight.numel() == 1): + if (hasattr(param, 'data') and param.data.numel() > 1 + and weight.numel() == 1): # Broadcast single value to all experts param.data.fill_(weight.item()) updated_params.add(name) @@ -975,7 +990,10 @@ def _handle_expert_scale_broadcasting(self, weights: list[tuple[str, torch.Tenso return regular_weights, expert_scale_weights, updated_params - def _load_other_weights(self, other_weights: Iterable[tuple[str, torch.Tensor]], params_dict: dict, stacked_params_mapping: list) -> set[str]: + def _load_other_weights(self, other_weights: Iterable[tuple[str, + torch.Tensor]], + params_dict: dict, + stacked_params_mapping: list) -> set[str]: """Load non-language-model weights with stacking support.""" updated_params = set() @@ -999,7 +1017,8 @@ def _load_other_weights(self, other_weights: Iterable[tuple[str, torch.Tensor]], if not mapped: # Use regular weight loading param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) weight_loader(param, loaded_weight) updated_params.add(name) @@ -1024,10 +1043,13 @@ def load_weights(self, weights: Iterable[tuple[str, updated_params: set[str] = set() # Separate and rename weights - language_model_weights, other_weights = self._separate_and_rename_weights(weights) + language_model_weights, other_weights = ( + self._separate_and_rename_weights(weights)) # Handle expert scale parameters - regular_weights, expert_scale_weights, updated_params_from_experts = self._handle_expert_scale_broadcasting(language_model_weights, params_dict) + regular_weights, expert_scale_weights, updated_params_from_experts = ( + self._handle_expert_scale_broadcasting(language_model_weights, + params_dict)) updated_params.update(updated_params_from_experts) loader = AutoWeightsLoader(self) @@ -1036,10 +1058,13 @@ def load_weights(self, weights: Iterable[tuple[str, updated_params.update(loaded_language_model_params) if expert_scale_weights: - loaded_expert_scale_params = loader.load_weights(expert_scale_weights) + loaded_expert_scale_params = loader.load_weights( + expert_scale_weights) if loaded_expert_scale_params: updated_params.update(loaded_expert_scale_params) - updated_params.update(self._load_other_weights(other_weights, params_dict, stacked_params_mapping)) + updated_params.update( + self._load_other_weights(other_weights, params_dict, + stacked_params_mapping)) return updated_params From 770890abb124415d2e020976d7da00c577bdc1b4 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 3 Jul 2025 07:38:09 +0000 Subject: [PATCH 21/25] simplify ModelOptFp8MoEMethod to avoid mypy error --- .../layers/quantization/modelopt.py | 27 +------------------ 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 7f6903e9b90..5482a686874 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -204,7 +204,7 @@ def apply( bias=bias) -class ModelOptFp8MoEMethod: +class ModelOptFp8MoEMethod(FusedMoEMethodBase): """MoE method for ModelOpt FP8. Supports loading FP8 checkpoints with static weight scale and activation scale. @@ -212,31 +212,6 @@ class ModelOptFp8MoEMethod: quant_config: The ModelOpt quantization config. """ - def __new__(cls, *args, **kwargs): - """ - Dynamic class composition pattern. - This allows us to effectively "inject" FusedMoEMethodBase as a parent - class at runtime while avoiding circular import issues. - """ - - if not hasattr(cls, "_initialized"): - original_init = cls.__init__ - new_cls = type( - cls.__name__, - (FusedMoEMethodBase, ), - { - "__init__": original_init, - **{ - k: v - for k, v in cls.__dict__.items() if k != "__dict__" - }, - }, - ) - obj = super(new_cls, new_cls).__new__(new_cls) - obj.__init__(*args, **kwargs) - return obj - return super().__new__(cls) - def __init__(self, quant_config: ModelOptFp8Config): self.quant_config = quant_config from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( From 1beecbff9d8c29e75ae855e65a81c193a287f684 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Fri, 4 Jul 2025 00:28:48 +0000 Subject: [PATCH 22/25] resolve conflict --- vllm/model_executor/layers/fused_moe/layer.py | 20 +++++++-- .../layers/quantization/experts_int8.py | 4 +- .../layers/quantization/modelopt.py | 18 ++++---- .../layers/quantization/moe_wna16.py | 2 + vllm/model_executor/models/llama4.py | 43 +++++++++---------- 5 files changed, 52 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index a44e83bfee8..9e33d70ebd8 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -81,6 +81,16 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, params_dtype: torch.dtype, **extra_weight_attrs): raise NotImplementedError + def uses_weight_scale_2_pattern(self) -> bool: + """ + Returns True if this quantization method uses 'weight_scale_2' pattern + for per-tensor weight scales (e.g., FP4 variants), False otherwise. + + This method should be overridden by subclasses that use the + 'weight_scale_2' pattern instead of the standard 'weight_scale' pattern. + """ + return False + def init_prepare_finalize(self, moe: FusedMoEConfig, quant_config: Optional[QuantizationConfig]): all2all_manager = get_ep_group().device_communicator.all2all_manager @@ -1050,12 +1060,12 @@ def weight_loader(self, # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern if "ModelOpt" in quant_method_name: # Determine per-tensor weight scale patterns based on variant - is_fp4_variant = ("ModelOptNvFp4FusedMoEMethod" - in self.quant_method.__class__.__name__) + # Use the dedicated method instead of brittle string matching + uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern() # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale" per_tensor_conditions = ( - "weight_scale_2" in weight_name if is_fp4_variant else + "weight_scale_2" in weight_name if uses_weight_scale_2 else "weight_scale" in weight_name) or "input_scale" in weight_name if per_tensor_conditions: @@ -1536,3 +1546,7 @@ def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor, dispatch_key=current_platform.dispatch_key, tags=(torch.Tag.needs_fixed_stride_order, ), ) + +# Mark the FusedMoE weight_loader as supporting MoE-specific parameters +# to avoid expensive runtime reflection in model loading code +FusedMoE.weight_loader.supports_moe_loading = True diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 47eca80609e..d7acb1fbd39 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -188,8 +188,10 @@ def quantize_and_call_weight_loader(param: torch.nn.Parameter, raise ValueError( f"Shard id must be in [0,1,2] but got {shard_id}") weight_loader(param, loaded_weight, weight_name, shard_id, - expert_id) + expert_id) + # Mark as supporting MoE-specific loading to avoid expensive reflection + quantize_and_call_weight_loader.supports_moe_loading = True return quantize_and_call_weight_loader diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 5482a686874..e283b2bb4db 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -350,6 +350,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: "w2_weight_scale") and layer.w2_weight_scale is not None: layer.w2_weight_scale = Parameter(layer.w2_weight_scale.data, requires_grad=False) + # Input scales must be equal for each expert in fp8 MoE layers. if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None: layer.w13_input_scale = Parameter(layer.w13_input_scale.max(), @@ -366,21 +367,16 @@ def apply( router_logits: torch.Tensor, top_k: int, renormalize: bool, - use_grouped_topk: bool, + use_grouped_topk: bool = False, topk_group: Optional[int] = None, num_expert_group: Optional[int] = None, global_num_experts: int = -1, expert_map: Optional[torch.Tensor] = None, - num_fused_shared_experts: Optional[int] = None, custom_routing_function: Optional[Callable] = None, - correction_bias: Optional[torch.Tensor] = None, scoring_func: str = "softmax", e_score_correction_bias: Optional[torch.Tensor] = None, - activation: str = "silu", apply_router_weight_on_input: bool = False, - inplace: bool = True, - no_combine: bool = False, - routed_scaling_factor: Optional[float] = None, + activation: str = "silu", enable_eplb: bool = False, expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, @@ -411,7 +407,7 @@ def apply( layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - inplace=inplace, + inplace=True, activation=activation, use_fp8_w8a8=True, per_channel_quant=False, @@ -725,6 +721,12 @@ def __init__(self, quant_config: ModelOptNvFp4Config): " quantization. Please use Blackwell and" " above.") + def uses_weight_scale_2_pattern(self) -> bool: + """ + FP4 variants use 'weight_scale_2' pattern for per-tensor weight scales. + """ + return True + def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index c5055a02fa3..86d3f4c7a1a 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -454,4 +454,6 @@ def moe_wna16_weight_loader(param: torch.nn.Parameter, weight_loader(param, loaded_weight, weight_name, shard_id, expert_id) + # Mark as supporting MoE-specific loading to avoid expensive reflection + moe_wna16_weight_loader.supports_moe_loading = True return moe_wna16_weight_loader diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 7696b84bf3f..8f7d25be541 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -479,30 +479,27 @@ def load_weights(self, weights: Iterable[tuple[str, # Check if this is a MoE-specific weight loader that # needs extra arguments if hasattr(param, 'weight_loader'): - try: - # Try to inspect the weight_loader signature - import inspect - sig = inspect.signature(weight_loader) - if ('expert_id' in sig.parameters - and 'shard_id' in sig.parameters): - # This is a MoE weight loader - if "w13_" in name: - shard_id = "w1" - elif "w2_" in name: - shard_id = "w2" - else: - shard_id = "w1" - - weight_loader(param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=0) + # Check for MoE-specific loading support via + # attribute instead of expensive runtime reflection + supports_moe = getattr(weight_loader, + 'supports_moe_loading', False) + + if supports_moe: + # This is a MoE weight loader + if "w13_" in name: + shard_id = "w1" + elif "w2_" in name: + shard_id = "w2" else: - # Regular weight loader - weight_loader(param, loaded_weight) - except Exception: - # Fallback to regular loading + shard_id = "w1" + + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=0) + else: + # Regular weight loader weight_loader(param, loaded_weight) else: weight_loader(param, loaded_weight) From 0b98a7fa153111ce96c4d6352b4b7b3573e79783 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Fri, 4 Jul 2025 00:20:36 +0000 Subject: [PATCH 23/25] format fix --- vllm/model_executor/layers/fused_moe/layer.py | 3 ++- vllm/model_executor/layers/quantization/experts_int8.py | 2 +- vllm/model_executor/models/llama4.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9e33d70ebd8..b814ee956c7 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1061,7 +1061,8 @@ def weight_loader(self, if "ModelOpt" in quant_method_name: # Determine per-tensor weight scale patterns based on variant # Use the dedicated method instead of brittle string matching - uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern() + uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern( + ) # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale" per_tensor_conditions = ( diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index d7acb1fbd39..f7d28e3bdf7 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -188,7 +188,7 @@ def quantize_and_call_weight_loader(param: torch.nn.Parameter, raise ValueError( f"Shard id must be in [0,1,2] but got {shard_id}") weight_loader(param, loaded_weight, weight_name, shard_id, - expert_id) + expert_id) # Mark as supporting MoE-specific loading to avoid expensive reflection quantize_and_call_weight_loader.supports_moe_loading = True diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 8f7d25be541..e740e00c3cd 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -482,7 +482,8 @@ def load_weights(self, weights: Iterable[tuple[str, # Check for MoE-specific loading support via # attribute instead of expensive runtime reflection supports_moe = getattr(weight_loader, - 'supports_moe_loading', False) + 'supports_moe_loading', + False) if supports_moe: # This is a MoE weight loader From cc44385e7492fdee4766773cbdedce18d93b6d17 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 9 Jul 2025 18:58:49 +0000 Subject: [PATCH 24/25] fix mypy error --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- vllm/model_executor/layers/quantization/experts_int8.py | 2 +- vllm/model_executor/layers/quantization/moe_wna16.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b814ee956c7..2129de083a7 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1550,4 +1550,4 @@ def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor, # Mark the FusedMoE weight_loader as supporting MoE-specific parameters # to avoid expensive runtime reflection in model loading code -FusedMoE.weight_loader.supports_moe_loading = True +FusedMoE.weight_loader.supports_moe_loading = True # type: ignore[attr-defined] diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index f7d28e3bdf7..67083c3b4b5 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -191,7 +191,7 @@ def quantize_and_call_weight_loader(param: torch.nn.Parameter, expert_id) # Mark as supporting MoE-specific loading to avoid expensive reflection - quantize_and_call_weight_loader.supports_moe_loading = True + quantize_and_call_weight_loader.supports_moe_loading = True # type: ignore[attr-defined] return quantize_and_call_weight_loader diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 86d3f4c7a1a..f03c7b3d501 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -455,5 +455,5 @@ def moe_wna16_weight_loader(param: torch.nn.Parameter, expert_id) # Mark as supporting MoE-specific loading to avoid expensive reflection - moe_wna16_weight_loader.supports_moe_loading = True + moe_wna16_weight_loader.supports_moe_loading = True # type: ignore[attr-defined] return moe_wna16_weight_loader From 1206f330a2564bcd92ddd48a2143b6fad27c9f40 Mon Sep 17 00:00:00 2001 From: jingyu Date: Thu, 10 Jul 2025 05:11:16 +0000 Subject: [PATCH 25/25] add qwen fp8 modelopt support --- vllm/config.py | 4 ++++ .../layers/quantization/modelopt.py | 17 +++++++++++------ vllm/model_executor/models/qwen2.py | 12 ++++++++++-- vllm/model_executor/models/qwen3_moe.py | 16 +++++++++++++--- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 508e09174cc..c065bdb4158 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -887,6 +887,10 @@ def _parse_quant_hf_config(self): if quant_cfg is None: # compressed-tensors uses a "compression_config" key quant_cfg = getattr(self.hf_config, "compression_config", None) + if quant_cfg is not None: + if quant_cfg["producer"]["name"].lower() == "modelopt": + if "quant_algo" in quant_cfg.keys() and quant_cfg["quant_algo"].lower() == "fp8": + quant_cfg = {"quant_method": "modelopt"} return quant_cfg def _verify_quantization(self) -> None: diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 3853938ba1c..6cdcf3f781b 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -71,12 +71,17 @@ def get_config_filenames(cls) -> list[str]: @classmethod def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": - quant_config = cls.get_from_keys(config, ["quantization"]) - quant_method = quant_config["quant_algo"] - kv_cache_quant_method = cls.get_from_keys( - config, ["quantization"]).get("kv_cache_quant_algo") - exclude_modules = cls.get_from_keys( - config, ["quantization"]).get("exclude_modules") + try: + quant_method = cls.get_from_keys(config, ["quant_algo"]) + kv_cache_quant_method = cls.get_from_keys(config, ["kv_cache_scheme"]) + exclude_modules = cls.get_from_keys(config, ["ignore"]) + except: + quant_config = cls.get_from_keys(config, ["quantization"]) + quant_method = quant_config["quant_algo"] + kv_cache_quant_method = cls.get_from_keys( + config, ["quantization"]).get("kv_cache_quant_algo") + exclude_modules = cls.get_from_keys( + config, ["quantization"]).get("exclude_modules") if quant_method not in QUANT_ALGOS: raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}" diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 7ef9d248da4..1186d65425f 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -400,9 +400,17 @@ def load_weights(self, weights: Iterable[tuple[str, continue if is_pp_missing_parameter(name, self): continue + if name.endswith("scale"): + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) break else: # Skip loading extra bias for GPTQ models. diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index ff182aadf73..09b32d16038 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -46,7 +46,9 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -406,6 +408,10 @@ def load_weights(self, weights: Iterable[tuple[str, # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: continue + if name.endswith("scale"): + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue # We have mlp.experts[0].gate_proj in the checkpoint. # Since we handle the experts below in expert_params_mapping, # we need to skip here BEFORE we update the name, otherwise @@ -427,8 +433,12 @@ def load_weights(self, weights: Iterable[tuple[str, continue param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) break else: for mapping in expert_params_mapping: