From 6927b02554530925d9b0fc1b33ce37f2afed5e14 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 2 Jul 2025 04:51:51 +0000
Subject: [PATCH 01/25] resolve conflict

---
 vllm/model_executor/layers/fused_moe/layer.py |  25 +-
 .../layers/quantization/modelopt.py           | 239 +++++++++++++++++-
 2 files changed, 254 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 36ac75a8df4..24129484125 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1049,12 +1049,25 @@ def weight_loader(self,
 
         # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
         if "ModelOpt" in quant_method_name:
-            if ('weight_scale_2' in weight_name
-                    or 'input_scale' in weight_name):
-                self._load_per_tensor_weight_scale(shard_id=shard_id,
-                                                   param=param,
-                                                   loaded_weight=loaded_weight,
-                                                   expert_id=expert_id)
+            # Determine per-tensor weight scale patterns based on variant
+            is_fp4_variant = (
+                "ModelOptNvFp4FusedMoEMethod" in self.quant_method.__class__.__name__
+            )
+
+            # FP4 uses "weight_scale_2" for per-tensor, FP8 uses "weight_scale" for per-tensor
+            per_tensor_conditions = (
+                "weight_scale_2" in weight_name
+                if is_fp4_variant
+                else "weight_scale" in weight_name
+            ) or "input_scale" in weight_name
+
+            if per_tensor_conditions:
+                self._load_per_tensor_weight_scale(
+                    shard_id=shard_id,
+                    param=param,
+                    loaded_weight=loaded_weight,
+                    expert_id=expert_id,
+                )
             elif "weight" in weight_name:
                 self._load_model_weight_or_group_weight_scale(
                     shard_id=shard_id,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 9db87533023..7ac00a71343 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -85,18 +85,20 @@ def get_quant_method(self, layer: torch.nn.Module,
             return ModelOptFp8LinearMethod(self)
         elif isinstance(layer, Attention):
             return ModelOptFp8KVCacheMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return ModelOptFp8MoEMethod(self)
         return None
 
 
 class ModelOptFp8LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer static quantization.
     Supports loading FP8 checkpoints with static weight scale and
-    activation scale. Future support might be added for dynamic 
+    activation scale. Future support might be added for dynamic
     scales.
 
     Limitations:
     1. Only support per-tensor quantization due to torch._scaled_mm support.
-    2. Only support float8_e4m3fn datatype 
+    2. Only support float8_e4m3fn datatype
         Args: quant_config: The ModelOpt quantization config.
     """
 
@@ -170,6 +172,235 @@ def apply(
                                      input_scale=layer.input_scale,
                                      bias=bias)
 
+class ModelOptFp8MoEMethod:
+    """MoE method for ModelOpt FP8.
+    Supports loading FP8 checkpoints with static weight scale and activation scale.
+    Args:
+        quant_config: The ModelOpt quantization config.
+    """
+
+    def __new__(cls, *args, **kwargs):
+        """
+        Dynamic class composition pattern.
+        This allows us to effectively "inject" FusedMoEMethodBase as a parent class
+        at runtime while avoiding circular import issues.
+        """
+
+        if not hasattr(cls, "_initialized"):
+            original_init = cls.__init__
+            new_cls = type(
+                cls.__name__,
+                (FusedMoEMethodBase,),
+                {
+                    "__init__": original_init,
+                    **{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
+                },
+            )
+            obj = super(new_cls, new_cls).__new__(new_cls)
+            obj.__init__(*args, **kwargs)
+            return obj
+        return super().__new__(cls)
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        self.quant_config = quant_config
+        from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+            cutlass_fp8_supported)
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        # Use FP8 dtype if checkpoint is serialized, otherwise use the default dtype
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts, 2 * intermediate_size, hidden_size, dtype=weight_dtype
+            ),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts, hidden_size, intermediate_size, dtype=weight_dtype
+            ),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALES - Per-tensor scaling for ModelOpts
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = PerTensorScaleParameter(
+                data=torch.full(
+                    (num_experts, 2),
+                    torch.finfo(torch.float32).min,
+                    dtype=torch.float32,
+                ),
+                weight_loader=weight_loader,
+            )
+            w2_weight_scale = PerTensorScaleParameter(
+                data=torch.full(
+                    (num_experts,), torch.finfo(torch.float32).min, dtype=torch.float32
+                ),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+            # Set weight loader attributes for scales
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+            )
+
+            # INPUT SCALES - Per-tensor scaling for ModelOpt
+            w13_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            w2_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Process FP8 MoE weights after loading from serialized checkpoint.
+        Only supports pre-quantized checkpoints with FP8 weights and scales.
+        """
+
+        layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
+        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+
+        from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+            per_tensor_dequantize)
+        from vllm._custom_ops import scaled_fp8_quant
+
+        # Handle scale parameters
+        if hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None:
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max of the w1 and w3 scales then dequant and requant each expert.
+            if layer.w13_weight_scale.dim() == 2:  # Shape: (num_experts, 2)
+
+                # Get the maximum scale across w1 and w3 for each expert
+                max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+
+                # Requantize each expert's weights using the combined scale
+                # w13_weight has shape (num_experts, 2 * intermediate_size, hidden_size)
+                # where the first intermediate_size rows are w1, the next are w3
+                intermediate_size = layer.w13_weight.shape[1] // 2
+                for expert_id in range(layer.w13_weight.shape[0]):
+                    start = 0
+                    for shard_id in range(2):  # w1 and w3
+                        # Dequantize using the original scale for this shard
+                        dq_weight = per_tensor_dequantize(
+                            layer.w13_weight[expert_id][
+                                start : start + intermediate_size, :
+                            ],
+                            layer.w13_weight_scale[expert_id][shard_id],
+                        )
+                        # Requantize using the combined max scale
+
+                        (
+                            layer.w13_weight[expert_id][
+                                start : start + intermediate_size, :
+                            ],
+                            _,
+                        ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+
+                        start += intermediate_size
+
+                # Update the scale parameter to be per-expert instead of per-shard
+                layer.w13_weight_scale = Parameter(max_w13_scales, requires_grad=False)
+            else:
+                layer.w13_weight_scale = Parameter(
+                    layer.w13_weight_scale.data, requires_grad=False
+                )
+
+        if hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None:
+            layer.w2_weight_scale = Parameter(
+                layer.w2_weight_scale.data, requires_grad=False
+            )
+        if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None:
+            layer.w13_input_scale = Parameter(
+                layer.w13_input_scale.max(), requires_grad=False
+            )
+        if hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None:
+            layer.w2_input_scale = Parameter(
+                layer.w2_input_scale.max(), requires_grad=False
+            )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        inplace: bool = True,
+        no_combine: bool = False,
+        routed_scaling_factor: Optional[float] = None,
+    ) -> torch.Tensor:
+
+        # Expert selection
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
+            custom_routing_function=custom_routing_function,
+            correction_bias=correction_bias,
+            routed_scaling_factor=routed_scaling_factor,
+        )
+        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+            cutlass_moe_fp8)
+        return cutlass_moe_fp8(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=inplace,
+            activation=activation,
+            use_fp8_w8a8=True,
+            per_channel_quant=False,  # ModelOpt uses per-tensor quantization
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            no_combine=no_combine,
+        )
 
 class ModelOptNvFp4Config(QuantizationConfig):
     """Config class for ModelOpt FP4."""
@@ -273,7 +504,7 @@ def __init__(self, quant_config: Union[ModelOptFp8Config,
 class ModelOptNvFp4LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer NVFP4.
     Supports loading NVFP4 checkpoints with the following structure:
-    
+
     input_scale: torch.float32, scalar ,
     weight: NVFP4(represented as byte) Shape: [1, X, y/2]
     weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale,
@@ -454,7 +685,7 @@ def apply(
 class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     """
     MoE Method for FP4 Quantization.
-    Args: 
+    Args:
         quant_config: NVFP4 Quant Config
     """
 

From b45972e2f568abb9b2c33b6a78a9902d2dff3fe5 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Fri, 20 Jun 2025 23:17:43 +0000
Subject: [PATCH 02/25] bugfix

---
 vllm/model_executor/layers/quantization/modelopt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 7ac00a71343..2dae1ac3b40 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -212,7 +212,7 @@ def create_weights(
         layer: torch.nn.Module,
         num_experts: int,
         hidden_size: int,
-        intermediate_size: int,
+        intermediate_size_per_partition: int,
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
@@ -227,7 +227,7 @@ def create_weights(
 
         w13_weight = ModelWeightParameter(
             data=torch.empty(
-                num_experts, 2 * intermediate_size, hidden_size, dtype=weight_dtype
+                num_experts, 2 * intermediate_size_per_partition, hidden_size, dtype=weight_dtype
             ),
             input_dim=2,
             output_dim=1,
@@ -237,7 +237,7 @@ def create_weights(
 
         w2_weight = ModelWeightParameter(
             data=torch.empty(
-                num_experts, hidden_size, intermediate_size, dtype=weight_dtype
+                num_experts, hidden_size, intermediate_size_per_partition, dtype=weight_dtype
             ),
             input_dim=2,
             output_dim=1,

From bf965286a4c869b281c3e24ae81c140839b958c4 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Fri, 20 Jun 2025 23:29:20 +0000
Subject: [PATCH 03/25] handle language_model. prefix

---
 vllm/model_executor/models/mllama4.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 1276d626a7c..9eb30fb52b3 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -918,6 +918,23 @@ def load_weights(self, weights: Iterable[tuple[str,
         # using llama4's load_weights routine.
         language_model_weights, other_weights = self.separate_weights(
             weights, prefix="language_model.")
+
+        # If no language_model weights found, try with "model." prefix and rename
+        language_model_weights_list = list(language_model_weights)
+        if not language_model_weights_list:
+            # No language_model.* weights found, try model.* weights
+            def rename_model_weights():
+                for name, weight in weights:
+                    if name.startswith("model."):
+                        # Rename model.* to language_model.model.*
+                        yield (name.replace("model.", "language_model.model.", 1), weight)
+                    else:
+                        # Keep other weights as is
+                        yield (name, weight)
+
+            language_model_weights, other_weights = self.separate_weights(
+                rename_model_weights(), prefix="language_model.")
+
         loader = AutoWeightsLoader(self)
         loaded_language_model_params = loader.load_weights(
             language_model_weights)

From cb20cd1db219122bc5037a552d2be39330652217 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Sat, 21 Jun 2025 00:57:11 +0000
Subject: [PATCH 04/25] fix issue in fused_experts calling

---
 .../layers/quantization/modelopt.py           | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 2dae1ac3b40..09133808b48 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -359,9 +359,13 @@ def apply(
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         num_fused_shared_experts: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
         apply_router_weight_on_input: bool = False,
         inplace: bool = True,
@@ -378,14 +382,14 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
-            correction_bias=correction_bias,
-            routed_scaling_factor=routed_scaling_factor,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
         )
-        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-            cutlass_moe_fp8)
-        return cutlass_moe_fp8(
+        # from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+        #     cutlass_moe_fp8)
+        from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+        return fused_experts(
             x,
             layer.w13_weight,
             layer.w2_weight,
@@ -395,11 +399,13 @@ def apply(
             activation=activation,
             use_fp8_w8a8=True,
             per_channel_quant=False,  # ModelOpt uses per-tensor quantization
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
-            no_combine=no_combine,
+            apply_router_weight_on_input=apply_router_weight_on_input,
         )
 
 class ModelOptNvFp4Config(QuantizationConfig):

From 1f6180273bb71b064ab22cf8c6c6d32595365903 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Sat, 21 Jun 2025 05:37:33 +0000
Subject: [PATCH 05/25] minor

---
 vllm/model_executor/layers/quantization/modelopt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 09133808b48..13338705652 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -252,14 +252,14 @@ def create_weights(
             w13_weight_scale = PerTensorScaleParameter(
                 data=torch.full(
                     (num_experts, 2),
-                    torch.finfo(torch.float32).min,
+                    1.0,  # Initialize to reasonable default instead of -inf
                     dtype=torch.float32,
                 ),
                 weight_loader=weight_loader,
             )
             w2_weight_scale = PerTensorScaleParameter(
                 data=torch.full(
-                    (num_experts,), torch.finfo(torch.float32).min, dtype=torch.float32
+                    (num_experts,), 1.0, dtype=torch.float32  # Initialize to reasonable default instead of -inf
                 ),
                 weight_loader=weight_loader,
             )

From 0fb23e12f6c92847186e59cd14c83f0bd24c3be6 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Sun, 22 Jun 2025 08:39:59 +0000
Subject: [PATCH 06/25] update ModelOptFp8Config, handle prefix in mllama4
 weight loading, debug

---
 .../layers/quantization/modelopt.py           | 23 ++++++-
 vllm/model_executor/models/mllama4.py         | 62 ++++++++++++-------
 2 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 13338705652..278e747035f 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, Union, List
 
 import torch
 from torch.nn import Module
@@ -42,9 +42,13 @@ class ModelOptFp8Config(QuantizationConfig):
     def __init__(
         self,
         is_checkpoint_fp8_serialized: bool = False,
+        kv_cache_quant_method: Optional[str] = None,
+        exclude_modules: Optional[List[str]] = None,
     ) -> None:
         super().__init__()
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        self.kv_cache_quant_method = kv_cache_quant_method
+        self.exclude_modules = exclude_modules
         if is_checkpoint_fp8_serialized:
             logger.warning("Detected ModelOpt fp8 checkpoint. Please note that"
                            " the format is experimental and could change.")
@@ -69,6 +73,13 @@ def get_config_filenames(cls) -> list[str]:
     def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
         quant_config = cls.get_from_keys(config, ["quantization"])
         quant_method = quant_config["quant_algo"]
+        kv_cache_quant_method = cls.get_from_keys(config, ["quantization"]).get(
+            "kv_cache_quant_algo"
+        )
+        exclude_modules = cls.get_from_keys(config, ["quantization"]).get(
+            "exclude_modules"
+        )
+
         if quant_method not in QUANT_ALGOS:
             raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}"
                              " quantizations in vLLM. Please check the "
@@ -76,7 +87,15 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
                              "quant configuration.")
         is_checkpoint_fp8_serialized = ("FP8" in quant_method)
 
-        return cls(is_checkpoint_fp8_serialized)
+        # Convert exclude_modules to handle the language_model prefix that gets added by mllama4.py
+        converted_exclude_modules = []
+        if exclude_modules:
+            for module in exclude_modules:
+                converted_exclude_modules.append(module)
+                if not module.startswith("language_model."):
+                    converted_exclude_modules.append(f"language_model.{module}")
+
+        return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, converted_exclude_modules)
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 9eb30fb52b3..e9a50b19109 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -914,30 +914,46 @@ def load_weights(self, weights: Iterable[tuple[str,
         params_dict = dict(self.named_parameters())
         updated_params: set[str] = set()
 
-        # language_model is an Llama4ForCausalLM instance. We load it's
-        # using llama4's load_weights routine.
-        language_model_weights, other_weights = self.separate_weights(
-            weights, prefix="language_model.")
-
-        # If no language_model weights found, try with "model." prefix and rename
-        language_model_weights_list = list(language_model_weights)
-        if not language_model_weights_list:
-            # No language_model.* weights found, try model.* weights
-            def rename_model_weights():
-                for name, weight in weights:
-                    if name.startswith("model."):
-                        # Rename model.* to language_model.model.*
-                        yield (name.replace("model.", "language_model.model.", 1), weight)
-                    else:
-                        # Keep other weights as is
-                        yield (name, weight)
-
-            language_model_weights, other_weights = self.separate_weights(
-                rename_model_weights(), prefix="language_model.")
-
+        # Debug: Print first 30 parameter names from initialized model
+        print("=== INITIALIZED MODEL PARAMETERS ===")
+        print("First 30 parameter names containing 'scale':")
+        scale_params = [name for name in params_dict.keys() if "scale" in name]
+        for i, name in enumerate(scale_params[:30]):
+            print(f"  {i+1:2d}. {name}")
+        print(f"Total parameters with 'scale': {len(scale_params)}")
+        print(f"Total model parameters: {len(params_dict)}")
+        print("=== END DEBUG ===\n")
+
+        # Combine renaming and separation logic in a single pass
+        def process_and_separate_weights():
+            language_model_weights = []
+            other_weights = []
+
+            for name, weight in weights:
+                # Apply renaming logic
+                if name.startswith("model."):
+                    # Rename model.* to language_model.model.*
+                    renamed = name.replace("model.", "language_model.model.", 1)
+                elif name.startswith("lm_head.weight"):
+                    # Rename lm_head.weight to language_model.lm_head.weight
+                    renamed = name.replace("lm_head.weight", "language_model.lm_head.weight")
+                else:
+                    # Keep other weights as is
+                    renamed = name
+
+                # Separate into language_model and other weights
+                if renamed.startswith("language_model."):
+                    language_model_weights.append((renamed, weight))
+                else:
+                    other_weights.append((renamed, weight))
+
+            return language_model_weights, other_weights
+
+        language_model_weights, other_weights = process_and_separate_weights()
+
+        # Load language model weights
         loader = AutoWeightsLoader(self)
-        loaded_language_model_params = loader.load_weights(
-            language_model_weights)
+        loaded_language_model_params = loader.load_weights(language_model_weights)
         assert loaded_language_model_params is not None
         updated_params.update(loaded_language_model_params)
 

From 03d2b3be9b970dfceda3674d3e3d67550f1294d8 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Mon, 23 Jun 2025 08:58:25 +0000
Subject: [PATCH 07/25] debug, handle kv scales

---
 vllm/model_executor/models/llama4.py  | 49 +++++++++++++++++-
 vllm/model_executor/models/mllama4.py | 71 ++++++++++++++++++++++++++-
 2 files changed, 117 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 0c9baab1f2e..c5d498b0e0a 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -35,7 +35,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
 
 from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
 from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk,
@@ -435,6 +435,12 @@ def load_weights(self, weights: Iterable[tuple[str,
                 name = name.replace(weight_name, param_name)
                 if is_pp_missing_parameter(name, self):
                     continue
+                if name.endswith("scale") and "expert" not in name:
+                    # Remapping the name of FP8 kv-scale.
+                    remapped_name = maybe_remap_kv_scale_name(name, params_dict)
+                    if remapped_name is None:
+                        continue
+                    name = remapped_name
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -452,6 +458,47 @@ def load_weights(self, weights: Iterable[tuple[str,
                 if not moe_loaded:
                     if is_pp_missing_parameter(name, self):
                         continue
+
+                    # Handle flat expert scale parameters that don't match per-expert patterns
+                    if ("experts." in name and
+                        ("w13_input_scale" in name or "w13_weight_scale" in name or
+                         "w2_input_scale" in name or "w2_weight_scale" in name)):
+                        # These are flat expert scales that apply to all experts
+                        param = params_dict[name]
+                        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+
+                        # Check if this is a MoE-specific weight loader that needs extra arguments
+                        if hasattr(param, 'weight_loader'):
+                            try:
+                                # Try to inspect the weight_loader signature
+                                import inspect
+                                sig = inspect.signature(weight_loader)
+                                if 'expert_id' in sig.parameters and 'shard_id' in sig.parameters:
+                                    # This is a MoE weight loader, provide the required arguments
+                                    # Determine the appropriate shard_id based on parameter name
+                                    if "w13_" in name:
+                                        # w13 corresponds to gate_up_proj, which can be either w1 or w3
+                                        # For scales, we typically use w1 as the representative
+                                        shard_id = "w1"
+                                    elif "w2_" in name:
+                                        # w2 corresponds to down_proj
+                                        shard_id = "w2"
+                                    else:
+                                        # Fallback - this shouldn't happen for scale parameters
+                                        shard_id = "w1"
+
+                                    weight_loader(param, loaded_weight, name, shard_id=shard_id, expert_id=0)
+                                else:
+                                    # Regular weight loader
+                                    weight_loader(param, loaded_weight)
+                            except Exception:
+                                # Fallback to regular loading if signature inspection fails
+                                weight_loader(param, loaded_weight)
+                        else:
+                            weight_loader(param, loaded_weight)
+                        loaded_params.add(name)
+                        continue
+
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index e9a50b19109..136a6828b09 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -717,6 +717,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
                                      SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     @classmethod
@@ -910,6 +911,12 @@ def load_weights(self, weights: Iterable[tuple[str,
             (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
             (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
             (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            # Shared expert gate_up_proj stacking
+            (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0),
+            (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1),
+            # Feed forward gate_up_proj stacking (for non-MoE layers if any)
+            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
+            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
         updated_params: set[str] = set()
@@ -929,11 +936,38 @@ def process_and_separate_weights():
             language_model_weights = []
             other_weights = []
 
+            # Track scale parameters for debugging
+            checkpoint_scales = []
+            renamed_scales = []
+
             for name, weight in weights:
+                # Track scale parameters from checkpoint
+                if "scale" in name:
+                    checkpoint_scales.append(name)
+
                 # Apply renaming logic
                 if name.startswith("model."):
-                    # Rename model.* to language_model.model.*
-                    renamed = name.replace("model.", "language_model.model.", 1)
+                    # Handle expert scale parameters with flat naming
+                    if "feed_forward.experts." in name and ("_input_scale" in name or "_weight_scale" in name):
+                        # Expert scales in checkpoint are single values for all experts
+                        # e.g., "model.layers.0.feed_forward.experts.down_proj_input_scale"
+                        # should map to "language_model.model.layers.0.feed_forward.experts.w2_input_scale"
+
+                        renamed = name.replace("model.", "language_model.model.", 1)
+
+                        # Map checkpoint naming to vLLM's expected naming
+                        if "down_proj_input_scale" in renamed:
+                            renamed = renamed.replace("down_proj_input_scale", "w2_input_scale")
+                        elif "down_proj_weight_scale" in renamed:
+                            renamed = renamed.replace("down_proj_weight_scale", "w2_weight_scale")
+                        elif "gate_up_proj_input_scale" in renamed:
+                            renamed = renamed.replace("gate_up_proj_input_scale", "w13_input_scale")
+                        elif "gate_up_proj_weight_scale" in renamed:
+                            renamed = renamed.replace("gate_up_proj_weight_scale", "w13_weight_scale")
+                        # If none of the above patterns match, keep the renamed version as is
+                    else:
+                        # Standard model.* to language_model.model.* renaming
+                        renamed = name.replace("model.", "language_model.model.", 1)
                 elif name.startswith("lm_head.weight"):
                     # Rename lm_head.weight to language_model.lm_head.weight
                     renamed = name.replace("lm_head.weight", "language_model.lm_head.weight")
@@ -941,12 +975,45 @@ def process_and_separate_weights():
                     # Keep other weights as is
                     renamed = name
 
+                # Track renamed scale parameters
+                if "scale" in renamed:
+                    renamed_scales.append(renamed)
+
                 # Separate into language_model and other weights
                 if renamed.startswith("language_model."):
                     language_model_weights.append((renamed, weight))
                 else:
                     other_weights.append((renamed, weight))
 
+            # Debug scale parameter mapping
+            print("=== SCALE PARAMETER MAPPING DEBUG ===")
+            print(f"Total scale parameters in checkpoint: {len(checkpoint_scales)}")
+            print(f"Total renamed scale parameters: {len(renamed_scales)}")
+
+            # Categorize scale parameters
+            self_attn_scales = [s for s in checkpoint_scales if "self_attn" in s]
+            expert_scales = [s for s in checkpoint_scales if "experts" in s and "shared_expert" not in s]
+            shared_expert_scales = [s for s in checkpoint_scales if "shared_expert" in s]
+            other_scales = [s for s in checkpoint_scales if s not in self_attn_scales + expert_scales + shared_expert_scales]
+
+            print(f"\nScale parameter categories from checkpoint:")
+            print(f"  Self-attention scales: {len(self_attn_scales)}")
+            print(f"  Expert scales: {len(expert_scales)}")
+            print(f"  Shared expert scales: {len(shared_expert_scales)}")
+            print(f"  Other scales: {len(other_scales)}")
+
+            if expert_scales:
+                print(f"\nFirst 5 expert scale parameters (original):")
+                for i, name in enumerate(expert_scales[:5]):
+                    print(f"  {i+1}. {name}")
+
+                print(f"\nFirst 5 expert scale parameters (renamed):")
+                expert_renamed = [s for s in renamed_scales if "experts" in s and "shared_expert" not in s]
+                for i, name in enumerate(expert_renamed[:5]):
+                    print(f"  {i+1}. {name}")
+
+            print("=== END SCALE DEBUG ===\n")
+
             return language_model_weights, other_weights
 
         language_model_weights, other_weights = process_and_separate_weights()

From 93d7185b38e81058a548f95a0e7b8e3791298d80 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Mon, 23 Jun 2025 20:40:58 +0000
Subject: [PATCH 08/25] fix kv scale name matching issue

---
 vllm/model_executor/models/llama4.py  |  5 ++---
 vllm/model_executor/models/mllama4.py | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index c5d498b0e0a..16c8c3a411a 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -437,10 +437,9 @@ def load_weights(self, weights: Iterable[tuple[str,
                     continue
                 if name.endswith("scale") and "expert" not in name:
                     # Remapping the name of FP8 kv-scale.
-                    remapped_name = maybe_remap_kv_scale_name(name, params_dict)
-                    if remapped_name is None:
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
                         continue
-                    name = remapped_name
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 136a6828b09..f750609f49f 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -40,7 +40,7 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.utils import initialize_model
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -968,6 +968,19 @@ def process_and_separate_weights():
                     else:
                         # Standard model.* to language_model.model.* renaming
                         renamed = name.replace("model.", "language_model.model.", 1)
+
+                        # Handle FP8 scale parameters: k_proj.k_scale -> attn.k_scale, v_proj.v_scale -> attn.v_scale
+                        if ".k_proj.k_scale" in renamed:
+                            original_renamed = renamed
+                            renamed = renamed.replace(".k_proj.k_scale", ".attn.k_scale")
+                            print(f"Remapped FP8 k_scale: {original_renamed} -> {renamed}")
+                        elif ".v_proj.v_scale" in renamed:
+                            original_renamed = renamed
+                            renamed = renamed.replace(".v_proj.v_scale", ".attn.v_scale")
+                            print(f"Remapped FP8 v_scale: {original_renamed} -> {renamed}")
+                    # Track renamed scale parameters
+                    if "scale" in renamed:
+                        renamed_scales.append(renamed)
                 elif name.startswith("lm_head.weight"):
                     # Rename lm_head.weight to language_model.lm_head.weight
                     renamed = name.replace("lm_head.weight", "language_model.lm_head.weight")

From d154fe14efb3eb4e2e7753ca7ec4e3167122dce3 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Mon, 23 Jun 2025 23:47:53 +0000
Subject: [PATCH 09/25] update, debug

---
 .../model_loader/weight_utils.py              | 10 +++
 vllm/model_executor/models/llama4.py          | 36 +++++---
 vllm/model_executor/models/mllama4.py         | 85 +++++++++----------
 3 files changed, 74 insertions(+), 57 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 857f4bca682..b886efed0a8 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -758,6 +758,10 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
     modelopt_scale_names = [
         ".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale"
     ]
+    # Also support qkv_proj scale parameters (from stacked parameter processing)
+    qkv_proj_scale_names = [
+        ".self_attn.qkv_proj.k_scale", ".self_attn.qkv_proj.v_scale"
+    ]
     for scale_name in possible_scale_names:
         if name.endswith(scale_name):
             if any(mo_scale_name in name
@@ -765,6 +769,12 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
                 remapped_name = name.replace(
                     f".self_attn.{scale_name[1]}_proj{scale_name}",
                     f".self_attn.attn{scale_name}")
+            elif any(qkv_scale_name in name
+                     for qkv_scale_name in qkv_proj_scale_names):
+                # Handle qkv_proj scale parameters: .self_attn.qkv_proj.k_scale -> .self_attn.attn.k_scale
+                remapped_name = name.replace(
+                    f".self_attn.qkv_proj{scale_name}",
+                    f".self_attn.attn{scale_name}")
             else:
                 remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 16c8c3a411a..40c8cf1a440 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -395,11 +395,18 @@ def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
+            # (".qkv_proj", ".q_proj", "q"),
+            # (".qkv_proj", ".k_proj", "k"),
+            # (".qkv_proj", ".v_proj", "v"),
+            # (".gate_up_proj", ".gate_proj", 0),
+            # (".gate_up_proj", ".up_proj", 1),
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0),
+            (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1),
+            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
+            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
         ]
         fused_experts_params = False
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
@@ -432,17 +439,25 @@ def load_weights(self, weights: Iterable[tuple[str,
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name or "experts" in name:
                     continue
-                name = name.replace(weight_name, param_name)
+                # Don't transform k_scale/v_scale parameter names with stacked parameter mapping
+                # but allow other scale parameters (input_scale, weight_scale) to be processed
+                if not (name.endswith((".k_scale", ".v_scale")) and "self_attn" in name):
+                    name = name.replace(weight_name, param_name)
                 if is_pp_missing_parameter(name, self):
                     continue
                 if name.endswith("scale") and "expert" not in name:
                     # Remapping the name of FP8 kv-scale.
                     name = maybe_remap_kv_scale_name(name, params_dict)
                     if name is None:
-                        continue
+                        continue  # Skip this parameter if remapping failed
                 param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    # default_weight_loader doesn't support shard_id, just load the weight directly
+                    weight_loader(param, loaded_weight)
+                else:
+                    # Custom weight loader that supports shard_id
+                    weight_loader(param, loaded_weight, shard_id)
                 loaded_params.add(name)
                 break
             else:
@@ -499,8 +514,7 @@ def load_weights(self, weights: Iterable[tuple[str,
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(param, "weight_loader", default_weight_loader)
                     weight_loader(param, loaded_weight)
                     loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index f750609f49f..63195c7b2f4 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -921,15 +921,15 @@ def load_weights(self, weights: Iterable[tuple[str,
         params_dict = dict(self.named_parameters())
         updated_params: set[str] = set()
 
-        # Debug: Print first 30 parameter names from initialized model
-        print("=== INITIALIZED MODEL PARAMETERS ===")
-        print("First 30 parameter names containing 'scale':")
-        scale_params = [name for name in params_dict.keys() if "scale" in name]
-        for i, name in enumerate(scale_params[:30]):
-            print(f"  {i+1:2d}. {name}")
-        print(f"Total parameters with 'scale': {len(scale_params)}")
-        print(f"Total model parameters: {len(params_dict)}")
-        print("=== END DEBUG ===\n")
+        # # Debug: Print first 30 parameter names from initialized model
+        # print("=== INITIALIZED MODEL PARAMETERS ===")
+        # print("First 30 parameter names containing 'scale':")
+        # scale_params = [name for name in params_dict.keys() if "scale" in name]
+        # for i, name in enumerate(scale_params[:30]):
+        #     print(f"  {i+1:2d}. {name}")
+        # print(f"Total parameters with 'scale': {len(scale_params)}")
+        # print(f"Total model parameters: {len(params_dict)}")
+        # print("=== END DEBUG ===\n")
 
         # Combine renaming and separation logic in a single pass
         def process_and_separate_weights():
@@ -969,15 +969,8 @@ def process_and_separate_weights():
                         # Standard model.* to language_model.model.* renaming
                         renamed = name.replace("model.", "language_model.model.", 1)
 
-                        # Handle FP8 scale parameters: k_proj.k_scale -> attn.k_scale, v_proj.v_scale -> attn.v_scale
-                        if ".k_proj.k_scale" in renamed:
-                            original_renamed = renamed
-                            renamed = renamed.replace(".k_proj.k_scale", ".attn.k_scale")
-                            print(f"Remapped FP8 k_scale: {original_renamed} -> {renamed}")
-                        elif ".v_proj.v_scale" in renamed:
-                            original_renamed = renamed
-                            renamed = renamed.replace(".v_proj.v_scale", ".attn.v_scale")
-                            print(f"Remapped FP8 v_scale: {original_renamed} -> {renamed}")
+                        # Don't do FP8 scale parameter remapping here - let Llama4Model.load_weights() handle it
+                        # The existing logic in Llama4Model.load_weights() already has proper scale remapping via maybe_remap_kv_scale_name
                     # Track renamed scale parameters
                     if "scale" in renamed:
                         renamed_scales.append(renamed)
@@ -998,34 +991,34 @@ def process_and_separate_weights():
                 else:
                     other_weights.append((renamed, weight))
 
-            # Debug scale parameter mapping
-            print("=== SCALE PARAMETER MAPPING DEBUG ===")
-            print(f"Total scale parameters in checkpoint: {len(checkpoint_scales)}")
-            print(f"Total renamed scale parameters: {len(renamed_scales)}")
-
-            # Categorize scale parameters
-            self_attn_scales = [s for s in checkpoint_scales if "self_attn" in s]
-            expert_scales = [s for s in checkpoint_scales if "experts" in s and "shared_expert" not in s]
-            shared_expert_scales = [s for s in checkpoint_scales if "shared_expert" in s]
-            other_scales = [s for s in checkpoint_scales if s not in self_attn_scales + expert_scales + shared_expert_scales]
-
-            print(f"\nScale parameter categories from checkpoint:")
-            print(f"  Self-attention scales: {len(self_attn_scales)}")
-            print(f"  Expert scales: {len(expert_scales)}")
-            print(f"  Shared expert scales: {len(shared_expert_scales)}")
-            print(f"  Other scales: {len(other_scales)}")
-
-            if expert_scales:
-                print(f"\nFirst 5 expert scale parameters (original):")
-                for i, name in enumerate(expert_scales[:5]):
-                    print(f"  {i+1}. {name}")
-
-                print(f"\nFirst 5 expert scale parameters (renamed):")
-                expert_renamed = [s for s in renamed_scales if "experts" in s and "shared_expert" not in s]
-                for i, name in enumerate(expert_renamed[:5]):
-                    print(f"  {i+1}. {name}")
-
-            print("=== END SCALE DEBUG ===\n")
+            # # Debug scale parameter mapping
+            # print("=== SCALE PARAMETER MAPPING DEBUG ===")
+            # print(f"Total scale parameters in checkpoint: {len(checkpoint_scales)}")
+            # print(f"Total renamed scale parameters: {len(renamed_scales)}")
+
+            # # Categorize scale parameters
+            # self_attn_scales = [s for s in checkpoint_scales if "self_attn" in s]
+            # expert_scales = [s for s in checkpoint_scales if "experts" in s and "shared_expert" not in s]
+            # shared_expert_scales = [s for s in checkpoint_scales if "shared_expert" in s]
+            # other_scales = [s for s in checkpoint_scales if s not in self_attn_scales + expert_scales + shared_expert_scales]
+
+            # print(f"\nScale parameter categories from checkpoint:")
+            # print(f"  Self-attention scales: {len(self_attn_scales)}")
+            # print(f"  Expert scales: {len(expert_scales)}")
+            # print(f"  Shared expert scales: {len(shared_expert_scales)}")
+            # print(f"  Other scales: {len(other_scales)}")
+
+            # if expert_scales:
+            #     print(f"\nFirst 5 expert scale parameters (original):")
+            #     for i, name in enumerate(expert_scales[:5]):
+            #         print(f"  {i+1}. {name}")
+
+            #     print(f"\nFirst 5 expert scale parameters (renamed):")
+            #     expert_renamed = [s for s in renamed_scales if "experts" in s and "shared_expert" not in s]
+            #     for i, name in enumerate(expert_renamed[:5]):
+            #         print(f"  {i+1}. {name}")
+
+            # print("=== END SCALE DEBUG ===\n")
 
             return language_model_weights, other_weights
 

From 782c018f2c1647699e1f5a57f466fa11d38b8485 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Tue, 24 Jun 2025 07:50:15 +0000
Subject: [PATCH 10/25] cleanup

---
 .../layers/quantization/modelopt.py           |  2 -
 vllm/model_executor/models/llama4.py          | 17 +++------
 vllm/model_executor/models/mllama4.py         | 38 -------------------
 3 files changed, 5 insertions(+), 52 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 278e747035f..7617b91cc2a 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -405,8 +405,6 @@ def apply(
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
         )
-        # from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-        #     cutlass_moe_fp8)
         from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
         return fused_experts(
             x,
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 40c8cf1a440..4131c357ac2 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -395,18 +395,11 @@ def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            # (".qkv_proj", ".q_proj", "q"),
-            # (".qkv_proj", ".k_proj", "k"),
-            # (".qkv_proj", ".v_proj", "v"),
-            # (".gate_up_proj", ".gate_proj", 0),
-            # (".gate_up_proj", ".up_proj", 1),
-            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
-            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
-            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
-            (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0),
-            (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1),
-            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
-            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
         ]
         fused_experts_params = False
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 63195c7b2f4..c609fbe1e91 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -921,16 +921,6 @@ def load_weights(self, weights: Iterable[tuple[str,
         params_dict = dict(self.named_parameters())
         updated_params: set[str] = set()
 
-        # # Debug: Print first 30 parameter names from initialized model
-        # print("=== INITIALIZED MODEL PARAMETERS ===")
-        # print("First 30 parameter names containing 'scale':")
-        # scale_params = [name for name in params_dict.keys() if "scale" in name]
-        # for i, name in enumerate(scale_params[:30]):
-        #     print(f"  {i+1:2d}. {name}")
-        # print(f"Total parameters with 'scale': {len(scale_params)}")
-        # print(f"Total model parameters: {len(params_dict)}")
-        # print("=== END DEBUG ===\n")
-
         # Combine renaming and separation logic in a single pass
         def process_and_separate_weights():
             language_model_weights = []
@@ -991,34 +981,6 @@ def process_and_separate_weights():
                 else:
                     other_weights.append((renamed, weight))
 
-            # # Debug scale parameter mapping
-            # print("=== SCALE PARAMETER MAPPING DEBUG ===")
-            # print(f"Total scale parameters in checkpoint: {len(checkpoint_scales)}")
-            # print(f"Total renamed scale parameters: {len(renamed_scales)}")
-
-            # # Categorize scale parameters
-            # self_attn_scales = [s for s in checkpoint_scales if "self_attn" in s]
-            # expert_scales = [s for s in checkpoint_scales if "experts" in s and "shared_expert" not in s]
-            # shared_expert_scales = [s for s in checkpoint_scales if "shared_expert" in s]
-            # other_scales = [s for s in checkpoint_scales if s not in self_attn_scales + expert_scales + shared_expert_scales]
-
-            # print(f"\nScale parameter categories from checkpoint:")
-            # print(f"  Self-attention scales: {len(self_attn_scales)}")
-            # print(f"  Expert scales: {len(expert_scales)}")
-            # print(f"  Shared expert scales: {len(shared_expert_scales)}")
-            # print(f"  Other scales: {len(other_scales)}")
-
-            # if expert_scales:
-            #     print(f"\nFirst 5 expert scale parameters (original):")
-            #     for i, name in enumerate(expert_scales[:5]):
-            #         print(f"  {i+1}. {name}")
-
-            #     print(f"\nFirst 5 expert scale parameters (renamed):")
-            #     expert_renamed = [s for s in renamed_scales if "experts" in s and "shared_expert" not in s]
-            #     for i, name in enumerate(expert_renamed[:5]):
-            #         print(f"  {i+1}. {name}")
-
-            # print("=== END SCALE DEBUG ===\n")
 
             return language_model_weights, other_weights
 

From b78b191398c439f7a8ceb38292e8f21ca0ade999 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Tue, 24 Jun 2025 20:20:35 +0000
Subject: [PATCH 11/25] fix format

---
 vllm/model_executor/layers/fused_moe/layer.py |  13 +-
 .../layers/quantization/modelopt.py           | 128 ++++++++++--------
 vllm/model_executor/models/llama4.py          |  59 +++++---
 vllm/model_executor/models/mllama4.py         |  41 +++---
 4 files changed, 136 insertions(+), 105 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 24129484125..a44e83bfee8 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1050,16 +1050,13 @@ def weight_loader(self,
         # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
         if "ModelOpt" in quant_method_name:
             # Determine per-tensor weight scale patterns based on variant
-            is_fp4_variant = (
-                "ModelOptNvFp4FusedMoEMethod" in self.quant_method.__class__.__name__
-            )
+            is_fp4_variant = ("ModelOptNvFp4FusedMoEMethod"
+                              in self.quant_method.__class__.__name__)
 
-            # FP4 uses "weight_scale_2" for per-tensor, FP8 uses "weight_scale" for per-tensor
+            # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale"
             per_tensor_conditions = (
-                "weight_scale_2" in weight_name
-                if is_fp4_variant
-                else "weight_scale" in weight_name
-            ) or "input_scale" in weight_name
+                "weight_scale_2" in weight_name if is_fp4_variant else
+                "weight_scale" in weight_name) or "input_scale" in weight_name
 
             if per_tensor_conditions:
                 self._load_per_tensor_weight_scale(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 7617b91cc2a..a2b9212de88 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union, List
+from typing import Any, Callable, List, Optional, Union
 
 import torch
 from torch.nn import Module
@@ -73,12 +73,10 @@ def get_config_filenames(cls) -> list[str]:
     def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
         quant_config = cls.get_from_keys(config, ["quantization"])
         quant_method = quant_config["quant_algo"]
-        kv_cache_quant_method = cls.get_from_keys(config, ["quantization"]).get(
-            "kv_cache_quant_algo"
-        )
-        exclude_modules = cls.get_from_keys(config, ["quantization"]).get(
-            "exclude_modules"
-        )
+        kv_cache_quant_method = cls.get_from_keys(
+            config, ["quantization"]).get("kv_cache_quant_algo")
+        exclude_modules = cls.get_from_keys(
+            config, ["quantization"]).get("exclude_modules")
 
         if quant_method not in QUANT_ALGOS:
             raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}"
@@ -87,15 +85,17 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
                              "quant configuration.")
         is_checkpoint_fp8_serialized = ("FP8" in quant_method)
 
-        # Convert exclude_modules to handle the language_model prefix that gets added by mllama4.py
+        # Convert exclude_modules to handle the language_model prefix for llama4
         converted_exclude_modules = []
         if exclude_modules:
             for module in exclude_modules:
                 converted_exclude_modules.append(module)
                 if not module.startswith("language_model."):
-                    converted_exclude_modules.append(f"language_model.{module}")
+                    converted_exclude_modules.append(
+                        f"language_model.{module}")
 
-        return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, converted_exclude_modules)
+        return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method,
+                   converted_exclude_modules)
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
@@ -191,6 +191,7 @@ def apply(
                                      input_scale=layer.input_scale,
                                      bias=bias)
 
+
 class ModelOptFp8MoEMethod:
     """MoE method for ModelOpt FP8.
     Supports loading FP8 checkpoints with static weight scale and activation scale.
@@ -209,10 +210,13 @@ def __new__(cls, *args, **kwargs):
             original_init = cls.__init__
             new_cls = type(
                 cls.__name__,
-                (FusedMoEMethodBase,),
+                (FusedMoEMethodBase, ),
                 {
                     "__init__": original_init,
-                    **{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
+                    **{
+                        k: v
+                        for k, v in cls.__dict__.items() if k != "__dict__"
+                    },
                 },
             )
             obj = super(new_cls, new_cls).__new__(new_cls)
@@ -237,17 +241,16 @@ def create_weights(
     ):
 
         # Use FP8 dtype if checkpoint is serialized, otherwise use the default dtype
-        weight_dtype = (
-            torch.float8_e4m3fn
-            if self.quant_config.is_checkpoint_fp8_serialized
-            else params_dtype
-        )
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_checkpoint_fp8_serialized else
+                        params_dtype)
         weight_loader = extra_weight_attrs.get("weight_loader")
 
         w13_weight = ModelWeightParameter(
-            data=torch.empty(
-                num_experts, 2 * intermediate_size_per_partition, hidden_size, dtype=weight_dtype
-            ),
+            data=torch.empty(num_experts,
+                             2 * intermediate_size_per_partition,
+                             hidden_size,
+                             dtype=weight_dtype),
             input_dim=2,
             output_dim=1,
             weight_loader=weight_loader,
@@ -255,9 +258,10 @@ def create_weights(
         layer.register_parameter("w13_weight", w13_weight)
 
         w2_weight = ModelWeightParameter(
-            data=torch.empty(
-                num_experts, hidden_size, intermediate_size_per_partition, dtype=weight_dtype
-            ),
+            data=torch.empty(num_experts,
+                             hidden_size,
+                             intermediate_size_per_partition,
+                             dtype=weight_dtype),
             input_dim=2,
             output_dim=1,
             weight_loader=weight_loader,
@@ -278,7 +282,10 @@ def create_weights(
             )
             w2_weight_scale = PerTensorScaleParameter(
                 data=torch.full(
-                    (num_experts,), 1.0, dtype=torch.float32  # Initialize to reasonable default instead of -inf
+                    (num_experts, ),
+                    1.0,
+                    dtype=torch.
+                    float32  # Initialize to reasonable default instead of -inf
                 ),
                 weight_loader=weight_loader,
             )
@@ -287,16 +294,15 @@ def create_weights(
 
             # Set weight loader attributes for scales
             extra_weight_attrs.update(
-                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
-            )
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
 
             # INPUT SCALES - Per-tensor scaling for ModelOpt
             w13_input_scale = PerTensorScaleParameter(
-                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
                 weight_loader=weight_loader,
             )
             w2_input_scale = PerTensorScaleParameter(
-                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
                 weight_loader=weight_loader,
             )
             layer.register_parameter("w13_input_scale", w13_input_scale)
@@ -307,24 +313,27 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         Only supports pre-quantized checkpoints with FP8 weights and scales.
         """
 
-        layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
+        layer.w13_weight = Parameter(layer.w13_weight.data,
+                                     requires_grad=False)
         layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
 
+        from vllm._custom_ops import scaled_fp8_quant
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
             per_tensor_dequantize)
-        from vllm._custom_ops import scaled_fp8_quant
 
         # Handle scale parameters
-        if hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None:
+        if hasattr(layer,
+                   "w13_weight_scale") and layer.w13_weight_scale is not None:
             # Fp8 moe kernel needs single weight scale for w13 per expert.
-            # We take the max of the w1 and w3 scales then dequant and requant each expert.
+            # We take the max of the w1 and w3 scales
+            # then dequant and requant each expert.
             if layer.w13_weight_scale.dim() == 2:  # Shape: (num_experts, 2)
 
                 # Get the maximum scale across w1 and w3 for each expert
                 max_w13_scales = layer.w13_weight_scale.max(dim=1).values
 
                 # Requantize each expert's weights using the combined scale
-                # w13_weight has shape (num_experts, 2 * intermediate_size, hidden_size)
+                # w13_weight (num_experts, 2 * intermediate_size, hidden_size)
                 # where the first intermediate_size rows are w1, the next are w3
                 intermediate_size = layer.w13_weight.shape[1] // 2
                 for expert_id in range(layer.w13_weight.shape[0]):
@@ -332,41 +341,40 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     for shard_id in range(2):  # w1 and w3
                         # Dequantize using the original scale for this shard
                         dq_weight = per_tensor_dequantize(
-                            layer.w13_weight[expert_id][
-                                start : start + intermediate_size, :
-                            ],
+                            layer.w13_weight[expert_id][start:start +
+                                                        intermediate_size, :],
                             layer.w13_weight_scale[expert_id][shard_id],
                         )
                         # Requantize using the combined max scale
 
                         (
-                            layer.w13_weight[expert_id][
-                                start : start + intermediate_size, :
-                            ],
+                            layer.w13_weight[expert_id][start:start +
+                                                        intermediate_size, :],
                             _,
-                        ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+                        ) = scaled_fp8_quant(dq_weight,
+                                             max_w13_scales[expert_id])
 
                         start += intermediate_size
 
-                # Update the scale parameter to be per-expert instead of per-shard
-                layer.w13_weight_scale = Parameter(max_w13_scales, requires_grad=False)
+                # Update the scale parameter to be per-expert
+                layer.w13_weight_scale = Parameter(max_w13_scales,
+                                                   requires_grad=False)
             else:
-                layer.w13_weight_scale = Parameter(
-                    layer.w13_weight_scale.data, requires_grad=False
-                )
-
-        if hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None:
-            layer.w2_weight_scale = Parameter(
-                layer.w2_weight_scale.data, requires_grad=False
-            )
-        if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None:
-            layer.w13_input_scale = Parameter(
-                layer.w13_input_scale.max(), requires_grad=False
-            )
-        if hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None:
-            layer.w2_input_scale = Parameter(
-                layer.w2_input_scale.max(), requires_grad=False
-            )
+                layer.w13_weight_scale = Parameter(layer.w13_weight_scale.data,
+                                                   requires_grad=False)
+
+        if hasattr(layer,
+                   "w2_weight_scale") and layer.w2_weight_scale is not None:
+            layer.w2_weight_scale = Parameter(layer.w2_weight_scale.data,
+                                              requires_grad=False)
+        if hasattr(layer,
+                   "w13_input_scale") and layer.w13_input_scale is not None:
+            layer.w13_input_scale = Parameter(layer.w13_input_scale.max(),
+                                              requires_grad=False)
+        if hasattr(layer,
+                   "w2_input_scale") and layer.w2_input_scale is not None:
+            layer.w2_input_scale = Parameter(layer.w2_input_scale.max(),
+                                             requires_grad=False)
 
     def apply(
         self,
@@ -405,7 +413,8 @@ def apply(
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
         )
-        from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+        from vllm.model_executor.layers.fused_moe.fused_moe import (
+            fused_experts)
         return fused_experts(
             x,
             layer.w13_weight,
@@ -425,6 +434,7 @@ def apply(
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
 
+
 class ModelOptNvFp4Config(QuantizationConfig):
     """Config class for ModelOpt FP4."""
 
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 4131c357ac2..d0dbae20dce 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -35,7 +35,8 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 
 from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
 from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk,
@@ -432,9 +433,11 @@ def load_weights(self, weights: Iterable[tuple[str,
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name or "experts" in name:
                     continue
-                # Don't transform k_scale/v_scale parameter names with stacked parameter mapping
-                # but allow other scale parameters (input_scale, weight_scale) to be processed
-                if not (name.endswith((".k_scale", ".v_scale")) and "self_attn" in name):
+                # Don't transform k_scale/v_scale parameter names with
+                # stacked parameter mapping but allow other scale parameters
+                # (input_scale, weight_scale) to be processed
+                if not (name.endswith(
+                    (".k_scale", ".v_scale")) and "self_attn" in name):
                     name = name.replace(weight_name, param_name)
                 if is_pp_missing_parameter(name, self):
                     continue
@@ -444,9 +447,10 @@ def load_weights(self, weights: Iterable[tuple[str,
                     if name is None:
                         continue  # Skip this parameter if remapping failed
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
                 if weight_loader == default_weight_loader:
-                    # default_weight_loader doesn't support shard_id, just load the weight directly
+                    # default_weight_loader doesn't support shard_id
                     weight_loader(param, loaded_weight)
                 else:
                     # Custom weight loader that supports shard_id
@@ -466,40 +470,52 @@ def load_weights(self, weights: Iterable[tuple[str,
                     if is_pp_missing_parameter(name, self):
                         continue
 
-                    # Handle flat expert scale parameters that don't match per-expert patterns
-                    if ("experts." in name and
-                        ("w13_input_scale" in name or "w13_weight_scale" in name or
-                         "w2_input_scale" in name or "w2_weight_scale" in name)):
+                    # Handle flat expert scale parameters that
+                    # don't match per-expert patterns
+                    if ("experts." in name and ("w13_input_scale" in name
+                                                or "w13_weight_scale" in name
+                                                or "w2_input_scale" in name
+                                                or "w2_weight_scale" in name)):
                         # These are flat expert scales that apply to all experts
                         param = params_dict[name]
-                        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                        weight_loader = getattr(param, "weight_loader",
+                                                default_weight_loader)
 
-                        # Check if this is a MoE-specific weight loader that needs extra arguments
+                        # Check if this is a MoE-specific weight loader that
+                        # needs extra arguments
                         if hasattr(param, 'weight_loader'):
                             try:
                                 # Try to inspect the weight_loader signature
                                 import inspect
                                 sig = inspect.signature(weight_loader)
-                                if 'expert_id' in sig.parameters and 'shard_id' in sig.parameters:
-                                    # This is a MoE weight loader, provide the required arguments
-                                    # Determine the appropriate shard_id based on parameter name
+                                if ('expert_id' in sig.parameters and
+                                    'shard_id' in sig.parameters):
+                                    # This is a MoE weight loader, provide the
+                                    # required arguments
+                                    # Determine the appropriate shard_id based
+                                    # on parameter name
                                     if "w13_" in name:
-                                        # w13 corresponds to gate_up_proj, which can be either w1 or w3
-                                        # For scales, we typically use w1 as the representative
+                                        # w13 corresponds to gate_up_proj, which
+                                        # can be either w1 or w3
                                         shard_id = "w1"
                                     elif "w2_" in name:
                                         # w2 corresponds to down_proj
                                         shard_id = "w2"
                                     else:
-                                        # Fallback - this shouldn't happen for scale parameters
+                                        # Fallback - this shouldn't happen for
+                                        # scale parameters
                                         shard_id = "w1"
 
-                                    weight_loader(param, loaded_weight, name, shard_id=shard_id, expert_id=0)
+                                    weight_loader(param,
+                                                  loaded_weight,
+                                                  name,
+                                                  shard_id=shard_id,
+                                                  expert_id=0)
                                 else:
                                     # Regular weight loader
                                     weight_loader(param, loaded_weight)
                             except Exception:
-                                # Fallback to regular loading if signature inspection fails
+                                # Fallback to regular loading
                                 weight_loader(param, loaded_weight)
                         else:
                             weight_loader(param, loaded_weight)
@@ -507,7 +523,8 @@ def load_weights(self, weights: Iterable[tuple[str,
                         continue
 
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
                     weight_loader(param, loaded_weight)
                     loaded_params.add(name)
         return loaded_params
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index c609fbe1e91..70e379b4def 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -40,7 +40,7 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.utils import initialize_model
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -938,35 +938,42 @@ def process_and_separate_weights():
                 # Apply renaming logic
                 if name.startswith("model."):
                     # Handle expert scale parameters with flat naming
-                    if "feed_forward.experts." in name and ("_input_scale" in name or "_weight_scale" in name):
-                        # Expert scales in checkpoint are single values for all experts
-                        # e.g., "model.layers.0.feed_forward.experts.down_proj_input_scale"
-                        # should map to "language_model.model.layers.0.feed_forward.experts.w2_input_scale"
+                    if "feed_forward.experts." in name and (
+                            "_input_scale" in name or "_weight_scale" in name):
+                        # Expert scales in checkpoint are single values for all
+                        # experts e.g., "model.layers.0.feed_forward.experts.
+                        # down_proj_input_scale" should map to "language_model.
+                        # model.layers.0.feed_forward.experts.w2_input_scale"
 
-                        renamed = name.replace("model.", "language_model.model.", 1)
+                        renamed = name.replace("model.",
+                                               "language_model.model.", 1)
 
                         # Map checkpoint naming to vLLM's expected naming
                         if "down_proj_input_scale" in renamed:
-                            renamed = renamed.replace("down_proj_input_scale", "w2_input_scale")
+                            renamed = renamed.replace("down_proj_input_scale",
+                                                      "w2_input_scale")
                         elif "down_proj_weight_scale" in renamed:
-                            renamed = renamed.replace("down_proj_weight_scale", "w2_weight_scale")
+                            renamed = renamed.replace("down_proj_weight_scale",
+                                                      "w2_weight_scale")
                         elif "gate_up_proj_input_scale" in renamed:
-                            renamed = renamed.replace("gate_up_proj_input_scale", "w13_input_scale")
+                            renamed = renamed.replace(
+                                "gate_up_proj_input_scale", "w13_input_scale")
                         elif "gate_up_proj_weight_scale" in renamed:
-                            renamed = renamed.replace("gate_up_proj_weight_scale", "w13_weight_scale")
-                        # If none of the above patterns match, keep the renamed version as is
+                            renamed = renamed.replace(
+                                "gate_up_proj_weight_scale",
+                                "w13_weight_scale")
                     else:
                         # Standard model.* to language_model.model.* renaming
-                        renamed = name.replace("model.", "language_model.model.", 1)
+                        renamed = name.replace("model.",
+                                               "language_model.model.", 1)
 
-                        # Don't do FP8 scale parameter remapping here - let Llama4Model.load_weights() handle it
-                        # The existing logic in Llama4Model.load_weights() already has proper scale remapping via maybe_remap_kv_scale_name
                     # Track renamed scale parameters
                     if "scale" in renamed:
                         renamed_scales.append(renamed)
                 elif name.startswith("lm_head.weight"):
                     # Rename lm_head.weight to language_model.lm_head.weight
-                    renamed = name.replace("lm_head.weight", "language_model.lm_head.weight")
+                    renamed = name.replace("lm_head.weight",
+                                           "language_model.lm_head.weight")
                 else:
                     # Keep other weights as is
                     renamed = name
@@ -981,14 +988,14 @@ def process_and_separate_weights():
                 else:
                     other_weights.append((renamed, weight))
 
-
             return language_model_weights, other_weights
 
         language_model_weights, other_weights = process_and_separate_weights()
 
         # Load language model weights
         loader = AutoWeightsLoader(self)
-        loaded_language_model_params = loader.load_weights(language_model_weights)
+        loaded_language_model_params = loader.load_weights(
+            language_model_weights)
         assert loaded_language_model_params is not None
         updated_params.update(loaded_language_model_params)
 

From 22745f3508b92b4418b1cde3a0ead88d6b938608 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 9 Jul 2025 06:15:26 +0000
Subject: [PATCH 12/25] resolve conflict

---
 vllm/attention/layer.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index f0ad68b1640..ee8c452b4e1 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -314,9 +314,13 @@ def __init__(
                            _Backend.FLEX_ATTENTION):
                 backend = _Backend.XFORMERS
 
-            self.attn_backend = backend if backend in {
-                _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
-            } else _Backend.TORCH_SDPA
+            # self.attn_backend = backend if backend in {
+            #     _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
+            # } else _Backend.TORCH_SDPA
+
+            # Force TORCH_SDPA to avoid xformers-triton compatibility issues
+            # TODO: Remove this workaround once xformers-triton compatibility is fixed
+            self.attn_backend = _Backend.TORCH_SDPA
 
     def forward(
         self,

From 1c5acec33b93cf7633eed57e00b0926e5b37e009 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 2 Jul 2025 03:42:49 +0000
Subject: [PATCH 13/25] debug

---
 vllm/model_executor/models/mllama4.py | 222 +++++++++++++++++++++++++-
 1 file changed, 220 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 70e379b4def..1538a098af6 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -929,6 +929,7 @@ def process_and_separate_weights():
             # Track scale parameters for debugging
             checkpoint_scales = []
             renamed_scales = []
+            scale_mapping = {}
 
             for name, weight in weights:
                 # Track scale parameters from checkpoint
@@ -962,14 +963,31 @@ def process_and_separate_weights():
                             renamed = renamed.replace(
                                 "gate_up_proj_weight_scale",
                                 "w13_weight_scale")
+                                        # Handle attention scale parameters
+                    elif "self_attn." in name and (
+                            ".k_scale" in name or ".v_scale" in name):
+                        # Map attention scale parameters for ModelOpt checkpoints
+                        # e.g., "model.layers.0.self_attn.k_proj.k_scale"
+                        # should map to "language_model.model.layers.0.self_attn.attn.k_scale"
+
+                        renamed = name.replace("model.",
+                                               "language_model.model.", 1)
+
+                        # Map checkpoint attention scale naming to vLLM's expected naming
+                        if ".k_proj.k_scale" in renamed:
+                            renamed = renamed.replace(".k_proj.k_scale", ".attn.k_scale")
+                        elif ".v_proj.v_scale" in renamed:
+                            renamed = renamed.replace(".v_proj.v_scale", ".attn.v_scale")
                     else:
                         # Standard model.* to language_model.model.* renaming
                         renamed = name.replace("model.",
                                                "language_model.model.", 1)
 
-                    # Track renamed scale parameters
+                    # Track renamed scale parameters and mapping
                     if "scale" in renamed:
                         renamed_scales.append(renamed)
+                        if "scale" in name:  # Only add to mapping if original was also a scale
+                            scale_mapping[name] = renamed
                 elif name.startswith("lm_head.weight"):
                     # Rename lm_head.weight to language_model.lm_head.weight
                     renamed = name.replace("lm_head.weight",
@@ -978,9 +996,11 @@ def process_and_separate_weights():
                     # Keep other weights as is
                     renamed = name
 
-                # Track renamed scale parameters
+                # Track renamed scale parameters and mapping
                 if "scale" in renamed:
                     renamed_scales.append(renamed)
+                    if "scale" in name and name not in scale_mapping:  # Avoid duplicates
+                        scale_mapping[name] = renamed
 
                 # Separate into language_model and other weights
                 if renamed.startswith("language_model."):
@@ -988,6 +1008,108 @@ def process_and_separate_weights():
                 else:
                     other_weights.append((renamed, weight))
 
+            # Print debugging information for scale parameters
+            print(f"\n=== SCALE PARAMETER LOADING DEBUG INFO ===")
+            print(f"Scale parameters found in checkpoint ({len(checkpoint_scales)}):")
+
+            # Group scale parameters by type for better readability
+            moe_scales = [s for s in checkpoint_scales if "experts." in s]
+            attn_scales = [s for s in checkpoint_scales if "self_attn." in s and "scale" in s]
+            other_scales = [s for s in checkpoint_scales if s not in moe_scales and s not in attn_scales]
+
+            # Further categorize attention scales
+            kv_cache_scales = [s for s in attn_scales if ".k_scale" in s or ".v_scale" in s]
+            linear_scales = [s for s in attn_scales if s not in kv_cache_scales]
+
+            if moe_scales:
+                print(f"\n  MoE Expert Scales ({len(moe_scales)}):")
+                for scale_name in sorted(moe_scales):
+                    print(f"    {scale_name}")
+
+            if attn_scales:
+                print(f"\n  Attention Scales ({len(attn_scales)}):")
+                if kv_cache_scales:
+                    print(f"    KV Cache Scales ({len(kv_cache_scales)}):")
+                    for scale_name in sorted(kv_cache_scales):
+                        print(f"      {scale_name}")
+                if linear_scales:
+                    print(f"    Linear Projection Scales ({len(linear_scales)}):")
+                    for scale_name in sorted(linear_scales):
+                        print(f"      {scale_name}")
+
+                # Note about missing q_scale and prob_scale
+                print(f"    📝 Note: q_scale and prob_scale not found in checkpoint")
+                print(f"             These will use default values (1.0) as expected")
+
+            if other_scales:
+                print(f"\n  Other Scales ({len(other_scales)}):")
+                for scale_name in sorted(other_scales):
+                    print(f"    {scale_name}")
+
+            print(f"\nScale parameter name mappings ({len(scale_mapping)}):")
+
+            # Group mappings by type for clarity
+            moe_mappings = {k: v for k, v in scale_mapping.items() if "experts." in k}
+            attn_mappings = {k: v for k, v in scale_mapping.items() if "self_attn." in k}
+            other_mappings = {k: v for k, v in scale_mapping.items() if k not in moe_mappings and k not in attn_mappings}
+
+            if moe_mappings:
+                print(f"\n  MoE Scale Mappings ({len(moe_mappings)}):")
+                for orig_name, renamed_name in sorted(moe_mappings.items()):
+                    print(f"    {orig_name} → {renamed_name}")
+
+            if attn_mappings:
+                print(f"\n  Attention Scale Mappings ({len(attn_mappings)}):")
+                for orig_name, renamed_name in sorted(attn_mappings.items()):
+                    print(f"    {orig_name} → {renamed_name}")
+
+            if other_mappings:
+                print(f"\n  Other Scale Mappings ({len(other_mappings)}):")
+                for orig_name, renamed_name in sorted(other_mappings.items()):
+                    print(f"    {orig_name} → {renamed_name}")
+
+            print(f"\nRenamed scale parameters ({len(renamed_scales)}):")
+            for scale_name in sorted(renamed_scales):
+                print(f"  {scale_name}")
+
+            # Get expected scale parameters from model
+            model_scale_params = []
+            for param_name in params_dict.keys():
+                if "scale" in param_name:
+                    model_scale_params.append(param_name)
+
+            print(f"\nExpected scale parameters in model ({len(model_scale_params)}):")
+            for param_name in sorted(model_scale_params):
+                print(f"  {param_name}")
+
+                        # Check for missing scale parameters
+            missing_scales = set(model_scale_params) - set(renamed_scales)
+            extra_scales = set(renamed_scales) - set(model_scale_params)
+
+            # Filter out q_scale and prob_scale as they're expected to use defaults
+            expected_defaults = {p for p in missing_scales if ".attn.q_scale" in p or ".attn.prob_scale" in p}
+            truly_missing = missing_scales - expected_defaults
+
+            if truly_missing:
+                print(f"\n⚠️  MISSING scale parameters ({len(truly_missing)}):")
+                for param_name in sorted(truly_missing):
+                    print(f"  {param_name}")
+            else:
+                print(f"\n✅ All required scale parameters found in checkpoint!")
+
+            if expected_defaults:
+                print(f"\n📋 Scale parameters using defaults ({len(expected_defaults)}):")
+                for param_name in sorted(expected_defaults):
+                    print(f"  {param_name} (will use default value 1.0)")
+
+
+            if extra_scales:
+                print(f"\n⚠️  EXTRA scale parameters in checkpoint ({len(extra_scales)}):")
+                for param_name in sorted(extra_scales):
+                    print(f"  {param_name}")
+
+            print(f"=== END SCALE PARAMETER DEBUG INFO ===\n")
+
             return language_model_weights, other_weights
 
         language_model_weights, other_weights = process_and_separate_weights()
@@ -1019,4 +1141,100 @@ def process_and_separate_weights():
 
                 weight_loader(param, loaded_weight)
                 updated_params.add(name)
+
+        # Print final verification of loaded scale parameters
+        print(f"\n=== SCALE PARAMETER LOADING VERIFICATION ===")
+
+        # Show parameters that were loaded from checkpoint
+        loaded_scale_params = {}
+        for param_name, param in params_dict.items():
+            if "scale" in param_name and param_name in updated_params:
+                if hasattr(param, 'data'):
+                    param_value = param.data
+                    if param_value.numel() == 1:
+                        loaded_scale_params[param_name] = float(param_value.item())
+                    else:
+                        loaded_scale_params[param_name] = f"tensor{list(param_value.shape)} (first few values: {param_value.flatten()[:5].tolist()})"
+                else:
+                    loaded_scale_params[param_name] = "No .data attribute"
+
+        if loaded_scale_params:
+            print(f"Scale parameters loaded from checkpoint ({len(loaded_scale_params)}):")
+            for param_name, value in sorted(loaded_scale_params.items()):
+                print(f"  {param_name}: {value}")
+
+        # Show parameters that weren't loaded but exist in model (including defaults)
+        not_loaded_scale_params = {}
+        for param_name, param in params_dict.items():
+            if "scale" in param_name and param_name not in updated_params:
+                if hasattr(param, 'data'):
+                    param_value = param.data
+                    if param_value.numel() == 1:
+                        not_loaded_scale_params[param_name] = float(param_value.item())
+                    else:
+                        not_loaded_scale_params[param_name] = f"tensor{list(param_value.shape)} (first few values: {param_value.flatten()[:5].tolist()})"
+                else:
+                    not_loaded_scale_params[param_name] = "No .data attribute"
+
+        if not_loaded_scale_params:
+            print(f"\nScale parameters using default values ({len(not_loaded_scale_params)}):")
+            for param_name, value in sorted(not_loaded_scale_params.items()):
+                # Highlight q_scale and prob_scale specifically
+                if ".attn.q_scale" in param_name or ".attn.prob_scale" in param_name:
+                    print(f"  {param_name}: {value} ⭐ (expected default)")
+                else:
+                    print(f"  {param_name}: {value}")
+
+        # Summary
+        total_scale_params = len(loaded_scale_params) + len(not_loaded_scale_params)
+        print(f"\nScale parameter summary:")
+        print(f"  Loaded from checkpoint: {len(loaded_scale_params)}")
+        print(f"  Using default values: {len(not_loaded_scale_params)}")
+        print(f"  Total scale parameters: {total_scale_params}")
+
+        # Fix missing attention scale parameters using proper defaults
+        attention_params_fixed = 0
+        layer_k_scales = {}  # Store k_scale values for each layer
+
+        # First pass: collect k_scale values
+        for param_name, param in params_dict.items():
+            if ".attn.k_scale" in param_name and hasattr(param, 'data'):
+                layer_prefix = param_name.replace(".attn.k_scale", "")
+                if param.data.numel() == 1:
+                    layer_k_scales[layer_prefix] = float(param.data.item())
+                    print(f"📊 Found k_scale for {layer_prefix}: {layer_k_scales[layer_prefix]}")
+
+        # Second pass: fix missing scales with proper defaults
+        for param_name, param in params_dict.items():
+            if hasattr(param, 'data') and param.data.numel() == 1:
+                current_value = float(param.data.item())
+
+                # Fix q_scale: use k_scale from same layer if available
+                if ".attn.q_scale" in param_name and current_value == 1.0:
+                    layer_prefix = param_name.replace(".attn.q_scale", "")
+                    if layer_prefix in layer_k_scales:
+                        k_scale_value = layer_k_scales[layer_prefix]
+                        print(f"🔧 Setting {param_name}: {current_value} -> {k_scale_value} (using k_scale)")
+                        param.data.fill_(k_scale_value)
+                        attention_params_fixed += 1
+                    else:
+                        print(f"⚠️  No k_scale found for {param_name}, keeping default 1.0")
+
+                # Fix prob_scale: use standard default of 1.0/448.0 for missing values
+                elif ".attn.prob_scale" in param_name and current_value == 1.0:
+                    prob_scale_default = 1.0 / 448.0
+                    print(f"🔧 Setting {param_name}: {current_value} -> {prob_scale_default} (attention prob default)")
+                    param.data.fill_(prob_scale_default)
+                    attention_params_fixed += 1
+
+        if attention_params_fixed > 0:
+            print(f"Fixed {attention_params_fixed} attention scale parameters with proper defaults")
+
+        if not_loaded_scale_params and not any(".attn.q_scale" in p or ".attn.prob_scale" in p for p in not_loaded_scale_params):
+            print(f"\n⚠️  Warning: Expected q_scale and prob_scale to be using defaults, but they weren't found")
+        elif any(".attn.q_scale" in p or ".attn.prob_scale" in p for p in not_loaded_scale_params):
+            print(f"\n✅ q_scale and prob_scale are correctly using default values!")
+
+        print(f"=== END SCALE PARAMETER VERIFICATION ===\n")
+
         return updated_params

From b10782dcbf123b61504a6c1fef6a4af8f2a62e01 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 2 Jul 2025 05:36:10 +0000
Subject: [PATCH 14/25] handle eplb in ModelOptFp8MoEMethod

---
 vllm/model_executor/layers/quantization/modelopt.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index a2b9212de88..600f2af9468 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -398,7 +398,14 @@ def apply(
         inplace: bool = True,
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
 
         # Expert selection
         topk_weights, topk_ids = FusedMoE.select_experts(

From 826528739df218db64fe0047f4d33097015bdf8f Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 3 Jul 2025 01:32:26 +0000
Subject: [PATCH 15/25] broadcasting BMM experts scales

---
 vllm/model_executor/models/mllama4.py | 183 +++++---------------------
 1 file changed, 34 insertions(+), 149 deletions(-)

diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 1538a098af6..a39080bc6c3 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -963,7 +963,8 @@ def process_and_separate_weights():
                             renamed = renamed.replace(
                                 "gate_up_proj_weight_scale",
                                 "w13_weight_scale")
-                                        # Handle attention scale parameters
+
+                    # Handle attention scale parameters
                     elif "self_attn." in name and (
                             ".k_scale" in name or ".v_scale" in name):
                         # Map attention scale parameters for ModelOpt checkpoints
@@ -1008,119 +1009,48 @@ def process_and_separate_weights():
                 else:
                     other_weights.append((renamed, weight))
 
-            # Print debugging information for scale parameters
-            print(f"\n=== SCALE PARAMETER LOADING DEBUG INFO ===")
-            print(f"Scale parameters found in checkpoint ({len(checkpoint_scales)}):")
-
-            # Group scale parameters by type for better readability
-            moe_scales = [s for s in checkpoint_scales if "experts." in s]
-            attn_scales = [s for s in checkpoint_scales if "self_attn." in s and "scale" in s]
-            other_scales = [s for s in checkpoint_scales if s not in moe_scales and s not in attn_scales]
-
-            # Further categorize attention scales
-            kv_cache_scales = [s for s in attn_scales if ".k_scale" in s or ".v_scale" in s]
-            linear_scales = [s for s in attn_scales if s not in kv_cache_scales]
-
-            if moe_scales:
-                print(f"\n  MoE Expert Scales ({len(moe_scales)}):")
-                for scale_name in sorted(moe_scales):
-                    print(f"    {scale_name}")
-
-            if attn_scales:
-                print(f"\n  Attention Scales ({len(attn_scales)}):")
-                if kv_cache_scales:
-                    print(f"    KV Cache Scales ({len(kv_cache_scales)}):")
-                    for scale_name in sorted(kv_cache_scales):
-                        print(f"      {scale_name}")
-                if linear_scales:
-                    print(f"    Linear Projection Scales ({len(linear_scales)}):")
-                    for scale_name in sorted(linear_scales):
-                        print(f"      {scale_name}")
-
-                # Note about missing q_scale and prob_scale
-                print(f"    📝 Note: q_scale and prob_scale not found in checkpoint")
-                print(f"             These will use default values (1.0) as expected")
-
-            if other_scales:
-                print(f"\n  Other Scales ({len(other_scales)}):")
-                for scale_name in sorted(other_scales):
-                    print(f"    {scale_name}")
-
-            print(f"\nScale parameter name mappings ({len(scale_mapping)}):")
-
-            # Group mappings by type for clarity
-            moe_mappings = {k: v for k, v in scale_mapping.items() if "experts." in k}
-            attn_mappings = {k: v for k, v in scale_mapping.items() if "self_attn." in k}
-            other_mappings = {k: v for k, v in scale_mapping.items() if k not in moe_mappings and k not in attn_mappings}
-
-            if moe_mappings:
-                print(f"\n  MoE Scale Mappings ({len(moe_mappings)}):")
-                for orig_name, renamed_name in sorted(moe_mappings.items()):
-                    print(f"    {orig_name} → {renamed_name}")
-
-            if attn_mappings:
-                print(f"\n  Attention Scale Mappings ({len(attn_mappings)}):")
-                for orig_name, renamed_name in sorted(attn_mappings.items()):
-                    print(f"    {orig_name} → {renamed_name}")
-
-            if other_mappings:
-                print(f"\n  Other Scale Mappings ({len(other_mappings)}):")
-                for orig_name, renamed_name in sorted(other_mappings.items()):
-                    print(f"    {orig_name} → {renamed_name}")
-
-            print(f"\nRenamed scale parameters ({len(renamed_scales)}):")
-            for scale_name in sorted(renamed_scales):
-                print(f"  {scale_name}")
-
-            # Get expected scale parameters from model
-            model_scale_params = []
-            for param_name in params_dict.keys():
-                if "scale" in param_name:
-                    model_scale_params.append(param_name)
-
-            print(f"\nExpected scale parameters in model ({len(model_scale_params)}):")
-            for param_name in sorted(model_scale_params):
-                print(f"  {param_name}")
-
-                        # Check for missing scale parameters
-            missing_scales = set(model_scale_params) - set(renamed_scales)
-            extra_scales = set(renamed_scales) - set(model_scale_params)
-
-            # Filter out q_scale and prob_scale as they're expected to use defaults
-            expected_defaults = {p for p in missing_scales if ".attn.q_scale" in p or ".attn.prob_scale" in p}
-            truly_missing = missing_scales - expected_defaults
-
-            if truly_missing:
-                print(f"\n⚠️  MISSING scale parameters ({len(truly_missing)}):")
-                for param_name in sorted(truly_missing):
-                    print(f"  {param_name}")
-            else:
-                print(f"\n✅ All required scale parameters found in checkpoint!")
-
-            if expected_defaults:
-                print(f"\n📋 Scale parameters using defaults ({len(expected_defaults)}):")
-                for param_name in sorted(expected_defaults):
-                    print(f"  {param_name} (will use default value 1.0)")
-
-
-            if extra_scales:
-                print(f"\n⚠️  EXTRA scale parameters in checkpoint ({len(extra_scales)}):")
-                for param_name in sorted(extra_scales):
-                    print(f"  {param_name}")
-
-            print(f"=== END SCALE PARAMETER DEBUG INFO ===\n")
 
             return language_model_weights, other_weights
 
         language_model_weights, other_weights = process_and_separate_weights()
 
-        # Load language model weights
+        # Handle expert scale parameters separately to avoid FusedMoE weight loader issues
+        expert_scale_weights = []
+        regular_language_model_weights = []
+
+        for name, weight in language_model_weights:
+            # Check if this is an expert scale parameter that needs broadcasting
+            if ("feed_forward.experts." in name and "scale" in name and
+                ".shared_expert" not in name):
+
+                if name in params_dict:
+                    param = params_dict[name]
+                    if (hasattr(param, 'data') and param.data.numel() > 1 and
+                        weight.numel() == 1):
+                        # This needs broadcasting - handle it directly
+                        # print(f"Broadcasting single scale value {weight.item()} to shape {param.data.shape} for {name}")
+                        param.data.fill_(weight.item())
+                        updated_params.add(name)
+                        continue
+
+                # Regular expert scale loading - add to separate list
+                expert_scale_weights.append((name, weight))
+            else:
+                regular_language_model_weights.append((name, weight))
+
+        # Load regular language model weights (excluding expert scales that need broadcasting)
         loader = AutoWeightsLoader(self)
         loaded_language_model_params = loader.load_weights(
-            language_model_weights)
+            regular_language_model_weights)
         assert loaded_language_model_params is not None
         updated_params.update(loaded_language_model_params)
 
+        # Load expert scale weights that didn't need broadcasting through normal mechanism
+        if expert_scale_weights:
+            loaded_expert_scale_params = loader.load_weights(expert_scale_weights)
+            if loaded_expert_scale_params:
+                updated_params.update(loaded_expert_scale_params)
+
         if self.use_data_parallel:
             other_weights = self._consolidate_qkv_weights(other_weights)
 
@@ -1192,49 +1122,4 @@ def process_and_separate_weights():
         print(f"  Using default values: {len(not_loaded_scale_params)}")
         print(f"  Total scale parameters: {total_scale_params}")
 
-        # Fix missing attention scale parameters using proper defaults
-        attention_params_fixed = 0
-        layer_k_scales = {}  # Store k_scale values for each layer
-
-        # First pass: collect k_scale values
-        for param_name, param in params_dict.items():
-            if ".attn.k_scale" in param_name and hasattr(param, 'data'):
-                layer_prefix = param_name.replace(".attn.k_scale", "")
-                if param.data.numel() == 1:
-                    layer_k_scales[layer_prefix] = float(param.data.item())
-                    print(f"📊 Found k_scale for {layer_prefix}: {layer_k_scales[layer_prefix]}")
-
-        # Second pass: fix missing scales with proper defaults
-        for param_name, param in params_dict.items():
-            if hasattr(param, 'data') and param.data.numel() == 1:
-                current_value = float(param.data.item())
-
-                # Fix q_scale: use k_scale from same layer if available
-                if ".attn.q_scale" in param_name and current_value == 1.0:
-                    layer_prefix = param_name.replace(".attn.q_scale", "")
-                    if layer_prefix in layer_k_scales:
-                        k_scale_value = layer_k_scales[layer_prefix]
-                        print(f"🔧 Setting {param_name}: {current_value} -> {k_scale_value} (using k_scale)")
-                        param.data.fill_(k_scale_value)
-                        attention_params_fixed += 1
-                    else:
-                        print(f"⚠️  No k_scale found for {param_name}, keeping default 1.0")
-
-                # Fix prob_scale: use standard default of 1.0/448.0 for missing values
-                elif ".attn.prob_scale" in param_name and current_value == 1.0:
-                    prob_scale_default = 1.0 / 448.0
-                    print(f"🔧 Setting {param_name}: {current_value} -> {prob_scale_default} (attention prob default)")
-                    param.data.fill_(prob_scale_default)
-                    attention_params_fixed += 1
-
-        if attention_params_fixed > 0:
-            print(f"Fixed {attention_params_fixed} attention scale parameters with proper defaults")
-
-        if not_loaded_scale_params and not any(".attn.q_scale" in p or ".attn.prob_scale" in p for p in not_loaded_scale_params):
-            print(f"\n⚠️  Warning: Expected q_scale and prob_scale to be using defaults, but they weren't found")
-        elif any(".attn.q_scale" in p or ".attn.prob_scale" in p for p in not_loaded_scale_params):
-            print(f"\n✅ q_scale and prob_scale are correctly using default values!")
-
-        print(f"=== END SCALE PARAMETER VERIFICATION ===\n")
-
         return updated_params

From 59190eae74d6d7539406d9211a0fcd2b57b3c8e2 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 3 Jul 2025 05:30:38 +0000
Subject: [PATCH 16/25] cleanup

---
 vllm/model_executor/models/mllama4.py | 51 ---------------------------
 1 file changed, 51 deletions(-)

diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index a39080bc6c3..2168f9c149a 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -1028,7 +1028,6 @@ def process_and_separate_weights():
                     if (hasattr(param, 'data') and param.data.numel() > 1 and
                         weight.numel() == 1):
                         # This needs broadcasting - handle it directly
-                        # print(f"Broadcasting single scale value {weight.item()} to shape {param.data.shape} for {name}")
                         param.data.fill_(weight.item())
                         updated_params.add(name)
                         continue
@@ -1072,54 +1071,4 @@ def process_and_separate_weights():
                 weight_loader(param, loaded_weight)
                 updated_params.add(name)
 
-        # Print final verification of loaded scale parameters
-        print(f"\n=== SCALE PARAMETER LOADING VERIFICATION ===")
-
-        # Show parameters that were loaded from checkpoint
-        loaded_scale_params = {}
-        for param_name, param in params_dict.items():
-            if "scale" in param_name and param_name in updated_params:
-                if hasattr(param, 'data'):
-                    param_value = param.data
-                    if param_value.numel() == 1:
-                        loaded_scale_params[param_name] = float(param_value.item())
-                    else:
-                        loaded_scale_params[param_name] = f"tensor{list(param_value.shape)} (first few values: {param_value.flatten()[:5].tolist()})"
-                else:
-                    loaded_scale_params[param_name] = "No .data attribute"
-
-        if loaded_scale_params:
-            print(f"Scale parameters loaded from checkpoint ({len(loaded_scale_params)}):")
-            for param_name, value in sorted(loaded_scale_params.items()):
-                print(f"  {param_name}: {value}")
-
-        # Show parameters that weren't loaded but exist in model (including defaults)
-        not_loaded_scale_params = {}
-        for param_name, param in params_dict.items():
-            if "scale" in param_name and param_name not in updated_params:
-                if hasattr(param, 'data'):
-                    param_value = param.data
-                    if param_value.numel() == 1:
-                        not_loaded_scale_params[param_name] = float(param_value.item())
-                    else:
-                        not_loaded_scale_params[param_name] = f"tensor{list(param_value.shape)} (first few values: {param_value.flatten()[:5].tolist()})"
-                else:
-                    not_loaded_scale_params[param_name] = "No .data attribute"
-
-        if not_loaded_scale_params:
-            print(f"\nScale parameters using default values ({len(not_loaded_scale_params)}):")
-            for param_name, value in sorted(not_loaded_scale_params.items()):
-                # Highlight q_scale and prob_scale specifically
-                if ".attn.q_scale" in param_name or ".attn.prob_scale" in param_name:
-                    print(f"  {param_name}: {value} ⭐ (expected default)")
-                else:
-                    print(f"  {param_name}: {value}")
-
-        # Summary
-        total_scale_params = len(loaded_scale_params) + len(not_loaded_scale_params)
-        print(f"\nScale parameter summary:")
-        print(f"  Loaded from checkpoint: {len(loaded_scale_params)}")
-        print(f"  Using default values: {len(not_loaded_scale_params)}")
-        print(f"  Total scale parameters: {total_scale_params}")
-
         return updated_params

From 7a6fc84e3f5f5e182428e937ab7cb23e4d93a3b7 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 3 Jul 2025 06:44:30 +0000
Subject: [PATCH 17/25] some refactor and cleanup

---
 .../layers/quantization/modelopt.py           | 40 ++++++++++++-------
 .../model_loader/weight_utils.py              |  2 +-
 vllm/model_executor/models/llama4.py          | 17 +-------
 vllm/model_executor/models/mllama4.py         | 37 +----------------
 4 files changed, 30 insertions(+), 66 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 600f2af9468..68aa2872c1f 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -85,22 +85,34 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
                              "quant configuration.")
         is_checkpoint_fp8_serialized = ("FP8" in quant_method)
 
-        # Convert exclude_modules to handle the language_model prefix for llama4
-        converted_exclude_modules = []
-        if exclude_modules:
-            for module in exclude_modules:
-                converted_exclude_modules.append(module)
-                if not module.startswith("language_model."):
-                    converted_exclude_modules.append(
-                        f"language_model.{module}")
-
         return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method,
-                   converted_exclude_modules)
+                   exclude_modules)
+
+    def is_layer_excluded(self, prefix: str) -> bool:
+        """
+        Check if a layer should be excluded from quantization.
+
+        This method handles both regular models and multimodal models that use
+        the language_model prefix. For multimodal models, it checks if the
+        module name (without the language_model prefix) is in the exclude list.
+        """
+        if self.exclude_modules is None:
+            return False
+
+        # Check if any excluded module matches the prefix
+        for module in self.exclude_modules:
+            if (module in prefix or
+                (prefix.startswith("language_model.") and
+                 module in prefix.removeprefix("language_model."))):
+                return True
+        return False
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
         if isinstance(layer, LinearBase):
+            if self.is_layer_excluded(prefix):
+                return UnquantizedLinearMethod()
             return ModelOptFp8LinearMethod(self)
         elif isinstance(layer, Attention):
             return ModelOptFp8KVCacheMethod(self)
@@ -275,7 +287,7 @@ def create_weights(
             w13_weight_scale = PerTensorScaleParameter(
                 data=torch.full(
                     (num_experts, 2),
-                    1.0,  # Initialize to reasonable default instead of -inf
+                    1.0,
                     dtype=torch.float32,
                 ),
                 weight_loader=weight_loader,
@@ -285,7 +297,7 @@ def create_weights(
                     (num_experts, ),
                     1.0,
                     dtype=torch.
-                    float32  # Initialize to reasonable default instead of -inf
+                    float32
                 ),
                 weight_loader=weight_loader,
             )
@@ -327,7 +339,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             # Fp8 moe kernel needs single weight scale for w13 per expert.
             # We take the max of the w1 and w3 scales
             # then dequant and requant each expert.
-            if layer.w13_weight_scale.dim() == 2:  # Shape: (num_experts, 2)
+            if layer.w13_weight_scale.dim() == 2:
 
                 # Get the maximum scale across w1 and w3 for each expert
                 max_w13_scales = layer.w13_weight_scale.max(dim=1).values
@@ -431,7 +443,7 @@ def apply(
             inplace=inplace,
             activation=activation,
             use_fp8_w8a8=True,
-            per_channel_quant=False,  # ModelOpt uses per-tensor quantization
+            per_channel_quant=False,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
             w1_scale=layer.w13_weight_scale,
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index b886efed0a8..a70c89f2d82 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -771,7 +771,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
                     f".self_attn.attn{scale_name}")
             elif any(qkv_scale_name in name
                      for qkv_scale_name in qkv_proj_scale_names):
-                # Handle qkv_proj scale parameters: .self_attn.qkv_proj.k_scale -> .self_attn.attn.k_scale
+                # Handle qkv_proj scale parameters
                 remapped_name = name.replace(
                     f".self_attn.qkv_proj{scale_name}",
                     f".self_attn.attn{scale_name}")
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index d0dbae20dce..409e627cd9f 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -433,9 +433,6 @@ def load_weights(self, weights: Iterable[tuple[str,
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name or "experts" in name:
                     continue
-                # Don't transform k_scale/v_scale parameter names with
-                # stacked parameter mapping but allow other scale parameters
-                # (input_scale, weight_scale) to be processed
                 if not (name.endswith(
                     (".k_scale", ".v_scale")) and "self_attn" in name):
                     name = name.replace(weight_name, param_name)
@@ -445,15 +442,13 @@ def load_weights(self, weights: Iterable[tuple[str,
                     # Remapping the name of FP8 kv-scale.
                     name = maybe_remap_kv_scale_name(name, params_dict)
                     if name is None:
-                        continue  # Skip this parameter if remapping failed
+                        continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 if weight_loader == default_weight_loader:
-                    # default_weight_loader doesn't support shard_id
                     weight_loader(param, loaded_weight)
                 else:
-                    # Custom weight loader that supports shard_id
                     weight_loader(param, loaded_weight, shard_id)
                 loaded_params.add(name)
                 break
@@ -490,20 +485,12 @@ def load_weights(self, weights: Iterable[tuple[str,
                                 sig = inspect.signature(weight_loader)
                                 if ('expert_id' in sig.parameters and
                                     'shard_id' in sig.parameters):
-                                    # This is a MoE weight loader, provide the
-                                    # required arguments
-                                    # Determine the appropriate shard_id based
-                                    # on parameter name
+                                    # This is a MoE weight loader
                                     if "w13_" in name:
-                                        # w13 corresponds to gate_up_proj, which
-                                        # can be either w1 or w3
                                         shard_id = "w1"
                                     elif "w2_" in name:
-                                        # w2 corresponds to down_proj
                                         shard_id = "w2"
                                     else:
-                                        # Fallback - this shouldn't happen for
-                                        # scale parameters
                                         shard_id = "w1"
 
                                     weight_loader(param,
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 2168f9c149a..415e99ba3d0 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -926,25 +926,13 @@ def process_and_separate_weights():
             language_model_weights = []
             other_weights = []
 
-            # Track scale parameters for debugging
-            checkpoint_scales = []
-            renamed_scales = []
-            scale_mapping = {}
-
             for name, weight in weights:
-                # Track scale parameters from checkpoint
-                if "scale" in name:
-                    checkpoint_scales.append(name)
 
-                # Apply renaming logic
+                # Apply renaming logic for ModelOpt llama4 fp8 checkpoints
                 if name.startswith("model."):
                     # Handle expert scale parameters with flat naming
                     if "feed_forward.experts." in name and (
                             "_input_scale" in name or "_weight_scale" in name):
-                        # Expert scales in checkpoint are single values for all
-                        # experts e.g., "model.layers.0.feed_forward.experts.
-                        # down_proj_input_scale" should map to "language_model.
-                        # model.layers.0.feed_forward.experts.w2_input_scale"
 
                         renamed = name.replace("model.",
                                                "language_model.model.", 1)
@@ -967,42 +955,23 @@ def process_and_separate_weights():
                     # Handle attention scale parameters
                     elif "self_attn." in name and (
                             ".k_scale" in name or ".v_scale" in name):
-                        # Map attention scale parameters for ModelOpt checkpoints
-                        # e.g., "model.layers.0.self_attn.k_proj.k_scale"
-                        # should map to "language_model.model.layers.0.self_attn.attn.k_scale"
 
                         renamed = name.replace("model.",
                                                "language_model.model.", 1)
 
-                        # Map checkpoint attention scale naming to vLLM's expected naming
                         if ".k_proj.k_scale" in renamed:
                             renamed = renamed.replace(".k_proj.k_scale", ".attn.k_scale")
                         elif ".v_proj.v_scale" in renamed:
                             renamed = renamed.replace(".v_proj.v_scale", ".attn.v_scale")
                     else:
-                        # Standard model.* to language_model.model.* renaming
                         renamed = name.replace("model.",
                                                "language_model.model.", 1)
-
-                    # Track renamed scale parameters and mapping
-                    if "scale" in renamed:
-                        renamed_scales.append(renamed)
-                        if "scale" in name:  # Only add to mapping if original was also a scale
-                            scale_mapping[name] = renamed
                 elif name.startswith("lm_head.weight"):
-                    # Rename lm_head.weight to language_model.lm_head.weight
                     renamed = name.replace("lm_head.weight",
                                            "language_model.lm_head.weight")
                 else:
-                    # Keep other weights as is
                     renamed = name
 
-                # Track renamed scale parameters and mapping
-                if "scale" in renamed:
-                    renamed_scales.append(renamed)
-                    if "scale" in name and name not in scale_mapping:  # Avoid duplicates
-                        scale_mapping[name] = renamed
-
                 # Separate into language_model and other weights
                 if renamed.startswith("language_model."):
                     language_model_weights.append((renamed, weight))
@@ -1014,7 +983,6 @@ def process_and_separate_weights():
 
         language_model_weights, other_weights = process_and_separate_weights()
 
-        # Handle expert scale parameters separately to avoid FusedMoE weight loader issues
         expert_scale_weights = []
         regular_language_model_weights = []
 
@@ -1032,19 +1000,16 @@ def process_and_separate_weights():
                         updated_params.add(name)
                         continue
 
-                # Regular expert scale loading - add to separate list
                 expert_scale_weights.append((name, weight))
             else:
                 regular_language_model_weights.append((name, weight))
 
-        # Load regular language model weights (excluding expert scales that need broadcasting)
         loader = AutoWeightsLoader(self)
         loaded_language_model_params = loader.load_weights(
             regular_language_model_weights)
         assert loaded_language_model_params is not None
         updated_params.update(loaded_language_model_params)
 
-        # Load expert scale weights that didn't need broadcasting through normal mechanism
         if expert_scale_weights:
             loaded_expert_scale_params = loader.load_weights(expert_scale_weights)
             if loaded_expert_scale_params:

From 47a47a918503358c66386e150da0ffad53b71d4d Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 3 Jul 2025 07:03:48 +0000
Subject: [PATCH 18/25] refactor Llama4ForConditionalGeneration.load_weights

---
 vllm/model_executor/models/mllama4.py | 206 +++++++++++++-------------
 1 file changed, 106 insertions(+), 100 deletions(-)

diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 415e99ba3d0..be3caf829f8 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -903,122 +903,88 @@ def _consolidate_qkv_weights(
             qkv_weight = torch.cat(weight, dim=0)
             yield key, qkv_weight
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
-            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
-            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
-            # Shared expert gate_up_proj stacking
-            (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0),
-            (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1),
-            # Feed forward gate_up_proj stacking (for non-MoE layers if any)
-            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
-            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        updated_params: set[str] = set()
+    def _rename_weight_for_checkpoint(self, name: str) -> str:
+        """Rename weights from ModelOpt llama4 fp8 checkpoints to vLLM format."""
+        if name.startswith("model."):
+            # Handle expert scale parameters with flat naming
+            if "feed_forward.experts." in name and ("_input_scale" in name or "_weight_scale" in name):
+                renamed = name.replace("model.", "language_model.model.", 1)
+                # Map checkpoint naming to vLLM's expected naming
+                if "down_proj_input_scale" in renamed:
+                    return renamed.replace("down_proj_input_scale", "w2_input_scale")
+                elif "down_proj_weight_scale" in renamed:
+                    return renamed.replace("down_proj_weight_scale", "w2_weight_scale")
+                elif "gate_up_proj_input_scale" in renamed:
+                    return renamed.replace("gate_up_proj_input_scale", "w13_input_scale")
+                elif "gate_up_proj_weight_scale" in renamed:
+                    return renamed.replace("gate_up_proj_weight_scale", "w13_weight_scale")
+                return renamed
+
+            # Handle attention scale parameters
+            elif "self_attn." in name and (".k_scale" in name or ".v_scale" in name):
+                renamed = name.replace("model.", "language_model.model.", 1)
+                if ".k_proj.k_scale" in renamed:
+                    return renamed.replace(".k_proj.k_scale", ".attn.k_scale")
+                elif ".v_proj.v_scale" in renamed:
+                    return renamed.replace(".v_proj.v_scale", ".attn.v_scale")
+                return renamed
+
+            # Standard model.* to language_model.model.* renaming
+            return name.replace("model.", "language_model.model.", 1)
+
+        elif name.startswith("lm_head.weight"):
+            return name.replace("lm_head.weight", "language_model.lm_head.weight")
+
+        return name
+
+    def _separate_and_rename_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> tuple[list[tuple[str, torch.Tensor]], list[tuple[str, torch.Tensor]]]:
+        """Rename weights and separate them into language_model and other weights."""
+        language_model_weights = []
+        other_weights = []
+
+        for name, weight in weights:
+            renamed = self._rename_weight_for_checkpoint(name)
+
+            if renamed.startswith("language_model."):
+                language_model_weights.append((renamed, weight))
+            else:
+                other_weights.append((renamed, weight))
 
-        # Combine renaming and separation logic in a single pass
-        def process_and_separate_weights():
-            language_model_weights = []
-            other_weights = []
-
-            for name, weight in weights:
-
-                # Apply renaming logic for ModelOpt llama4 fp8 checkpoints
-                if name.startswith("model."):
-                    # Handle expert scale parameters with flat naming
-                    if "feed_forward.experts." in name and (
-                            "_input_scale" in name or "_weight_scale" in name):
-
-                        renamed = name.replace("model.",
-                                               "language_model.model.", 1)
-
-                        # Map checkpoint naming to vLLM's expected naming
-                        if "down_proj_input_scale" in renamed:
-                            renamed = renamed.replace("down_proj_input_scale",
-                                                      "w2_input_scale")
-                        elif "down_proj_weight_scale" in renamed:
-                            renamed = renamed.replace("down_proj_weight_scale",
-                                                      "w2_weight_scale")
-                        elif "gate_up_proj_input_scale" in renamed:
-                            renamed = renamed.replace(
-                                "gate_up_proj_input_scale", "w13_input_scale")
-                        elif "gate_up_proj_weight_scale" in renamed:
-                            renamed = renamed.replace(
-                                "gate_up_proj_weight_scale",
-                                "w13_weight_scale")
-
-                    # Handle attention scale parameters
-                    elif "self_attn." in name and (
-                            ".k_scale" in name or ".v_scale" in name):
-
-                        renamed = name.replace("model.",
-                                               "language_model.model.", 1)
-
-                        if ".k_proj.k_scale" in renamed:
-                            renamed = renamed.replace(".k_proj.k_scale", ".attn.k_scale")
-                        elif ".v_proj.v_scale" in renamed:
-                            renamed = renamed.replace(".v_proj.v_scale", ".attn.v_scale")
-                    else:
-                        renamed = name.replace("model.",
-                                               "language_model.model.", 1)
-                elif name.startswith("lm_head.weight"):
-                    renamed = name.replace("lm_head.weight",
-                                           "language_model.lm_head.weight")
-                else:
-                    renamed = name
-
-                # Separate into language_model and other weights
-                if renamed.startswith("language_model."):
-                    language_model_weights.append((renamed, weight))
-                else:
-                    other_weights.append((renamed, weight))
-
-
-            return language_model_weights, other_weights
-
-        language_model_weights, other_weights = process_and_separate_weights()
+        return language_model_weights, other_weights
 
+    def _handle_expert_scale_broadcasting(self, weights: list[tuple[str, torch.Tensor]], params_dict: dict) -> tuple[list[tuple[str, torch.Tensor]], set[str]]:
+        """Handle expert scale parameters that need broadcasting."""
+        regular_weights = []
         expert_scale_weights = []
-        regular_language_model_weights = []
+        updated_params = set()
 
-        for name, weight in language_model_weights:
+        for name, weight in weights:
             # Check if this is an expert scale parameter that needs broadcasting
-            if ("feed_forward.experts." in name and "scale" in name and
-                ".shared_expert" not in name):
-
+            if ("feed_forward.experts." in name and "scale" in name and ".shared_expert" not in name):
                 if name in params_dict:
                     param = params_dict[name]
-                    if (hasattr(param, 'data') and param.data.numel() > 1 and
-                        weight.numel() == 1):
-                        # This needs broadcasting - handle it directly
+                    if (hasattr(param, 'data') and param.data.numel() > 1 and weight.numel() == 1):
+                        # Broadcast single value to all experts
                         param.data.fill_(weight.item())
                         updated_params.add(name)
                         continue
 
                 expert_scale_weights.append((name, weight))
             else:
-                regular_language_model_weights.append((name, weight))
+                regular_weights.append((name, weight))
 
-        loader = AutoWeightsLoader(self)
-        loaded_language_model_params = loader.load_weights(
-            regular_language_model_weights)
-        assert loaded_language_model_params is not None
-        updated_params.update(loaded_language_model_params)
+        return regular_weights, expert_scale_weights, updated_params
 
-        if expert_scale_weights:
-            loaded_expert_scale_params = loader.load_weights(expert_scale_weights)
-            if loaded_expert_scale_params:
-                updated_params.update(loaded_expert_scale_params)
+    def _load_other_weights(self, other_weights: Iterable[tuple[str, torch.Tensor]], params_dict: dict, stacked_params_mapping: list) -> set[str]:
+        """Load non-language-model weights with stacking support."""
+        updated_params = set()
 
         if self.use_data_parallel:
             other_weights = self._consolidate_qkv_weights(other_weights)
 
         for name, loaded_weight in other_weights:
+            # Try stacked parameter mapping first
+            mapped = False
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name or self.use_data_parallel:
                     continue
@@ -1027,13 +993,53 @@ def process_and_separate_weights():
                 updated_params.add(name)
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
+                mapped = True
                 break
-            else:
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
 
+            if not mapped:
+                # Use regular weight loading
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
                 updated_params.add(name)
 
         return updated_params
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            # Shared expert gate_up_proj stacking
+            (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0),
+            (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1),
+            # Feed forward gate_up_proj stacking (for non-MoE layers if any)
+            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
+            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        updated_params: set[str] = set()
+
+        # Separate and rename weights
+        language_model_weights, other_weights = self._separate_and_rename_weights(weights)
+
+        # Handle expert scale parameters
+        regular_weights, expert_scale_weights, updated_params_from_experts = self._handle_expert_scale_broadcasting(language_model_weights, params_dict)
+        updated_params.update(updated_params_from_experts)
+
+        loader = AutoWeightsLoader(self)
+        loaded_language_model_params = loader.load_weights(regular_weights)
+        assert loaded_language_model_params is not None
+        updated_params.update(loaded_language_model_params)
+
+        if expert_scale_weights:
+            loaded_expert_scale_params = loader.load_weights(expert_scale_weights)
+            if loaded_expert_scale_params:
+                updated_params.update(loaded_expert_scale_params)
+
+        updated_params.update(self._load_other_weights(other_weights, params_dict, stacked_params_mapping))
+
+        return updated_params

From eec1daf66d2e0df80780e35c89e3bcad74cb98e5 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 9 Jul 2025 06:16:04 +0000
Subject: [PATCH 19/25] resolve conflict

---
 vllm/attention/layer.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index ee8c452b4e1..f0ad68b1640 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -314,13 +314,9 @@ def __init__(
                            _Backend.FLEX_ATTENTION):
                 backend = _Backend.XFORMERS
 
-            # self.attn_backend = backend if backend in {
-            #     _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
-            # } else _Backend.TORCH_SDPA
-
-            # Force TORCH_SDPA to avoid xformers-triton compatibility issues
-            # TODO: Remove this workaround once xformers-triton compatibility is fixed
-            self.attn_backend = _Backend.TORCH_SDPA
+            self.attn_backend = backend if backend in {
+                _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
+            } else _Backend.TORCH_SDPA
 
     def forward(
         self,

From 770bc24e376a5075a46327fc4a2d21df1d13f7db Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 3 Jul 2025 07:17:12 +0000
Subject: [PATCH 20/25] format and linter error fix

---
 .../layers/quantization/modelopt.py           | 26 ++++----
 vllm/model_executor/models/llama4.py          |  4 +-
 vllm/model_executor/models/mllama4.py         | 63 +++++++++++++------
 3 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 68aa2872c1f..7f6903e9b90 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 from torch.nn import Module
@@ -43,7 +43,7 @@ def __init__(
         self,
         is_checkpoint_fp8_serialized: bool = False,
         kv_cache_quant_method: Optional[str] = None,
-        exclude_modules: Optional[List[str]] = None,
+        exclude_modules: Optional[list[str]] = None,
     ) -> None:
         super().__init__()
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
@@ -101,9 +101,9 @@ def is_layer_excluded(self, prefix: str) -> bool:
 
         # Check if any excluded module matches the prefix
         for module in self.exclude_modules:
-            if (module in prefix or
-                (prefix.startswith("language_model.") and
-                 module in prefix.removeprefix("language_model."))):
+            if (module in prefix
+                    or (prefix.startswith("language_model.")
+                        and module in prefix.removeprefix("language_model."))):
                 return True
         return False
 
@@ -206,7 +206,8 @@ def apply(
 
 class ModelOptFp8MoEMethod:
     """MoE method for ModelOpt FP8.
-    Supports loading FP8 checkpoints with static weight scale and activation scale.
+    Supports loading FP8 checkpoints with static weight scale and
+    activation scale.
     Args:
         quant_config: The ModelOpt quantization config.
     """
@@ -214,8 +215,8 @@ class ModelOptFp8MoEMethod:
     def __new__(cls, *args, **kwargs):
         """
         Dynamic class composition pattern.
-        This allows us to effectively "inject" FusedMoEMethodBase as a parent class
-        at runtime while avoiding circular import issues.
+        This allows us to effectively "inject" FusedMoEMethodBase as a parent
+        class at runtime while avoiding circular import issues.
         """
 
         if not hasattr(cls, "_initialized"):
@@ -252,7 +253,7 @@ def create_weights(
         **extra_weight_attrs,
     ):
 
-        # Use FP8 dtype if checkpoint is serialized, otherwise use the default dtype
+        # Use FP8 dtype if checkpoint is serialized
         weight_dtype = (torch.float8_e4m3fn
                         if self.quant_config.is_checkpoint_fp8_serialized else
                         params_dtype)
@@ -293,12 +294,7 @@ def create_weights(
                 weight_loader=weight_loader,
             )
             w2_weight_scale = PerTensorScaleParameter(
-                data=torch.full(
-                    (num_experts, ),
-                    1.0,
-                    dtype=torch.
-                    float32
-                ),
+                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
                 weight_loader=weight_loader,
             )
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 409e627cd9f..7696b84bf3f 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -483,8 +483,8 @@ def load_weights(self, weights: Iterable[tuple[str,
                                 # Try to inspect the weight_loader signature
                                 import inspect
                                 sig = inspect.signature(weight_loader)
-                                if ('expert_id' in sig.parameters and
-                                    'shard_id' in sig.parameters):
+                                if ('expert_id' in sig.parameters
+                                        and 'shard_id' in sig.parameters):
                                     # This is a MoE weight loader
                                     if "w13_" in name:
                                         shard_id = "w1"
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index be3caf829f8..b7d0a1ddafd 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -904,24 +904,31 @@ def _consolidate_qkv_weights(
             yield key, qkv_weight
 
     def _rename_weight_for_checkpoint(self, name: str) -> str:
-        """Rename weights from ModelOpt llama4 fp8 checkpoints to vLLM format."""
+        """Rename weights from ModelOpt llama4 fp8 checkpoints to vLLM
+        format."""
         if name.startswith("model."):
             # Handle expert scale parameters with flat naming
-            if "feed_forward.experts." in name and ("_input_scale" in name or "_weight_scale" in name):
+            if "feed_forward.experts." in name and ("_input_scale" in name or
+                                                    "_weight_scale" in name):
                 renamed = name.replace("model.", "language_model.model.", 1)
                 # Map checkpoint naming to vLLM's expected naming
                 if "down_proj_input_scale" in renamed:
-                    return renamed.replace("down_proj_input_scale", "w2_input_scale")
+                    return renamed.replace("down_proj_input_scale",
+                                           "w2_input_scale")
                 elif "down_proj_weight_scale" in renamed:
-                    return renamed.replace("down_proj_weight_scale", "w2_weight_scale")
+                    return renamed.replace("down_proj_weight_scale",
+                                           "w2_weight_scale")
                 elif "gate_up_proj_input_scale" in renamed:
-                    return renamed.replace("gate_up_proj_input_scale", "w13_input_scale")
+                    return renamed.replace("gate_up_proj_input_scale",
+                                           "w13_input_scale")
                 elif "gate_up_proj_weight_scale" in renamed:
-                    return renamed.replace("gate_up_proj_weight_scale", "w13_weight_scale")
+                    return renamed.replace("gate_up_proj_weight_scale",
+                                           "w13_weight_scale")
                 return renamed
 
             # Handle attention scale parameters
-            elif "self_attn." in name and (".k_scale" in name or ".v_scale" in name):
+            elif "self_attn." in name and (".k_scale" in name
+                                           or ".v_scale" in name):
                 renamed = name.replace("model.", "language_model.model.", 1)
                 if ".k_proj.k_scale" in renamed:
                     return renamed.replace(".k_proj.k_scale", ".attn.k_scale")
@@ -933,12 +940,16 @@ def _rename_weight_for_checkpoint(self, name: str) -> str:
             return name.replace("model.", "language_model.model.", 1)
 
         elif name.startswith("lm_head.weight"):
-            return name.replace("lm_head.weight", "language_model.lm_head.weight")
+            return name.replace("lm_head.weight",
+                                "language_model.lm_head.weight")
 
         return name
 
-    def _separate_and_rename_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> tuple[list[tuple[str, torch.Tensor]], list[tuple[str, torch.Tensor]]]:
-        """Rename weights and separate them into language_model and other weights."""
+    def _separate_and_rename_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> tuple[list[tuple[str, torch.Tensor]], list[tuple[str, torch.Tensor]]]:
+        """Rename weights and separate them into language_model and other
+        weights."""
         language_model_weights = []
         other_weights = []
 
@@ -952,7 +963,9 @@ def _separate_and_rename_weights(self, weights: Iterable[tuple[str, torch.Tensor
 
         return language_model_weights, other_weights
 
-    def _handle_expert_scale_broadcasting(self, weights: list[tuple[str, torch.Tensor]], params_dict: dict) -> tuple[list[tuple[str, torch.Tensor]], set[str]]:
+    def _handle_expert_scale_broadcasting(
+            self, weights: list[tuple[str, torch.Tensor]], params_dict: dict
+    ) -> tuple[list[tuple[str, torch.Tensor]], set[str]]:
         """Handle expert scale parameters that need broadcasting."""
         regular_weights = []
         expert_scale_weights = []
@@ -960,10 +973,12 @@ def _handle_expert_scale_broadcasting(self, weights: list[tuple[str, torch.Tenso
 
         for name, weight in weights:
             # Check if this is an expert scale parameter that needs broadcasting
-            if ("feed_forward.experts." in name and "scale" in name and ".shared_expert" not in name):
+            if ("feed_forward.experts." in name and "scale" in name
+                    and ".shared_expert" not in name):
                 if name in params_dict:
                     param = params_dict[name]
-                    if (hasattr(param, 'data') and param.data.numel() > 1 and weight.numel() == 1):
+                    if (hasattr(param, 'data') and param.data.numel() > 1
+                            and weight.numel() == 1):
                         # Broadcast single value to all experts
                         param.data.fill_(weight.item())
                         updated_params.add(name)
@@ -975,7 +990,10 @@ def _handle_expert_scale_broadcasting(self, weights: list[tuple[str, torch.Tenso
 
         return regular_weights, expert_scale_weights, updated_params
 
-    def _load_other_weights(self, other_weights: Iterable[tuple[str, torch.Tensor]], params_dict: dict, stacked_params_mapping: list) -> set[str]:
+    def _load_other_weights(self, other_weights: Iterable[tuple[str,
+                                                                torch.Tensor]],
+                            params_dict: dict,
+                            stacked_params_mapping: list) -> set[str]:
         """Load non-language-model weights with stacking support."""
         updated_params = set()
 
@@ -999,7 +1017,8 @@ def _load_other_weights(self, other_weights: Iterable[tuple[str, torch.Tensor]],
             if not mapped:
                 # Use regular weight loading
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
                 weight_loader(param, loaded_weight)
                 updated_params.add(name)
 
@@ -1024,10 +1043,13 @@ def load_weights(self, weights: Iterable[tuple[str,
         updated_params: set[str] = set()
 
         # Separate and rename weights
-        language_model_weights, other_weights = self._separate_and_rename_weights(weights)
+        language_model_weights, other_weights = (
+            self._separate_and_rename_weights(weights))
 
         # Handle expert scale parameters
-        regular_weights, expert_scale_weights, updated_params_from_experts = self._handle_expert_scale_broadcasting(language_model_weights, params_dict)
+        regular_weights, expert_scale_weights, updated_params_from_experts = (
+            self._handle_expert_scale_broadcasting(language_model_weights,
+                                                   params_dict))
         updated_params.update(updated_params_from_experts)
 
         loader = AutoWeightsLoader(self)
@@ -1036,10 +1058,13 @@ def load_weights(self, weights: Iterable[tuple[str,
         updated_params.update(loaded_language_model_params)
 
         if expert_scale_weights:
-            loaded_expert_scale_params = loader.load_weights(expert_scale_weights)
+            loaded_expert_scale_params = loader.load_weights(
+                expert_scale_weights)
             if loaded_expert_scale_params:
                 updated_params.update(loaded_expert_scale_params)
 
-        updated_params.update(self._load_other_weights(other_weights, params_dict, stacked_params_mapping))
+        updated_params.update(
+            self._load_other_weights(other_weights, params_dict,
+                                     stacked_params_mapping))
 
         return updated_params

From 770890abb124415d2e020976d7da00c577bdc1b4 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 3 Jul 2025 07:38:09 +0000
Subject: [PATCH 21/25] simplify ModelOptFp8MoEMethod to avoid mypy error

---
 .../layers/quantization/modelopt.py           | 27 +------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 7f6903e9b90..5482a686874 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -204,7 +204,7 @@ def apply(
                                      bias=bias)
 
 
-class ModelOptFp8MoEMethod:
+class ModelOptFp8MoEMethod(FusedMoEMethodBase):
     """MoE method for ModelOpt FP8.
     Supports loading FP8 checkpoints with static weight scale and
     activation scale.
@@ -212,31 +212,6 @@ class ModelOptFp8MoEMethod:
         quant_config: The ModelOpt quantization config.
     """
 
-    def __new__(cls, *args, **kwargs):
-        """
-        Dynamic class composition pattern.
-        This allows us to effectively "inject" FusedMoEMethodBase as a parent
-        class at runtime while avoiding circular import issues.
-        """
-
-        if not hasattr(cls, "_initialized"):
-            original_init = cls.__init__
-            new_cls = type(
-                cls.__name__,
-                (FusedMoEMethodBase, ),
-                {
-                    "__init__": original_init,
-                    **{
-                        k: v
-                        for k, v in cls.__dict__.items() if k != "__dict__"
-                    },
-                },
-            )
-            obj = super(new_cls, new_cls).__new__(new_cls)
-            obj.__init__(*args, **kwargs)
-            return obj
-        return super().__new__(cls)
-
     def __init__(self, quant_config: ModelOptFp8Config):
         self.quant_config = quant_config
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (

From 1beecbff9d8c29e75ae855e65a81c193a287f684 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Fri, 4 Jul 2025 00:28:48 +0000
Subject: [PATCH 22/25] resolve conflict

---
 vllm/model_executor/layers/fused_moe/layer.py | 20 +++++++--
 .../layers/quantization/experts_int8.py       |  4 +-
 .../layers/quantization/modelopt.py           | 18 ++++----
 .../layers/quantization/moe_wna16.py          |  2 +
 vllm/model_executor/models/llama4.py          | 43 +++++++++----------
 5 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index a44e83bfee8..9e33d70ebd8 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -81,6 +81,16 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
         raise NotImplementedError
 
+    def uses_weight_scale_2_pattern(self) -> bool:
+        """
+        Returns True if this quantization method uses 'weight_scale_2' pattern
+        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
+
+        This method should be overridden by subclasses that use the
+        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
+        """
+        return False
+
     def init_prepare_finalize(self, moe: FusedMoEConfig,
                               quant_config: Optional[QuantizationConfig]):
         all2all_manager = get_ep_group().device_communicator.all2all_manager
@@ -1050,12 +1060,12 @@ def weight_loader(self,
         # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
         if "ModelOpt" in quant_method_name:
             # Determine per-tensor weight scale patterns based on variant
-            is_fp4_variant = ("ModelOptNvFp4FusedMoEMethod"
-                              in self.quant_method.__class__.__name__)
+            # Use the dedicated method instead of brittle string matching
+            uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern()
 
             # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale"
             per_tensor_conditions = (
-                "weight_scale_2" in weight_name if is_fp4_variant else
+                "weight_scale_2" in weight_name if uses_weight_scale_2 else
                 "weight_scale" in weight_name) or "input_scale" in weight_name
 
             if per_tensor_conditions:
@@ -1536,3 +1546,7 @@ def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
     dispatch_key=current_platform.dispatch_key,
     tags=(torch.Tag.needs_fixed_stride_order, ),
 )
+
+# Mark the FusedMoE weight_loader as supporting MoE-specific parameters
+# to avoid expensive runtime reflection in model loading code
+FusedMoE.weight_loader.supports_moe_loading = True
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 47eca80609e..d7acb1fbd39 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -188,8 +188,10 @@ def quantize_and_call_weight_loader(param: torch.nn.Parameter,
                 raise ValueError(
                     f"Shard id must be in [0,1,2] but got {shard_id}")
             weight_loader(param, loaded_weight, weight_name, shard_id,
-                          expert_id)
+                                                    expert_id)
 
+        # Mark as supporting MoE-specific loading to avoid expensive reflection
+        quantize_and_call_weight_loader.supports_moe_loading = True
         return quantize_and_call_weight_loader
 
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 5482a686874..e283b2bb4db 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -350,6 +350,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                    "w2_weight_scale") and layer.w2_weight_scale is not None:
             layer.w2_weight_scale = Parameter(layer.w2_weight_scale.data,
                                               requires_grad=False)
+        # Input scales must be equal for each expert in fp8 MoE layers.
         if hasattr(layer,
                    "w13_input_scale") and layer.w13_input_scale is not None:
             layer.w13_input_scale = Parameter(layer.w13_input_scale.max(),
@@ -366,21 +367,16 @@ def apply(
         router_logits: torch.Tensor,
         top_k: int,
         renormalize: bool,
-        use_grouped_topk: bool,
+        use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
-        num_fused_shared_experts: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
-        correction_bias: Optional[torch.Tensor] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
-        activation: str = "silu",
         apply_router_weight_on_input: bool = False,
-        inplace: bool = True,
-        no_combine: bool = False,
-        routed_scaling_factor: Optional[float] = None,
+        activation: str = "silu",
         enable_eplb: bool = False,
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
@@ -411,7 +407,7 @@ def apply(
             layer.w2_weight,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
-            inplace=inplace,
+            inplace=True,
             activation=activation,
             use_fp8_w8a8=True,
             per_channel_quant=False,
@@ -725,6 +721,12 @@ def __init__(self, quant_config: ModelOptNvFp4Config):
                                  " quantization. Please use Blackwell and"
                                  " above.")
 
+    def uses_weight_scale_2_pattern(self) -> bool:
+        """
+        FP4 variants use 'weight_scale_2' pattern for per-tensor weight scales.
+        """
+        return True
+
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index c5055a02fa3..86d3f4c7a1a 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -454,4 +454,6 @@ def moe_wna16_weight_loader(param: torch.nn.Parameter,
                 weight_loader(param, loaded_weight, weight_name, shard_id,
                               expert_id)
 
+        # Mark as supporting MoE-specific loading to avoid expensive reflection
+        moe_wna16_weight_loader.supports_moe_loading = True
         return moe_wna16_weight_loader
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 7696b84bf3f..8f7d25be541 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -479,30 +479,27 @@ def load_weights(self, weights: Iterable[tuple[str,
                         # Check if this is a MoE-specific weight loader that
                         # needs extra arguments
                         if hasattr(param, 'weight_loader'):
-                            try:
-                                # Try to inspect the weight_loader signature
-                                import inspect
-                                sig = inspect.signature(weight_loader)
-                                if ('expert_id' in sig.parameters
-                                        and 'shard_id' in sig.parameters):
-                                    # This is a MoE weight loader
-                                    if "w13_" in name:
-                                        shard_id = "w1"
-                                    elif "w2_" in name:
-                                        shard_id = "w2"
-                                    else:
-                                        shard_id = "w1"
-
-                                    weight_loader(param,
-                                                  loaded_weight,
-                                                  name,
-                                                  shard_id=shard_id,
-                                                  expert_id=0)
+                            # Check for MoE-specific loading support via
+                            # attribute instead of expensive runtime reflection
+                            supports_moe = getattr(weight_loader,
+                                                'supports_moe_loading', False)
+
+                            if supports_moe:
+                                # This is a MoE weight loader
+                                if "w13_" in name:
+                                    shard_id = "w1"
+                                elif "w2_" in name:
+                                    shard_id = "w2"
                                 else:
-                                    # Regular weight loader
-                                    weight_loader(param, loaded_weight)
-                            except Exception:
-                                # Fallback to regular loading
+                                    shard_id = "w1"
+
+                                weight_loader(param,
+                                              loaded_weight,
+                                              name,
+                                              shard_id=shard_id,
+                                              expert_id=0)
+                            else:
+                                # Regular weight loader
                                 weight_loader(param, loaded_weight)
                         else:
                             weight_loader(param, loaded_weight)

From 0b98a7fa153111ce96c4d6352b4b7b3573e79783 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Fri, 4 Jul 2025 00:20:36 +0000
Subject: [PATCH 23/25] format fix

---
 vllm/model_executor/layers/fused_moe/layer.py           | 3 ++-
 vllm/model_executor/layers/quantization/experts_int8.py | 2 +-
 vllm/model_executor/models/llama4.py                    | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 9e33d70ebd8..b814ee956c7 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1061,7 +1061,8 @@ def weight_loader(self,
         if "ModelOpt" in quant_method_name:
             # Determine per-tensor weight scale patterns based on variant
             # Use the dedicated method instead of brittle string matching
-            uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern()
+            uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern(
+            )
 
             # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale"
             per_tensor_conditions = (
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index d7acb1fbd39..f7d28e3bdf7 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -188,7 +188,7 @@ def quantize_and_call_weight_loader(param: torch.nn.Parameter,
                 raise ValueError(
                     f"Shard id must be in [0,1,2] but got {shard_id}")
             weight_loader(param, loaded_weight, weight_name, shard_id,
-                                                    expert_id)
+                          expert_id)
 
         # Mark as supporting MoE-specific loading to avoid expensive reflection
         quantize_and_call_weight_loader.supports_moe_loading = True
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 8f7d25be541..e740e00c3cd 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -482,7 +482,8 @@ def load_weights(self, weights: Iterable[tuple[str,
                             # Check for MoE-specific loading support via
                             # attribute instead of expensive runtime reflection
                             supports_moe = getattr(weight_loader,
-                                                'supports_moe_loading', False)
+                                                   'supports_moe_loading',
+                                                   False)
 
                             if supports_moe:
                                 # This is a MoE weight loader

From cc44385e7492fdee4766773cbdedce18d93b6d17 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 9 Jul 2025 18:58:49 +0000
Subject: [PATCH 24/25] fix mypy error

---
 vllm/model_executor/layers/fused_moe/layer.py           | 2 +-
 vllm/model_executor/layers/quantization/experts_int8.py | 2 +-
 vllm/model_executor/layers/quantization/moe_wna16.py    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b814ee956c7..2129de083a7 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1550,4 +1550,4 @@ def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
 
 # Mark the FusedMoE weight_loader as supporting MoE-specific parameters
 # to avoid expensive runtime reflection in model loading code
-FusedMoE.weight_loader.supports_moe_loading = True
+FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index f7d28e3bdf7..67083c3b4b5 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -191,7 +191,7 @@ def quantize_and_call_weight_loader(param: torch.nn.Parameter,
                           expert_id)
 
         # Mark as supporting MoE-specific loading to avoid expensive reflection
-        quantize_and_call_weight_loader.supports_moe_loading = True
+        quantize_and_call_weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
         return quantize_and_call_weight_loader
 
 
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 86d3f4c7a1a..f03c7b3d501 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -455,5 +455,5 @@ def moe_wna16_weight_loader(param: torch.nn.Parameter,
                               expert_id)
 
         # Mark as supporting MoE-specific loading to avoid expensive reflection
-        moe_wna16_weight_loader.supports_moe_loading = True
+        moe_wna16_weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
         return moe_wna16_weight_loader

From 1206f330a2564bcd92ddd48a2143b6fad27c9f40 Mon Sep 17 00:00:00 2001
From: jingyu <jingyu@omniml.ai>
Date: Thu, 10 Jul 2025 05:11:16 +0000
Subject: [PATCH 25/25] add qwen fp8 modelopt support

---
 vllm/config.py                                  |  4 ++++
 .../layers/quantization/modelopt.py             | 17 +++++++++++------
 vllm/model_executor/models/qwen2.py             | 12 ++++++++++--
 vllm/model_executor/models/qwen3_moe.py         | 16 +++++++++++++---
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 508e09174cc..c065bdb4158 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -887,6 +887,10 @@ def _parse_quant_hf_config(self):
         if quant_cfg is None:
             # compressed-tensors uses a "compression_config" key
             quant_cfg = getattr(self.hf_config, "compression_config", None)
+        if quant_cfg is not None:
+            if quant_cfg["producer"]["name"].lower() == "modelopt":
+                if "quant_algo" in quant_cfg.keys() and quant_cfg["quant_algo"].lower() == "fp8":
+                    quant_cfg = {"quant_method": "modelopt"}
         return quant_cfg
 
     def _verify_quantization(self) -> None:
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 3853938ba1c..6cdcf3f781b 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -71,12 +71,17 @@ def get_config_filenames(cls) -> list[str]:
 
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
-        quant_config = cls.get_from_keys(config, ["quantization"])
-        quant_method = quant_config["quant_algo"]
-        kv_cache_quant_method = cls.get_from_keys(
-            config, ["quantization"]).get("kv_cache_quant_algo")
-        exclude_modules = cls.get_from_keys(
-            config, ["quantization"]).get("exclude_modules")
+        try:
+            quant_method = cls.get_from_keys(config, ["quant_algo"])
+            kv_cache_quant_method = cls.get_from_keys(config, ["kv_cache_scheme"])
+            exclude_modules = cls.get_from_keys(config, ["ignore"])
+        except:
+            quant_config = cls.get_from_keys(config, ["quantization"])
+            quant_method = quant_config["quant_algo"]
+            kv_cache_quant_method = cls.get_from_keys(
+                config, ["quantization"]).get("kv_cache_quant_algo")
+            exclude_modules = cls.get_from_keys(
+                config, ["quantization"]).get("exclude_modules")
 
         if quant_method not in QUANT_ALGOS:
             raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}"
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 7ef9d248da4..1186d65425f 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -400,9 +400,17 @@ def load_weights(self, weights: Iterable[tuple[str,
                     continue
                 if is_pp_missing_parameter(name, self):
                     continue
+                if name.endswith("scale"):
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
                 param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index ff182aadf73..09b32d16038 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -46,7 +46,9 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -406,6 +408,10 @@ def load_weights(self, weights: Iterable[tuple[str,
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
+                if name.endswith("scale"):
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
                 # We have mlp.experts[0].gate_proj in the checkpoint.
                 # Since we handle the experts below in expert_params_mapping,
                 # we need to skip here BEFORE we update the name, otherwise
@@ -427,8 +433,12 @@ def load_weights(self, weights: Iterable[tuple[str,
                     continue
 
                 param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
                 break
             else:
                 for mapping in expert_params_mapping: