Chen-zexi
diff --git a/‎vllm/model_executor/layers/fused_moe/layer.py
Lines changed: 31 additions & 6 deletions b/‎vllm/model_executor/layers/fused_moe/layer.py
Lines changed: 31 additions & 6 deletions
diff --git a/‎vllm/model_executor/layers/quantization/modelopt.py
Lines changed: 261 additions & 5 deletions b/‎vllm/model_executor/layers/quantization/modelopt.py
Lines changed: 261 additions & 5 deletions
@@ -81,6 +81,16 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
         raise NotImplementedError
 
+    def uses_weight_scale_2_pattern(self) -> bool:
+        """
+        Returns True if this quantization method uses 'weight_scale_2' pattern
+        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
+
+        This method should be overridden by subclasses that use the
+        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
+        """
+        return False
+
     @staticmethod
     def maybe_make_prepare_finalize(
             moe: FusedMoEConfig) -> Optional[FusedMoEPrepareAndFinalize]:
@@ -1081,12 +1091,23 @@ def weight_loader(self,
 
         # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
         if "ModelOpt" in quant_method_name:
-            if ('weight_scale_2' in weight_name
-                    or 'input_scale' in weight_name):
-                self._load_per_tensor_weight_scale(shard_id=shard_id,
-                                                   param=param,
-                                                   loaded_weight=loaded_weight,
-                                                   expert_id=expert_id)
+            # Determine per-tensor weight scale patterns based on variant
+            # Use the dedicated method instead of brittle string matching
+            uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern(
+            )
+
+            # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale"
+            per_tensor_conditions = (
+                "weight_scale_2" in weight_name if uses_weight_scale_2 else
+                "weight_scale" in weight_name) or "input_scale" in weight_name
+
+            if per_tensor_conditions:
+                self._load_per_tensor_weight_scale(
+                    shard_id=shard_id,
+                    param=param,
+                    loaded_weight=loaded_weight,
+                    expert_id=expert_id,
+                )
             elif "weight" in weight_name:
                 self._load_model_weight_or_group_weight_scale(
                     shard_id=shard_id,
@@ -1558,3 +1579,7 @@ def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
     dispatch_key=current_platform.dispatch_key,
     tags=(torch.Tag.needs_fixed_stride_order, ),
 )
+
+# Mark the FusedMoE weight_loader as supporting MoE-specific parameters
+# to avoid expensive runtime reflection in model loading code
+FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
@@ -42,9 +42,13 @@ class ModelOptFp8Config(QuantizationConfig):
     def __init__(
         self,
         is_checkpoint_fp8_serialized: bool = False,
+        kv_cache_quant_method: Optional[str] = None,
+        exclude_modules: Optional[list[str]] = None,
     ) -> None:
         super().__init__()
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        self.kv_cache_quant_method = kv_cache_quant_method
+        self.exclude_modules = exclude_modules
         if is_checkpoint_fp8_serialized:
             logger.warning("Detected ModelOpt fp8 checkpoint. Please note that"
                            " the format is experimental and could change.")
@@ -69,34 +73,63 @@ def get_config_filenames(cls) -> list[str]:
     def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
         quant_config = cls.get_from_keys(config, ["quantization"])
         quant_method = quant_config["quant_algo"]
+        kv_cache_quant_method = cls.get_from_keys(
+            config, ["quantization"]).get("kv_cache_quant_algo")
+        exclude_modules = cls.get_from_keys(
+            config, ["quantization"]).get("exclude_modules")
+
         if quant_method not in QUANT_ALGOS:
             raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}"
                              " quantizations in vLLM. Please check the "
                              "`hf_quant_config.json` file for your model's "
                              "quant configuration.")
         is_checkpoint_fp8_serialized = ("FP8" in quant_method)
 
-        return cls(is_checkpoint_fp8_serialized)
+        return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method,
+                   exclude_modules)
+
+    def is_layer_excluded(self, prefix: str) -> bool:
+        """
+        Check if a layer should be excluded from quantization.
+
+        This method handles both regular models and multimodal models that use
+        the language_model prefix. For multimodal models, it checks if the
+        module name (without the language_model prefix) is in the exclude list.
+        """
+        if self.exclude_modules is None:
+            return False
+
+        # Check if any excluded module matches the prefix
+        for module in self.exclude_modules:
+            if (module in prefix
+                    or (prefix.startswith("language_model.")
+                        and module in prefix.removeprefix("language_model."))):
+                return True
+        return False
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
         if isinstance(layer, LinearBase):
+            if self.is_layer_excluded(prefix):
+                return UnquantizedLinearMethod()
             return ModelOptFp8LinearMethod(self)
         elif isinstance(layer, Attention):
             return ModelOptFp8KVCacheMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return ModelOptFp8MoEMethod(self)
         return None
 
 
 class ModelOptFp8LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer static quantization.
     Supports loading FP8 checkpoints with static weight scale and
-    activation scale. Future support might be added for dynamic 
+    activation scale. Future support might be added for dynamic
     scales.
 
     Limitations:
     1. Only support per-tensor quantization due to torch._scaled_mm support.
-    2. Only support float8_e4m3fn datatype 
+    2. Only support float8_e4m3fn datatype
         Args: quant_config: The ModelOpt quantization config.
     """
 
@@ -172,6 +205,223 @@ def apply(
                                      bias=bias)
 
 
+class ModelOptFp8MoEMethod(FusedMoEMethodBase):
+    """MoE method for ModelOpt FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    activation scale.
+    Args:
+        quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        self.quant_config = quant_config
+        from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+            cutlass_fp8_supported)
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        # Use FP8 dtype if checkpoint is serialized
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_checkpoint_fp8_serialized else
+                        params_dtype)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(num_experts,
+                             2 * intermediate_size_per_partition,
+                             hidden_size,
+                             dtype=weight_dtype),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(num_experts,
+                             hidden_size,
+                             intermediate_size_per_partition,
+                             dtype=weight_dtype),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALES - Per-tensor scaling for ModelOpts
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = PerTensorScaleParameter(
+                data=torch.full(
+                    (num_experts, 2),
+                    1.0,
+                    dtype=torch.float32,
+                ),
+                weight_loader=weight_loader,
+            )
+            w2_weight_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+            # Set weight loader attributes for scales
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+
+            # INPUT SCALES - Per-tensor scaling for ModelOpt
+            w13_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            w2_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts, ), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Process FP8 MoE weights after loading from serialized checkpoint.
+        Only supports pre-quantized checkpoints with FP8 weights and scales.
+        """
+
+        layer.w13_weight = Parameter(layer.w13_weight.data,
+                                     requires_grad=False)
+        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+
+        from vllm._custom_ops import scaled_fp8_quant
+        from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+            per_tensor_dequantize)
+
+        # Handle scale parameters
+        if hasattr(layer,
+                   "w13_weight_scale") and layer.w13_weight_scale is not None:
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max of the w1 and w3 scales
+            # then dequant and requant each expert.
+            if layer.w13_weight_scale.dim() == 2:
+
+                # Get the maximum scale across w1 and w3 for each expert
+                max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+
+                # Requantize each expert's weights using the combined scale
+                # w13_weight (num_experts, 2 * intermediate_size, hidden_size)
+                # where the first intermediate_size rows are w1, the next are w3
+                intermediate_size = layer.w13_weight.shape[1] // 2
+                for expert_id in range(layer.w13_weight.shape[0]):
+                    start = 0
+                    for shard_id in range(2):  # w1 and w3
+                        # Dequantize using the original scale for this shard
+                        dq_weight = per_tensor_dequantize(
+                            layer.w13_weight[expert_id][start:start +
+                                                        intermediate_size, :],
+                            layer.w13_weight_scale[expert_id][shard_id],
+                        )
+                        # Requantize using the combined max scale
+
+                        (
+                            layer.w13_weight[expert_id][start:start +
+                                                        intermediate_size, :],
+                            _,
+                        ) = scaled_fp8_quant(dq_weight,
+                                             max_w13_scales[expert_id])
+
+                        start += intermediate_size
+
+                # Update the scale parameter to be per-expert
+                layer.w13_weight_scale = Parameter(max_w13_scales,
+                                                   requires_grad=False)
+            else:
+                layer.w13_weight_scale = Parameter(layer.w13_weight_scale.data,
+                                                   requires_grad=False)
+
+        if hasattr(layer,
+                   "w2_weight_scale") and layer.w2_weight_scale is not None:
+            layer.w2_weight_scale = Parameter(layer.w2_weight_scale.data,
+                                              requires_grad=False)
+        # Input scales must be equal for each expert in fp8 MoE layers.
+        if hasattr(layer,
+                   "w13_input_scale") and layer.w13_input_scale is not None:
+            layer.w13_input_scale = Parameter(layer.w13_input_scale.max(),
+                                              requires_grad=False)
+        if hasattr(layer,
+                   "w2_input_scale") and layer.w2_input_scale is not None:
+            layer.w2_input_scale = Parameter(layer.w2_input_scale.max(),
+                                             requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
+
+        # Expert selection
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+        from vllm.model_executor.layers.fused_moe.fused_moe import (
+            fused_experts)
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            use_fp8_w8a8=True,
+            per_channel_quant=False,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+
+
 class ModelOptNvFp4Config(QuantizationConfig):
     """Config class for ModelOpt FP4."""
 
@@ -274,7 +524,7 @@ def __init__(self, quant_config: Union[ModelOptFp8Config,
 class ModelOptNvFp4LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer NVFP4.
     Supports loading NVFP4 checkpoints with the following structure:
-    
+
     input_scale: torch.float32, scalar ,
     weight: NVFP4(represented as byte) Shape: [1, X, y/2]
     weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale,
@@ -455,7 +705,7 @@ def apply(
 class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     """
     MoE Method for FP4 Quantization.
-    Args: 
+    Args:
         quant_config: NVFP4 Quant Config
     """
 
@@ -472,6 +722,12 @@ def __init__(self, quant_config: ModelOptNvFp4Config):
                                  " quantization. Please use Blackwell and"
                                  " above.")
 
+    def uses_weight_scale_2_pattern(self) -> bool:
+        """
+        FP4 variants use 'weight_scale_2' pattern for per-tensor weight scales.
+        """
+        return True
+
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):