vllm-project · jingyu-ml · Jul 2, 2025 · Jun 20, 2025 · Jun 20, 2025 · Jun 21, 2025
@@ -887,6 +887,10 @@
         if quant_cfg is None:
             # compressed-tensors uses a "compression_config" key
             quant_cfg = getattr(self.hf_config, "compression_config", None)
+        if quant_cfg is not None:
+            if quant_cfg["producer"]["name"].lower() == "modelopt":
+                if "quant_algo" in quant_cfg.keys() and quant_cfg["quant_algo"].lower() == "fp8":
+                    quant_cfg = {"quant_method": "modelopt"}
-            if quant_cfg["producer"]["name"].lower() == "modelopt":
-                if "quant_algo" in quant_cfg.keys() and quant_cfg["quant_algo"].lower() == "fp8":
-                    quant_cfg = {"quant_method": "modelopt"}
+        if (
+            quant_cfg is not None
+            and quant_cfg.get("producer", {}).get("name", "").lower() == "modelopt"
+            and quant_cfg.get("quant_algo", "").lower() == "fp8"
+        ):
+            quant_cfg = {"quant_method": "modelopt"}
-            if quant_cfg["producer"]["name"].lower() == "modelopt":
-                if "quant_algo" in quant_cfg.keys() and quant_cfg["quant_algo"].lower() == "fp8":
-                    quant_cfg = {"quant_method": "modelopt"}
+        if (
+            quant_cfg is not None
+            and quant_cfg.get("producer", {}).get("name", "").lower() == "modelopt"
+            and quant_cfg.get("quant_algo", "").lower() == "fp8"
+        ):
+            quant_cfg = {"quant_method": "modelopt"}
         return quant_cfg
 
     def _verify_quantization(self) -> None:

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -81,6 +81,16 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
         raise NotImplementedError
 
+    def uses_weight_scale_2_pattern(self) -> bool:
+        """
+        Returns True if this quantization method uses 'weight_scale_2' pattern
+        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
+
+        This method should be overridden by subclasses that use the
+        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
+        """
+        return False
+
     def init_prepare_finalize(self, moe: FusedMoEConfig,
                               quant_config: Optional[QuantizationConfig]):
         all2all_manager = get_ep_group().device_communicator.all2all_manager
@@ -1049,12 +1059,23 @@ def weight_loader(self,
 
         # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
         if "ModelOpt" in quant_method_name:
-            if ('weight_scale_2' in weight_name
-                    or 'input_scale' in weight_name):
-                self._load_per_tensor_weight_scale(shard_id=shard_id,
-                                                   param=param,
-                                                   loaded_weight=loaded_weight,
-                                                   expert_id=expert_id)
+            # Determine per-tensor weight scale patterns based on variant
+            # Use the dedicated method instead of brittle string matching
+            uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern(
+            )
+
+            # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale"
+            per_tensor_conditions = (
+                "weight_scale_2" in weight_name if uses_weight_scale_2 else
+                "weight_scale" in weight_name) or "input_scale" in weight_name
+
+            if per_tensor_conditions:
+                self._load_per_tensor_weight_scale(
+                    shard_id=shard_id,
+                    param=param,
+                    loaded_weight=loaded_weight,
+                    expert_id=expert_id,
+                )
             elif "weight" in weight_name:
                 self._load_model_weight_or_group_weight_scale(
                     shard_id=shard_id,
@@ -1526,3 +1547,7 @@ def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
     dispatch_key=current_platform.dispatch_key,
     tags=(torch.Tag.needs_fixed_stride_order, ),
 )
+
+# Mark the FusedMoE weight_loader as supporting MoE-specific parameters
+# to avoid expensive runtime reflection in model loading code
+FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
-FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
+with torch.inference_mode():
+    FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
-FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
+with torch.inference_mode():
+    FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
@@ -190,6 +190,8 @@ def quantize_and_call_weight_loader(param: torch.nn.Parameter,
             weight_loader(param, loaded_weight, weight_name, shard_id,
                           expert_id)
 
+        # Mark as supporting MoE-specific loading to avoid expensive reflection
+        quantize_and_call_weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
         return quantize_and_call_weight_loader