fix LM Eval Small Models test failure

bnellnm · bnellnm · commit fc61195f009c · 2025-07-01T17:22:30.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
@@ -382,14 +382,20 @@ def make(
                 per_out_ch_quant = (
                     weight_quant.strategy == QuantizationStrategy.CHANNEL)
 
-            assert quant_dtype is not None
-
-            _quant_config = FusedMoEQuantConfig(
-                quant_dtype=quant_dtype,
-                per_act_token_quant=per_act_token_quant,
-                per_out_ch_quant=per_out_ch_quant,
-                block_shape=block_shape,
-            )
+            if quant_dtype is not None:
+                _quant_config = FusedMoEQuantConfig(
+                    quant_dtype=quant_dtype,
+                    per_act_token_quant=per_act_token_quant,
+                    per_out_ch_quant=per_out_ch_quant,
+                    block_shape=block_shape,
+                )
+            else:
+                logger.warning_once("MoE DP setup unable to determine "
+                                    "quantization scheme or unsupported "
+                                    "quantization type. This model will "
+                                    "not run with DP enabled.")
+
+            _quant_config = FusedMoEQuantConfig()
         else:
             _quant_config = quant_config
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -799,8 +799,8 @@ def select_gemm_impl(
                 self.quant_config.weight_block_size, False)
             return BatchedTritonOrDeepGemmExperts(
                 max_num_tokens=max_num_tokens_per_rank,
-                world_size=moe.world_size,
-                dp_size=moe.dp_size,
+                world_size=prepare_finalize.world_size,
+                dp_size=prepare_finalize.dp_size,
                 use_fp8_w8a8=True,
                 block_shape=self.quant_config.weight_block_size,
                 per_act_token_quant=False,