Fix enable_multistream_moe for unquantized scenario

sdmyzlp · sdmyzlp · commit 57d570aa1650 · 2025-07-07T10:35:34.000+08:00
Signed-off-by: sdmyzlp &lt;lrwei2@petalmail.com&gt;
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -40,6 +40,7 @@
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import FusedMoEState
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
+from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
 from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
                                get_ascend_soc_version, npu_stream_switch,
                                npu_wait_tensor)
@@ -1144,9 +1145,10 @@ def forward(self,
         if self.enable_multistream_moe:
             assert gate is not None
             router_logits, _ = gate(hidden_states)
-            if isinstance(self.quant_method.quant_method,
-                          AscendW8A8DynamicFusedMoEMethod
-                          ) and fused_moe_state == FusedMoEState.MC2:
+            quant_method = self.quant_method
+            if isinstance(quant_method, AscendFusedMoEMethod) and isinstance(
+                    quant_method.quant_method, AscendW8A8DynamicFusedMoEMethod
+            ) and fused_moe_state == FusedMoEState.MC2:
                 with npu_stream_switch("moe_secondary", 0):
                     quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant(
                         hidden_states)