Re-add 2stage moe

gshtras · gshtras · commit 105e655bab6f · 2025-04-23T21:18:10.000Z
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -85,6 +85,7 @@
     VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
+    VLLM_ROCM_USE_AITER_2STAGE_MOE: bool = True
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_USE_AITER_MLA: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
@@ -598,6 +599,11 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
              ("true", "1")),
 
+    # use aiter ck fused moe op if ater ops are enabled
+    "VLLM_ROCM_USE_AITER_2STAGE_MOE":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_2STAGE_MOE", "True").lower() in
+             ("true", "1")),
+
     # use aiter rms norm op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_RMSNORM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -121,11 +121,15 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                                              requires_grad=False)
         # Lazy import to avoid importing triton.
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            is_rocm_aiter_moe_enabled, shuffle_weights)
+            is_rocm_aiter_2stage_moe_enabled, is_rocm_aiter_moe_enabled,
+            shuffle_weights)
         if is_rocm_aiter_moe_enabled():
             # reshaping weights is required for aiter moe kernel.
-            shuffled_w13, shuffled_w2 = shuffle_weights(
-                layer.w13_weight.data, layer.w2_weight.data)
+            layout = (32, 32) if is_rocm_aiter_2stage_moe_enabled() else (16,
+                                                                          16)
+            shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data,
+                                                        layer.w2_weight.data,
+                                                        layout=layout)
 
             layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                   requires_grad=False)
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -13,6 +13,12 @@ def is_rocm_aiter_moe_enabled() -> bool:
         and envs.VLLM_ROCM_USE_AITER
 
 
+def is_rocm_aiter_2stage_moe_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_2STAGE_MOE \
+        and envs.VLLM_ROCM_USE_AITER
+
+
 def rocm_aiter_asm_moe_tkw1(hidden_states,
                             w1,
                             w2,
@@ -165,6 +171,17 @@ def rocm_aiter_fused_experts(
     elif use_fp8_w8a8:
         assert not apply_router_weight_on_input, (
             "apply_router_weight_on_input is not supported for fp8_w8a8")
+        if is_rocm_aiter_2stage_moe_enabled():
+            from aiter.fused_moe_bf16_asm import ck_moe_2stages
+            return ck_moe_2stages(a1=hidden_states,
+                                  w1=w1,
+                                  w2=w2,
+                                  topk_weight=topk_weights,
+                                  topk_ids=topk_ids,
+                                  fc1_scale=w1_scale,
+                                  fc2_scale=w2_scale,
+                                  a1_scale=a1_scale,
+                                  a2_scale=a2_scale)
         return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
                                            w1=w1,
                                            w2=w2,
@@ -187,7 +204,17 @@ def rocm_aiter_fused_experts(
         hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
         topk_ids = topk_ids.to(torch.int32)
         topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
-
+    if is_rocm_aiter_2stage_moe_enabled():
+        from aiter.fused_moe_bf16_asm import ck_moe_2stages
+        return ck_moe_2stages(a1=hidden_states,
+                              w1=w1,
+                              w2=w2,
+                              topk_weight=topk_weights,
+                              topk_ids=topk_ids,
+                              fc1_scale=w1_scale,
+                              fc2_scale=w2_scale,
+                              a1_scale=a1_scale,
+                              a2_scale=a2_scale)
     return rocm_aiter.ck_moe(hidden_states=hidden_states,
                              w1=w1,
                              w2=w2,
@@ -207,7 +234,8 @@ def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
     return topk_weights, topk_indices
 
 
-def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
+def shuffle_weights(*tensors: torch.Tensor,
+                    layout: tuple[int, int]) -> tuple[torch.Tensor, ...]:
     """
     Applies shuffle_weight function from AITER to each 
     input tensor and returns them.
@@ -220,7 +248,7 @@ def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
     """
     from aiter.ops.shuffle import shuffle_weight
 
-    return tuple(shuffle_weight(tensor) for tensor in tensors)
+    return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
 
 
 def expand_weights(*tensors: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -585,7 +585,8 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
     def process_weights_after_loading(self, layer: Module) -> None:
         # Lazy import to avoid importing triton too early.
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            expand_weights, is_rocm_aiter_moe_enabled, shuffle_weights)
+            expand_weights, is_rocm_aiter_2stage_moe_enabled,
+            is_rocm_aiter_moe_enabled, shuffle_weights)
 
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
@@ -615,7 +616,9 @@ def process_weights_after_loading(self, layer: Module) -> None:
             if is_rocm_aiter_moe_enabled():
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = shuffle_weights(
-                    layer.w13_weight.data, layer.w2_weight.data)
+                    layer.w13_weight.data,
+                    layer.w2_weight.data,
+                    layout=(16, 16))
 
                 layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                       requires_grad=False)
@@ -673,9 +676,12 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     w13_scales.contiguous(), requires_grad=False)
                 layer.w2_weight_scale = torch.nn.Parameter(
                     w2_scales.contiguous(), requires_grad=False)
-
-                shuffled_w13, shuffled_w2 = shuffle_weights(
-                    layer.w13_weight, layer.w2_weight)
+                layout = (32,
+                          32) if is_rocm_aiter_2stage_moe_enabled() else (16,
+                                                                          16)
+                shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight,
+                                                            layer.w2_weight,
+                                                            layout=layout)
 
                 layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                       requires_grad=False)
@@ -759,9 +765,12 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     expansion_dims=expansion_dims)
                 layer.w2_weight_scale = torch.nn.Parameter(
                     w2_scales.contiguous(), requires_grad=False)
-
-                shuffled_w13, shuffled_w2 = shuffle_weights(
-                    layer.w13_weight, layer.w2_weight)
+                layout = (32,
+                          32) if is_rocm_aiter_2stage_moe_enabled() else (16,
+                                                                          16)
+                shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight,
+                                                            layer.w2_weight,
+                                                            layout=layout)
 
                 layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                       requires_grad=False)
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
@@ -87,7 +87,7 @@ def rocm_unquantized_gemm(x: torch.Tensor,
         out = ops.wvSplitK(weight, x_view, cu_count)
         return out.view(*x.shape[:-1], weight.shape[0])
     elif m % 4 == 0 and n == 1 and k <= 8192:
-        out = ops.LLMM1(weight, x_view, out, 4)
+        out = ops.LLMM1(weight, x_view, 4)
         return out.view(*x.shape[:-1], weight.shape[0])
     return torch.nn.functional.linear(x, weight, bias)