scale hacking

bnellnm · bnellnm · commit 31b66d8d10d8 · 2025-06-13T02:12:20.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -204,29 +204,69 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
     config_block_shape = [16, 16, 32]  # 16 for k if not fp8
 
     #print(f"A {use_fp8_w8a8} {A_q.dtype} {B_q.dtype} {A_scale.shape} {B_scale.shape}")
-
-    invoke_moe_batched_triton_kernel(
-        A_q,
-        B_q,
-        test_output,
-        num_expert_tokens,
-        compute_tl_dtype,
-        # Quantization data
-        A_scale,
-        B_scale,
-        None,
-        # Quantization schemes
-        use_fp8_w8a8,
-        False,
-        False,
-        config={
-            "BLOCK_SIZE_M": config_block_shape[0],
-            "BLOCK_SIZE_N": config_block_shape[1],
-            "BLOCK_SIZE_K": config_block_shape[2],
-        },
-        per_act_token_quant=False,
-        block_shape=block_shape,
-    )
+    if False:
+        from vllm.model_executor.layers.fused_moe.batched_moe2 import fused_moe_kernel2
+        fused_moe_kernel2(
+            A_q,
+            B_q,
+            test_output,
+            A_scale,
+            B_scale,
+            num_expert_tokens,
+            N,
+            K,
+            max_tokens_per_expert,
+            max_tokens_per_expert,
+            A_q.stride(0),
+            A_q.stride(1),
+            A_q.stride(2),
+            B_q.stride(0),
+            B_q.stride(1),
+            B_q.stride(2),
+            test_output.stride(0),
+            test_output.stride(1),
+            A_scale.stride(0),
+            A_scale.stride(1),
+            A_scale.stride(2),
+            B_scale.stride(0),
+            B_scale.stride(1),
+            B_scale.stride(2),
+            block_shape[0] if block_shape is not None else 0,
+            block_shape[1] if block_shape is not None else 0,
+            config_block_shape[0],
+            config_block_shape[1],
+            config_block_shape[2],
+            1,
+            1, # topk hack
+            compute_tl_dtype,
+            use_fp8_w8a8,
+            False,
+            False,
+            per_channel_quant=False,
+        )
+    else:
+        invoke_moe_batched_triton_kernel(
+            A_q,
+            B_q,
+            test_output,
+            num_expert_tokens,
+            compute_tl_dtype,
+            # Quantization data
+            A_scale,
+            B_scale,
+            None,
+            # Quantization schemes
+            use_fp8_w8a8,
+            False,
+            False,
+            config={
+                "BLOCK_SIZE_M": config_block_shape[0],
+                "BLOCK_SIZE_N": config_block_shape[1],
+                "BLOCK_SIZE_K": config_block_shape[2],
+            },
+            per_act_token_quant=False,
+            block_shape=block_shape,
+        )
 
     ref_output = ref_impl(
         A,
@@ -283,7 +323,7 @@ def per_block_cast_to_fp8(
     return x_scaled_sub, scales
 
 
-def make_test_weights(
+def _make_test_weights(
     e: int,
     n: int,
     k: int,
@@ -298,10 +338,10 @@ def make_test_weights(
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
-    w1_bf16 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
+    w1_bf16 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
     w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
 
-    w2_bf16 = torch.randn((e, k, n), dtype=dtype) / 10
+    w2_bf16 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
     w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
 
     block_n, block_k = block_size[0], block_size[1]
@@ -330,7 +370,7 @@ def make_test_weights(
     return w1, w2, w1_s, w2_s, w1_bf16, w2_bf16
 
 
-def _make_test_weights(e, n, k, block_shape, dtype):
+def make_test_weights(e, n, k, block_shape, dtype):
     use_fp8_w8a8 = dtype == torch.torch.float8_e4m3fn
     w_dtype = torch.bfloat16 if use_fp8_w8a8 else dtype
 
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
@@ -84,9 +84,6 @@ def __init__(self, cpu_group):
         assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."  # noqa
         super().__init__(cpu_group)
 
-        # Intranode doesn't work yet.
-        self.internode = True
-
         if self.internode:
             # inter-node communication needs nvshmem,
             # intra-node communication uses p2p mapping directly
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -29,8 +29,8 @@ def moe_mmk(
         stride_ak,
         stride_bk,
         stride_ase,
-        stride_ask,
         stride_asm,
+        stride_ask,
         stride_bse,
         stride_bsk,
         stride_bsn,
@@ -156,8 +156,8 @@ def expert_triton_kernel(
         stride_cm,
         stride_cn,
         stride_ase,
-        stride_ask,
         stride_asm,
+        stride_ask,
         stride_bse,
         stride_bsk,
         stride_bsn,
@@ -196,8 +196,8 @@ def expert_triton_kernel(
         stride_ak,
         stride_bk,
         stride_ase,
-        stride_ask,
         stride_asm,
+        stride_ask,
         stride_bse,
         stride_bsk,
         stride_bsn,
@@ -253,8 +253,8 @@ def batched_triton_kernel(
         stride_cm,
         stride_cn,
         stride_ase,
-        stride_ask,
         stride_asm,
+        stride_ask,
         stride_bse,
         stride_bsk,
         stride_bsn,
@@ -297,11 +297,11 @@ def batched_triton_kernel(
 
     if use_fp8_w8a8:
         # block-wise
-        if group_k > 0 and group_n > 0:
+        if (group_k > 0 and group_n > 0) or per_channel_quant:
             a_scale_ptr = a_scale_ptr + (expert_id * stride_ase) + cta_m_start * stride_asm
             #b_scale_ptr = b_scale_ptr + (expert_id * stride_bse) # + cta_n_start * stride_bsn?
-        # channel-wise
-        elif per_channel_quant:
+        # channel-wise or tensor-wise
+        else:
             a_scale_ptr = a_scale_ptr + (expert_id * stride_ase)
             #b_scale_ptr = b_scale_ptr + (expert_id * stride_bse)
 
@@ -325,8 +325,8 @@ def batched_triton_kernel(
         stride_cm,
         stride_cn,
         stride_ase,
-        stride_ask,
         stride_asm,
+        stride_ask,
         stride_bse,
         stride_bsk,
         stride_bsn,
@@ -373,6 +373,36 @@ def invoke_moe_batched_triton_kernel(
     grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) *
             triton.cdiv(B.size(1), BLOCK_N))
 
+    assert A_scale is None or A_scale.ndim == 1 or A_scale.ndim == 3, f"{0 if A_scale is None else (A_scale.ndim, A_scale.shape)}"
+    assert B_scale is None or B_scale.ndim == 1 or B_scale.ndim == 3, f"{0 if B_scale is None else (B_scale.ndim, B_scale.shape)}"
+
+    #print(f"SCALES {A_scale.shape}, {B_scale.shape}")
+
+    stride_bse = 0
+    stride_bsk = 0
+    stride_bsn = 0
+    if B_scale is not None:
+        if B_scale.ndim == 1:
+            stride_bsk = B_scale.stride(0)
+        else:
+            assert B_scale.ndim == 3
+            stride_bse = B_scale.stride(0)
+            stride_bsn = B_scale.stride(1)
+            stride_bsk = B_scale.stride(2)
+
+    stride_ase = 0
+    stride_asm = 0
+    stride_ask = 0
+    if A_scale is not None:
+        if A_scale.ndim == 1:
+            stride_ask = A_scale.stride(0)
+        else:
+            assert A_scale.ndim == 3
+            stride_ase = A_scale.stride(0)
+            stride_asm = A_scale.stride(1)
+            stride_ask = A_scale.stride(2)
+
+
     batched_triton_kernel[grid](
         A,
         B,
@@ -397,15 +427,12 @@ def invoke_moe_batched_triton_kernel(
         C.stride(0),
         C.stride(1),
         C.stride(2),
-
-        A_scale.stride(0) if A_scale is not None and A_scale.ndim >= 2 else 0,  #E
-        A_scale.stride(2) if A_scale is not None and A_scale.ndim == 3 else 0,  #K
-        A_scale.stride(1) if A_scale is not None and A_scale.ndim >= 2 else 0,  #M
-
-        B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,  #E
-        B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,  #K
-        B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,  #N
-
+        stride_ase,
+        stride_asm,
+        stride_ask,
+        stride_bse,
+        stride_bsk,
+        stride_bsn,
         # Blockwise quantization data
         0 if block_shape is None else block_shape[0],
         0 if block_shape is None else block_shape[1],
@@ -537,7 +564,11 @@ def prepare(
 
             tokens_per_expert[idx] = rows
 
-        return b_a1, b_a1_scale, tokens_per_expert, None, None
+        #b_a1_scale.fill_(0.0001)
+        #print(f"A1Q_scale = {b_a1_scale.shape}\n{b_a1_scale}")
+        assert b_a1_scale is None or b_a1_scale.ndim == 3
+
+        return b_a1, b_a1_scale, tokens_per_expert
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -97,27 +97,32 @@ def prepare(
                 "apply_router_weight_on_input is only implemented for topk=1")
             a1 = a1 * rank_topk_weights.to(a1.dtype)
 
-
-        repeat_cols = 4
-        repeat_rows = 1 if self.per_act_token_quant else a1.shape[0]
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1, (None if self.per_act_token_quant else a1_scale), self.quant_dtype,
             self.per_act_token_quant, self.block_shape)
 
+        # pplx requires 2-d scales even for scalars
         if a1q_scale is not None:
-            a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
+            if a1q_scale.dim() <= 1:
+                assert a1q_scale.numel() == 1
+                a1q_scale = a1q_scale.view(1, 1)
+
+            #print(f"ORIG {a1q_scale.shape}, {a1q_scale}")
+
+            orig_scale = a1q_scale
+            orig_a1q_scale_shape = a1q_scale.shape
 
-        # per_act_token_quant = a1_scale.numel() != 1 if a1_scale is not None else (
-        #     a2_scale.numel() != 1 if a2_scale is not None else False)
+            # pad out scales if needed
+            if a1q_scale.numel() == 1:
+                a1q_scale = a1q_scale.repeat(a1q.shape[1], 4)
 
-        # a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale,
-        #                                            self.quant_dtype,
-        #                                            per_act_token,
-        #                                            self.block_shape)
+            assert a1q_scale.shape[0] == a1q.shape[1]
 
-        if a1q_scale is not None and a1q_scale.dim() == 1:
-            assert a1q_scale.numel() == 1
-            a1q_scale = a1q_scale.view(1, 1)
+            #print(f"FINAL {a1q_scale.shape}, {a1q_scale}")
+
+
+        assert a1q_scale is None or a1q_scale.ndim == 2, \
+            f"{0 if a1q_scale is None else (a1q_scale.ndim, a1q_scale.shape)}"
 
         # rem_experts need to be 0 for pplx to work properly.
         rem_experts = num_experts % self.world_size
@@ -147,7 +152,8 @@ def prepare(
             expert_x_scale_shape = (
                 num_local_experts,
                 expert_x.size(1),
-                (expert_x.size(2) + block_size - 1) // block_size,
+                #(expert_x.size(2) + block_size - 1) // block_size,
+                orig_a1q_scale_shape[-1],
             )
 
             #print(f"XXXXXXXXXX {block_size} {expert_x_scale_shape}")
@@ -176,9 +182,22 @@ def prepare(
         if expert_x_scale is not None:
             expert_x_scale = expert_x_scale[:, :, 0:1]
 
-        #print(f"ZZZZZZZZZZZZZZ")
+        #print(f"ZZZZZZZZZZZZZZ {expert_x_scale.shape}")
         if expert_x_scale is not None:
-            expert_x_scale = expert_x_scale[:, :, 0:1]
+            expert_x_scale = expert_x_scale[:, :, :orig_a1q_scale_shape[-1]]
+            from math import prod
+            if prod(orig_a1q_scale_shape) == 1:
+                expert_x_scale = expert_x_scale[:, :1, :1]
+                #print(f"EPT {expert_num_tokens.flatten()}")
+                #print(f"SCALARIZING!!! {expert_x_scale.shape}, {expert_x_scale.flatten()}")
+                idx = expert_num_tokens.flatten() != 0
+                assert torch.all(expert_x_scale.flatten()[idx] != 0)
+                #zidx = expert_num_tokens.flatten() == 0
+                #assert torch.all(expert_x_scale.flatten()[zidx] == 0)
+                assert expert_x_scale.ndim == 3
+                #expert_x_scale = orig_scale.view(1)
+
+            assert expert_x_scale.ndim == 1 or expert_x_scale.ndim == 3
 
         return expert_x, expert_x_scale, expert_num_tokens, None, None
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -788,7 +788,12 @@ def select_gemm_impl(self, prepare_finalize, moe):
         use_batched_experts = max_num_tokens_per_rank is not None
 
         if use_batched_experts:
-            logger.debug("BatchedTritonExperts(fp8)")
+            logger.debug(
+                "BatchedTritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s",
+                self.__class__.__name__,
+                self.quant_config.weight_block_size,
+                False
+            )
             return BatchedTritonOrDeepGemmExperts(
                 max_num_tokens=max_num_tokens_per_rank,
                 world_size=prepare_finalize.world_size,
@@ -799,10 +804,16 @@ def select_gemm_impl(self, prepare_finalize, moe):
                 allow_deep_gemm=self.allow_deep_gemm,
             )
         else:
-            logger.debug("TritonOrDeepGemmExperts(fp8)")
+            logger.debug(
+                "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s",
+                self.__class__.__name__,
+                self.quant_config.weight_block_size,
+                False
+            )
             return TritonOrDeepGemmExperts(
                 use_fp8_w8a8=True,
                 block_shape=self.quant_config.weight_block_size,
+                per_act_token=False, #?
                 allow_deep_gemm=self.allow_deep_gemm,
             )
 
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py