wip hacking

bnellnm · bnellnm · commit addd937476d6 · 2025-06-13T02:12:23.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -922,6 +922,8 @@ def apply(
 
         intermediate_cache1.fill_(0)
 
+        #print(f"A1_SCALES {a1q_scale.shape}")
+
         # MM1
         invoke_moe_batched_triton_kernel(A=hidden_states,
                                          B=w1,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -340,8 +340,16 @@ def init_prepare_finalize(self, moe: MoEConfig,
 
             input_activations = get_quant_config_input_activations(
                 quant_config)
+            block_shape = quant_config.weight_block_size if quant_config is not None else None
 
             logger.debug("PplxPrepareAndFinalize")
+
+            # XXXXXXXXXXXXXXXXXXXXXXXXX TODO
+            # Remove quant flags from PrepareAndFinalize ctor and
+            # pass them in as arguments to prepare().  Get them
+            # from the FusedExperts as attributes or arguments
+
+
             prepare_finalize = PplxPrepareAndFinalize(
                 handle,
                 max_num_tokens=moe.max_num_tokens,
@@ -353,7 +361,7 @@ def init_prepare_finalize(self, moe: MoEConfig,
                 per_act_token=(input_activations.strategy
                                == QuantizationStrategy.TOKEN
                                if input_activations is not None else False),
-                block_shape=None,           # TODO (bnell): quantization
+                block_shape=None, #block_shape
             )
         elif moe.use_deepep_ht_kernels:
             assert moe.dp_size == all2all_manager.dp_world_size
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -22,13 +22,17 @@ def pplx_hidden_dim_scale_bytes(
     #   ceil_div(hidden_dim, block_size) * sizeof(float32)
     # For per-token: set to 4 * sizeof(float32) (x4 for alignment)
     if quant_dtype is not None and quant_dtype.itemsize == 1:
-        block_size = block_shape[0] if block_shape is not None else 128
         hidden_dim_bytes = hidden_dim * quant_dtype.itemsize
-        if per_act_token_quant:
-            hidden_scale_bytes = 4 * torch.float32.itemsize  #?
-        else:
+        elem_size = torch.float32.itemsize
+        if block_shape is not None:
+            assert not per_act_token_quant
+            block_size = block_shape[1]
             hidden_scale_bytes = round_up(
-                (cdiv(hidden_dim, block_size) * torch.float32.itemsize), 16)
+                (cdiv(hidden_dim, block_size) * elem_size), elem_size)
+        elif per_act_token_quant:
+            hidden_scale_bytes = hidden_dim * elem_size
+        else:
+            hidden_scale_bytes = 4 * elem_size
     else:
         hidden_dim_bytes = hidden_dim * in_dtype.itemsize
         hidden_scale_bytes = 0
@@ -101,25 +105,21 @@ def prepare(
             a1, (None if self.per_act_token_quant else a1_scale), self.quant_dtype,
             self.per_act_token_quant, self.block_shape)
 
-        # pplx requires 2-d scales even for scalars
         if a1q_scale is not None:
+            scalar_scales = a1q_scale.numel() == 1
+
+            # pplx requires 2-d scales even for scalar scales
             if a1q_scale.dim() <= 1:
-                assert a1q_scale.numel() == 1
+                assert scalar_scales
                 a1q_scale = a1q_scale.view(1, 1)
 
-            #print(f"ORIG {a1q_scale.shape}, {a1q_scale}")
-
-            orig_scale = a1q_scale
-            orig_a1q_scale_shape = a1q_scale.shape
+            # pad out scales if needed. TODO (bnell): do for non-scalar scales?
+            if scalar_scales:
+                a1q_scale = a1q_scale.repeat(a1q.shape[1], torch.float32.itemsize)
 
-            # pad out scales if needed
-            if a1q_scale.numel() == 1:
-                a1q_scale = a1q_scale.repeat(a1q.shape[1], 4)
-
-            assert a1q_scale.shape[0] == a1q.shape[1]
-
-            #print(f"FINAL {a1q_scale.shape}, {a1q_scale}")
+            orig_a_scale_block_shape = a1q_scale.shape[-1]
 
+            #assert a1_scale is None or a1_scale.shape[0] == a1q.shape[1], f"{a1_scale.shape}, {a1q_scale.shape}"
 
         assert a1q_scale is None or a1q_scale.ndim == 2, \
             f"{0 if a1q_scale is None else (a1q_scale.ndim, a1q_scale.shape)}"
@@ -146,26 +146,20 @@ def prepare(
         expert_x_scale: Optional[torch.Tensor] = None
         if a1q.dtype.itemsize == 1:
             float32_size = torch.float32.itemsize
-            block_size = (self.block_shape[0] if self.block_shape is not None
-                          else 1) * float32_size
+            block_size = (self.block_shape[1] if self.block_shape is not None else 1) * float32_size
 
             expert_x_scale_shape = (
                 num_local_experts,
                 expert_x.size(1),
-                #(expert_x.size(2) + block_size - 1) // block_size,
-                orig_a1q_scale_shape[-1],
+                (expert_x.size(2) + block_size - 1) // block_size if not scalar_scales else 1,
             )
 
-            #print(f"XXXXXXXXXX {block_size} {expert_x_scale_shape}")
-
             expert_x_scale = torch.zeros(
                 expert_x_scale_shape,
                 dtype=torch.float32,
                 device=expert_x.device,
             )
 
-            #print(f"YYYYYYYYYYYYYYY {expert_x.shape}")
-
         # This argument is optional, defaults to indices.size(0)
         # There's not much point setting this unless it is != indices.size(0)
         bound_m: Optional[torch.Tensor] = None
@@ -182,22 +176,9 @@ def prepare(
         if expert_x_scale is not None:
             expert_x_scale = expert_x_scale[:, :, 0:1]
 
-        #print(f"ZZZZZZZZZZZZZZ {expert_x_scale.shape}")
         if expert_x_scale is not None:
-            expert_x_scale = expert_x_scale[:, :, :orig_a1q_scale_shape[-1]]
-            from math import prod
-            if prod(orig_a1q_scale_shape) == 1:
-                expert_x_scale = expert_x_scale[:, :1, :1]
-                #print(f"EPT {expert_num_tokens.flatten()}")
-                #print(f"SCALARIZING!!! {expert_x_scale.shape}, {expert_x_scale.flatten()}")
-                idx = expert_num_tokens.flatten() != 0
-                assert torch.all(expert_x_scale.flatten()[idx] != 0)
-                #zidx = expert_num_tokens.flatten() == 0
-                #assert torch.all(expert_x_scale.flatten()[zidx] == 0)
-                assert expert_x_scale.ndim == 3
-                #expert_x_scale = orig_scale.view(1)
-
-            assert expert_x_scale.ndim == 1 or expert_x_scale.ndim == 3
+            expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
+            assert expert_x_scale.ndim == 3
 
         return expert_x, expert_x_scale, expert_num_tokens, None, None