some quantization tweaks

bnellnm · bnellnm · commit 3d750abc22f8 · 2025-07-01T12:20:31.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -147,10 +147,7 @@ def prepare(
             # quantization. Fallback to per_token_dynamic quant.
             per_token_quant = True
         else:
-            per_token_quant = ((quant_config.block_shape is None) or
-                               (a1_scale is not None and a1_scale.numel() != 1)
-                               or (a2_scale is not None
-                                   and a2_scale.numel() != 1))
+            per_token_quant = False
 
         if per_token_quant:
             a1q, a1q_scale = moe_kernel_quantize_input(
@@ -160,6 +157,8 @@ def prepare(
                 per_act_token_quant=True,
                 block_shape=quant_config.block_shape,
             )
+            if a1q_scale is not None and a1q_scale.numel() == 1:
+                a1q_scale = a1q_scale.view(1, 1)
             (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
              expert_topk_weights) = self._do_dispatch(
                  tokens=a1q,
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -119,6 +119,10 @@ def prepare(
             block_shape=quant_config.block_shape)
 
         if a1q_scale is not None:
+            if a1q_scale.numel() == 1:
+                orig_a_scale_block_shape = 1
+            else:
+                orig_a_scale_block_shape = a1q_scale.shape[-1]
             a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
 
         # rem_experts need to be 0 for pplx to work properly.
@@ -143,8 +147,9 @@ def prepare(
         expert_x_scale: Optional[torch.Tensor] = None
         if a1q.dtype.itemsize == 1:
             float32_size = torch.float32.itemsize
-            block_size = (quant_config.block_shape[1] if quant_config.
-                          block_shape is not None else 1) * float32_size
+            block_size = (quant_config.block_shape[1]
+                          if quant_config.block_shape is not None else
+                          float32_size)
             expert_x_scale = torch.empty(
                 (
                     num_local_experts,
@@ -169,7 +174,7 @@ def prepare(
             bound_m=bound_m,
         )
         if expert_x_scale is not None:
-            expert_x_scale = expert_x_scale[:, :, 0:1]
+            expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
 
         return expert_x, expert_x_scale, expert_num_tokens, None, None