Enabling MOE Quantization using linear decomposition [WIP]

HDCharles · HDCharles · commit 22e19a43e512 · 2025-04-11T12:49:08.000-07:00
Summary: This PR is a first step at optimizing moe inference using
torchAO. The goal for this step is to enable existing quantization
kernels and workflows to work for moe quantization by decomposing the
group gemm into a sequence of unbalanced linear ops that can use the
existing quantized kernels. To enable this we had to add support for
quantizing these 3D tensors as well as slicing and indexing.

current tests are running locally but will be added once working.

currently int8wo and int8dq are working for multi and single token moe
inference while int4wo is being finished up.

TODO move test set into ao, move quantizable moe module code to ao test
on hf model definition.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/torchao/dtypes/affine_quantized_tensor_ops.py b/torchao/dtypes/affine_quantized_tensor_ops.py
@@ -477,6 +477,46 @@ def _(func, types, args, kwargs):
     )
     return return_and_correct_aliasing(func, args, kwargs, new)
 
+@implements(aten.index.Tensor)
+def _(func, types, args, kwargs):
+    self, indices = args
+    assert len(indices) == 1, f"op {func} currently only implemented for single dimensional indexing but got indices: {indices}"
+    
+    new_tensor_impl = aten.index.Tensor(self.tensor_impl, indices)
+    shape = tuple([indices[0].numel(), *self.shape[1:]])
+
+    block_size = self.block_size
+    new = self.__class__(
+        new_tensor_impl,
+        block_size,
+        shape,
+        self.quant_min,
+        self.quant_max,
+        self.zero_point_domain,
+        dtype=self.dtype,
+    )
+    return return_and_correct_aliasing(func, args, kwargs, new)
+
+@implements(aten.select.int)
+def _(func, types, args, kwargs):
+    self, dim, index = fill_defaults(args, 3, [0, 0])
+    assert dim==0, f"op {func} currently only implemented for dim=0 but got dim={dim}"
+    assert self.dim() == 3, f"op {func} currently only implemented for 3 dimensional tensors but got shape={self.shape}"
+    
+    new_tensor_impl = aten.select.int(self.tensor_impl, dim, index)
+
+    shape = self.shape[1:]
+    block_size = self.block_size[1:]
+    new = self.__class__(
+        new_tensor_impl,
+        block_size,
+        shape,
+        self.quant_min,
+        self.quant_max,
+        self.zero_point_domain,
+        dtype=self.dtype,
+    )
+    return return_and_correct_aliasing(func, args, kwargs, new)
 
 # this is needed for DTensor.from_local() and for flattening tensor
 @implements(aten.view.default)
diff --git a/torchao/dtypes/uintx/plain_layout.py b/torchao/dtypes/uintx/plain_layout.py
@@ -154,6 +154,17 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             )
             return return_and_correct_aliasing(func, args, kwargs, new)
 
+
+        elif func in [aten.select.int, aten.index.Tensor]:
+            return return_and_correct_aliasing(
+                func,
+                args,
+                kwargs,
+                args[0]._apply_fn_to_data(
+                    lambda x: func(x, *args[1:], **kwargs)
+                ),
+            )
+
         elif func is aten.slice.Tensor:
             self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
             if dim == 0:
diff --git a/torchao/dtypes/uintx/tensor_core_tiled_layout.py b/torchao/dtypes/uintx/tensor_core_tiled_layout.py
@@ -75,7 +75,6 @@ def _linear_bf16_act_uint4_weight_impl(input_tensor, weight_tensor, bias):
         f"need input_tensor shape: {input_tensor.shape} final"
         f"dim to match weight_tensor shape: {weight_tensor.shape} second dim "
     )
-
     # TODO: check groupsize quantization
     # avoid circular dep, TODO: move this to a common util.py
     act_mat = input_tensor
@@ -97,7 +96,6 @@ def _linear_bf16_act_uint4_weight_impl(input_tensor, weight_tensor, bias):
     y = torch.ops.aten._weight_int4pack_mm(
         act_mat.contiguous(), packed_weight, groupsize, scale_and_zero
     )
-
     # remove out_feature padding
     orig_out_features = weight_tensor.shape[-2]
     y = y[:, :orig_out_features]
@@ -119,7 +117,7 @@ class TensorCoreTiledLayout(Layout):
     inner_k_tiles: int = 8
 
     def pre_process(self, input: torch.Tensor) -> torch.Tensor:
-        orig_out_features, orig_in_features = input.shape
+        orig_out_features, orig_in_features = input.shape[-2:]
         in_features = find_multiple(orig_in_features, 1024)
         out_features = find_multiple(orig_out_features, 8)
         input = torch.nn.functional.pad(
@@ -160,18 +158,18 @@ def post_process(
         zero_point: torch.Tensor,
         block_size: Tuple[int, ...],
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        orig_out_features, orig_in_features = input.shape
+        orig_out_features, orig_in_features = input.shape[-2:]
         in_features = find_multiple(orig_in_features, 1024)
         out_features = find_multiple(orig_out_features, 8)
         input = torch.nn.functional.pad(
             input,
             (0, in_features - orig_in_features, 0, out_features - orig_out_features),
         )
         assert (
-            len(block_size) == 2
-        ), f"TensorCoreTiledLayout only supports len(block_size) == 2, got: {block_size}"
-        scale_pad_dim_0 = (out_features - orig_out_features) // block_size[0]
-        scale_pad_dim_1 = (in_features - orig_in_features) // block_size[1]
+            len(block_size) == 2 or len(block_size) == 3,
+        ), f"TensorCoreTiledLayout only supports len(block_size) == 2 or 3, got: {block_size}"
+        scale_pad_dim_0 = (out_features - orig_out_features) // block_size[-2]
+        scale_pad_dim_1 = (in_features - orig_in_features) // block_size[-1]
         scale = torch.nn.functional.pad(scale, (0, scale_pad_dim_1, 0, scale_pad_dim_0))
         zero_point = torch.nn.functional.pad(
             zero_point, (0, scale_pad_dim_1, 0, scale_pad_dim_0)
@@ -272,11 +270,22 @@ def from_plain(
             assert (
                 int_data.dtype == torch.int32
             ), "torch.ops.aten._convert_weight_to_int4pack in torch 2.4 expects `int32` dtype"
-        packed_weight = torch.ops.aten._convert_weight_to_int4pack(
-            int_data, _layout.inner_k_tiles
-        )
-        scale = scale.reshape(int_data.shape[0], -1)
-        zero_point = zero_point.reshape(int_data.shape[0], -1)
+        def quant_2d(mat):
+            return torch.ops.aten._convert_weight_to_int4pack(
+                mat, _layout.inner_k_tiles
+            )
+        if int_data.dim() == 3: # for moe quant
+            num_experts = int_data.shape[0]
+            packed_weight_list = []
+            for expert in range(num_experts):
+                packed_weight_list.append(quant_2d(int_data[expert]).unsqueeze(0))
+            packed_weight = torch.cat(packed_weight_list, dim=0)
+            scale = scale.reshape(int_data.shape[0], int_data.shape[-2], -1)
+            zero_point = zero_point.reshape(int_data.shape[0], int_data.shape[-2], -1)
+        else:
+            packed_weight = quant_2d(int_data)
+            scale = scale.reshape(int_data.shape[0], -1)
+            zero_point = zero_point.reshape(int_data.shape[0], -1)
         from torchao.quantization.utils import pack_tinygemm_scales_and_zeros
 
         scale_and_zero = pack_tinygemm_scales_and_zeros(scale, zero_point, scale.dtype)
@@ -336,6 +345,18 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
                 f"Not supported args for copy_ due to metadata mistach: {args[0], args[1]}"
             )
 
+        if func in [aten.select.int, aten.index.Tensor]:
+            assert not (func is aten.select.int and args[1]!=0), "aten.select.int currently only has support for dim=0"
+            return return_and_correct_aliasing(
+                func,
+                args,
+                kwargs,
+                args[0]._apply_fn_to_data(
+                    lambda x: func(x, *args[1:], **kwargs)
+                ),
+            )
+
+
         if func is aten.t.default:
             """we don't need to repack the weight and just rely on external
             shape being changed and record the status of transpose/no-transpose
@@ -386,11 +407,15 @@ def block_size(self):
 
         scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
         cur_shape = self.shape
-        assert len(cur_shape) == 4
+        if len(cur_shape) == 5:
+            ones = [1,1]
+            cur_shape = cur_shape[1:]
+        elif len(cur_shape) == 4:
+            ones = [1]
         inner_k_tiles = cur_shape[-1] * 2
         original_shape = (cur_shape[0] * 8, cur_shape[1] * (inner_k_tiles * 16))
         groupsize = int(original_shape[1] / scale.shape[-2])
-        return (1, groupsize)
+        return tuple([*ones, groupsize])
 
     def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         from torchao.quantization.quant_primitives import (
@@ -399,35 +424,54 @@ def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         )
         from torchao.quantization.utils import unpack_tinygemm_scales_and_zeros
 
+        def dequant_4d(self):
+            cur_shape = self.shape
+            scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
+            assert len(cur_shape) == 4
+            inner_k_tiles = cur_shape[-1] * 2
+            original_shape = (cur_shape[0] * 8, cur_shape[1] * (inner_k_tiles * 16))
+            eye_shape = original_shape[1]
+            groupsize = int(original_shape[1] / scale.shape[-2])
+            block_size = (1, groupsize)
+            original_dtype = torch.bfloat16
+            assert len(block_size) == 2 and block_size[0] == 1
+            dequantized = torch.ops.aten._weight_int4pack_mm(
+                torch.eye(eye_shape, device=self.device, dtype=original_dtype),
+                self.packed_weight,
+                groupsize,
+                self.scale_and_zero,
+            )
+            dequantized = dequantized.t().contiguous()
+            return dequantized
+            
+        cur_shape = self.shape
+
+        if len(cur_shape)==4:
+            dequantized = dequant_4d(self)
+        else:
+            assert len(cur_shape) == 5
+            num_experts = cur_shape[0]
+            dequantized_list = []
+            import fbvscode; fbvscode.set_trace()
+            for expert in range(num_experts):
+                dequantized_list.append(dequant_4d(self[expert]).unsqueeze(0))
+            dequantized = torch.cat(dequantized_list, dim=0)
+            
+
         scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
+        # TODO: move this to `unpack_tinygemm_scales_and_zeros`?
+        scale = scale.reshape(scale.shape[:-1]).contiguous()
+        zero = zero.reshape(zero.shape[:-1]).contiguous()
 
-        cur_shape = self.shape
-        assert len(cur_shape) == 4
-        inner_k_tiles = cur_shape[-1] * 2
-        original_shape = (cur_shape[0] * 8, cur_shape[1] * (inner_k_tiles * 16))
-        eye_shape = original_shape[1]
-        groupsize = int(original_shape[1] / scale.shape[-2])
-        block_size = (1, groupsize)
         device = self.device
-        original_dtype = torch.bfloat16
+        
         target_dtype = torch.int32
         quant_min = 0
         quant_max = 15
         zero_point_domain = ZeroPointDomain.FLOAT
-        assert len(block_size) == 2 and block_size[0] == 1
-        dequantized = torch.ops.aten._weight_int4pack_mm(
-            torch.eye(eye_shape, device=device, dtype=original_dtype),
-            self.packed_weight,
-            groupsize,
-            self.scale_and_zero,
-        )
-        dequantized = dequantized.t().contiguous()
-        # TODO: move this to `unpack_tinygemm_scales_and_zeros`?
-        scale = scale.reshape(scale.shape[:-1]).contiguous()
-        zero = zero.reshape(zero.shape[:-1]).contiguous()
         int_data = quantize_affine(
             dequantized,
-            block_size,
+            self.block_size,
             scale,
             zero,
             target_dtype,
diff --git a/torchao/quantization/prototype/moe_quant/quantizable_moe_modules.py b/torchao/quantization/prototype/moe_quant/quantizable_moe_modules.py
@@ -0,0 +1,90 @@
+class MOEFeedForwardAOQuantizable(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.gate = nn.Linear(config.dim, config.num_experts, bias=False)
+        self.cond_ffn = ConditionalFeedForwardAOQuantizable(config)
+        self.dim = config.dim
+        self.num_activated_experts = config.num_activated_experts
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size = x.shape[0]
+        x = x.view(-1, self.dim) # x: [T, D]
+        scores = self.gate(x) # [T, E]
+        expert_weights = F.softmax(scores, dim=-1)
+        expert_weights, expert_indices = torch.topk(expert_weights, self.num_activated_experts, dim=-1) # [T, A], [T, A]
+        expert_weights /= expert_weights.sum(dim=-1, keepdim=True).to(x.dtype) # [T, A]
+        out = self.cond_ffn(x, expert_indices, expert_weights, self.num_activated_experts)
+        return out.reshape(batch_size, -1, self.dim)
+
+
+class ConditionalFeedForwardAOQuantizable(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.w1 = nn.Parameter(torch.empty(config.num_experts, config.intermediate_size, config.dim)) # E, I, D
+        self.w2 = nn.Parameter(torch.empty(config.num_experts, config.dim, config.intermediate_size)) # E, D, I
+        self.w3 = nn.Parameter(torch.empty(config.num_experts, config.intermediate_size, config.dim)) # E, I, D
+        self.num_experts = config.num_experts
+    def forward(
+        self, x: Tensor,        # T, D
+        expert_indices: Tensor, # T, A
+        expert_weights: Tensor,  # T, A
+        num_activated_experts: int,
+        ) -> Tensor:
+        num_tokens, dim = x.shape
+        num_token_activations = num_tokens * num_activated_experts
+
+        if x.shape[0]==1: #only 1 token (can be done without graph breaks when compiled)
+            outs = []
+            expert_indices=expert_indices.squeeze()
+            # collect used experts
+            w1 = self.w1[expert_indices]
+            w2 = self.w2[expert_indices]
+            w3 = self.w3[expert_indices]
+
+            # run token through each expert
+            for index in range(num_activated_experts):
+                cur_out = F.linear( F.silu(F.linear(x, w1[index])) * F.linear(x, w3[index]), w2[index])
+                outs.append(cur_out)
+
+            # combine outputs
+            final_out = (torch.cat(outs, dim=0) * expert_weights.view(-1,1)).sum(dim=0).unsqueeze(-1)
+            return final_out
+        else:
+            expert_list = [x for x in range(self.num_experts)]
+            
+            # shuffle tokens into groups for each expert
+            ordered_token_activations = expert_indices.view(-1).argsort(stable=True) # [A]
+            ordered_token_indices = ordered_token_activations.div(num_activated_experts).floor().to(torch.int64) #  [T]
+
+            num_tokens_per_expert = torch.histc(expert_indices, bins=self.num_experts+1, min=-1, max=self.num_experts) #  [E+1] (added leading 0 so can be used for indexing)
+            cum_tokens_per_expert = num_tokens_per_expert.cumsum(0).to(torch.int64)  #  [E+1]
+            
+            # without quant this is compilable, with quant it throws an error. 
+            # Even without quant there's a graph break here so not a huge loss
+            @torch._dynamo.disable()
+            def group_tokens_by_expert(ordered_token_indices, cum_tokens_per_expert, expert_list):
+                token_indices_per_expert = [ordered_token_indices[cum_tokens_per_expert[expert]:cum_tokens_per_expert[expert+1]] for expert in expert_list] # [T'(e1)], [T'(e2)] ...
+                return token_indices_per_expert
+            token_indices_per_expert = group_tokens_by_expert(ordered_token_indices, cum_tokens_per_expert, expert_list)
+            tokens_grouped_by_expert = [x[indices] for indices in token_indices_per_expert]
+
+            # calculate outputs for each expert
+            outs = []
+            for cur_x, expert in zip(tokens_grouped_by_expert,expert_list):
+
+                w1=self.w1[expert] # I, D
+                w2=self.w2[expert] # D, I
+                w3=self.w3[expert] # I, D
+
+                cur_out = F.linear( F.silu(F.linear(cur_x, w1)) * F.linear(cur_x, w3), w2) # [T'(e), D]
+                outs.append(cur_out)
+
+            # weigh outputs
+            ordered_outs = torch.cat(outs, dim=0) # [T*A, D]
+            ordered_token_activation_weights = expert_weights.view(-1,1)[ordered_token_activations].view(-1,1) # [T*A, 1]
+            weighted_ordered_outs = ordered_outs*ordered_token_activation_weights # [T*A, D]
+            
+            # sum weighted token-activation outputs together for each token
+            final_out = torch.zeros_like(x) #  [T, D]
+            final_out = final_out.scatter_add(dim=0, index=ordered_token_indices.unsqueeze(-1).expand(num_token_activations,dim).to(torch.int64), src=weighted_ordered_outs)
+        return final_out
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -300,7 +300,7 @@ def _replace_with_custom_fn_if_matches_filter(
                 device,
                 extra_args,
             )
-            if new_child is not child:
+            if new_child is not child and new_child is not None:
                 setattr(model, name, new_child)
         if device is not None:
             model.to(device=device)  # move parent module to device
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -366,22 +366,23 @@ def get_groupwise_affine_qparams(
 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16):
     guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size())
     guard_dtype_size(zeros, "zeros", dtype=dtype)
+    dim = scales.dim()
     return (
         torch.cat(
             [
-                scales.reshape(scales.size(0), scales.size(1), 1),
-                zeros.reshape(zeros.size(0), zeros.size(1), 1),
+                scales.unsqueeze(-1),
+                zeros.unsqueeze(-1),
             ],
-            2,
+            dim,
         )
-        .transpose(0, 1)
+        .transpose(-3, -2)
         .contiguous()
     )
 
 
 def unpack_tinygemm_scales_and_zeros(scales_and_zeros):
-    assert len(scales_and_zeros.shape) == 3 and scales_and_zeros.shape[2] == 2
-    return torch.split(scales_and_zeros.transpose(0, 1), 1, 2)
+    assert scales_and_zeros.shape[-1] == 2
+    return torch.split(scales_and_zeros.transpose(-3, -2), 1, -1)
 
 
 def convert_weight_to_int4pack_xpu(weight, zero_point_domain_is_int=False):
diff --git a/torchao/utils.py b/torchao/utils.py

Original file line number	Diff line number	Diff line change
`@@ -300,7 +300,7 @@ def _replace_with_custom_fn_if_matches_filter(`
`300`	`300`	`device,`
`301`	`301`	`extra_args,`
`302`	`302`	`)`
`303`		`- if new_child is not child:`
	`303`	`+ if new_child is not child and new_child is not None:`
`304`	`304`	`setattr(model, name, new_child)`
`305`	`305`	`if device is not None:`
`306`	`306`	`model.to(device=device) # move parent module to device`