Making subclass BC with torch version

HDCharles · HDCharles · commit 54bcd5a10d0a · 2023-12-19T21:03:22.000-08:00
Summary: there was a change to torch_flatten and unflatten in pytorch core, this change allows subclass to work with both Test Plan: python test/test.py on branch cut 2.2 and main Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: afa090f Pull Request resolved: #27
diff --git a/torchao/quantization/dynamic_quant.py b/torchao/quantization/dynamic_quant.py
@@ -29,7 +29,7 @@ def __init__(
     ) -> None:
         super().__init__(in_features, out_features, bias)
 
-    def forward(self, X: torch.Tensor) -> torch.Tensor:
+    def forward(self, X: torch.Tensor, *args, **kwargs) -> torch.Tensor:
         """
         Performs the forward pass of the quantized linear layer which consists
         of int8 dynamic symmetric per-token activation and int8 symmetric per-channel weight
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -54,7 +54,8 @@ def _replace_with_custom_fn_if_matches_filter(
             new_child = _replace_with_custom_fn_if_matches_filter(
                 child, replacement_fn, filter_fn, f"{cur_fqn}{name}."
             )
-            setattr(model, name, new_child)
+            if new_child is not child:
+                setattr(model, name, new_child)
         return model
 
 
@@ -68,15 +69,15 @@ def _is_linear(mod, *args):
 def _in_features_greater_than_16(mod, *args):
     return hasattr(mod, "in_features") and mod.in_features > 16
 
-def apply_weight_only_int8_quant(model):
+def apply_weight_only_int8_quant(model, filter_fn=None):
     """
     Applies weight-only symmetric per-channel int8 quantization to all linear layers
     in the given model using module swaps.
     """
     _replace_with_custom_fn_if_matches_filter(
         model,
         WeightOnlyInt8QuantLinear.from_float,
-        _is_linear,
+        _is_linear if filter_fn is None else filter_fn,
     )
 
 
@@ -123,7 +124,7 @@ def change_linear_weights_to_int8_dqtensors(model, filter_fn=None):
     )
 
 
-def change_linear_weights_to_int8_woqtensors(model):
+def change_linear_weights_to_int8_woqtensors(model, filter_fn=None):
     """
     Converts all linear weight tensors to the
     `Int8WeightOnlyQuantizedLinearWeight` tensor subclass,
@@ -133,7 +134,7 @@ def change_linear_weights_to_int8_woqtensors(model):
     _replace_with_custom_fn_if_matches_filter(
         model,
         _get_subclass_inserter(Int8WeightOnlyQuantizedLinearWeight),
-        _is_linear,
+        _is_linear if filter_fn is None else filter_fn,
     )
 
 
@@ -152,7 +153,7 @@ def change_linear_weights_to_int4_woqtensors(model, **kwargs):
         filter_fn,
     )
 
-def swap_conv2d_1x1_to_linear(model):
+def swap_conv2d_1x1_to_linear(model, filter_fn=None):
     """
     Changes all conv2d 1x1 modules to equivalent linear modules so that they can then be quantized.
     """
@@ -172,8 +173,11 @@ def replace_conv2d_1x1(conv):
         lin.bias = conv.bias
         return PermuteSandwich(lin)
 
+    if filter_fn is None:
+        filter_fn=lambda mod, *args: isinstance(mod, torch.nn.Conv2d) and mod.kernel_size==(1,1)
+
     _replace_with_custom_fn_if_matches_filter(
         model,
         replace_conv2d_1x1,
-        filter_fn=lambda mod, *args: isinstance(mod, torch.nn.Conv2d) and mod.kernel_size==(1,1)
+        filter_fn=filter_fn
     )
diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py
@@ -240,13 +240,13 @@ def _change_shape(self, shape):
         )
 
     def __tensor_flatten__(self):
-        return ["int_data", "q_scales"], [self.transposed, self.dtype]
+        return ["int_data", "q_scales"], [self.transposed, self.dtype, self.shape]
 
     @classmethod
-    def __tensor_unflatten__(cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride):
+    def __tensor_unflatten__(cls, tensor_data_dict, tensor_attributes, outer_size=None, outer_stride=None):
         int_data, q_scales = tensor_data_dict["int_data"], tensor_data_dict["q_scales"]
-        transposed, dtype = tensor_attributes
-        return cls(int_data, q_scales, transposed, outer_size, dtype=dtype, strides=outer_stride)
+        transposed, dtype, shape = tensor_attributes
+        return cls(int_data, q_scales, transposed, shape if outer_size is None else outer_size, dtype=dtype, strides=outer_stride)
 
     @classmethod
     def from_float(cls, input_float, qmin=-128, qmax=127):
@@ -416,20 +416,21 @@ def __tensor_flatten__(self):
             self.groupsize,
             self.inner_k_tiles,
             self.dtype,
+            self.shape
         )
 
     @classmethod
-    def __tensor_unflatten__(cls, tensor_data_dict, attributes, outer_size, outer_stride):
+    def __tensor_unflatten__(cls, tensor_data_dict, attributes, outer_size=None, outer_stride=None):
         int_data, scales_and_zeros = (
             tensor_data_dict["int_data"],
             tensor_data_dict["scales_and_zeros"],
         )
-        transposed, groupsize, inner_k_tiles, dtype = attributes
+        transposed, groupsize, inner_k_tiles, dtype, shape = attributes
         return cls(
             int_data,
             scales_and_zeros,
             transposed,
-            outer_size,
+            shape if outer_size is None else outer_size,
             groupsize,
             inner_k_tiles,
             dtype=dtype,
diff --git a/torchao/quantization/weight_only.py b/torchao/quantization/weight_only.py
@@ -24,7 +24,7 @@ def __init__(self, *args, **kwargs):
         self.w_int8 = w_int8
         self.scales = scales
 
-    def forward(self, x):
+    def forward(self, x, *args, **kwargs):
         """
         Performs the forward pass of the quantized linear layer which consists
         ofmixed dtype matmul using int8 symmetric per-channel weight quantization