Subclass fixes for torchbench

HDCharles · HDCharles · commit b56f51d54b41 · 2023-12-11T15:20:44.000-08:00
Summary: pytorch update changed flatten, now updated. Handle a number of behaviors needed to work with torchbench dynamo userbenchmark. Removed transpose and detach code and broke them into _change_shape and _apply_fn_to_data methods, changed subclasses to overwrite torch_function for torch.nn.functional.linear rather than mm and addmm individually since linear can also hit expand, view, bmm...etc. Test Plan: python test/test.py Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: a5c0146 Pull Request resolved: #24
diff --git a/test/test.py b/test/test.py
@@ -832,11 +832,12 @@ def test_dequantize_int4_weight_only_quant_subclass(self):
         for groupsize in [256, 128]:
             for inner_k_tiles in [8, 2]:
                 for m in [1, 256]:
-                    self._test_dequantize_impl(
-                        lambda w: Int4WeightOnlyQuantizedLinearWeight.from_float(w, groupsize, inner_k_tiles),
-                        15,
-                        test_shape=[m, 256, 8]
-                    )
+                    for n in [8, 13]:
+                        self._test_dequantize_impl(
+                            lambda w: Int4WeightOnlyQuantizedLinearWeight.from_float(w, groupsize, inner_k_tiles),
+                            15,
+                            test_shape=[m, 256, n]
+                        )
 
     def _test_lin_weight_subclass_impl(
         self,
@@ -886,11 +887,12 @@ def test_int4_weight_only_quant_subclass(self):
         for groupsize in [128, 64]:
             for inner_k_tiles in [4, 2]:
                 for m in [1, 256]:
-                    self._test_lin_weight_subclass_impl(
-                        lambda w: Int4WeightOnlyQuantizedLinearWeight.from_float(w, groupsize, inner_k_tiles),
-                        10,
-                        test_shape=[m, 256, 8]
-                    )
+                    for n in [8, 13]:
+                        self._test_lin_weight_subclass_impl(
+                            lambda w: Int4WeightOnlyQuantizedLinearWeight.from_float(w, groupsize, inner_k_tiles),
+                            10,
+                            test_shape=[m, 256, n]
+                        )
 
     @torch.no_grad()
     def _test_lin_weight_subclass_api_impl(
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -20,6 +20,7 @@
     DynamicallyPerAxisQuantizedLinear,
 )
 from .subclass import (
+    QuantizedLinearWeightBase,
     Int8DynamicallyQuantizedLinearWeight,
     Int8WeightOnlyQuantizedLinearWeight,
     Int4WeightOnlyQuantizedLinearWeight,
@@ -58,6 +59,15 @@ def _replace_with_custom_fn_if_matches_filter(
                 child, replacement_fn, filter_fn, new_fqn
             )
 
+def _is_linear(mod, *args):
+    return (
+        isinstance(mod, torch.nn.Linear) and
+        hasattr(mod, "weight") and
+        not isinstance(mod.weight, QuantizedLinearWeightBase)
+    )
+
+def _in_features_greater_than_16(mod, *args):
+    return hasattr(mod, "in_features") and mod.in_features > 16
 
 def apply_weight_only_int8_quant(model):
     """
@@ -67,7 +77,7 @@ def apply_weight_only_int8_quant(model):
     _replace_with_custom_fn_if_matches_filter(
         model,
         WeightOnlyInt8QuantLinear.from_float,
-        lambda mod, fqn: isinstance(mod, torch.nn.Linear),
+        _is_linear,
     )
 
 
@@ -80,7 +90,7 @@ def apply_dynamic_quant(model):
     _replace_with_custom_fn_if_matches_filter(
         model,
         lambda mod: DynamicallyPerAxisQuantizedLinear.from_float(mod),
-        lambda mod, fqn: isinstance(mod, torch.nn.Linear),
+        _is_linear,
     )
 
 
@@ -103,7 +113,9 @@ def change_linear_weights_to_int8_dqtensors(model):
     _replace_with_custom_fn_if_matches_filter(
         model,
         _get_subclass_inserter(Int8DynamicallyQuantizedLinearWeight),
-        lambda mod, fqn: isinstance(mod, torch.nn.Linear),
+        lambda *args:
+            _is_linear(*args) and
+            _in_features_greater_than_16(*args)
     )
 
 
@@ -117,7 +129,7 @@ def change_linear_weights_to_int8_woqtensors(model):
     _replace_with_custom_fn_if_matches_filter(
         model,
         _get_subclass_inserter(Int8WeightOnlyQuantizedLinearWeight),
-        lambda mod, fqn: isinstance(mod, torch.nn.Linear),
+        _is_linear,
     )
 
 
@@ -131,5 +143,5 @@ def change_linear_weights_to_int4_woqtensors(model, **kwargs):
     _replace_with_custom_fn_if_matches_filter(
         model,
         _get_subclass_inserter(Int4WeightOnlyQuantizedLinearWeight, **kwargs),
-        lambda mod, fqn: isinstance(mod, torch.nn.Linear),
+        _is_linear,
     )
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -351,9 +351,6 @@ def quant_int8_per_token_matmul(
     assert (
         w_vals_int8_t.dtype == torch.int8
     ), f"w dtype {w_vals_int8_t.dtype} not yet supported"
-    assert (
-        w_scales.dtype == output_dtype
-    ), f"{w_scales.dtype} does not match {output_dtype}"
 
     #
     # 1. do the matrix form of dot(X_i, W_j)
@@ -375,8 +372,8 @@ def quant_int8_per_token_matmul(
         torch.bfloat16,
     ], f"x_scales needs to be a torch.float32 or torch.bfloat16 but got {x_scales.dtype}"
 
-    y = (y_dot_int32 * x_scales.view(-1, 1) * w_scales).reshape(
-        *x_vals_int8.shape[:-1], -1
+    y = (y_dot_int32 * x_scales.reshape(-1, 1) * w_scales).reshape(
+        *x_vals_int8.shape[:-1], y_dot_int32.shape[-1]
     )
 
     # can downcast only at the very end
diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py