smoothquant fixes

HDCharles · HDCharles · commit e670872ea973 · 2024-01-03T14:34:03.000-08:00
Summary: certain custom linear modules add additional inputs to the forward that need to be handled but can be otherwise ignored. Additionally swap_linear_with_smooth_fq_linear had a bug where linear subclasses would get past the if statement and error on the dict key lookup since the actual class wasn't expected. (#30) enabled NonDynamicallyQuantizableLinear to work with smoothquant and fixed bug for other subclasses. At some point this should be brought in line with the other APIs if its getting use. Test Plan: python test/test.py Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 059b392 Pull Request resolved: #28
diff --git a/test/test.py b/test/test.py
@@ -1124,6 +1124,23 @@ def test_shape_logger(self):
 
 
 class SmoothquantIntegrationTest(unittest.TestCase):
+    @torch.no_grad()
+    def test_non_dynamically_quantizable_linear(self):
+        model = torch.nn.Sequential(
+            torch.nn.modules.linear.NonDynamicallyQuantizableLinear(32,32),
+            torch.nn.ReLU()
+        ).to("cuda").to(torch.bfloat16)
+        example_input = torch.randn(32,32, device="cuda", dtype=torch.bfloat16)
+        ref = model(example_input)
+        swap_linear_with_smooth_fq_linear(model)
+        model(ref)
+        smooth_fq_linear_to_inference(model)
+        model_c = torch.compile(model, mode="max-autotune")
+        out = model_c(example_input)
+        sqnr = SQNR(ref, out)
+        self.assertTrue(sqnr >= 25)
+        self.assertTrue(isinstance(model[0], SmoothFakeDynamicallyQuantizedLinear))
+
     @torch.inference_mode()
     def test_on_dummy_distilbert(self):
         # https://huggingface.co/distilbert-base-uncased#how-to-use
diff --git a/torchao/quantization/smoothquant.py b/torchao/quantization/smoothquant.py
@@ -137,7 +137,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.init_smoothquant_variables(alpha)
 
-    def forward(self, X):
+    def forward(self, X, *args, **kwargs):
         if self.calibrating:
             self.update_x_running_abs_max(X)
             Y = F.linear(X, self.weight, self.bias)
@@ -199,6 +199,7 @@ def set_debug_x_absmax(self):
 
 source_cls_to_target_cls = {
     torch.nn.Linear: SmoothFakeDynamicallyQuantizedLinear,
+    torch.nn.modules.linear.NonDynamicallyQuantizableLinear: SmoothFakeDynamicallyQuantizedLinear,
 }
 
 
@@ -212,8 +213,8 @@ def swap_linear_with_smooth_fq_linear(
             new_fqn = name
         else:
             new_fqn = f"{cur_fqn}.{name}"
-        if ((skip_fqn_list is None) or (new_fqn not in skip_fqn_list)) and isinstance(
-            child, tuple(source_cls_to_target_cls.keys())
+        if ((skip_fqn_list is None) or (new_fqn not in skip_fqn_list)) and (
+            type(child) in source_cls_to_target_cls.keys()
         ):
             target_cls = source_cls_to_target_cls[type(child)]
             new_child = target_cls.from_float(child, alpha=alpha)