Fix float related autoquant options (#1562)

jerryzh168 · web-flow · commit 1c0ea5b60e18 · 2025-01-14T16:02:46.000-08:00
* Fix float related autoquant options

Summary:
Forgot to add a test for previous changes, this fixed some
implementations for the quantized model

Test Plan:
python test/integration/test_integration.py -k test_autoquant_float

Reviewers:

Subscribers:

Tasks:

Tags:

* skip non-cuda runs

* update torch version requirement

* typo
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -1747,6 +1747,42 @@ def test_autoquant_min_sqnr(self, device, dtype):
         # setting min_sqnr for individual linear to be 60 allows us to achieve >= 50 final sqnr
         self.assertTrue(sqnr >= 50, f"sqnr: {sqnr}")
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_4, "autoquant float option requires 2.4+."
+    )
+    def test_autoquant_float(self):
+        device = "cuda"
+        dtype = torch.float32
+        m, k, n = 128, 128, 128
+        example_input = torch.randn(m, k, device=device, dtype=dtype)
+        model = (
+            torch.nn.Sequential(
+                torch.nn.ReLU(),
+                torch.nn.Linear(k, n),
+                torch.nn.ReLU(),
+            )
+            .to(device)
+            .to(dtype)
+        )
+        ref = model(example_input)
+        torchao.autoquant(
+            model,
+            qtensor_class_list=torchao.quantization.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST,
+        )
+        out = model(example_input)
+        from torchao.quantization.autoquant import (
+            BFloat16Tensor,
+            Float16Tensor,
+            Float32Tensor,
+        )
+
+        self.assertIn(
+            type(model[1].weight), [Float32Tensor, Float16Tensor, BFloat16Tensor]
+        )
+        print(compute_error(out, ref))
+        self.assertGreater(compute_error(out, ref), 60)
+
 
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_5, "requires 2.5+.")
 @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
diff --git a/torchao/_models/utils.py b/torchao/_models/utils.py
@@ -35,7 +35,7 @@ def write_json_result_ossci(output_json_path, headers, row):
                 "arch": mapping_headers["arch"],
                 "min_sqnr": mapping_headers["min_sqnr"],
                 # True means compile is enabled, False means eager mode
-                "complie": mapping_headers["compile"],
+                "compile": mapping_headers["compile"],
             },
         },
         "model": {
@@ -87,7 +87,7 @@ def write_json_result_local(output_json_path, headers, row):
                 "arch": mapping_headers["arch"],
                 "min_sqnr": mapping_headers["min_sqnr"],
                 # True means compile is enabled, False means eager mode
-                "complie": mapping_headers["compile"],
+                "compile": mapping_headers["compile"],
             },
         },
         "model": {
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -778,7 +778,7 @@ def _apply_fn_to_data(self, fn):
 
     @classmethod
     def from_float(cls, weight):
-        return cls(weight)
+        return Float32Tensor(weight)
 
 
 @Float32Tensor.implements([torch.nn.functional.linear, aten.linear.default])
@@ -829,6 +829,10 @@ def _quantized_linear_op(act_mat, w_qtensor, bias):
             bias.to(_DTYPE) if bias is not None else bias,
         ).to(dtype=orig_dtype)
 
+    @classmethod
+    def from_float(cls, weight):
+        return BFloat16Tensor(weight)
+
 
 class Float16Tensor(Float32Tensor):
     def __init__(self, weight):
@@ -844,6 +848,10 @@ def _quantized_linear_op(act_mat, w_qtensor, bias):
             bias.to(_DTYPE) if bias is not None else bias,
         ).to(dtype=orig_dtype)
 
+    @classmethod
+    def from_float(cls, weight):
+        return Float16Tensor(weight)
+
 
 class AQFloat32LinearWeight(Float32Tensor, AQMixin):
     """