pytorch
diff --git a/‎test/dtypes/test_fbgemm_fp8.py
Lines changed: 90 additions & 29 deletions b/‎test/dtypes/test_fbgemm_fp8.py
Lines changed: 90 additions & 29 deletions
diff --git a/‎test/dtypes/test_fbgemm_int4.py
Lines changed: 48 additions & 1 deletion b/‎test/dtypes/test_fbgemm_int4.py
Lines changed: 48 additions & 1 deletion
@@ -9,12 +9,14 @@
 import torch
 from torch.testing._internal.common_utils import (
     TestCase,
+    instantiate_parametrized_tests,
+    parametrize,
     run_tests,
 )
 
-from torchao.float8.config import e4m3_dtype
 from torchao.quantization import (
-    FbgemmConfig,
+    Float8DynamicActivationFloat8WeightConfig,
+    PerRow,
     quantize_,
 )
 from torchao.quantization.utils import compute_error
@@ -23,36 +25,35 @@
     is_sm_at_least_90,
 )
 
+FBGEMM_CONFIG = Float8DynamicActivationFloat8WeightConfig(
+    granularity=PerRow(), kernel="fbgemm"
+)
+ATEN_CONFIG = Float8DynamicActivationFloat8WeightConfig(
+    granularity=PerRow(), kernel="aten"
+)
+
 
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+")
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
 @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
 class TestFbgemmFp8Tensor(TestCase):
     def setUp(self):
-        self.config = FbgemmConfig(
-            input_dtype=e4m3_dtype,
-            weight_dtype=e4m3_dtype,
-            output_dtype=torch.bfloat16,
-        )
-        self.bmm_config = FbgemmConfig(
-            input_dtype=e4m3_dtype,
-            weight_dtype=e4m3_dtype,
-            output_dtype=torch.bfloat16,
-            transpose_input=True,
-        )
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
-    def test_linear(self):
+    @parametrize("config", [FBGEMM_CONFIG, ATEN_CONFIG])
+    def test_linear(self, config):
         dtype = torch.bfloat16
         device = "cuda"
         input = torch.randn(1, 128, dtype=dtype, device=device)
         linear = torch.nn.Linear(128, 256, dtype=dtype, device=device)
         original = linear(input)
-        quantize_(linear, self.config)
+        quantize_(linear, config)
         quantized = linear(input)
-        self.assertTrue(compute_error(original, quantized) > 20)
+        sqnr = compute_error(original, quantized)
+        self.assertTrue(sqnr > 20, f"sqnr: {sqnr}")
 
-    def test_slice(self):
+    @parametrize("config", [FBGEMM_CONFIG, ATEN_CONFIG])
+    def test_slice(self, config):
         dtype = torch.bfloat16
         device = "cuda"
         dummy = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
@@ -65,7 +66,7 @@ def test_slice(self):
             dummy.weight.narrow(1, 0, 128), requires_grad=False
         )
 
-        quantize_(dummy, self.config)
+        quantize_(dummy, config)
         weight1 = dummy.weight.narrow(0, 0, 64)
         weight2 = dummy.weight.narrow(1, 0, 128)
         self.assertEqual(weight1.float8_data, dummy.weight.float8_data.narrow(0, 0, 64))
@@ -81,20 +82,23 @@ def test_slice(self):
         res_ref = dummy1(input)
         dummy.weight = torch.nn.Parameter(weight1, requires_grad=False)
         res = dummy(input)
-        assert compute_error(res, res_ref) > 25
+        sqnr = compute_error(res, res_ref)
+        self.assertTrue(sqnr > 25, f"sqnr: {sqnr}")
 
         input = torch.randn(2, 128, dtype=dtype, device=device)
         res_ref = dummy2(input)
         dummy.weight = torch.nn.Parameter(weight2, requires_grad=False)
         res = dummy(input)
-        assert compute_error(res, res_ref) > 15
+        sqnr = compute_error(res, res_ref)
+        self.assertTrue(sqnr > 15, f"sqnr: {sqnr}")
 
-    def test_slice_and_copy_(self):
+    @parametrize("config", [FBGEMM_CONFIG, ATEN_CONFIG])
+    def test_slice_and_copy_(self, config):
         l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
         l.weight = torch.nn.Parameter(
             torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
         )
-        quantize_(l, self.config)
+        quantize_(l, config)
         param = l.weight
         param_data = param.data
         param_data = param_data.narrow(0, 0, 512)
@@ -104,7 +108,7 @@ def test_slice_and_copy_(self):
 
         # dummy_l has random input (shouldn't be 0)
         dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
-        quantize_(dummy_l, self.config)
+        quantize_(dummy_l, config)
         quantized = dummy_l.weight
         quantized = quantized.narrow(0, 0, 512)
 
@@ -113,7 +117,8 @@ def test_slice_and_copy_(self):
         # making sure param.data is updated
         assert param.data.float8_data[0][0] != orig_value
 
-    def test_bmm(self):
+    @parametrize("config", [FBGEMM_CONFIG])
+    def test_bmm(self, config):
         class M(torch.nn.Module):
             def __init__(self, weight):
                 super().__init__()
@@ -128,24 +133,80 @@ def forward(self, x):
         weight = torch.randn(10, 128, 256, dtype=dtype, device=device)
         m = M(weight).eval()
         original = m(input)
-        quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
+        # we need to transpose the weight first for bmm
+        m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
+        quantize_(m, config, filter_fn=lambda x, fqn: True)
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 20)
 
-    def test_to_device(self):
+    @parametrize("config", [FBGEMM_CONFIG, ATEN_CONFIG])
+    def test_to_device(self, config):
         for device in self.GPU_DEVICES:
             linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            quantize_(linear, config)
             linear.to(device)
 
             linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            quantize_(linear, config)
             linear.to(device=device)
 
             linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            quantize_(linear, config)
             linear.to(device)
 
+    @parametrize("config", [FBGEMM_CONFIG, ATEN_CONFIG])
+    def test_cat(self, config):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=dtype)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        dummy1 = torch.nn.Linear(128, 512, bias=False, dtype=dtype, device=device)
+
+        dummy1.weight = torch.nn.Parameter(cat_weight1)
+        quantize_(dummy1, config)
+
+        quantize_(linear1, config)
+        quantize_(linear2, config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        self.assertTrue(cat_qweight1.shape, (512, 128))
+        self.assertEqual(dummy1.weight.float8_data, cat_qweight1.float8_data)
+        self.assertEqual(dummy1.weight.scale, cat_qweight1.scale)
+
+        # concat with dim == 1 is not really correct and will be fixed later
+        # when we support distributed checkpointing
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight2.shape, (256, 256))
+        ref_float8_data = torch.cat(
+            [linear1.weight.float8_data, linear2.weight.float8_data], dim=1
+        )
+        ref_scale = linear1.weight.scale
+        self.assertEqual(cat_qweight2.float8_data, ref_float8_data)
+        self.assertEqual(cat_qweight2.scale, ref_scale)
+
+    @parametrize("config", [FBGEMM_CONFIG])
+    def test_transpose(self, config):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype, device=device)
+        quantize_(linear1, config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        linear1.bias = torch.nn.Parameter(torch.randn(128, dtype=dtype, device=device))
+        self.assertTrue(linear1.weight.shape, (128, 256))
+
+        input = torch.randn(32, 256, dtype=dtype, device=device)
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 128))
+
+
+instantiate_parametrized_tests(TestFbgemmFp8Tensor)
+
 
 if __name__ == "__main__":
     run_tests()
@@ -39,7 +39,6 @@ def setUp(self):
             weight_dtype=torch.int4,
             output_dtype=torch.bfloat16,
             block_size=[1, 1, 128],
-            transpose_input=True,
         )
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
@@ -134,6 +133,7 @@ def forward(self, x):
         weight = torch.randn(10, 128, 256, dtype=dtype, device=device)
         m = M(weight).eval()
         original = m(input)
+        m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
         quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 18)
@@ -152,6 +152,53 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=dtype)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        cat_weight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        dummy1 = torch.nn.Linear(128, 512, bias=False, dtype=dtype, device=device)
+        dummy2 = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
+
+        dummy1.weight = torch.nn.Parameter(cat_weight1)
+        dummy2.weight = torch.nn.Parameter(cat_weight2)
+        quantize_(dummy1, self.config)
+        quantize_(dummy2, self.config)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        self.assertTrue(cat_qweight1.shape, (512, 128))
+        self.assertEqual(dummy1.weight.packed_weight, cat_qweight1.packed_weight)
+        self.assertEqual(dummy1.weight.scale, cat_qweight1.scale)
+        self.assertEqual(dummy1.weight.zero_point, cat_qweight1.zero_point)
+
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight2.shape, (256, 256))
+        self.assertEqual(dummy2.weight.packed_weight, cat_qweight2.packed_weight)
+        self.assertEqual(dummy2.weight.scale, cat_qweight2.scale)
+        self.assertEqual(dummy2.weight.zero_point, cat_qweight2.zero_point)
+
+    def test_transpose(self):
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        # transpose again to return to the original state
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        self.assertTrue(linear1.weight.shape, (256, 128))
+
+        input = torch.randn(32, 128, dtype=torch.bfloat16, device="cuda")
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 256))
+
 
 if __name__ == "__main__":
     run_tests()