pytorch
diff --git a/‎test/dtypes/test_fbgemm_fp8.py
Lines changed: 52 additions & 5 deletions b/‎test/dtypes/test_fbgemm_fp8.py
Lines changed: 52 additions & 5 deletions
diff --git a/‎test/dtypes/test_fbgemm_int4.py
Lines changed: 47 additions & 0 deletions b/‎test/dtypes/test_fbgemm_int4.py
Lines changed: 47 additions & 0 deletions
diff --git a/‎torchao/core/config.py
Lines changed: 9 additions & 0 deletions b/‎torchao/core/config.py
Lines changed: 9 additions & 0 deletions
@@ -12,7 +12,6 @@
     run_tests,
 )
 
-from torchao.float8.config import e4m3_dtype
 from torchao.quantization import (
     FbgemmConfig,
     quantize_,
@@ -29,14 +28,15 @@
 @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
 class TestFbgemmFp8Tensor(TestCase):
     def setUp(self):
+        self.e4m3_dtype = torch.float8_e4m3fn
         self.config = FbgemmConfig(
-            input_dtype=e4m3_dtype,
-            weight_dtype=e4m3_dtype,
+            input_dtype=self.e4m3_dtype,
+            weight_dtype=self.e4m3_dtype,
             output_dtype=torch.bfloat16,
         )
         self.bmm_config = FbgemmConfig(
-            input_dtype=e4m3_dtype,
-            weight_dtype=e4m3_dtype,
+            input_dtype=self.e4m3_dtype,
+            weight_dtype=self.e4m3_dtype,
             output_dtype=torch.bfloat16,
             transpose_input=True,
         )
@@ -146,6 +146,53 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=dtype)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        dummy1 = torch.nn.Linear(128, 512, bias=False, dtype=dtype, device=device)
+
+        dummy1.weight = torch.nn.Parameter(cat_weight1)
+        quantize_(dummy1, self.config)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        self.assertTrue(cat_qweight1.shape, (512, 128))
+        self.assertEqual(dummy1.weight.float8_data, cat_qweight1.float8_data)
+        self.assertEqual(dummy1.weight.scale, cat_qweight1.scale)
+
+        # concat with dim == 1 is not really correct and will be fixed later
+        # when we support distributed checkpointing
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight2.shape, (256, 256))
+        ref_float8_data = torch.cat([linear1.weight.float8_data, linear2.weight.float8_data], dim=1)
+        ref_scale = linear1.weight.scale
+        self.assertEqual(cat_qweight2.float8_data, ref_float8_data)
+        self.assertEqual(cat_qweight2.scale, ref_scale)
+
+
+    def test_transpose(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype, device=device)
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        linear1.bias = torch.nn.Parameter(torch.randn(128, dtype=dtype, device=device))
+        self.assertTrue(linear1.weight.shape, (128, 256))
+
+        input = torch.randn(32, 256, dtype=dtype, device=device)
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 128))
+
 
 if __name__ == "__main__":
     run_tests()
@@ -152,6 +152,53 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=dtype)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        cat_weight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        dummy1 = torch.nn.Linear(128, 512, bias=False, dtype=dtype, device=device)
+        dummy2 = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
+
+        dummy1.weight = torch.nn.Parameter(cat_weight1)
+        dummy2.weight = torch.nn.Parameter(cat_weight2)
+        quantize_(dummy1, self.config)
+        quantize_(dummy2, self.config)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        self.assertTrue(cat_qweight1.shape, (512, 128))
+        self.assertEqual(dummy1.weight.packed_weight, cat_qweight1.packed_weight)
+        self.assertEqual(dummy1.weight.scale, cat_qweight1.scale)
+        self.assertEqual(dummy1.weight.zero_point, cat_qweight1.zero_point)
+
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight2.shape, (256, 256))
+        self.assertEqual(dummy2.weight.packed_weight, cat_qweight2.packed_weight)
+        self.assertEqual(dummy2.weight.scale, cat_qweight2.scale)
+        self.assertEqual(dummy2.weight.zero_point, cat_qweight2.zero_point)
+
+    def test_transpose(self):
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        # transpose again to return to the original state
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        self.assertTrue(linear1.weight.shape, (256, 128))
+
+        input = torch.randn(32, 128, dtype=torch.bfloat16, device="cuda")
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 256))
+
 
 if __name__ == "__main__":
     run_tests()
@@ -5,13 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 import abc
 import dataclasses
+from dataclasses import dataclass
 import enum
 import importlib
 import json
 from typing import Any, ClassVar, Dict
 
 import torch
 
+__all__ = [
+    "AOBaseConfig",
+    "VersionMismatchError",
+    "config_to_dict",
+    "config_from_dict",
+    "ALLOWED_AO_MODULES",
+]
+
 
 class AOBaseConfig(abc.ABC):
     """