pytorch
diff --git a/‎test/dtypes/test_fbgemm_fp8.py
Lines changed: 47 additions & 0 deletions b/‎test/dtypes/test_fbgemm_fp8.py
Lines changed: 47 additions & 0 deletions
diff --git a/‎test/dtypes/test_fbgemm_int4.py
Lines changed: 47 additions & 0 deletions b/‎test/dtypes/test_fbgemm_int4.py
Lines changed: 47 additions & 0 deletions
diff --git a/‎torchao/core/config.py
Lines changed: 36 additions & 0 deletions b/‎torchao/core/config.py
Lines changed: 36 additions & 0 deletions
@@ -146,6 +146,53 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=dtype)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        dummy1 = torch.nn.Linear(128, 512, bias=False, dtype=dtype, device=device)
+
+        dummy1.weight = torch.nn.Parameter(cat_weight1)
+        quantize_(dummy1, self.config)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        self.assertTrue(cat_qweight1.shape, (512, 128))
+        self.assertEqual(dummy1.weight.float8_data, cat_qweight1.float8_data)
+        self.assertEqual(dummy1.weight.scale, cat_qweight1.scale)
+
+        # concat with dim == 1 is not really correct and will be fixed later
+        # when we support distributed checkpointing
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight2.shape, (256, 256))
+        ref_float8_data = torch.cat([linear1.weight.float8_data, linear2.weight.float8_data], dim=1)
+        ref_scale = linear1.weight.scale
+        self.assertEqual(cat_qweight2.float8_data, ref_float8_data)
+        self.assertEqual(cat_qweight2.scale, ref_scale)
+
+
+    def test_transpose(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype, device=device)
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        linear1.bias = torch.nn.Parameter(torch.randn(128, dtype=dtype, device=device))
+        self.assertTrue(linear1.weight.shape, (128, 256))
+
+        input = torch.randn(32, 256, dtype=dtype, device=device)
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 128))
+
 
 if __name__ == "__main__":
     run_tests()
@@ -152,6 +152,53 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=dtype)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        cat_weight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        dummy1 = torch.nn.Linear(128, 512, bias=False, dtype=dtype, device=device)
+        dummy2 = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
+
+        dummy1.weight = torch.nn.Parameter(cat_weight1)
+        dummy2.weight = torch.nn.Parameter(cat_weight2)
+        quantize_(dummy1, self.config)
+        quantize_(dummy2, self.config)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        self.assertTrue(cat_qweight1.shape, (512, 128))
+        self.assertEqual(dummy1.weight.packed_weight, cat_qweight1.packed_weight)
+        self.assertEqual(dummy1.weight.scale, cat_qweight1.scale)
+        self.assertEqual(dummy1.weight.zero_point, cat_qweight1.zero_point)
+
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight2.shape, (256, 256))
+        self.assertEqual(dummy2.weight.packed_weight, cat_qweight2.packed_weight)
+        self.assertEqual(dummy2.weight.scale, cat_qweight2.scale)
+        self.assertEqual(dummy2.weight.zero_point, cat_qweight2.zero_point)
+
+    def test_transpose(self):
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        # transpose again to return to the original state
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        self.assertTrue(linear1.weight.shape, (256, 128))
+
+        input = torch.randn(32, 128, dtype=torch.bfloat16, device="cuda")
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 256))
+
 
 if __name__ == "__main__":
     run_tests()
@@ -5,13 +5,24 @@
 # LICENSE file in the root directory of this source tree.
 import abc
 import dataclasses
+from dataclasses import dataclass
 import enum
 import importlib
 import json
 from typing import Any, ClassVar, Dict
 
 import torch
 
+__all__ = [
+    "AOBaseConfig",
+    "VersionMismatchError",
+    "config_to_dict",
+    "config_from_dict",
+    "ALLOWED_AO_MODULES",
+    "e4m3_dtype",
+    "e5m2_dtype",
+]
+
 
 class AOBaseConfig(abc.ABC):
     """
@@ -284,3 +295,28 @@ def config_from_dict(data: Dict[str, Any]) -> AOBaseConfig:
         return cls(**processed_data)
     except Exception as e:
         raise ValueError(f"Failed to create instance of {cls.__name__}: {e}")
+
+@dataclass
+class Float8TypeConfig:
+    """
+    Configuration for selecting the preferred float8 type pair, either e4m3fn/e5m2 or e4m3fnuz/e5m2fnuz.
+
+    Currently, ROCm supports 1. fnuz variants in MI300. 2. OCP F8 variants in MI350/Navi4.
+    """
+
+    # The preferred e4m3 type.
+    e4m3_dtype = torch.float8_e4m3fn
+
+    # The preferred e5m2 type.
+    e5m2_dtype = torch.float8_e5m2
+
+    def __post_init__(self):
+        if torch.version.hip and torch.cuda.is_available() and is_MI300():
+            self.e4m3_dtype = torch.float8_e4m3fnuz
+            self.e5m2_dtype = torch.float8_e5m2fnuz
+
+
+# User defined type for using the individual F8 type based on config
+type_config = Float8TypeConfig()
+e4m3_dtype = type_config.e4m3_dtype
+e5m2_dtype = type_config.e5m2_dtype