Add support for resharding for fbgemm configs

jerryzh168 · jerryzh168 · commit 9e128c18266e · 2025-06-16T13:49:38.000-07:00
Summary:
added transpose and cat op support, and also some custom transpose/reshape/unflatten support
for resharding.

In the future we should probably provide examples for using distributed checkpoint for resharding

Test Plan:
python test/dtypes/test_fbgemm_int4.py -k test_transpose
python test/dtypes/test_fbgemm_int4.py -k test_cat
python test/dtypes/test_fbgemm_fp8.py -k test_transpose
python test/dtypes/test_fbgemm_fp8.py -k test_cat

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/dtypes/test_fbgemm_fp8.py b/test/dtypes/test_fbgemm_fp8.py
@@ -146,6 +146,35 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        cat_weight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_weight1.shape, (512, 128))
+        self.assertTrue(cat_weight2.shape, (256, 256))
+
+    def test_transpose(self):
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        linear1.bias = torch.nn.Parameter(
+            torch.randn(128, dtype=torch.bfloat16, device="cuda")
+        )
+        self.assertTrue(linear1.weight.shape, (128, 256))
+
+        input = torch.randn(32, 256, dtype=torch.bfloat16, device="cuda")
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 128))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/dtypes/test_fbgemm_int4.py b/test/dtypes/test_fbgemm_int4.py
@@ -152,6 +152,34 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        cat_weight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_weight1.shape, (512, 128))
+        self.assertTrue(cat_weight2.shape, (256, 256))
+
+    def test_transpose(self):
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        # transpose again to return to the original state
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        self.assertTrue(linear1.weight.shape, (256, 128))
+
+        input = torch.randn(32, 128, dtype=torch.bfloat16, device="cuda")
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 256))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torchao/dtypes/fbgemm_fp8_tensor.py b/torchao/dtypes/fbgemm_fp8_tensor.py
@@ -27,22 +27,30 @@
 class FbgemmFp8Tensor(TorchAOBaseTensor):
     """
     TODO: needs padding for cutlass kernels
+    Args:
+        data_to_scale_dim: the dim mapping from float8_data to scale, e.g.
+          float8_data: (batch_size, output_channel, input_channel)
+          scale: (batch_size, output_channel) (since it's per row quantization)
+          data_to_scale_dim: {0: 0, 1: 1}
     """
 
     tensor_data_attrs = ["float8_data", "scale", "activation_scale_ub"]
-    tensor_attributes = ["dtype"]
+    tensor_attributes = ["data_to_scale_dim", "dtype"]
 
-    def __new__(cls, float8_data, scale, activation_scale_ub, dtype):
+    def __new__(cls, float8_data, scale, activation_scale_ub, data_to_scale_dim, dtype):
         shape = float8_data.shape
         kwargs = {}
         kwargs["device"] = float8_data.device
         kwargs["dtype"] = dtype
         kwargs["requires_grad"] = False
         return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
 
-    def __init__(self, float8_data, scale, activation_scale_ub, dtype):
+    def __init__(
+        self, float8_data, scale, activation_scale_ub, data_to_scale_dim, dtype
+    ):
         self.float8_data = float8_data
         self.scale = scale
+        self.data_to_scale_dim = data_to_scale_dim
         self.activation_scale_ub = activation_scale_ub
 
     def __tensor_flatten__(self):
@@ -68,12 +76,12 @@ def _apply_fn_to_data(self, fn):
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(weight={self.float8_data}, scale={self.scale}, "
-            f"activation_scale_ub={self.activation_scale_ub}, "
+            f"activation_scale_ub={self.activation_scale_ub}, data_to_scale_dim={self.data_to_scale_dim}, "
             f"shape={self.shape}, device={self.device}, dtype={self.dtype}, requires_grad={self.requires_grad})"
         )
 
     def _quantization_type(self):
-        return f"shape={self.shape}, activation_scale_ub={self.activation_scale_ub}, device={self.device}"
+        return f"shape={self.shape}, data_to_scale_dim={self.data_to_scale_dim}, activation_scale_ub={self.activation_scale_ub}, device={self.device}"
 
     def to(self, *args, **kwargs):
         kwargs = self._get_to_kwargs(*args, **kwargs)
@@ -82,6 +90,53 @@ def to(self, *args, **kwargs):
             self.float8_data.to(device),
             self.scale.to(device),
             self.activation_scale_ub.to(device),
+            self.data_to_scale_dim,
+            self.dtype,
+        )
+
+    def _transpose_and_reshape(self):
+        """This is added for resharding support, since the resharding logic for the model we are
+        working with only support 2D
+        """
+        assert len(self.shape) == 3, (
+            f"Only expected to be used when the Tensor is 3D, got {len(self.shape)}"
+        )
+        dim0, dim1, dim2 = self.shape
+        # because we first transpose the weight before quantization, we'll recover the original shape
+        # by swapping dim1 and dim2
+        original_shape = (dim0, dim2, dim1)
+        # we must save this as 2D in the state dict, since loading code expects 2D weights
+        new_shape = (-1, original_shape[-1])
+        float8_data = self.float8_data
+        float8_data = float8_data.transpose(1, 2).reshape(*new_shape).contiguous()
+        data_to_scale_dim = {0: 0, 1: 1}
+        return self.__class__(
+            float8_data,
+            self.scale,
+            self.activation_scale_ub,
+            data_to_scale_dim,
+            self.dtype,
+        )
+
+    def _unflatten(self, num_experts):
+        """This is added for resharding support, since the resharding logic for the model we are
+        working with only support 2D
+        """
+        float8_data = self.float8_data
+        dim0, dim1 = self.shape
+        float8_data = float8_data.unflatten(0, (num_experts, -1)).squeeze(dim=0)
+        data_to_scale_dim = {0: 0}
+        dim0, dim1, dim2 = float8_data.shape
+        if dim1 == self.scale.shape[1]:
+            data_to_scale_dim[1] = 1
+        else:
+            data_to_scale_dim[2] = 1
+
+        return self.__class__(
+            float8_data,
+            self.scale,
+            self.activation_scale_ub,
+            data_to_scale_dim,
             self.dtype,
         )
 
@@ -106,14 +161,18 @@ def from_float(
             else:
                 w = w.t()
 
-        wq, w_scale = torch.ops.triton.quantize_fp8_row(w)
-        # wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
+        data_to_scale_dim = {0: 0}
+        if w.ndim == 3:
+            data_to_scale_dim[1] = 1
+
+        wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
         dtype = w.dtype
         del w
         return FbgemmFp8Tensor(
             wq,
             w_scale,
             activation_scale_ub=activation_scale_ub,
+            data_to_scale_dim=data_to_scale_dim,
             dtype=dtype,
         )
 
@@ -169,6 +228,8 @@ def _(func, types, args, kwargs):
 
     a_data = xq
     b_data = weight_tensor.float8_data
+    assert b_data.is_contiguous(), "weight for bmm must be contiguous"
+
     orig_out_features = b_data.shape[-2]
 
     res = torch.ops.fbgemm.f8f8bf16_rowwise_batched(
@@ -269,6 +330,63 @@ def _(func, types, args, kwargs):
     )
 
 
+@implements(aten.cat.default)
+def _(func, types, args, kwargs):
+    tensors, dim = fill_defaults(args, 2, [[], 0])
+    tensor_0 = tensors[0]
+    if dim < 0:
+        dim = tensor_0.ndim + dim
+
+    for i in range(1, len(tensors)):
+        assert tensor_0.float8_data.ndim == tensors[i].float8_data.ndim
+        assert tensor_0.scale.ndim == tensors[i].scale.ndim
+        assert tensor_0.activation_scale_ub == tensors[i].activation_scale_ub
+        assert tensor_0.data_to_scale_dim == tensors[i].data_to_scale_dim
+
+    float8_data = [t.float8_data for t in tensors]
+    scale = [t.scale for t in tensors]
+
+    # with rowwise quantization, dimension of float8_data and
+    # origianl shape will be the same, so original dim argument applies
+    # to float8_data
+    cat_float8_data = aten.cat.default(float8_data, dim)
+
+    # if cat dimension has a corresponding scale dimension, then we'll concat the corresponding
+    # scale dimension, otherwise, we'll just use the existing scale
+    if dim in tensor_0.data_to_scale_dim:
+        cat_scale = aten.cat.default(scale, dim=tensor_0.data_to_scale_dim[dim])
+    else:
+        cat_scale = scale[0]
+
+    new = tensor_0.__class__(
+        cat_float8_data,
+        cat_scale,
+        tensor_0.activation_scale_ub,
+        tensor_0.data_to_scale_dim,
+        tensor_0.dtype,
+    )
+    return return_and_correct_aliasing(func, args, kwargs, new)
+
+
+@implements(aten.transpose.int)
+def _(func, types, args, kwargs):
+    self, dim0, dim1 = args
+    float8_data = self.float8_data.transpose(dim0, dim1).contiguous()
+    data_to_scale_dim = self.data_to_scale_dim.copy()
+
+    if dim0 in data_to_scale_dim:
+        data_to_scale_dim[dim1] = data_to_scale_dim[dim0]
+        del data_to_scale_dim[dim0]
+    elif dim1 in data_to_scale_dim:
+        data_to_scale_dim[dim0] = data_to_scale_dim[dim1]
+        del data_to_scale_dim[dim1]
+
+    new = self.__class__(
+        float8_data, self.scale, self.activation_scale_ub, data_to_scale_dim, self.dtype
+    )
+    return return_and_correct_aliasing(func, args, kwargs, new)
+
+
 to_fbgemm_fp8 = FbgemmFp8Tensor.from_float
 
 
diff --git a/torchao/dtypes/fbgemm_int4_tensor.py b/torchao/dtypes/fbgemm_int4_tensor.py