Add support for resharding for fbgemm configs

jerryzh168 · jerryzh168 · commit 4e562e257b1e · 2025-06-16T18:21:56.000-07:00
Summary:
added transpose and cat op support, and also some custom transpose/reshape/unflatten support
for resharding.

In the future we should probably provide examples for using distributed checkpoint for resharding

Test Plan:
python test/dtypes/test_fbgemm_int4.py -k test_transpose
python test/dtypes/test_fbgemm_int4.py -k test_cat
python test/dtypes/test_fbgemm_fp8.py -k test_transpose
python test/dtypes/test_fbgemm_fp8.py -k test_cat

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/dtypes/test_fbgemm_fp8.py b/test/dtypes/test_fbgemm_fp8.py
@@ -146,6 +146,53 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=dtype)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        cat_weight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        dummy1 = torch.nn.Linear(128, 512, bias=False, dtype=dtype, device=device)
+
+        dummy1.weight = torch.nn.Parameter(cat_weight1)
+        quantize_(dummy1, self.config)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight1.shape, (512, 128))
+        self.assertTrue(cat_qweight2.shape, (256, 256))
+        self.assertEqual(dummy1.weight.float8_data, cat_qweight1.float8_data)
+        self.assertEqual(dummy1.weight.scale, cat_qweight1.scale)
+
+        ref_qweight2_float8_data = torch.cat([linear1.weight.float8_data, linear2.weight.float8_data], dim=1)
+        ref_qweight2_scale = torch.cat([linear1.weight.scale, linear2.weight.scale], dim=1)
+        self.assertEqual(cat_qweight2.float8_data, ref_qweight2_float8_data)
+        self.assertEqual(cat_qweight2.scale, ref_qweight2_scale)
+
+    def test_transpose(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype, device=device)
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        linear1.bias = torch.nn.Parameter(
+            torch.randn(128, dtype=dtype, device=device)
+        )
+        self.assertTrue(linear1.weight.shape, (128, 256))
+
+        input = torch.randn(32, 256, dtype=dtype, device=device)
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 128))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/dtypes/test_fbgemm_int4.py b/test/dtypes/test_fbgemm_int4.py
@@ -152,6 +152,52 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=dtype)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        cat_weight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        dummy1 = torch.nn.Linear(128, 512, bias=False, dtype=dtype, device=device)
+        dummy2 = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
+
+        dummy1.weight = torch.nn.Parameter(cat_weight1)
+        dummy2.weight = torch.nn.Parameter(cat_weight2)
+        quantize_(dummy1, self.config)
+        quantize_(dummy2, self.config)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight1.shape, (512, 128))
+        self.assertTrue(cat_qweight2.shape, (256, 256))
+        self.assertEqual(dummy1.weight.packed_weight, cat_qweight1.packed_weight)
+        self.assertEqual(dummy1.weight.scale, cat_qweight1.scale)
+        self.assertEqual(dummy1.weight.zero_point, cat_qweight1.zero_point)
+        self.assertEqual(dummy2.weight.packed_weight, cat_qweight2.packed_weight)
+        self.assertEqual(dummy2.weight.scale, cat_qweight2.scale)
+        self.assertEqual(dummy2.weight.zero_point, cat_qweight2.zero_point)
+
+    def test_transpose(self):
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        # transpose again to return to the original state
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        self.assertTrue(linear1.weight.shape, (256, 128))
+
+        input = torch.randn(32, 128, dtype=torch.bfloat16, device="cuda")
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 256))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torchao/dtypes/fbgemm_fp8_tensor.py b/torchao/dtypes/fbgemm_fp8_tensor.py
@@ -26,7 +26,16 @@
 
 class FbgemmFp8Tensor(TorchAOBaseTensor):
     """
+    Float8 Rowwise Quantized (weight) Tensor, with float8 rowwise dynamic quantization for activation.
     TODO: needs padding for cutlass kernels
+
+    Tensor Attributes:
+        float8_data: float8 raw data, dtype torchao.float8.config.e4m3_dtype
+        scale: the rowwise scale for float8 Tensor
+        activation_scale_ub: upper bound for activation scale, used during dynamic quantization for activation
+
+    Non-Tensor Attributes:
+        dtype: Original Tensor dtype
     """
 
     tensor_data_attrs = ["float8_data", "scale", "activation_scale_ub"]
@@ -40,7 +49,9 @@ def __new__(cls, float8_data, scale, activation_scale_ub, dtype):
         kwargs["requires_grad"] = False
         return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
 
-    def __init__(self, float8_data, scale, activation_scale_ub, dtype):
+    def __init__(
+        self, float8_data, scale, activation_scale_ub, dtype
+    ):
         self.float8_data = float8_data
         self.scale = scale
         self.activation_scale_ub = activation_scale_ub
@@ -85,6 +96,47 @@ def to(self, *args, **kwargs):
             self.dtype,
         )
 
+    def _transpose_and_reshape(self):
+        """This is added for resharding support, since the resharding logic for the model we are
+        working with only support 2D
+        """
+        assert len(self.shape) == 3, (
+            f"Only expected to be used when the Tensor is 3D, got {len(self.shape)}"
+        )
+        dim0, dim1, dim2 = self.shape
+        # because we first transpose the weight before quantization, we'll recover the original shape
+        # by swapping dim1 and dim2
+        original_shape = (dim0, dim2, dim1)
+        # we must save this as 2D in the state dict, since loading code expects 2D weights
+        new_shape = (-1, original_shape[-1])
+        float8_data = self.float8_data
+        float8_data = float8_data.transpose(1, 2).reshape(*new_shape).contiguous()
+        scale = self.scale.transpose(1, 2).reshape(*new_shape).contiguous()
+        return self.__class__(
+            float8_data,
+            scale,
+            self.activation_scale_ub,
+            self.dtype,
+        )
+
+    def _unflatten(self, num_experts):
+        """This is added for resharding support, since the resharding logic for the model we are
+        working with only support 2D
+        """
+        float8_data = self.float8_data
+        scale = self.scale
+        dim0, dim1 = self.shape
+        float8_data = float8_data.unflatten(0, (num_experts, -1)).squeeze(dim=0)
+        scale = scale.unflatten(0, (num_experts, -1)).squeeze(dim=0)
+        dim0, dim1, dim2 = float8_data.shape
+
+        return self.__class__(
+            float8_data,
+            scale,
+            self.activation_scale_ub,
+            self.dtype,
+        )
+
     @classmethod
     def from_float(
         cls,
@@ -106,8 +158,10 @@ def from_float(
             else:
                 w = w.t()
 
-        wq, w_scale = torch.ops.triton.quantize_fp8_row(w)
-        # wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
+        wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
+        # add a last dimension for per row quantization to align the rank of
+        # w_scale and wq
+        w_scale = w_scale.unsqueeze(-1).contiguous()
         dtype = w.dtype
         del w
         return FbgemmFp8Tensor(
@@ -133,18 +187,18 @@ def _(func, types, args, kwargs):
 
     # not used
     num_tokens = torch.empty([input_tensor.size(0)], device=input_tensor.device)
-    xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
+    a_data, a_scale = torch.ops.fbgemm.quantize_fp8_per_row(
         input_tensor, num_tokens, weight_tensor.activation_scale_ub
     )
 
-    a_data = xq
     b_data = weight_tensor.float8_data
+    b_scale = weight_tensor.scale.squeeze(-1)
 
     res = torch.ops.fbgemm.f8f8bf16_rowwise(
         a_data,
         b_data,
-        x_scale,
-        weight_tensor.scale,
+        a_scale,
+        b_scale,
         use_fast_accum=True,
     )
     res = res.reshape(*orig_act_size[:-1], orig_out_features)
@@ -163,19 +217,21 @@ def _(func, types, args, kwargs):
     orig_act_size = input_tensor.size()
     # not used
     num_tokens = torch.empty([input_tensor.size(0)], device=input_tensor.device)
-    xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
+    a_data, a_scale = torch.ops.fbgemm.quantize_fp8_per_row(
         input_tensor, num_tokens, weight_tensor.activation_scale_ub
     )
 
-    a_data = xq
     b_data = weight_tensor.float8_data
+    b_scale = weight_tensor.scale.squeeze(-1)
+    assert b_data.is_contiguous(), "weight for bmm must be contiguous"
+
     orig_out_features = b_data.shape[-2]
 
     res = torch.ops.fbgemm.f8f8bf16_rowwise_batched(
         a_data,
         b_data,
-        x_scale,
-        weight_tensor.scale,
+        a_scale,
+        b_scale,
     )
     res = res.reshape(*orig_act_size[:-1], orig_out_features)
     return res
@@ -269,6 +325,52 @@ def _(func, types, args, kwargs):
     )
 
 
+@implements(aten.cat.default)
+def _(func, types, args, kwargs):
+    tensors, dim = fill_defaults(args, 2, [[], 0])
+    tensor_0 = tensors[0]
+    if dim < 0:
+        dim = tensor_0.ndim + dim
+
+    for i in range(1, len(tensors)):
+        assert tensor_0.float8_data.ndim == tensors[i].float8_data.ndim
+        assert tensor_0.scale.ndim == tensors[i].scale.ndim
+        assert tensor_0.activation_scale_ub == tensors[i].activation_scale_ub
+
+    float8_datas = [t.float8_data for t in tensors]
+    scales = [t.scale for t in tensors]
+
+    # with rowwise quantization, dimension of float8_data and
+    # origianl shape will be the same, so original dim argument applies
+    # to float8_data
+    cat_float8_data = aten.cat.default(float8_datas, dim)
+
+    if dim != 2:
+        cat_scale = aten.cat.default(scales, dim=dim)
+    else:
+        cat_scale = scales[0]
+
+    new = tensor_0.__class__(
+        cat_float8_data,
+        cat_scale,
+        tensor_0.activation_scale_ub,
+        tensor_0.dtype,
+    )
+    return return_and_correct_aliasing(func, args, kwargs, new)
+
+
+@implements(aten.transpose.int)
+def _(func, types, args, kwargs):
+    self, dim0, dim1 = args
+    float8_data = self.float8_data.transpose(dim0, dim1).contiguous()
+    scale = self.scale.transpose(dim0, dim1).contiguous()
+
+    new = self.__class__(
+        float8_data, scale, self.activation_scale_ub, self.dtype
+    )
+    return return_and_correct_aliasing(func, args, kwargs, new)
+
+
 to_fbgemm_fp8 = FbgemmFp8Tensor.from_float
 
 
diff --git a/torchao/dtypes/fbgemm_int4_tensor.py b/torchao/dtypes/fbgemm_int4_tensor.py