Utilities for slicing preshuffled tensors (#4396)

jwfromm · facebook-github-bot · commit 0bef87ae71d6 · 2025-06-25T15:28:38.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1467 Pull Request resolved: #4396 Some integrations of fbgemm kernels and oss systems like VLLM would be made simpler by the ability to slice preshuffled tensors. Prior to this diff, there were two blockers to doing that: - Scales were required to be contiguous. This is easily addressed by more carefully setting the stride argument. - Shuffled tensors have a non-trivial layout. We add a python helper function for slicing int4 shuffled tensors. Notably, it involves some data copying that I believe is unavoidable. Hopefully it only needs to be done during model setup. Reviewed By: jiawenliu64, jianyuh Differential Revision: D77239566 fbshipit-source-id: ad8eea5eb153f851f1b1e297a566fd36c0ac6409
diff --git a/fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py b/fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py
@@ -181,6 +181,51 @@ def _quantize(
     return wq, scales
 
 
+def shuffle_slice(
+    x: torch.Tensor, dim: int, start: int, length: int, dtype: str = "fp8"
+) -> torch.Tensor:
+    """
+    Helper function to slice a preshuffled int4 tensor. This is needed since the shuffling
+    reorders rows based on the size of the input. Slicing a tensor shuffled for a larger input
+    is no longer valid. We must reorder the tensor to the appropriate size then slice.
+    Args:
+        x (Tensor): [N, K // 2] Preshuffled int4 tensor.
+        dim (int): Dimension to slice.
+        start (int): Start of slice.
+        length (int): Number of elements to slice in the original [N, K] dimension.
+        dtype (str): Type of corresponding activations. Must be fp8 or bf16.
+    Returns:
+        sliced (Tensor): [stop-start, K // 2] Sliced tensor.
+    """
+    # Get the size of the input tensor.
+    assert dim in [x.ndim - 2, x.ndim - 1], "Only slicing along N or K is supported."
+    assert length % 16 == 0, "Slicing must be a multiple of 16."
+    orig_shape = x.shape
+    N = x.shape[-2]
+    K = x.shape[-1]
+    # Tile shape is based on the activation dtype.
+    assert dtype in ("fp8", "bf16"), "Only fp8 and bf16 activations supported."
+    # Handle slice along M
+    if dim == x.ndim - 2:
+        tile_shape = 8 if dtype == "fp8" else 16
+        block_size = N // length
+        # View the shape in terms of shuffled tiles then permute to allow slicing.
+        x_s = x.view(-1, tile_shape, block_size, length // tile_shape, K)
+        x_s = x_s.permute(0, 2, 1, 3, 4).contiguous().view(-1, N, K)
+        out_slice = x_s.narrow(1, start, length)
+        # Reshape back to original shape.
+        return out_slice.view(*orig_shape[:-2], length, K)
+    # Handle slice along K
+    else:
+        outer_dim = x.view(-1, N, K).shape[0]
+        x_s = x.view(outer_dim, -1, length // 2)
+        row_factor = x_s.shape[1] * (length // 2) // K
+        # Take slices of rows corresponding to column slice.
+        return x_s.narrow(1, start * 2 * K // length, row_factor).view(
+            *orig_shape[:-2], N, length // 2
+        )
+
+
 def scale_nvfp4_quant(
     input: torch.Tensor, input_global_scale: torch.Tensor
 ) -> Tuple[torch.Tensor, torch.Tensor]:
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16i4bf16.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16i4bf16.cu
@@ -260,8 +260,13 @@ at::Tensor bf16i4bf16_dispatch(
       "and be contiguous on GPU.");
   // Make sure group scales and zeros are in proper format.
   TORCH_CHECK(
-      w_scale_group.dim() == 2 && w_scale_group.size(1) == N,
-      "Group scales are expected to have shape [num_groups, N].");
+      w_scale_group.dim() == 2 && w_scale_group.size(1) == N &&
+          w_scale_group.is_cuda() && w_scale_group.is_contiguous(),
+      "Group scales are expected to have shape [num_groups, N] and be contiguous on GPU.");
+  TORCH_CHECK(
+      w_zero_group.dim() == 2 && w_zero_group.size(1) == N &&
+          w_zero_group.is_cuda() && w_zero_group.is_contiguous(),
+      "Group zeros are expected to have shape [num_groups, N] and be contiguous on GPU.");
 
   // Allocate output or return an empty tensor if input is empty.
   if (M == 0 || N == 0 || K == 0) {
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_shuffled.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_shuffled.cu
@@ -278,17 +278,18 @@ at::Tensor f8i4bf16_shuffled(
       "and be contiguous on GPU.");
   TORCH_CHECK(
       x_scale.numel() == M && x_scale.dtype() == at::kFloat &&
-          x_scale.is_cuda(),
-      "x_scale must be fp32 and have M total elements.");
+          x_scale.is_cuda() && x_scale.is_contiguous(),
+      "x_scale must be fp32 and have M total elements and be contiguous.");
   TORCH_CHECK(
       w_scale.numel() == N && w_scale.dtype() == at::kFloat &&
-          w_scale.is_cuda(),
-      "Weight row scale should have N elements and be on GPU.");
+          w_scale.is_cuda() && w_scale.is_contiguous(),
+      "Weight row scale should have N elements and be contiguous on GPU.");
   // Make sure w_scale_group is in proper format.
   TORCH_CHECK(
       w_scale_group.dtype() == at::kFloat8_e4m3fn && w_scale_group.dim() == 3 &&
-          w_scale_group.size(1) == 8 && w_scale_group.size(2) == N,
-      "Weights and group scales must be prepacked with preshuffle_i4. "
+          w_scale_group.size(1) == 8 && w_scale_group.size(2) == N &&
+          w_scale_group.is_contiguous(),
+      "Weights and group scales must be contiguous and prepacked with preshuffle_i4. "
       "Group scales are expected to be FP8 and have shape [num_groups, 8, N].");
 
   // Allocate output or return an empty tensor if input is empty.