Groupwise quantization kernel (#4439)

jwfromm · facebook-github-bot · commit 5938645cbc4c · 2025-07-02T19:57:31.000-07:00
Summary: Pull Request resolved: #4439 X-link: facebookresearch/FBGEMM#1503 When doing groupwise quantization, we previously used the blockwise kernel ` quantize_fp8_block(w, block_m=1, block_k=128)`. However, this is quite inefficient as the blockwise kernel needs to use a 2D grid. We can be much faster using 1 thread per row and iterating over groups within that row. This diff introduces a bespoke groupwise quantization kernel that is dramatically faster than the flattened block approach. Reviewed By: jiawenliu64 Differential Revision: D77689544 fbshipit-source-id: 8059e73d2794f70d9c1f995c908ca036f4cb1680
diff --git a/fbgemm_gpu/experimental/gemm/test/fp8_gemm_test.py b/fbgemm_gpu/experimental/gemm/test/fp8_gemm_test.py
@@ -21,6 +21,7 @@
         matmul_fp8_block,
         matmul_fp8_row,
         quantize_fp8_block,
+        quantize_fp8_group,
         # packed_row unpacks the values, packed_row_raw returns just the packed tensor
         quantize_fp8_packed_row,
         quantize_fp8_packed_row_raw,
@@ -517,6 +518,37 @@ def _quantize_matmul_fp8(
             (3, 4, 5), torch.device("cuda"), use_bias=False
         )
 
+    def test_quantize_fp8_group(self) -> None:
+        def _test_quantize_fp8_group(
+            shape: Tuple[int, int],
+            group_size: int,
+            use_scale_ub: bool = False,
+        ) -> None:
+            M, K = shape
+            a = torch.randn(M, K, dtype=torch.float, device="cuda")
+
+            scale_ub = (
+                torch.tensor([1200], dtype=torch.float, device="cuda")
+                if use_scale_ub
+                else None
+            )
+
+            a_fp8, a_scale = quantize_fp8_group(a, group_size, scale_ub=scale_ub)
+
+            a_torch = a_fp8.to(torch.float)
+
+            # Undo scaling.
+            a_torch = a_torch.view(-1, K // group_size, group_size) * a_scale.unsqueeze(
+                -1
+            )
+            a_torch = a_torch.view(M, K)
+
+            self.assertTrue(torch.allclose(a, a_torch, atol=2e-1, rtol=5e-2))
+
+        _test_quantize_fp8_group((128, 128), 128)
+        _test_quantize_fp8_group((1, 256), 64)
+        _test_quantize_fp8_group((2, 384), 128, use_scale_ub=True)
+
     def test_quantize_fp8_block(self) -> None:
         def _test_quantize_fp8_block(
             shape: Tuple[int, int],
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -3111,7 +3111,7 @@ def triton_quantize_fp8_block(
     M, K = x.shape
     grid_m = triton.cdiv(M, block_m)
     grid_k = triton.cdiv(K, block_k)
-    x_scale = torch.ones((grid_m, grid_k), device=x.device, dtype=torch.float32)
+    x_scale = torch.empty((grid_m, grid_k), device=x.device, dtype=torch.float32)
     x_fp8 = torch.empty((M, K), device=x.device, dtype=pt_dtype)
 
     _kernel_quantize_fp8_block[(grid_m * grid_k,)](
@@ -3222,6 +3222,230 @@ def quantize_fp8_block(
     return x_fp8.view(x_shape), 1 / x_scale  # pyre-ignore
 
 
+@triton.autotune(
+    configs=[
+        Config({"GROUP_LOAD": 2}),
+        Config({"GROUP_LOAD": 4}),
+        Config({"GROUP_LOAD": 8}),
+        Config({"GROUP_LOAD": 16}),
+        Config({"GROUP_LOAD": 32}),
+    ],
+    key=["K"],
+)
+@triton.jit
+def _kernel_quantize_fp8_group(
+    A,
+    A_scale,
+    A_fp8,
+    scale_ub,
+    M,
+    K,
+    stride_am,
+    stride_ak,
+    stride_om,
+    stride_ok,
+    stride_a_scale_m,
+    stride_a_scale_k,
+    TL_FP8_DTYPE: tl.constexpr,
+    MAX_FP8: tl.constexpr,
+    EPS: tl.constexpr,
+    CLAMP_MAX: tl.constexpr,
+    USE_INT64: tl.constexpr,
+    GROUP_SIZE: tl.constexpr,
+    GROUP_LOAD: tl.constexpr,
+):
+    """Quantize and scale each GROUP_SIZE chunk of each row.
+
+    Scale per group i is computed as 1 / (MAX_FP8 / max(abs(A[i:i+GROUP_SIZE])))
+
+    Each kernel thread is responsible for one row and loads and processes a tunable
+    number of groups at once.
+
+    Args:
+        A (Tensor): [M, K] higher precision input tensor.
+        A_scale (Tensor): [M, cdiv(K, GROUP_SIZE)] reciprocal scale tensor per group.
+        A_fp8 (Tensor): [M, K] fp8 scaled tensor. A_fp8 = A * a
+        scale_ub (Tensor): [1] Maximum allowed value for scale.
+        M (int): Number of rows.
+        K (int): Number of columns.
+        stride_am (int): Stride of m dimension of A.
+        stride_ak (int): Stride of k dimension of A.
+        stride_om (int): Stride of m dimension of output.
+        stride_ok (int): Stride of k dimension of output.
+        stride_a_scale_m (int): Stride of m dimension of A_scale.
+        stride_a_scale_k (int): Stride of k dimension of A_scale.
+        TL_FP8_DTYPE (tl.dtype): Target fp8 datatype.
+        MAX_FP8 (float): Maxmimum expressible value for FP8.
+        EPS (float): Epsilon value for numerical stability.
+        CLAMP_MAX (bool): Whether to apply scale_ub.
+        USE_INT64 (bool): Whether to index using int64, which may be needed for large tensors.
+        GROUP_SIZE (int): Group size for K dimension of A_scale and kernel.
+        GROUP_LOAD (int): Number of groups to load and process simultaneously.
+    """
+    pid = tl.program_id(0)
+    if USE_INT64:
+        pid = pid.to(tl.int64)
+    # We load group_size * group_load chunks at a time.
+    row_offset = pid * stride_am
+    out_offset = pid * stride_om
+    scale_row_offset = pid * stride_a_scale_m
+    k_offset = tl.arange(0, GROUP_LOAD * GROUP_SIZE)
+    scale_k_offset = tl.arange(0, GROUP_LOAD)
+    NUM_GROUPS: tl.constexpr = K // GROUP_SIZE
+
+    for k in range(0, tl.cdiv(K, (GROUP_LOAD * GROUP_SIZE))):
+        # Load groups of the input.
+        chunk_offset = k_offset + k * GROUP_LOAD * GROUP_SIZE
+        a = tl.load(
+            A + row_offset + chunk_offset * stride_ak, mask=chunk_offset < K, other=0.0
+        )
+        # View loaded chunk as a set of groups.
+        a_grouped = tl.reshape(a, [GROUP_LOAD, GROUP_SIZE])
+        # Reduce over groups.
+        group_max = tl.max(tl.abs(a_grouped), axis=1)
+        # Apply clamping if specified.
+        if CLAMP_MAX:
+            ub = tl.load(scale_ub)
+            group_max = tl.clamp(group_max, EPS, ub)
+        else:
+            group_max = tl.maximum(group_max, EPS)
+        # Scale and quantize.
+        a_scale = MAX_FP8 / group_max
+        scale_chunk_offset = scale_k_offset + k * GROUP_LOAD
+        tl.store(
+            A_scale + scale_row_offset + scale_chunk_offset * stride_a_scale_k,
+            1.0 / a_scale,
+            mask=scale_chunk_offset < NUM_GROUPS,
+        )
+        # Apply scale to input.
+        a_fp8 = a_grouped * a_scale[:, None]
+        # Clamp to FP8 range to avoid overflow
+        a_fp8 = tl.clamp(a_fp8, -MAX_FP8, MAX_FP8).to(TL_FP8_DTYPE)
+        # Write to output.
+        tl.store(
+            A_fp8 + out_offset + chunk_offset * stride_ok,
+            tl.ravel(a_fp8),
+            mask=chunk_offset < K,
+        )
+
+
+def triton_quantize_fp8_group(
+    x: torch.Tensor,
+    group_size: int = 128,
+    scale_ub: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize a tensor to fp8 with group-wise scalings.
+
+    Scale per group i is computed as 1 / (MAX_FP8 / max(abs(x[i:i+group_size])))
+
+    Args:
+        x (torch.Tensor): [M, K] higher precision input tensor.
+        group_size (int): Group size for M dimension of scale.
+        scale_ub: Maximum allowed value for scale.
+
+    Returns:
+        torch.Tensor: [M, K] fp8 scaled tensor.
+        torch.Tensor: [M, cdiv(K, group_size)] reciprocal scale tensor per group.
+    """
+    assert x.device != torch.device(
+        "cpu"
+    ), "Triton groupwise quantization not supported on cpu."
+    x_shape = x.shape
+    x = x.view(-1, x.size(-1))
+    pt_dtype, tl_dtype, max_fp8, eps = get_fp8_constants()
+    M, K = x.shape
+    k_groups = triton.cdiv(K, group_size)
+    x_scale = torch.empty((M, k_groups), device=x.device, dtype=torch.float32)
+    x_fp8 = torch.empty((M, K), device=x.device, dtype=pt_dtype)
+    _kernel_quantize_fp8_group[(M,)](
+        x,
+        x_scale,
+        x_fp8,
+        scale_ub,
+        M,
+        K,
+        x.stride(0),
+        x.stride(1),
+        x_fp8.stride(0),
+        x_fp8.stride(1),
+        x_scale.stride(0),
+        x_scale.stride(1),
+        TL_FP8_DTYPE=tl_dtype,
+        MAX_FP8=max_fp8,
+        EPS=eps,
+        CLAMP_MAX=scale_ub is not None,
+        USE_INT64=x.numel() > (2**32 - 1),
+        GROUP_SIZE=group_size,
+    )
+    return x_fp8.view(x_shape), x_scale
+
+
+def quantize_fp8_group(
+    x: torch.Tensor,
+    group_size: int = 128,
+    scale_ub: Optional[torch.Tensor] = None,
+    use_triton: bool = True,
+    output_device: Optional[torch.device] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize a tensor to fp8 with group-wise scalings and optionally move to output device.
+
+    Scale per group i is computed as 1 / (MAX_FP8 / max(abs(x[i:i+group_size])))
+
+    Args:
+        x (Tensor): [M, K] higher precision input tensor.
+        group_size (int): Group size for M dimension of scale.
+        scale_ub: Maximum allowed value for scale.
+        use_triton (bool): Whether to use triton kernel or pytorch.
+        output_device (torch.device): Device to optionally move the scaled tensors to.
+
+    Returns:
+        torch.Tensor: [M, K] fp8 scaled tensor.
+        torch.Tensor: [M, cdiv(K, group_size)] reciprocal scale tensor per group.
+    """
+    x_shape = x.shape
+    x = x.view(-1, x.size(-1))
+    if x.device == torch.device("cpu"):
+        logger.info("Triton does not support cpu, falling back to torch ops.")
+        use_triton = False
+    if use_triton:
+        xq, x_scale = triton_quantize_fp8_group(x, group_size, scale_ub)
+        return xq.view(x_shape), x_scale
+    # else use pytorch implementation.
+    if not output_device:
+        output_device = x.device
+
+    # Get constants.
+    pt_dtype, _, max_fp8, eps = get_fp8_constants()
+
+    M, K = x.shape
+    assert (
+        K % group_size == 0
+    ), "K must be divisible by group_size for cpu implementation."
+    k_groups = triton.cdiv(K, group_size)
+    # View input as colleciton of groups for reduction.
+    x_grouped = x.view(M, k_groups, group_size).to(torch.float32)
+    # Reduce over groups.
+    group_max = x_grouped.abs().amax(dim=2)
+    # Apply clamping.
+    group_max = (
+        torch.clamp(group_max, min=eps, max=scale_ub.item())
+        if scale_ub
+        else torch.clamp(group_max, min=eps)
+    )
+    x_scale = torch.empty((M, k_groups), dtype=torch.float32, device=output_device)
+    x_scale = max_fp8 / group_max  # pyre-ignore
+    # pyre-ignore[16]: Undefined attribute [16]
+    x_scale[x_scale == float("inf")] = 1.0
+    # pyre-ignore[16]: Undefined attribute [16]
+    x_fp8 = x.view(-1, k_groups, group_size) * x_scale.unsqueeze(2)
+    # Cast and move data to output device (for cpu weight loading).
+    x_fp8 = x_fp8.to(device=output_device, dtype=pt_dtype)
+    x_scale = x_scale.to(output_device)  # pyre-ignore
+    return x_fp8.view(x_shape), 1 / x_scale  # pyre-ignore
+
+
 def need_split_k(SIZE_M, SIZE_N, SIZE_K):
     return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024
 
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -26,6 +26,7 @@
     matmul_fp8_block,
     matmul_fp8_row,
     quantize_fp8_block,
+    quantize_fp8_group,
     quantize_fp8_row,
     scale_fp8_row,
     triton_quantize_fp8_row,
@@ -1119,9 +1120,7 @@ def preprocess(self, x, w):
         return x, wq, w_scale, out
 
     def quantize(self, x, wq, w_scale, out):
-        xq, x_scale = quantize_fp8_block(x, block_m=1, block_k=128)
-        # Pretranspose scales to deepgemm format.
-        x_scale = get_col_major_tma_aligned_tensor(x_scale)
+        xq, x_scale = quantize_fp8_group(x, group_size=128)
         return xq, wq, x_scale, w_scale, out
 
     def compute(self, xq, wq, x_scale, w_scale, out):