Add a_1_128_w_128_128 (DeepSeek style) float8 scaling for inference

vkuzo · vkuzo · commit 070068feb59f · 2025-10-29T04:05:26.000-07:00
Summary: Basic enablement of the a_1_128_w_128_128 float8 scaling recipe in torchao inference. In detail: 1. bring the 128x128 gemm triton kernel we have out of prototype and wrap it with a custom op for `torch.compile` compatibility 2. enable the new granularity in various utility functions 3. wire the new granularity through the float8 inference configs 4. add a test which tests for e2e numerical correctness via SQNR comparison vs high precision baseline For now I added a fallback which only requires triton and is numerically correct but may not reach optimal performance. Performance optimization is left for future PRs: 1. we should map the gemm to `torch._scaled_mm` for CUDA 12.9+ 2. we should enable an fbgemm_gpu_genai path, if available in user env 3. we should map to a triton kernel for quantizing the weights, as `torch.compile` is currently known slow for 128x128 block quantization Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: db464e1 ghstack-comment-id: 3460951962 Pull-Request: #3257
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -18,6 +18,7 @@
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
     Float8WeightOnlyConfig,
+    PerBlock,
     PerRow,
     PerTensor,
     quantize_,
@@ -61,20 +62,37 @@ def setUp(self):
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
-    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
-    @common_utils.parametrize("mode", ["dynamic", "weight-only"])
-    @common_utils.parametrize("compile", [True, False])
-    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
+    # @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
+    @common_utils.parametrize(
+        "dtype",
+        [
+            torch.bfloat16,
+        ],
+    )
+    # @common_utils.parametrize("mode", ["dynamic", "weight-only"])
+    @common_utils.parametrize(
+        "mode",
+        [
+            "dynamic",
+        ],
+    )
+    # @common_utils.parametrize("compile", [True, False])
+    @common_utils.parametrize("compile", [False])
+    # @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
+    @common_utils.parametrize(
+        "granularity", [(PerBlock((1, 128)), PerBlock((128, 128)))]
+    )
     @common_utils.parametrize(
         "kernel_preference",
-        [KernelPreference.AUTO, KernelPreference.TORCH, KernelPreference.FBGEMM],
+        # [KernelPreference.AUTO, KernelPreference.TORCH, KernelPreference.FBGEMM],
+        [KernelPreference.TORCH],
     )
     # Inputs are (M,..), K, N
     @common_utils.parametrize(
         "sizes",
         [
             ((128,), 256, 128),
-            ((32, 128), 64, 256),
+            # ((32, 128), 64, 256),
         ],
     )
     def test_fp8_linear_variants(
diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py
@@ -14,6 +14,7 @@
 from torchao.float8.float8_utils import is_row_major, pad_tensor_for_matmul
 from torchao.float8.types import FP8Granularity
 from torchao.quantization.granularity import (
+    PerBlock,
     PerRow,
     PerTensor,
 )
@@ -196,6 +197,26 @@ def _is_tensorwise_scaled(x: torch.Tensor) -> bool:
     )
 
 
+def _is_1_128_scaled(x: torch.Tensor) -> bool:
+    """Checks if a quantized tensor is scaled with a block size of 1x128
+    Args:
+        x: quantized tensor (should have `block_size` attribute)
+    """
+    assert hasattr(x, "block_size"), "Expecting input to have `block_size` attribute"
+    b = x.block_size
+    return len(b) == 2 and b[0] == 1 and b[1] == 128
+
+
+def _is_128_128_scaled(x: torch.Tensor) -> bool:
+    """Checks if a quantized tensor is scaled with a block size of 128x128
+    Args:
+        x: quantized tensor (should have `block_size` attribute)
+    """
+    assert hasattr(x, "block_size"), "Expecting input to have `block_size` attribute"
+    b = x.block_size
+    return len(b) == 2 and b[0] == 128 and b[1] == 128
+
+
 def _normalize_granularity(
     granularity: Optional[
         Union[
@@ -211,22 +232,25 @@ def _normalize_granularity(
     elif isinstance(granularity, (PerTensor, PerRow)):
         processed_granularity = (granularity, granularity)
     elif isinstance(granularity, (tuple, list)) and len(granularity) == 2:
-        if not (
-            isinstance(granularity[0], (PerTensor, PerRow))
-            and isinstance(granularity[1], (PerTensor, PerRow))
-        ):
-            raise ValueError(
-                f"Invalid granularity types: {granularity}, only PerTensor or PerRow are supported."
-            )
+        is_per_tensor = isinstance(granularity[0], PerTensor) and isinstance(
+            granularity[1], PerTensor
+        )
+        is_per_row = isinstance(granularity[0], PerRow) and isinstance(
+            granularity[1], PerRow
+        )
+        is_a_1_128_w_128_128 = granularity[0] == PerBlock((1, 128)) and granularity[
+            1
+        ] == PerBlock((128, 128))
+
+        if not (is_per_tensor or is_per_row or is_a_1_128_w_128_128):
+            raise ValueError(f"Unsupported granularity types: {granularity}.")
         if not isinstance(granularity[0], type(granularity[1])):
             raise ValueError(
-                f"Different granularities for activation and weight are not supported: {granularity}, only PerTensor or PerRow are supported."
+                f"Different granularities for activation and weight are not supported: {granularity}."
             )
         processed_granularity = tuple(granularity)
     else:
-        raise ValueError(
-            f"Invalid granularity specification: {granularity}, only PerTensor or PerRow are supported."
-        )
+        raise ValueError(f"Invalid granularity specification: {granularity}.")
     return processed_granularity
 
 
@@ -243,12 +267,24 @@ def _check_hardware_support(
         AssertionError: If hardware doesn't support the requested granularity
         ValueError: If invalid granularity type is provided
     """
-    for _granularity in granularities:
-        if not isinstance(_granularity, (PerTensor, PerRow)):
-            raise ValueError(
-                f"Invalid granularity type: {_granularity}, only PerTensor or PerRow are supported."
-            )
+    is_per_tensor = isinstance(granularities[0], PerTensor) and isinstance(
+        granularities[1], PerTensor
+    )
+    is_per_row = isinstance(granularities[0], PerRow) and isinstance(
+        granularities[1], PerRow
+    )
+    is_a_1_128_w_128_128 = granularities[0] == PerBlock((1, 128)) and granularities[
+        1
+    ] == PerBlock((128, 128))
 
+    if is_per_tensor or is_per_row:
         assert is_sm_at_least_89() or is_MI300(), (
             "Float8 dynamic quantization requires CUDA compute capability ≥8.9 or MI300+."
         )
+    elif is_a_1_128_w_128_128:
+        # TODO(future PR): look into AMD support
+        assert is_sm_at_least_89(), (
+            "Float8 1x128 activation and 128x128 weight scaling requires CUDA compute capability ≥8.9."
+        )
+    else:
+        raise ValueError(f"Invalid granularities {granularities}.")
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -15,13 +15,18 @@
 from torchao.float8.inference import (
     Float8MMConfig,
     FP8Granularity,
+    _is_1_128_scaled,
+    _is_128_128_scaled,
     _is_rowwise_scaled,
     _is_tensorwise_scaled,
     _slice_scale_for_dimension,
     addmm_float8_unwrapped_inference,
     preprocess_data,
     preprocess_scale,
 )
+from torchao.kernel.blockwise_quantization import (
+    blockwise_fp8_gemm,
+)
 from torchao.quantization.granularity import PerRow, PerTensor
 from torchao.quantization.quant_primitives import (
     _choose_scale_float8,
@@ -337,19 +342,39 @@ def _(func, types, args, kwargs):
                     "Input tensor must be rowwise block size"
                 )
                 w_scale = w_scale.transpose(-1, -2)
+            elif _is_128_128_scaled(weight_tensor):
+                assert _is_1_128_scaled(input_tensor), (
+                    "input_tensor must be 1x128 scaled"
+                )
+                w_scale = w_scale.transpose(-1, -2)
 
             input_scale = preprocess_scale(input_scale, input_tensor.shape)
             inpt_data, w_data = preprocess_data(inpt_data, w_data.T, scaled_mm_config)
 
-            return addmm_float8_unwrapped_inference(
-                inpt_data,
-                input_scale,
-                w_data,
-                w_scale,
-                output_dtype=input_tensor.dtype,
-                bias=bias,
-                use_fast_accum=scaled_mm_config.use_fast_accum,
-            ).reshape(out_shape)
+            if _is_128_128_scaled(weight_tensor):
+                # TODO(before land): ensure fast_accum is False for blockwise
+                # TODO(future PR): add testing for torch._scaled_mm with
+                # blockwise scaling on CUDA 12.9
+                # TODO(future PR): add fbgemm_gpu_genai path if available
+                assert _is_1_128_scaled(input_tensor), "unsupported"
+                res = blockwise_fp8_gemm(
+                    inpt_data,
+                    input_scale,
+                    w_data.t(),
+                    w_scale,
+                    block_size=128,
+                )
+            else:
+                res = addmm_float8_unwrapped_inference(
+                    inpt_data,
+                    input_scale,
+                    w_data,
+                    w_scale,
+                    output_dtype=input_tensor.dtype,
+                    bias=bias,
+                    use_fast_accum=scaled_mm_config.use_fast_accum,
+                )
+            return res.reshape(out_shape)
     else:
         assert not isinstance(input_tensor, TorchAOBaseTensor), (
             "Expecting input_tensor to be unquantized"