Update

vkuzo · vkuzo · commit 26ade9891009 · 2025-10-29T07:07:09.000-07:00
[ghstack-poisoned]
diff --git a/.github/workflows/1xL4_tests.yml b/.github/workflows/1xL4_tests.yml
@@ -51,3 +51,4 @@ jobs:
         pytest test/dtypes/test_affine_quantized_float.py --verbose -s
         ./test/float8/test_everything_single_gpu.sh
         python test/quantization/quantize_/workflows/float8/test_float8_tensor.py
+        python test/kernel/test_blockwise_triton.py --verbose -s
diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py
@@ -217,6 +217,16 @@ def _is_128_128_scaled(x: torch.Tensor) -> bool:
     return len(b) == 2 and b[0] == 128 and b[1] == 128
 
 
+def _granularity_is_a_1_128_w_128_128(
+    g: Union[
+        FP8Granularity,
+        Tuple[FP8Granularity, FP8Granularity],
+        list[FP8Granularity],
+    ],
+) -> bool:
+    return len(g) == 2 and g[0] == PerBlock((1, 128)) and g[1] == PerBlock((128, 128))
+
+
 def _normalize_granularity(
     granularity: Optional[
         Union[
@@ -238,9 +248,7 @@ def _normalize_granularity(
         is_per_row = isinstance(granularity[0], PerRow) and isinstance(
             granularity[1], PerRow
         )
-        is_a_1_128_w_128_128 = granularity[0] == PerBlock((1, 128)) and granularity[
-            1
-        ] == PerBlock((128, 128))
+        is_a_1_128_w_128_128 = _granularity_is_a_1_128_w_128_128(granularity)
 
         if not (is_per_tensor or is_per_row or is_a_1_128_w_128_128):
             raise ValueError(f"Unsupported granularity types: {granularity}.")
@@ -273,9 +281,7 @@ def _check_hardware_support(
     is_per_row = isinstance(granularities[0], PerRow) and isinstance(
         granularities[1], PerRow
     )
-    is_a_1_128_w_128_128 = granularities[0] == PerBlock((1, 128)) and granularities[
-        1
-    ] == PerBlock((128, 128))
+    is_a_1_128_w_128_128 = _granularity_is_a_1_128_w_128_128(granularities)
 
     if is_per_tensor or is_per_row:
         assert is_sm_at_least_89() or is_MI300(), (
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -62,6 +62,7 @@
     Float8MMConfig,
     FP8Granularity,
     _check_hardware_support,
+    _granularity_is_a_1_128_w_128_128,
     _normalize_granularity,
 )
 from torchao.quantization.linear_activation_weight_observed_tensor import (
@@ -1770,13 +1771,22 @@ def __post_init__(self):
         torch._C._log_api_usage_once(
             "torchao.quantization.Float8DynamicActivationFloat8WeightConfig"
         )
-        if self.mm_config is None:
-            self.mm_config = Float8MMConfig(use_fast_accum=True)
         activation_granularity, weight_granularity = _normalize_granularity(
             self.granularity
         )
         self.granularity = [activation_granularity, weight_granularity]
 
+        default_use_fast_accum = True
+        if _granularity_is_a_1_128_w_128_128(self.granularity):
+            assert self.activation_value_lb is None, "unimplemented"
+            assert self.activation_value_ub is None, "unimplemented"
+            assert self.kernel_preference is KernelPreference.TORCH, "unimplemented"
+            assert self.mm_config is None, "unimplemented"
+            default_use_fast_accum = False
+
+        if self.mm_config is None:
+            self.mm_config = Float8MMConfig(use_fast_accum=default_use_fast_accum)
+
 
 # for bc
 float8_dynamic_activation_float8_weight = _ConfigDeprecationWrapper(
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -352,7 +352,6 @@ def _(func, types, args, kwargs):
             inpt_data, w_data = preprocess_data(inpt_data, w_data.T, scaled_mm_config)
 
             if _is_128_128_scaled(weight_tensor):
-                # TODO(before land): ensure fast_accum is False for blockwise
                 # TODO(future PR): add testing for torch._scaled_mm with
                 # blockwise scaling on CUDA 12.9
                 # TODO(future PR): add fbgemm_gpu_genai path if available