Update

vkuzo · vkuzo · commit 1d4a2f7d5e3f · 2025-10-30T12:55:56.000-07:00
[ghstack-poisoned]
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -14,9 +14,11 @@
     MappingType,
     ZeroPointDomain,
     _choose_qparams_affine_tinygemm,
+    _choose_scale_float8,
     _fake_quantize_affine,
     _fake_quantize_affine_cachemask,
     _maybe_expand_scale_to_tensor_shape,
+    _quantize_affine_float8,
     choose_qparams_affine,
     dequantize_affine,
     quantize_affine,
@@ -55,6 +57,23 @@ def check_idempotent(self, fn, *args, **kwargs):
     return output1
 
 
+# from https://github.com/pytorch/pytorch/blob/7563f61cc8a40a5ba21a498a2d98895b4eec3f39/test/test_scaled_matmul_cuda.py#L100
+# with scale modified to be the inverse of the version in PT core
+def _tensor_to_scale_block(
+    x: torch.Tensor,
+    float8_dtype: torch.dtype,
+    block_outer: int,
+    block_inner: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    x = x.unflatten(1, (-1, block_inner)).unflatten(0, (-1, block_outer))
+    amax = x.abs().amax(dim=[1, 3], keepdim=True).float()
+    scale = amax / torch.finfo(float8_dtype).max
+    x = x.div(scale).to(float8_dtype)
+    x = x.flatten(2, 3).flatten(0, 1)
+    scale = scale.flatten(2, 3).flatten(0, 1)
+    return x, scale
+
+
 # Legacy tinygemm ops
 def _get_groupwise_affine_qparams(
     w,
@@ -798,6 +817,33 @@ def test_maybe_expand_scale_to_tensor_shape(self):
         self.assertEqual(new_scale5.shape, torch.Size([3, 2, 8]))
         self.assertEqual(new_scale5.unique(dim=-1).shape, torch.Size([3, 2, 2]))
 
+    def test_float8_blockwise_scaling(self):
+        M, K = 512, 1024
+        hp_tensor = torch.randn(M, K, dtype=torch.float)
+        # make the scales from some of the blocks obviously different
+        hp_tensor[0:128, 0:128] *= 3.0
+        hp_tensor[0:128, 128:256] *= 7.0
+        hp_tensor[128:256, 0:128] *= 2.0
+        hp_tensor[128:256, 128:256] *= 100.0
+
+        block_size = (128, 128)
+
+        scale = _choose_scale_float8(
+            hp_tensor,
+            float8_dtype=torch.float8_e4m3fn,
+            block_size=block_size,
+            hp_value_lb=None,
+            hp_value_ub=None,
+        )
+        data = _quantize_affine_float8(hp_tensor, scale, torch.float8_e4m3fn)
+
+        ref_data, ref_scale = _tensor_to_scale_block(
+            hp_tensor, torch.float8_e4m3fn, 128, 128
+        )
+
+        torch.testing.assert_close(scale, ref_scale, atol=0, rtol=0)
+        torch.testing.assert_close(data.float(), ref_data.float(), atol=0, rtol=0)
+
 
 if __name__ == "__main__":
     unittest.main()