mx cast: remove clamping of output tensor for torch.compile path (#1911)

vkuzo · web-flow · commit 9ef2f0661e87 · 2025-03-18T06:24:50.000-07:00
* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]
diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py
@@ -334,3 +334,55 @@ def test_to_mx_inductor_single_kernel():
     to_mx_c = torch.compile(MXTensor.to_mx, fullgraph=True)
     out, code = run_and_get_code(to_mx_c, x, torch.float8_e4m3fn, block_size)
     FileCheck().check("def call(").check_count(".run(", 1, exactly=True).run(code[0])
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not is_sm_at_least_89(),
+    reason="float8 in triton requires CUDA capability 8.9 or greater",
+)
+def test_cast_to_float8_e4m3fn_saturation_behavior():
+    # TODO(#1912): make the saturated cast work in eager mode and remove this
+    # test
+    max_val = torch.finfo(torch.float8_e4m3fn).max
+
+    # create example data inside the representable range
+    data_in_range_bf16 = torch.tensor(
+        [
+            max_val,
+            -1 * max_val,
+        ],
+        dtype=torch.bfloat16,
+        device="cuda",
+    )
+
+    # create example data outside the representable range
+    data_out_of_range_bf16 = torch.tensor(
+        [
+            max_val * 2,
+            -1 * (max_val * 2),
+        ],
+        dtype=torch.bfloat16,
+        device="cuda",
+    )
+
+    # verify that in eager mode PyTorch casting to float8 is unsaturated
+    data_in_range_f8 = data_in_range_bf16.to(torch.float8_e4m3fn)
+    data_out_of_range_f8 = data_out_of_range_bf16.to(torch.float8_e4m3fn)
+    assert not torch.any(torch.isnan(data_in_range_f8))
+    assert torch.all(torch.isnan(data_out_of_range_f8))
+
+    # verify that in triton, casting to float8 is saturated
+    # for simplicity, use torch.compile to generate triton code
+    def to_f8(x):
+        x = x.to(torch.float8_e4m3fn)
+        return x
+
+    to_f8_c = torch.compile(to_f8)
+    data_in_range_f8_c = to_f8_c(data_in_range_bf16)
+    data_out_of_range_f8_c = to_f8_c(data_out_of_range_bf16)
+    assert not torch.any(torch.isnan(data_in_range_f8_c))
+    assert not torch.any(torch.isnan(data_out_of_range_f8_c))
+    torch.testing.assert_close(
+        data_in_range_f8_c, data_out_of_range_f8_c, atol=0, rtol=0
+    )
diff --git a/torchao/prototype/mx_formats/mx_tensor.py b/torchao/prototype/mx_formats/mx_tensor.py
@@ -228,9 +228,18 @@ def to_mx(
         max_pos = F4_E2M1_MAX
     else:
         raise AssertionError("unsupported")
-    data_lp = torch.clamp(
-        data_hp / scale_fp32.unsqueeze(1), min=-1 * max_pos, max=max_pos
-    )
+    data_lp = data_hp / scale_fp32.unsqueeze(1)
+    if (
+        elem_dtype in (torch.float8_e4m3fn, torch.float8_e5m2)
+        and not torch._dynamo.is_compiling()
+    ):
+        # As of 20250317, the Pytorch eager mode cast to `torch.float8_e4m3fn`
+        # is unsaturated. This cast is saturated in triton. If we are compute bound,
+        # we see a speedup if we remove this redundant clamp if we are compiling
+        # to triton.
+        # TODO(#1912): make the saturated cast work in eager mode and remove this
+        # workaround.
+        data_lp = torch.clamp(data_lp, min=-1 * max_pos, max=max_pos)
 
     # cast to target dtype
     if elem_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):