Update

lw · lw · commit 7debcd9db226 · 2024-11-22T10:02:34.000Z
[ghstack-poisoned]
diff --git a/torchao/float8/float8_python_api.py b/torchao/float8/float8_python_api.py
@@ -38,7 +38,11 @@ def addmm_float8_unwrapped(
     b_inverse_scale = b_scale.reciprocal()
 
     post_inverse_scale = None
-    if a_scale.shape == (a_data.shape[0], 1) and b_scale.shape == (1, b_data.shape[1]) and not use_fast_accum:
+    if (
+        a_scale.shape == (a_data.shape[0], 1)
+        and b_scale.shape == (1, b_data.shape[1])
+        and not use_fast_accum
+    ):
         # The rowwise CUTLASS-based kernel is so slow without fast-accum that
         # we'd rather use the tensorwise cuBLAS-based kernel and do the scaling
         # manually afterwards (hoping Inductor will be able to fuse it).