mx cast: torch.log2 -> bit shifts (#1908)

vkuzo · web-flow · commit b1ea75c684cd · 2025-03-18T06:23:21.000-07:00
* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]
diff --git a/torchao/prototype/mx_formats/constants.py b/torchao/prototype/mx_formats/constants.py
@@ -35,6 +35,7 @@
 E8M0_EXPONENT_NAN_VAL = 255
 
 F32_EXP_BIAS = 127
+BF16_EXP_BIAS = 127
 F6_E2M3_EXP_BIAS = 1
 F6_E3M2_EXP_BIAS = 3
 F4_E2M1_EXP_BIAS = 1
diff --git a/torchao/prototype/mx_formats/mx_tensor.py b/torchao/prototype/mx_formats/mx_tensor.py
@@ -23,6 +23,7 @@
 
 from torchao.prototype.mx_formats.config import MXGemmKernelChoice
 from torchao.prototype.mx_formats.constants import (
+    BF16_EXP_BIAS,
     BLOCK_SIZE_DEFAULT,
     DTYPE_FP4,
     DTYPE_FP6_E2M3,
@@ -39,6 +40,7 @@
     F8E4M3_MAX_POW2,
     F8E5M2_MAX,
     F8E5M2_MAX_POW2,
+    F32_EXP_BIAS,
     F32_MIN_NORMAL,
     SUPPORTED_ELEM_DTYPES,
 )
@@ -59,6 +61,7 @@
 
 # TODO(later): read from somewhere else?
 SBITS, EBITS_F32, MBITS_F32 = 1, 8, 23
+EBITS_BF16, MBITS_BF16 = 8, 7
 EBITS_F4_E2M1, MBITS_F4_E2M1 = 2, 1
 EBITS_F6_E2M3, MBITS_F6_E2M3 = 2, 3
 EBITS_F6_E3M2, MBITS_F6_E3M2 = 3, 2
@@ -141,28 +144,51 @@ def to_mx(
     else:
         raise AssertionError("unsupported element dtype")
 
+    if data_hp.dtype is torch.float32:
+        hp_int_dtype = torch.int32
+        hp_mbits = MBITS_F32
+        hp_ebits = EBITS_F32
+        hp_exp_bias = F32_EXP_BIAS
+    else:
+        assert data_hp.dtype is torch.bfloat16
+        hp_int_dtype = torch.int16
+        hp_mbits = MBITS_BF16
+        hp_ebits = EBITS_BF16
+        hp_exp_bias = BF16_EXP_BIAS
+
     # rounding before calculating the largest power of 2
     # X = 2^(floor(log2(rounding(max_abs(v)))-max_exp))
     if scaling_mode == ScaleCalculationMode.EVEN:
         nan_mask = torch.isnan(max_abs)
-        max_abs = max_abs.to(torch.float32).view(torch.int32)
-        val_to_add = 1 << (MBITS_F32 - mbits - 1)
-        mask = ((1 << (EBITS_F32 + SBITS)) - 1) << MBITS_F32
+        max_abs = max_abs.view(hp_int_dtype)
+        val_to_add = 1 << (hp_mbits - mbits - 1)
+        mask = ((1 << (hp_ebits + SBITS)) - 1) << hp_mbits
         max_abs = (max_abs + val_to_add) & mask
-        max_abs = max_abs.view(torch.float32)
-        max_abs[nan_mask] = torch.tensor(float("nan"), device=max_abs.device)
+        max_abs = max_abs.view(data_hp.dtype)
+        max_abs[nan_mask] = torch.tensor(
+            float("nan"), device=max_abs.device, dtype=max_abs.dtype
+        )
 
     # Calculate the scale for different modes
+    max_abs_int32 = (max_abs + eps).view(hp_int_dtype)
+    extracted_pow2 = ((max_abs_int32 >> hp_mbits) & 0b11111111) - hp_exp_bias
+    extracted_pow2 = extracted_pow2.to(data_hp.dtype)
+
     if scaling_mode in (ScaleCalculationMode.FLOOR, ScaleCalculationMode.EVEN):
-        scale_e8m0_unbiased = torch.floor(torch.log2(max_abs + eps)) - target_max_pow2
+        scale_e8m0_unbiased = extracted_pow2 - target_max_pow2
     elif scaling_mode == ScaleCalculationMode.CEIL:
-        scale_e8m0_unbiased = torch.ceil(torch.log2(max_abs + eps)) - target_max_pow2
+        # round up: add one to scale if the mantissa is larger than 0
+        # 0x7FFFFF is equal to 23 ones
+        mantissa_gt_one = (max_abs_int32 & 0x7FFFFF) > 0
+        extracted_pow2 += mantissa_gt_one
+        scale_e8m0_unbiased = extracted_pow2 - target_max_pow2
     else:
         raise AssertionError("unsupported scaling calculation mode")
 
     # Clamp to exponents that can be represented in e8m0
+    # add one to positive range to capture NaNs
     scale_e8m0_unbiased = torch.clamp(
-        scale_e8m0_unbiased, min=-E8M0_EXPONENT_BIAS, max=E8M0_EXPONENT_BIAS
+        scale_e8m0_unbiased, min=-E8M0_EXPONENT_BIAS, max=E8M0_EXPONENT_BIAS + 1
     )
 
     # Create the biased e8m0 representation and cast it to 8 bits
@@ -172,7 +198,7 @@ def to_mx(
     # Conversion to torch.uint8 sets NaN values to 0, fix this by
     # explicitly setting known NaN values to 255
     scale_e8m0_biased = torch.where(
-        torch.isnan(scale_e8m0_unbiased),
+        torch.isnan(max_abs),
         E8M0_EXPONENT_NAN_VAL,
         scale_e8m0_biased,
     )