ikawrakow · saood06 · May 24, 2025 · May 24, 2025 · May 24, 2025
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -195,7 +195,7 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int |
                 return False
         return name == (key_name + suffix)
 
-    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
+    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias", ".weight_scale_inv")) -> str:
         new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
         if new_name is None:
             raise ValueError(f"Can not map tensor {name!r}")
@@ -347,6 +347,8 @@ def prepare_tensors(self):
                         # data_qtype = gguf.GGMLQuantizationType.Q6_0
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
                         data_qtype = gguf.GGMLQuantizationType.Q8_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_FP8_E4M3:
+                        data_qtype = gguf.GGMLQuantizationType.FP8_E4M3
                     else:
                         raise ValueError(f"Unknown file type: {self.ftype.name}")
 
@@ -4113,8 +4115,8 @@ def parse_args() -> argparse.Namespace:
         help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
     )
     parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "fp8", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, fp8 for float8_e4m3fn, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
     )
     parser.add_argument(
         "--bigendian", action="store_true",
@@ -4206,6 +4208,7 @@ def main() -> None:
         "q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
         # "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
         "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "fp8": gguf.LlamaFileType.MOSTLY_FP8_E4M3,
         "auto": gguf.LlamaFileType.GUESSED,
     }
 

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -1317,6 +1317,7 @@ class GGMLQuantizationType(IntEnum):
     IQ4_KS_R4 = 344
     Q8_KV_R8  = 398
     Q8_K_R8   = 399
+    FP8_E4M3  = 999
 
 
 class ExpertGatingFuncType(IntEnum):
@@ -1395,6 +1396,7 @@ class LlamaFileType(IntEnum):
     MOSTLY_IQ4_KS_R4       = 337    #except 1d tensors
     MOSTLY_Q8_KV_R8        = 398    #except 1d tensors
     MOSTLY_Q8_K_R8         = 399    #except 1d tensors
+    MOSTLY_FP8_E4M3        = 999    #except 1d tensors
 
 
     GUESSED              = 1024  # not specified in the model file
@@ -1522,6 +1524,7 @@ def get_type(val: Any) -> GGUFValueType:
     GGMLQuantizationType.IQ4_KS_R4   : ( 256,  136),
     GGMLQuantizationType.Q8_KV_R8    : (  32,   32),
     GGMLQuantizationType.Q8_K_R8     : ( 256,  258),
+    GGMLQuantizationType.FP8_E4M3    : (   1,    1),
 }
 
 

diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
@@ -217,6 +217,110 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
         return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
 
 
+class FP8_E4M3(__Quant, qtype=GGMLQuantizationType.FP8_E4M3):
+    FP8_EXP_BIAS = 7
+    FP8_MAX_EXP = 14
+    FP8_MANT_BITS = 3
+    FP32_EXP_BIAS = 127
+
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        f32 = blocks.view(np.float32)
+        u32 = f32.view(np.uint32)
+        sign = (u32 >> 31).astype(np.uint32)
+        exp = (u32 >> 23) & 0xFF
+        mant = u32 & 0x7FFFFF
+
+        #special cases
+        is_nan = (exp == 0xFF) & (mant != 0)
+        is_inf = (exp == 0xFF) & (mant == 0)
+        is_zero = (exp == 0) & (mant == 0)
+
+        #normalize FP32 subnormals
+        is_subnormal_fp32 = (exp == 0) & (mant != 0)
+        leading_zeros = 22 - np.log2(np.maximum(mant, 1)).astype(int)
+        mant = np.where(is_subnormal_fp32, mant << leading_zeros, mant)
+        exp = np.where(is_subnormal_fp32, 1 - leading_zeros, exp)
+
+        #calculate unclipped exponent
+        fp8_exp_raw = exp.astype(np.int32) - (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS)
+        underflow = fp8_exp_raw < 0
+        fp8_exp = np.clip(fp8_exp_raw, 0, cls.FP8_MAX_EXP)
+
+        # calculate subnormal shift
+        shift = np.where(underflow, 1 - fp8_exp_raw, 0)
+
+        # align and round mantissa (RNE)
+        mant_plus_implicit = np.where(exp > 0, mant | 0x800000, mant)
+        total_shift = 20 + shift
+        mant_shifted = np.right_shift(mant_plus_implicit, total_shift)
+        round_bit = np.right_shift(mant_plus_implicit, total_shift - 1) & 1
+        sticky_mask = (1 << (total_shift - 1)) - 1
+        sticky = (mant_plus_implicit & sticky_mask) != 0
+        rounded = mant_shifted + ((round_bit & (sticky | (mant_shifted & 1))) != 0)
+
+        # handle mantissa overflow
+        mant_overflow = rounded >= 16  # 1 << (3+1)
+        fp8_exp = np.where(mant_overflow, fp8_exp + 1, fp8_exp)
+        rounded = np.where(mant_overflow, 8, rounded)  # Reset to 1.000
+
+        # handle exponent overflow
+        overflow = fp8_exp > cls.FP8_MAX_EXP
+        fp8_exp = np.where(overflow, 0xF, fp8_exp)
+        rounded = np.where(overflow, 0, rounded)
+
+        # make the FP8
+        fp8 = (
+            (sign << 7) |
+            ((fp8_exp << 3) & 0x78) |
+            (rounded & 0x7)
+        )
+        fp8 = np.where(is_nan, (sign << 7) | 0x7D, fp8)   # NaN
+        fp8 = np.where(is_inf, (sign << 7) | 0x78, fp8)   # Inf
+        fp8 = np.where(is_zero, sign << 7, fp8)           # Zero
+
+        return fp8.astype(np.uint8)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        fp8 = blocks.astype(np.uint32)
+        sign = (fp8 >> 7) & 1
+        exp = (fp8 >> 3) & 0xF
+        mant = fp8 & 0x7
+
+        #special cases
+        is_nan = (exp == 0xF) & (mant != 0)
+        is_inf = (exp == 0xF) & (mant == 0)
+        is_zero = (exp == 0) & (mant == 0)
+        is_subnormal = (exp == 0) & (mant != 0)
+
+        fp32_exp = np.where(
+            exp > 0,
+            exp + (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS),
+            (1 - cls.FP8_EXP_BIAS) + cls.FP32_EXP_BIAS  # -6 + 127 = 121
+        )
+
+        mant_scale = np.where(
+            is_subnormal,
+            mant.astype(np.float32) * 0.125,  # 1/8
+            1.0 + mant.astype(np.float32) * 0.125
+        )
+
+        result = np.where(
+            is_nan,
+            np.nan,
+            np.where(
+                is_inf,
+                np.copysign(np.inf, (-1.0)**sign),
+                np.where(
+                    is_zero,
+                    np.copysign(0.0, (-1.0)**sign),
+                    np.ldexp(mant_scale * (-1.0)**sign, fp32_exp - cls.FP32_EXP_BIAS)
+                )
+            )
+        )
+        return result.astype(np.float32)
+
 class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
     @classmethod
     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: