diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 01c9e34ff..1735bdaf4 100644 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -195,7 +195,7 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | return False return name == (key_name + suffix) - def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias", ".weight_scale_inv")) -> str: new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) if new_name is None: raise ValueError(f"Can not map tensor {name!r}") @@ -347,6 +347,8 @@ def prepare_tensors(self): # data_qtype = gguf.GGMLQuantizationType.Q6_0 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: data_qtype = gguf.GGMLQuantizationType.Q8_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_FP8_E4M3: + data_qtype = gguf.GGMLQuantizationType.FP8_E4M3 else: raise ValueError(f"Unknown file type: {self.ftype.name}") @@ -4113,8 +4115,8 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + "--outtype", type=str, choices=["f32", "f16", "bf16", "fp8", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, fp8 for float8_e4m3fn, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( "--bigendian", action="store_true", @@ -4206,6 +4208,7 @@ def main() -> None: "q5_1": gguf.LlamaFileType.MOSTLY_Q5_1, # "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "fp8": gguf.LlamaFileType.MOSTLY_FP8_E4M3, "auto": gguf.LlamaFileType.GUESSED, } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6819979fd..b3a071fed 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1317,6 +1317,7 @@ class GGMLQuantizationType(IntEnum): IQ4_KS_R4 = 344 Q8_KV_R8 = 398 Q8_K_R8 = 399 + FP8_E4M3 = 999 class ExpertGatingFuncType(IntEnum): @@ -1395,6 +1396,7 @@ class LlamaFileType(IntEnum): MOSTLY_IQ4_KS_R4 = 337 #except 1d tensors MOSTLY_Q8_KV_R8 = 398 #except 1d tensors MOSTLY_Q8_K_R8 = 399 #except 1d tensors + MOSTLY_FP8_E4M3 = 999 #except 1d tensors GUESSED = 1024 # not specified in the model file @@ -1522,6 +1524,7 @@ def get_type(val: Any) -> GGUFValueType: GGMLQuantizationType.IQ4_KS_R4 : ( 256, 136), GGMLQuantizationType.Q8_KV_R8 : ( 32, 32), GGMLQuantizationType.Q8_K_R8 : ( 256, 258), + GGMLQuantizationType.FP8_E4M3 : ( 1, 1), } diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index ff589b852..eed78d93c 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -217,6 +217,110 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32) +class FP8_E4M3(__Quant, qtype=GGMLQuantizationType.FP8_E4M3): + FP8_EXP_BIAS = 7 + FP8_MAX_EXP = 14 + FP8_MANT_BITS = 3 + FP32_EXP_BIAS = 127 + + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + f32 = blocks.view(np.float32) + u32 = f32.view(np.uint32) + sign = (u32 >> 31).astype(np.uint32) + exp = (u32 >> 23) & 0xFF + mant = u32 & 0x7FFFFF + + #special cases + is_nan = (exp == 0xFF) & (mant != 0) + is_inf = (exp == 0xFF) & (mant == 0) + is_zero = (exp == 0) & (mant == 0) + + #normalize FP32 subnormals + is_subnormal_fp32 = (exp == 0) & (mant != 0) + leading_zeros = 22 - np.log2(np.maximum(mant, 1)).astype(int) + mant = np.where(is_subnormal_fp32, mant << leading_zeros, mant) + exp = np.where(is_subnormal_fp32, 1 - leading_zeros, exp) + + #calculate unclipped exponent + fp8_exp_raw = exp.astype(np.int32) - (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS) + underflow = fp8_exp_raw < 0 + fp8_exp = np.clip(fp8_exp_raw, 0, cls.FP8_MAX_EXP) + + # calculate subnormal shift + shift = np.where(underflow, 1 - fp8_exp_raw, 0) + + # align and round mantissa (RNE) + mant_plus_implicit = np.where(exp > 0, mant | 0x800000, mant) + total_shift = 20 + shift + mant_shifted = np.right_shift(mant_plus_implicit, total_shift) + round_bit = np.right_shift(mant_plus_implicit, total_shift - 1) & 1 + sticky_mask = (1 << (total_shift - 1)) - 1 + sticky = (mant_plus_implicit & sticky_mask) != 0 + rounded = mant_shifted + ((round_bit & (sticky | (mant_shifted & 1))) != 0) + + # handle mantissa overflow + mant_overflow = rounded >= 16 # 1 << (3+1) + fp8_exp = np.where(mant_overflow, fp8_exp + 1, fp8_exp) + rounded = np.where(mant_overflow, 8, rounded) # Reset to 1.000 + + # handle exponent overflow + overflow = fp8_exp > cls.FP8_MAX_EXP + fp8_exp = np.where(overflow, 0xF, fp8_exp) + rounded = np.where(overflow, 0, rounded) + + # make the FP8 + fp8 = ( + (sign << 7) | + ((fp8_exp << 3) & 0x78) | + (rounded & 0x7) + ) + fp8 = np.where(is_nan, (sign << 7) | 0x7D, fp8) # NaN + fp8 = np.where(is_inf, (sign << 7) | 0x78, fp8) # Inf + fp8 = np.where(is_zero, sign << 7, fp8) # Zero + + return fp8.astype(np.uint8) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + fp8 = blocks.astype(np.uint32) + sign = (fp8 >> 7) & 1 + exp = (fp8 >> 3) & 0xF + mant = fp8 & 0x7 + + #special cases + is_nan = (exp == 0xF) & (mant != 0) + is_inf = (exp == 0xF) & (mant == 0) + is_zero = (exp == 0) & (mant == 0) + is_subnormal = (exp == 0) & (mant != 0) + + fp32_exp = np.where( + exp > 0, + exp + (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS), + (1 - cls.FP8_EXP_BIAS) + cls.FP32_EXP_BIAS # -6 + 127 = 121 + ) + + mant_scale = np.where( + is_subnormal, + mant.astype(np.float32) * 0.125, # 1/8 + 1.0 + mant.astype(np.float32) * 0.125 + ) + + result = np.where( + is_nan, + np.nan, + np.where( + is_inf, + np.copysign(np.inf, (-1.0)**sign), + np.where( + is_zero, + np.copysign(0.0, (-1.0)**sign), + np.ldexp(mant_scale * (-1.0)**sign, fp32_exp - cls.FP32_EXP_BIAS) + ) + ) + ) + return result.astype(np.float32) + class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0): @classmethod def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: