Skip to content

Add support for FP8 GGUF creation and re-quantization (WIP) #454

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int |
return False
return name == (key_name + suffix)

def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias", ".weight_scale_inv")) -> str:
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
if new_name is None:
raise ValueError(f"Can not map tensor {name!r}")
Expand Down Expand Up @@ -347,6 +347,8 @@ def prepare_tensors(self):
# data_qtype = gguf.GGMLQuantizationType.Q6_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
data_qtype = gguf.GGMLQuantizationType.Q8_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_FP8_E4M3:
data_qtype = gguf.GGMLQuantizationType.FP8_E4M3
else:
raise ValueError(f"Unknown file type: {self.ftype.name}")

Expand Down Expand Up @@ -4113,8 +4115,8 @@ def parse_args() -> argparse.Namespace:
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
)
parser.add_argument(
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
"--outtype", type=str, choices=["f32", "f16", "bf16", "fp8", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, fp8 for float8_e4m3fn, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
)
parser.add_argument(
"--bigendian", action="store_true",
Expand Down Expand Up @@ -4206,6 +4208,7 @@ def main() -> None:
"q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
# "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
"fp8": gguf.LlamaFileType.MOSTLY_FP8_E4M3,
"auto": gguf.LlamaFileType.GUESSED,
}

Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1317,6 +1317,7 @@ class GGMLQuantizationType(IntEnum):
IQ4_KS_R4 = 344
Q8_KV_R8 = 398
Q8_K_R8 = 399
FP8_E4M3 = 999


class ExpertGatingFuncType(IntEnum):
Expand Down Expand Up @@ -1395,6 +1396,7 @@ class LlamaFileType(IntEnum):
MOSTLY_IQ4_KS_R4 = 337 #except 1d tensors
MOSTLY_Q8_KV_R8 = 398 #except 1d tensors
MOSTLY_Q8_K_R8 = 399 #except 1d tensors
MOSTLY_FP8_E4M3 = 999 #except 1d tensors


GUESSED = 1024 # not specified in the model file
Expand Down Expand Up @@ -1522,6 +1524,7 @@ def get_type(val: Any) -> GGUFValueType:
GGMLQuantizationType.IQ4_KS_R4 : ( 256, 136),
GGMLQuantizationType.Q8_KV_R8 : ( 32, 32),
GGMLQuantizationType.Q8_K_R8 : ( 256, 258),
GGMLQuantizationType.FP8_E4M3 : ( 1, 1),
}


Expand Down
104 changes: 104 additions & 0 deletions gguf-py/gguf/quants.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,110 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)


class FP8_E4M3(__Quant, qtype=GGMLQuantizationType.FP8_E4M3):
FP8_EXP_BIAS = 7
FP8_MAX_EXP = 14
FP8_MANT_BITS = 3
FP32_EXP_BIAS = 127

@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
f32 = blocks.view(np.float32)
u32 = f32.view(np.uint32)
sign = (u32 >> 31).astype(np.uint32)
exp = (u32 >> 23) & 0xFF
mant = u32 & 0x7FFFFF

#special cases
is_nan = (exp == 0xFF) & (mant != 0)
is_inf = (exp == 0xFF) & (mant == 0)
is_zero = (exp == 0) & (mant == 0)

#normalize FP32 subnormals
is_subnormal_fp32 = (exp == 0) & (mant != 0)
leading_zeros = 22 - np.log2(np.maximum(mant, 1)).astype(int)
mant = np.where(is_subnormal_fp32, mant << leading_zeros, mant)
exp = np.where(is_subnormal_fp32, 1 - leading_zeros, exp)

#calculate unclipped exponent
fp8_exp_raw = exp.astype(np.int32) - (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS)
underflow = fp8_exp_raw < 0
fp8_exp = np.clip(fp8_exp_raw, 0, cls.FP8_MAX_EXP)

# calculate subnormal shift
shift = np.where(underflow, 1 - fp8_exp_raw, 0)

# align and round mantissa (RNE)
mant_plus_implicit = np.where(exp > 0, mant | 0x800000, mant)
total_shift = 20 + shift
mant_shifted = np.right_shift(mant_plus_implicit, total_shift)
round_bit = np.right_shift(mant_plus_implicit, total_shift - 1) & 1
sticky_mask = (1 << (total_shift - 1)) - 1
sticky = (mant_plus_implicit & sticky_mask) != 0
rounded = mant_shifted + ((round_bit & (sticky | (mant_shifted & 1))) != 0)

# handle mantissa overflow
mant_overflow = rounded >= 16 # 1 << (3+1)
fp8_exp = np.where(mant_overflow, fp8_exp + 1, fp8_exp)
rounded = np.where(mant_overflow, 8, rounded) # Reset to 1.000

# handle exponent overflow
overflow = fp8_exp > cls.FP8_MAX_EXP
fp8_exp = np.where(overflow, 0xF, fp8_exp)
rounded = np.where(overflow, 0, rounded)

# make the FP8
fp8 = (
(sign << 7) |
((fp8_exp << 3) & 0x78) |
(rounded & 0x7)
)
fp8 = np.where(is_nan, (sign << 7) | 0x7D, fp8) # NaN
fp8 = np.where(is_inf, (sign << 7) | 0x78, fp8) # Inf
fp8 = np.where(is_zero, sign << 7, fp8) # Zero

return fp8.astype(np.uint8)

@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
fp8 = blocks.astype(np.uint32)
sign = (fp8 >> 7) & 1
exp = (fp8 >> 3) & 0xF
mant = fp8 & 0x7

#special cases
is_nan = (exp == 0xF) & (mant != 0)
is_inf = (exp == 0xF) & (mant == 0)
is_zero = (exp == 0) & (mant == 0)
is_subnormal = (exp == 0) & (mant != 0)

fp32_exp = np.where(
exp > 0,
exp + (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS),
(1 - cls.FP8_EXP_BIAS) + cls.FP32_EXP_BIAS # -6 + 127 = 121
)

mant_scale = np.where(
is_subnormal,
mant.astype(np.float32) * 0.125, # 1/8
1.0 + mant.astype(np.float32) * 0.125
)

result = np.where(
is_nan,
np.nan,
np.where(
is_inf,
np.copysign(np.inf, (-1.0)**sign),
np.where(
is_zero,
np.copysign(0.0, (-1.0)**sign),
np.ldexp(mant_scale * (-1.0)**sign, fp32_exp - cls.FP32_EXP_BIAS)
)
)
)
return result.astype(np.float32)

class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
Expand Down