[DataType] Rename FP8 dtypes (#3155)

MasterJH5574 · web-flow · commit 56a5ff9f87e3 · 2025-03-09T17:51:21.000-04:00
Following recent renaming in TVM, we now rename all FP8 dtypes:

* `e4m3_float8` is renamed to `float8_e4m3fn`,
* `e5m2_float8` is renamed to `float8_e5m2`.

This aligns with dtype names in PyTorch and ml_dtypes.

The delivered HuggingFace FP8 model repos might need update as well
to align with the rename.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,5 +1,6 @@
 --find-links https://mlc.ai/wheels
 fastapi
+ml_dtypes>=0.5.1
 mlc-ai-nightly
 openai
 prompt_toolkit
diff --git a/python/mlc_llm/op/cutlass.py b/python/mlc_llm/op/cutlass.py
@@ -50,11 +50,11 @@ def group_gemm(
     out_dtype = out_dtype if out_dtype else x.dtype
     weight_dtype = weight_dtype if weight_dtype else weight.dtype
 
-    if x.dtype == "e5m2_float8" and weight_dtype == "e5m2_float8" and out_dtype == "float16":
+    if x.dtype == "float8_e5m2" and weight_dtype == "float8_e5m2" and out_dtype == "float16":
         func_name = "cutlass.group_gemm_e5m2_e5m2_fp16"
-    elif x.dtype == "e4m3_float8" and weight_dtype == "e5m2_float8" and out_dtype == "float16":
+    elif x.dtype == "float8_e4m3fn" and weight_dtype == "float8_e5m2" and out_dtype == "float16":
         func_name = "cutlass.group_gemm_e4m3_e5m2_fp16"
-    elif x.dtype == "e4m3_float8" and weight_dtype == "e4m3_float8" and out_dtype == "float16":
+    elif x.dtype == "float8_e4m3fn" and weight_dtype == "float8_e4m3fn" and out_dtype == "float16":
         func_name = "cutlass.group_gemm_e4m3_e4m3_fp16"
     elif x.dtype == "float16" and weight_dtype == "float16" and out_dtype == "float16":
         func_name = "cutlass.group_gemm_fp16_sm90"
@@ -113,11 +113,11 @@ def fp8_gemm(
     out_dtype = out_dtype if out_dtype else x.dtype
     weight_dtype = weight_dtype if weight_dtype else weight.dtype
 
-    if x.dtype == "e5m2_float8" and weight_dtype == "e5m2_float8" and out_dtype == "float16":
+    if x.dtype == "float8_e5m2" and weight_dtype == "float8_e5m2" and out_dtype == "float16":
         func_name = "cutlass.gemm_e5m2_e5m2_fp16"
-    elif x.dtype == "e4m3_float8" and weight_dtype == "e5m2_float8" and out_dtype == "float16":
+    elif x.dtype == "float8_e4m3fn" and weight_dtype == "float8_e5m2" and out_dtype == "float16":
         func_name = "cutlass.gemm_e5m2_e4m3_fp16"
-    elif x.dtype == "e4m3_float8" and weight_dtype == "e4m3_float8" and out_dtype == "float16":
+    elif x.dtype == "float8_e4m3fn" and weight_dtype == "float8_e4m3fn" and out_dtype == "float16":
         func_name = "cutlass.gemm_e4m3_e4m3_fp16"
     else:
         raise NotImplementedError(
diff --git a/python/mlc_llm/op/moe_matmul.py b/python/mlc_llm/op/moe_matmul.py
@@ -182,7 +182,7 @@ def dequantize_float8_gemv(
     w: Tensor,
     scale: Optional[Tensor],
     indptr: Tensor,
-    quantize_dtype: Literal["e5m2_float8", "e4m3_float8"],
+    quantize_dtype: Literal["float8_e5m2", "float8_e4m3fn"],
 ) -> Tensor:
     """GEMV for project-in (e1-e3) or project-out (e2) in MLP but the weight is quantized in
     fp8 e5m2 or e4m3. It needs to be dequantized before the GEMV computation.
@@ -204,8 +204,8 @@ def dequantize_float8_gemv(
         The index pointer tensor of shape (1, experts_per_tok), where `experts_per_tok` is the
         number of activated experts per token.
 
-    quantize_dtype : Literal["e5m2_float8", "e4m3_float8"]
-        The quantize dtype of the weight tensor, which is either e5m2_float8 or e4m3_float8.
+    quantize_dtype : Literal["float8_e5m2", "float8_e4m3fn"]
+        The quantize dtype of the weight tensor, which is either float8_e5m2 or float8_e4m3fn.
     """
     (x_leading_dim, in_features), model_dtype = x.shape, x.dtype
     (local_experts, out_features, _), storage_dtype = w.shape, w.dtype
diff --git a/python/mlc_llm/quantization/per_tensor_quantization.py b/python/mlc_llm/quantization/per_tensor_quantization.py
@@ -32,9 +32,9 @@ class PerTensorQuantize:  # pylint: disable=too-many-instance-attributes
 
     name: str
     kind: str
-    activation_dtype: Literal["e4m3_float8", "e5m2_float8"]
-    weight_dtype: Literal["e4m3_float8", "e5m2_float8"]
-    storage_dtype: Literal["uint32", "e4m3_float8", "e5m2_float8"]
+    activation_dtype: Literal["float8_e4m3fn", "float8_e5m2"]
+    weight_dtype: Literal["float8_e4m3fn", "float8_e5m2"]
+    storage_dtype: Literal["uint32", "float8_e4m3fn", "float8_e5m2"]
     model_dtype: Literal["float16"]
     quantize_embedding: bool = True
     quantize_final_fc: bool = True
@@ -184,8 +184,8 @@ def quantize_weight(self, weight) -> List[NDArray]:
 
         def _create_quantize_func() -> IRModule:
             if DataType(self.weight_dtype).type_code in [
-                DataTypeCode.E4M3Float,
-                DataTypeCode.E5M2Float,
+                DataTypeCode.Float8E4M3FN,
+                DataTypeCode.Float8E5M2,
             ]:
                 quantize_func = functools.partial(
                     self.quantize_float8,
@@ -288,8 +288,8 @@ def _dequantize(
         if self.use_scale:
             assert scale is not None
         if DataType(self.weight_dtype).type_code in [
-            DataTypeCode.E4M3Float,
-            DataTypeCode.E5M2Float,
+            DataTypeCode.Float8E4M3FN,
+            DataTypeCode.Float8E5M2,
         ]:
             return self.dequantize_float8(q_weight, scale, self.weight_dtype, out_shape)
         raise NotImplementedError()
@@ -655,8 +655,8 @@ def from_mixtral_experts(
             The per-tensor quantized MixtralExperts layer
         """
         if DataType(config.weight_dtype).type_code in [
-            DataTypeCode.E4M3Float,
-            DataTypeCode.E5M2Float,
+            DataTypeCode.Float8E4M3FN,
+            DataTypeCode.Float8E5M2,
         ]:
             return PerTensorQuantizeMixtralExperts._IMPL["fp8"].from_mixtral_experts(
                 src, config, name
diff --git a/python/mlc_llm/quantization/quantization.py b/python/mlc_llm/quantization/quantization.py
@@ -122,9 +122,9 @@ def quantize_weight(self, weight: tvm.runtime.NDArray) -> List[tvm.runtime.NDArr
     "e5m2_e5m2_f16": PerTensorQuantize(
         name="e5m2_e5m2_f16",
         kind="per-tensor-quant",
-        activation_dtype="e5m2_float8",
-        weight_dtype="e5m2_float8",
-        storage_dtype="e5m2_float8",
+        activation_dtype="float8_e5m2",
+        weight_dtype="float8_e5m2",
+        storage_dtype="float8_e5m2",
         model_dtype="float16",
         quantize_final_fc=False,
         quantize_embedding=False,
@@ -134,9 +134,9 @@ def quantize_weight(self, weight: tvm.runtime.NDArray) -> List[tvm.runtime.NDArr
     "e4m3_e4m3_f16": PerTensorQuantize(
         name="e4m3_e4m3_f16",
         kind="per-tensor-quant",
-        activation_dtype="e4m3_float8",
-        weight_dtype="e4m3_float8",
-        storage_dtype="e4m3_float8",
+        activation_dtype="float8_e4m3fn",
+        weight_dtype="float8_e4m3fn",
+        storage_dtype="float8_e4m3fn",
         model_dtype="float16",
         quantize_final_fc=False,
         quantize_embedding=False,
@@ -147,9 +147,9 @@ def quantize_weight(self, weight: tvm.runtime.NDArray) -> List[tvm.runtime.NDArr
     "e4m3_e4m3_f16_max_calibrate": PerTensorQuantize(
         name="e4m3_e4m3_f16_max_calibrate",
         kind="per-tensor-quant",
-        activation_dtype="e4m3_float8",
-        weight_dtype="e4m3_float8",
-        storage_dtype="e4m3_float8",
+        activation_dtype="float8_e4m3fn",
+        weight_dtype="float8_e4m3fn",
+        storage_dtype="float8_e4m3fn",
         model_dtype="float16",
         quantize_final_fc=False,
         quantize_embedding=False,
diff --git a/python/mlc_llm/quantization/utils.py b/python/mlc_llm/quantization/utils.py
@@ -104,7 +104,7 @@ def convert_uint_packed_fp8_to_float(  # pylint: disable=too-many-arguments
     out_shape: Optional[Sequence[tir.PrimExpr]] = None,
 ) -> te.Tensor:
     """Unpack a fp8 value from the storage dtype and convert to float."""
-    assert quant_dtype in ["e4m3_float8", "e5m2_float8"]
+    assert quant_dtype in ["float8_e4m3fn", "float8_e5m2"]
     assert DataType(storage_dtype).type_code == DataTypeCode.UINT
     bits = DataType(quant_dtype).bits
     elem_storage_dtype = DataType(f"uint{bits}")
diff --git a/python/setup.py b/python/setup.py
@@ -111,6 +111,7 @@ def main():
             "transformers",
             "pandas",
             "datasets",
+            "ml_dtypes>=0.5.1",
             "flashinfer-python==0.2.2",
         ],
         distclass=BinaryDistribution,