[DataType] BF16 Support (#3158)

joshua-j-hong · web-flow · commit d128e59d79bc · 2025-03-10T14:50:53.000-04:00
Allows BF16 for model datatypes in group quantization and adds Quantization settings for BF16. Corresponding PR in TVM is apache/tvm#17670
diff --git a/ci/task/test_unittest.sh b/ci/task/test_unittest.sh
@@ -7,6 +7,7 @@ if [[ -n ${MLC_CI_SETUP_DEPS:-} ]]; then
     echo "MLC_CI_SETUP_DEPS=1 start setup deps.."
     # Install dependency
     pip install --force-reinstall wheels/*.whl
+    pip install "ml_dtypes>=0.5.1" --no-binary ml_dtypes
     pip install --quiet pytest
     pip install --pre -U --no-index -f https://mlc.ai/wheels mlc-ai-nightly-cu123
     export LD_LIBRARY_PATH=/usr/local/cuda/compat/:$LD_LIBRARY_PATH
diff --git a/python/mlc_llm/quantization/group_quantization.py b/python/mlc_llm/quantization/group_quantization.py
@@ -33,7 +33,7 @@ class GroupQuantize:  # pylint: disable=too-many-instance-attributes
     group_size: int
     quantize_dtype: Literal["int3", "int4", "int8"]
     storage_dtype: Literal["uint32"]
-    model_dtype: Literal["float16", "float32"]
+    model_dtype: Literal["float16", "float32", "bfloat16"]
     linear_weight_layout: Literal["KN", "NK"]
     quantize_embedding: bool = True
     quantize_final_fc: bool = True
@@ -50,7 +50,7 @@ def __post_init__(self):
         model_dtype = DataType(self.model_dtype)
         assert quantize_dtype.type_code == DataTypeCode.INT
         assert storage_dtype.type_code == DataTypeCode.UINT
-        assert model_dtype.type_code == DataTypeCode.FLOAT
+        assert model_dtype.type_code in (DataTypeCode.FLOAT, DataTypeCode.BFLOAT)
         if storage_dtype.bits < quantize_dtype.bits:
             raise ValueError("Storage unit should be greater or equal to quantized element")
 
diff --git a/python/mlc_llm/quantization/quantization.py b/python/mlc_llm/quantization/quantization.py
@@ -33,6 +33,11 @@ def quantize_weight(self, weight: tvm.runtime.NDArray) -> List[tvm.runtime.NDArr
         kind="no-quant",
         model_dtype="float16",
     ),
+    "q0bf16": NoQuantize(
+        name="q0bf16",
+        kind="no-quant",
+        model_dtype="bfloat16",
+    ),
     "q0f32": NoQuantize(
         name="q0f32",
         kind="no-quant",
@@ -82,6 +87,28 @@ def quantize_weight(self, weight: tvm.runtime.NDArray) -> List[tvm.runtime.NDArr
         quantize_embedding=True,
         quantize_final_fc=True,
     ),
+    "q4bf16_0": GroupQuantize(
+        name="q4bf16_0",
+        kind="group-quant",
+        group_size=32,
+        quantize_dtype="int4",
+        storage_dtype="uint32",
+        model_dtype="bfloat16",
+        linear_weight_layout="KN",
+        quantize_embedding=True,
+        quantize_final_fc=True,
+    ),
+    "q4bf16_1": GroupQuantize(
+        name="q4bf16_1",
+        kind="group-quant",
+        group_size=32,
+        quantize_dtype="int4",
+        storage_dtype="uint32",
+        model_dtype="bfloat16",
+        linear_weight_layout="NK",
+        quantize_embedding=True,
+        quantize_final_fc=True,
+    ),
     "q4f32_1": GroupQuantize(
         name="q4f32_1",
         kind="group-quant",