[Model] Qwen3 FP8 support (#3219)

MasterJH5574 · web-flow · commit fd8e84aecd7d · 2025-04-30T21:18:28.000-04:00
This PR enables the FP8 support for Qwen3 family.
diff --git a/python/mlc_llm/model/model.py b/python/mlc_llm/model/model.py
@@ -340,6 +340,7 @@ class Model:
             "no-quant": qwen3_quantization.no_quant,
             "group-quant": qwen3_quantization.group_quant,
             "ft-quant": qwen3_quantization.ft_quant,
+            "block-scale-quant": qwen3_quantization.block_scale_quant,
         },
     ),
     "qwen3_moe": Model(
@@ -354,6 +355,7 @@ class Model:
             "no-quant": qwen3_moe_quantization.no_quant,
             "group-quant": qwen3_moe_quantization.group_quant,
             "ft-quant": qwen3_moe_quantization.ft_quant,
+            "block-scale-quant": qwen3_moe_quantization.block_scale_quant,
         },
     ),
     "deepseek_v2": Model(
diff --git a/python/mlc_llm/model/qwen3/qwen3_loader.py b/python/mlc_llm/model/qwen3/qwen3_loader.py
@@ -4,11 +4,12 @@
 """
 
 import functools
+from typing import Callable, List
 
 import numpy as np
 
-from mlc_llm.loader import ExternMapping
-from mlc_llm.quantization import Quantization
+from mlc_llm.loader import ExternMapping, QuantizeMapping
+from mlc_llm.quantization import BlockScaleQuantize, Quantization
 
 from .qwen3_model import Qwen3Config, Qwen3LMHeadModel
 
@@ -33,6 +34,15 @@ def huggingface(model_config: Qwen3Config, quantization: Quantization) -> Extern
     model = Qwen3LMHeadModel(model_config)
     if quantization is not None:
         model.to(quantization.model_dtype)
+    if isinstance(quantization, BlockScaleQuantize):
+        # Convert the model to block-scale quantized model before loading parameters
+        model = quantization.quantize_model(model, QuantizeMapping({}, {}), "")
+        if model_config.weight_block_size is None:
+            raise ValueError(
+                "The input Qwen3 model is not fp8 block quantized. "
+                "Thus BlockScaleQuantize is not supported."
+            )
+
     _, _named_params, _ = model.export_tvm(  # type: ignore[misc]
         spec=model.get_default_spec(),
         allow_extern=True,
@@ -41,19 +51,60 @@ def huggingface(model_config: Qwen3Config, quantization: Quantization) -> Extern
 
     mapping = ExternMapping()
 
+    if (
+        not isinstance(quantization, BlockScaleQuantize)
+        and model_config.weight_block_size is not None
+    ):
+        raise ValueError(
+            "The input Qwen3 model is fp8 block quantized. "
+            "Please use BlockScaleQuantize for the model."
+        )
+
+    # Helper function to add both weight and scale mappings
+    def add_weight_and_scale_mapping(
+        weight_mlc_name: str,
+        weight_hf_names: List[str],
+        weight_transform_func: Callable,
+    ):
+        mlc_param = named_parameters[weight_mlc_name]
+        mapping.add_mapping(
+            weight_mlc_name,
+            weight_hf_names,
+            functools.partial(weight_transform_func, dtype=mlc_param.dtype),
+        )
+
+        if isinstance(quantization, BlockScaleQuantize):
+            scale_mlc_name = f"{weight_mlc_name}_scale_inv"
+            if scale_mlc_name in named_parameters:
+                scale_hf_names = [f"{name}_scale_inv" for name in weight_hf_names]
+                scale_param = named_parameters[scale_mlc_name]
+                mapping.add_mapping(
+                    scale_mlc_name,
+                    scale_hf_names,
+                    functools.partial(weight_transform_func, dtype=scale_param.dtype),
+                )
+
     for i in range(model_config.num_hidden_layers):
         # map attention weight
         attn = f"model.layers.{i}.self_attn"
-        weight_names = ["weight", "bias"] if model_config.attention_bias else ["weight"]
-        for weight_type in weight_names:
-            mlc_name = f"{attn}.c_attn.{weight_type}"
+        add_weight_and_scale_mapping(
+            f"{attn}.c_attn.weight",
+            [
+                f"{attn}.q_proj.weight",
+                f"{attn}.k_proj.weight",
+                f"{attn}.v_proj.weight",
+            ],
+            lambda q, k, v, dtype: np.concatenate([q, k, v], axis=0).astype(dtype),
+        )
+        if model_config.attention_bias:
+            mlc_name = f"{attn}.c_attn.bias"
             mlc_param = named_parameters[mlc_name]
             mapping.add_mapping(
                 mlc_name,
                 [
-                    f"{attn}.q_proj.{weight_type}",
-                    f"{attn}.k_proj.{weight_type}",
-                    f"{attn}.v_proj.{weight_type}",
+                    f"{attn}.q_proj.bias",
+                    f"{attn}.k_proj.bias",
+                    f"{attn}.v_proj.bias",
                 ],
                 functools.partial(
                     lambda q, k, v, dtype: np.concatenate([q, k, v], axis=0).astype(dtype),
@@ -62,18 +113,13 @@ def huggingface(model_config: Qwen3Config, quantization: Quantization) -> Extern
             )
         # map mlp weight
         mlp = f"model.layers.{i}.mlp"
-        mlc_name = f"{mlp}.gate_up_proj.weight"
-        mlc_param = named_parameters[mlc_name]
-        mapping.add_mapping(
-            mlc_name,
+        add_weight_and_scale_mapping(
+            f"{mlp}.gate_up_proj.weight",
             [
                 f"{mlp}.gate_proj.weight",
                 f"{mlp}.up_proj.weight",
             ],
-            functools.partial(
-                lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
-                dtype=mlc_param.dtype,
-            ),
+            lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
         )
 
     for mlc_name, mlc_param in named_parameters.items():
diff --git a/python/mlc_llm/model/qwen3/qwen3_model.py b/python/mlc_llm/model/qwen3/qwen3_model.py
@@ -4,7 +4,7 @@
 
 import dataclasses
 from functools import partial
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Tuple
 
 from tvm import te, tir
 from tvm.relax.frontend import nn
@@ -41,9 +41,34 @@ class Qwen3Config(ConfigBase):  # pylint: disable=too-many-instance-attributes
     head_dim: int = 0
     dtype: str = "float32"
     max_batch_size: int = 1
+    weight_block_size: Optional[Tuple[int, int]] = None
     kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
 
     def __post_init__(self):
+        if "quantization_config" in self.kwargs:
+            quantization_config = self.kwargs.get("quantization_config")
+            if (
+                isinstance(quantization_config, dict)
+                and quantization_config.get("activation_scheme", "") == "dynamic"
+                and quantization_config.get("fmt", "") == "e4m3"
+                and quantization_config.get("quant_method", "") == "fp8"
+                and "weight_block_size" in quantization_config
+            ):
+                self.weight_block_size = quantization_config.get("weight_block_size")
+                if (
+                    not isinstance(self.weight_block_size, (tuple, list))
+                    or len(self.weight_block_size) != 2
+                ):
+                    raise ValueError(
+                        "Invalid DeepSeek model quantization config: "
+                        "weight_block_size must be a tuple of two integers, "
+                        f"got {self.weight_block_size} of type {type(self.weight_block_size)}"
+                    )
+            else:
+                raise ValueError(
+                    "Invalid DeepSeek model quantization config: unrecognized quantization config: "
+                    f"{quantization_config}"
+                )
         if self.context_window_size == 0:
             for name in ["max_position_embeddings", "max_sequence_length"]:
                 if name in self.kwargs:
@@ -247,6 +272,7 @@ def __init__(self, config: Qwen3Config):
         self.vocab_size = config.vocab_size
         self.tensor_parallel_shards = config.tensor_parallel_shards
         self.head_dim = config.head_dim
+        self.weight_block_size = config.weight_block_size
 
     def to(self, dtype: Optional[str] = None):
         super().to(dtype=dtype)
diff --git a/python/mlc_llm/model/qwen3/qwen3_quantization.py b/python/mlc_llm/model/qwen3/qwen3_quantization.py
@@ -6,7 +6,12 @@
 from tvm.relax.frontend import nn
 
 from mlc_llm.loader import QuantizeMapping
-from mlc_llm.quantization import FTQuantize, GroupQuantize, NoQuantize
+from mlc_llm.quantization import (
+    BlockScaleQuantize,
+    FTQuantize,
+    GroupQuantize,
+    NoQuantize,
+)
 
 from .qwen3_model import Qwen3Config, Qwen3LMHeadModel
 
@@ -53,3 +58,15 @@ def no_quant(
     model.to(quantization.model_dtype)
     quant_map = QuantizeMapping({}, {})
     return model, quant_map
+
+
+def block_scale_quant(
+    model_config: Qwen3Config,
+    quantization: BlockScaleQuantize,
+) -> Tuple[nn.Module, QuantizeMapping]:
+    """Quantize a Qwen3 model using block-scale quantization."""
+    model: nn.Module = Qwen3LMHeadModel(model_config)
+    model.to(quantization.model_dtype)
+    quant_map = QuantizeMapping({}, {})
+    model = quantization.quantize_model(model, quant_map, "")
+    return model, quant_map
diff --git a/python/mlc_llm/model/qwen3_moe/qwen3_moe_loader.py b/python/mlc_llm/model/qwen3_moe/qwen3_moe_loader.py
@@ -4,11 +4,12 @@
 """
 
 import functools
+from typing import Callable, List
 
 import numpy as np
 
-from mlc_llm.loader import ExternMapping
-from mlc_llm.quantization import Quantization
+from mlc_llm.loader import ExternMapping, QuantizeMapping
+from mlc_llm.quantization import BlockScaleQuantize, Quantization
 
 from .qwen3_moe_model import Qwen3MoeConfig, Qwen3MoeForCausalLM
 
@@ -33,6 +34,15 @@ def huggingface(model_config: Qwen3MoeConfig, quantization: Quantization) -> Ext
     model = Qwen3MoeForCausalLM(model_config)
     if quantization is not None:
         model.to(quantization.model_dtype)
+    if isinstance(quantization, BlockScaleQuantize):
+        # Convert the model to block-scale quantized model before loading parameters
+        model = quantization.quantize_model(model, QuantizeMapping({}, {}), "")
+        if model_config.weight_block_size is None:
+            raise ValueError(
+                "The input Qwen3 model is not fp8 block quantized. "
+                "Thus BlockScaleQuantize is not supported."
+            )
+
     _, _named_params, _ = model.export_tvm(  # type: ignore[misc]
         spec=model.get_default_spec(),
         allow_extern=True,
@@ -41,19 +51,60 @@ def huggingface(model_config: Qwen3MoeConfig, quantization: Quantization) -> Ext
 
     mapping = ExternMapping()
 
+    if (
+        not isinstance(quantization, BlockScaleQuantize)
+        and model_config.weight_block_size is not None
+    ):
+        raise ValueError(
+            "The input Qwen3 model is fp8 block quantized. "
+            "Please use BlockScaleQuantize for the model."
+        )
+
+    # Helper function to add both weight and scale mappings
+    def add_weight_and_scale_mapping(
+        weight_mlc_name: str,
+        weight_hf_names: List[str],
+        weight_transform_func: Callable,
+    ):
+        mlc_param = named_parameters[weight_mlc_name]
+        mapping.add_mapping(
+            weight_mlc_name,
+            weight_hf_names,
+            functools.partial(weight_transform_func, dtype=mlc_param.dtype),
+        )
+
+        if isinstance(quantization, BlockScaleQuantize):
+            scale_mlc_name = f"{weight_mlc_name}_scale_inv"
+            if scale_mlc_name in named_parameters:
+                scale_hf_names = [f"{name}_scale_inv" for name in weight_hf_names]
+                scale_param = named_parameters[scale_mlc_name]
+                mapping.add_mapping(
+                    scale_mlc_name,
+                    scale_hf_names,
+                    functools.partial(weight_transform_func, dtype=scale_param.dtype),
+                )
+
     for i in range(model_config.num_hidden_layers):
         # map attention weight
         attn = f"model.layers.{i}.self_attn"
-        weight_names = ["weight", "bias"] if model_config.attention_bias else ["weight"]
-        for weight_type in weight_names:
-            mlc_name = f"{attn}.c_attn.{weight_type}"
+        add_weight_and_scale_mapping(
+            f"{attn}.c_attn.weight",
+            [
+                f"{attn}.q_proj.weight",
+                f"{attn}.k_proj.weight",
+                f"{attn}.v_proj.weight",
+            ],
+            lambda q, k, v, dtype: np.concatenate([q, k, v], axis=0).astype(dtype),
+        )
+        if model_config.attention_bias:
+            mlc_name = f"{attn}.c_attn.bias"
             mlc_param = named_parameters[mlc_name]
             mapping.add_mapping(
                 mlc_name,
                 [
-                    f"{attn}.q_proj.{weight_type}",
-                    f"{attn}.k_proj.{weight_type}",
-                    f"{attn}.v_proj.{weight_type}",
+                    f"{attn}.q_proj.bias",
+                    f"{attn}.k_proj.bias",
+                    f"{attn}.v_proj.bias",
                 ],
                 functools.partial(
                     lambda q, k, v, dtype: np.concatenate([q, k, v], axis=0).astype(dtype),
@@ -62,16 +113,15 @@ def huggingface(model_config: Qwen3MoeConfig, quantization: Quantization) -> Ext
             )
         # map mlp moe gate and up weight
         mlp = f"model.layers.{i}.mlp"
-        mlc_name = f"{mlp}.moe_gate_up_proj.weight"
 
         def combine_expert_gate_up(*hf_params, dtype):
             stack = []
             for i in range(0, len(hf_params), 2):
                 stack.append(np.concatenate([hf_params[i], hf_params[i + 1]], axis=0))
             return np.stack(stack, axis=0).astype(dtype)
 
-        mapping.add_mapping(
-            mlc_name,
+        add_weight_and_scale_mapping(
+            f"{mlp}.moe_gate_up_proj.weight",
             functools.reduce(
                 lambda a, b: a + b,
                 [
@@ -82,25 +132,17 @@ def combine_expert_gate_up(*hf_params, dtype):
                     for expert_id in range(model_config.num_experts)
                 ],
             ),
-            functools.partial(
-                combine_expert_gate_up,
-                dtype=mlc_param.dtype,
-            ),
+            combine_expert_gate_up,
         )
 
-        # map mlp moe gate and up weight
-        mlc_name = f"{mlp}.moe_down_proj.weight"
-        mlc_param = named_parameters[mlc_name]
-        mapping.add_mapping(
-            mlc_name,
+        # map mlp moe down projection weight
+        add_weight_and_scale_mapping(
+            f"{mlp}.moe_down_proj.weight",
             [
                 f"{mlp}.experts.{expert_id}.down_proj.weight"
                 for expert_id in range(model_config.num_experts)
             ],
-            functools.partial(
-                lambda *hf_params, dtype: np.stack(hf_params, axis=0).astype(dtype),
-                dtype=mlc_param.dtype,
-            ),
+            lambda *hf_params, dtype: np.stack(hf_params, axis=0).astype(dtype),
         )
 
     for mlc_name, mlc_param in named_parameters.items():
diff --git a/python/mlc_llm/model/qwen3_moe/qwen3_moe_model.py b/python/mlc_llm/model/qwen3_moe/qwen3_moe_model.py
@@ -218,6 +218,7 @@ def __init__(self, config: Qwen3MoeConfig):
         self.vocab_size = config.vocab_size
         self.tensor_parallel_shards = config.tensor_parallel_shards
         self.head_dim = config.head_dim
+        self.weight_block_size = config.weight_block_size
 
     def to(self, dtype: Optional[str] = None):
         super().to(dtype=dtype)
diff --git a/python/mlc_llm/model/qwen3_moe/qwen3_moe_quantization.py b/python/mlc_llm/model/qwen3_moe/qwen3_moe_quantization.py