[Model] Update Gemma3 to support 1b variant (#3178)

MasterJH5574 · web-flow · commit cdc2c807e778 · 2025-03-16T15:01:12.000-04:00
This PR updates the Gemma3 weight loader implementation to support the
1b variant.
diff --git a/python/mlc_llm/compiler_pass/dispatch_kv_cache_creation.py b/python/mlc_llm/compiler_pass/dispatch_kv_cache_creation.py
@@ -179,7 +179,7 @@ def create_flashinfer_paged_kv_cache(
         if (  # pylint: disable=too-many-boolean-expressions
             not self.flashinfer
             or self.target.kind.name != "cuda"
-            or str(kwargs["dtype"]) not in ["float16"]
+            or str(kwargs["dtype"]) not in ["float16", "bfloat16"]
             or (
                 kwargs["rope_mode"] == RopeMode.INLINE
                 and (
diff --git a/python/mlc_llm/model/gemma3/gemma3_loader.py b/python/mlc_llm/model/gemma3/gemma3_loader.py
@@ -41,16 +41,18 @@ def huggingface(model_config: Gemma3Config, quantization: Quantization) -> Exter
 
     mapping = ExternMapping()
 
+    mlc_prefix = "language_model."
+    hf_prefix = "language_model." if not model_config.is_text_model else ""
     for i in range(model_config.text_config.num_hidden_layers):
         # Add gates in MLP
-        mlp = f"language_model.model.layers.{i}.mlp"
-        mlc_name = f"{mlp}.gate_up_proj.weight"
+        mlp = f"model.layers.{i}.mlp"
+        mlc_name = f"{mlc_prefix + mlp}.gate_up_proj.weight"
         mlc_param = named_parameters[mlc_name]
         mapping.add_mapping(
             mlc_name,
             [
-                f"{mlp}.gate_proj.weight",
-                f"{mlp}.up_proj.weight",
+                f"{hf_prefix + mlp}.gate_proj.weight",
+                f"{hf_prefix + mlp}.up_proj.weight",
             ],
             functools.partial(
                 lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
@@ -59,88 +61,88 @@ def huggingface(model_config: Gemma3Config, quantization: Quantization) -> Exter
         )
         # Modify RMS layernorm weights, since Gemma model adds 1 to the weights
         # We add 1 to the weights here for efficiency purpose
-        mlc_name = f"language_model.model.layers.{i}.input_layernorm.weight"
-        mlc_param = named_parameters[mlc_name]
+        mlc_name = f"model.layers.{i}.input_layernorm.weight"
+        mlc_param = named_parameters[mlc_prefix + mlc_name]
         mapping.add_mapping(
-            mlc_name,
-            [mlc_name],
+            mlc_prefix + mlc_name,
+            [hf_prefix + mlc_name],
             functools.partial(
                 lambda x, dtype: (x + 1).astype(dtype),
-                dtype=named_parameters[mlc_name].dtype,
+                dtype=named_parameters[mlc_prefix + mlc_name].dtype,
             ),
         )
 
-        mlc_name = f"language_model.model.layers.{i}.post_attention_layernorm.weight"
-        mlc_param = named_parameters[mlc_name]
+        mlc_name = f"model.layers.{i}.post_attention_layernorm.weight"
+        mlc_param = named_parameters[mlc_prefix + mlc_name]
         mapping.add_mapping(
-            mlc_name,
-            [mlc_name],
+            mlc_prefix + mlc_name,
+            [hf_prefix + mlc_name],
             functools.partial(
                 lambda x, dtype: (x + 1).astype(dtype),
-                dtype=named_parameters[mlc_name].dtype,
+                dtype=named_parameters[mlc_prefix + mlc_name].dtype,
             ),
         )
 
-        mlc_name = f"language_model.model.layers.{i}.pre_feedforward_layernorm.weight"
-        mlc_param = named_parameters[mlc_name]
+        mlc_name = f"model.layers.{i}.pre_feedforward_layernorm.weight"
+        mlc_param = named_parameters[mlc_prefix + mlc_name]
         mapping.add_mapping(
-            mlc_name,
-            [mlc_name],
+            mlc_prefix + mlc_name,
+            [hf_prefix + mlc_name],
             functools.partial(
                 lambda x, dtype: (x + 1).astype(dtype),
-                dtype=named_parameters[mlc_name].dtype,
+                dtype=named_parameters[mlc_prefix + mlc_name].dtype,
             ),
         )
 
-        mlc_name = f"language_model.model.layers.{i}.post_feedforward_layernorm.weight"
-        mlc_param = named_parameters[mlc_name]
+        mlc_name = f"model.layers.{i}.post_feedforward_layernorm.weight"
+        mlc_param = named_parameters[mlc_prefix + mlc_name]
         mapping.add_mapping(
-            mlc_name,
-            [mlc_name],
+            mlc_prefix + mlc_name,
+            [hf_prefix + mlc_name],
             functools.partial(
                 lambda x, dtype: (x + 1).astype(dtype),
-                dtype=named_parameters[mlc_name].dtype,
+                dtype=named_parameters[mlc_prefix + mlc_name].dtype,
             ),
         )
 
-        mlc_name = f"language_model.model.layers.{i}.self_attn.k_norm.weight"
-        mlc_param = named_parameters[mlc_name]
+        mlc_name = f"model.layers.{i}.self_attn.k_norm.weight"
+        mlc_param = named_parameters[mlc_prefix + mlc_name]
         mapping.add_mapping(
-            mlc_name,
-            [mlc_name],
+            mlc_prefix + mlc_name,
+            [hf_prefix + mlc_name],
             functools.partial(
                 lambda x, dtype: (x + 1).astype(dtype),
-                dtype=named_parameters[mlc_name].dtype,
+                dtype=named_parameters[mlc_prefix + mlc_name].dtype,
             ),
         )
 
-        mlc_name = f"language_model.model.layers.{i}.self_attn.q_norm.weight"
-        mlc_param = named_parameters[mlc_name]
+        mlc_name = f"model.layers.{i}.self_attn.q_norm.weight"
+        mlc_param = named_parameters[mlc_prefix + mlc_name]
         mapping.add_mapping(
-            mlc_name,
-            [mlc_name],
+            mlc_prefix + mlc_name,
+            [hf_prefix + mlc_name],
             functools.partial(
                 lambda x, dtype: (x + 1).astype(dtype),
-                dtype=named_parameters[mlc_name].dtype,
+                dtype=named_parameters[mlc_prefix + mlc_name].dtype,
             ),
         )
 
-    mlc_name = "language_model.model.norm.weight"
-    mlc_param = named_parameters[mlc_name]
+    mlc_name = "model.norm.weight"
+    mlc_param = named_parameters[mlc_prefix + mlc_name]
     mapping.add_mapping(
-        mlc_name,
-        [mlc_name],
+        mlc_prefix + mlc_name,
+        [hf_prefix + mlc_name],
         functools.partial(
             lambda x, dtype: (x + 1).astype(dtype),
-            dtype=named_parameters[mlc_name].dtype,
+            dtype=named_parameters[mlc_prefix + mlc_name].dtype,
         ),
     )
 
     for mlc_name, mlc_param in named_parameters.items():
         if mlc_name not in mapping.param_map:
             mapping.add_mapping(
                 mlc_name,
-                [mlc_name],
+                [hf_prefix + mlc_name[len(mlc_prefix) :]],
                 functools.partial(
                     lambda x, dtype: x.astype(dtype),
                     dtype=mlc_param.dtype,
diff --git a/python/mlc_llm/model/gemma3/gemma3_model.py b/python/mlc_llm/model/gemma3/gemma3_model.py
@@ -95,16 +95,21 @@ def __post_init__(self):
 class Gemma3Config(ConfigBase):  # pylint: disable=too-many-instance-attributes
     """Configuration of the Gemma3 model"""
 
-    text_config: Gemma3TextConfig
+    text_config: Gemma3TextConfig = None
     vocab_size: int = 262_208
     tensor_parallel_shards: int = 1
     max_batch_size: int = 1
     context_window_size: int = -1
     sliding_window_size: int = -1
     prefill_chunk_size: int = -1
+    is_text_model: bool = False
     kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
 
     def __post_init__(self):
+        if self.text_config is None:
+            self.is_text_model = True
+            self.text_config = Gemma3TextConfig.from_dict(self.kwargs)
+
         text_config_dict: Dict[str, Any]
         if isinstance(self.text_config, Gemma3TextConfig):
             text_config_dict = dataclasses.asdict(self.text_config)
@@ -121,10 +126,6 @@ def __post_init__(self):
                 if hasattr(self.text_config, k):
                     setattr(self, k, getattr(self.text_config, k))
 
-        # if getattr(self, "sliding_window_size") <= 0:
-        #     if hasattr(self.text_config, "sliding_window"):
-        #         setattr(self, "sliding_window_size", getattr(self.text_config, "sliding_window"))
-
 
 # pylint: disable=invalid-name,missing-docstring
 
diff --git a/python/mlc_llm/model/model.py b/python/mlc_llm/model/model.py
@@ -158,6 +158,19 @@ class Model:
             "group-quant": gemma3_quantization.group_quant,
         },
     ),
+    "gemma3_text": Model(
+        name="gemma3_text",
+        model=gemma3_model.Gemma3ForCausalLM,
+        config=gemma3_model.Gemma3Config,
+        source={
+            "huggingface-torch": gemma3_loader.huggingface,
+            "huggingface-safetensor": gemma3_loader.huggingface,
+        },
+        quantize={
+            "no-quant": gemma3_quantization.no_quant,
+            "group-quant": gemma3_quantization.group_quant,
+        },
+    ),
     "gpt2": Model(
         name="gpt2",
         model=gpt2_model.GPT2LMHeadModel,