[Model] Support Phi-4 (#3268)

harrywhoo · web-flow · commit 46f9bc4ead7b · 2025-06-30T21:29:40.000-04:00
This PR supports Phi-4 using the existing Phi3 architecture.
Updated conversation template and model preset for phi-4-mini-instruct.
Added support for tie_word_embeddings and partial_rotary factors.
Fix rotary_dim usage in position_embedding.py from head_dim//2 to
rotary_dim//2 for ext_factor.
diff --git a/python/mlc_llm/conversation_template/phi.py b/python/mlc_llm/conversation_template/phi.py
@@ -51,3 +51,20 @@
         stop_token_ids=[2, 32000, 32001, 32007],
     )
 )
+
+# Phi-4
+ConvTemplateRegistry.register_conv_template(
+    Conversation(
+        name="phi-4",
+        system_template=f"<|system|>\n{MessagePlaceholders.SYSTEM.value}",
+        system_message="You are a helpful digital assistant. Please provide safe, "
+        "ethical and accurate information to the user.",
+        roles={"user": "<|user|>", "assistant": "<|assistant|>"},
+        seps=["<|end|>\n"],
+        role_content_sep="\n",
+        role_empty_sep="\n",
+        system_prefix_token_ids=[200022],  # <|system|>
+        stop_str=["<|endoftext|>", "<|end|>"],
+        stop_token_ids=[199999, 200020],  # <|endoftext|>, <|end|>
+    )
+)
diff --git a/python/mlc_llm/interface/gen_config.py b/python/mlc_llm/interface/gen_config.py
@@ -299,6 +299,7 @@ def gen_config(  # pylint: disable=too-many-locals,too-many-arguments,too-many-b
     "phi-2",
     "phi-3",
     "phi-3-vision",
+    "phi-4",
     "stablelm-2",
     "gemma_instruction",
     "gemma3_instruction",
diff --git a/python/mlc_llm/model/model_preset.py b/python/mlc_llm/model/model_preset.py
@@ -757,6 +757,148 @@
         "vocab_size": 32064,
         "_attn_implementation": "flash_attention_2",
     },
+    "phi-4": {
+        "_name_or_path": "Phi-4-mini-instruct",
+        "architectures": ["Phi3ForCausalLM"],
+        "attention_bias": False,
+        "attention_dropout": 0.0,
+        "auto_map": {
+            "AutoConfig": "configuration_phi3.Phi3Config",
+            "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM",
+            "AutoTokenizer": "Xenova/gpt-4o",
+        },
+        "bos_token_id": 199999,
+        "embd_pdrop": 0.0,
+        "eos_token_id": 199999,
+        "full_attn_mod": 1,
+        "hidden_act": "silu",
+        "hidden_size": 3072,
+        "initializer_range": 0.02,
+        "intermediate_size": 8192,
+        "interpolate_factor": 1,
+        "lm_head_bias": False,
+        "max_position_embeddings": 131072,
+        "mlp_bias": False,
+        "model_type": "phi3",
+        "num_attention_heads": 24,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 8,
+        "original_max_position_embeddings": 4096,
+        "pad_token_id": 199999,
+        "partial_rotary_factor": 0.75,
+        "resid_pdrop": 0.0,
+        "rms_norm_eps": 1e-05,
+        "rope_scaling": {
+            "long_factor": [
+                1,
+                1.118320672,
+                1.250641126,
+                1.398617824,
+                1.564103225,
+                1.74916897,
+                1.956131817,
+                2.187582649,
+                2.446418898,
+                2.735880826,
+                3.059592084,
+                3.421605075,
+                3.826451687,
+                4.279200023,
+                4.785517845,
+                5.351743533,
+                5.984965424,
+                6.693110555,
+                7.485043894,
+                8.370679318,
+                9.36110372,
+                10.4687158,
+                11.70738129,
+                13.09260651,
+                14.64173252,
+                16.37415215,
+                18.31155283,
+                20.47818807,
+                22.90118105,
+                25.61086418,
+                28.64115884,
+                32.03,
+                32.1,
+                32.13,
+                32.23,
+                32.6,
+                32.61,
+                32.64,
+                32.66,
+                32.7,
+                32.71,
+                32.93,
+                32.97,
+                33.28,
+                33.49,
+                33.5,
+                44.16,
+                47.77,
+            ],
+            "short_factor": [
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+            ],
+            "type": "longrope",
+        },
+        "rope_theta": 10000.0,
+        "sliding_window": 262144,
+        "tie_word_embeddings": True,
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.45.0",
+        "use_cache": True,
+        "vocab_size": 200064,
+    },
     "qwen": {
         "architectures": ["QWenLMHeadModel"],
         "auto_map": {
diff --git a/python/mlc_llm/model/phi3/phi3_loader.py b/python/mlc_llm/model/phi3/phi3_loader.py
@@ -48,7 +48,9 @@ def _add(mlc_name, hf_name):
             ),
         )
 
-    _add("lm_head.weight", "lm_head.weight")
+    # Skip lm_head.weight if tie_word_embeddings is enabled
+    if not getattr(model_config, "tie_word_embeddings", False):
+        _add("lm_head.weight", "lm_head.weight")
     _add("transformer.norm.weight", "model.norm.weight")
     _add("transformer.embd.weight", "model.embed_tokens.weight")
 
diff --git a/python/mlc_llm/model/phi3/phi3_model.py b/python/mlc_llm/model/phi3/phi3_model.py
@@ -40,6 +40,8 @@ class Phi3Config(ConfigBase):  # pylint: disable=too-many-instance-attributes
     head_dim: int = 0
     tensor_parallel_shards: int = 1
     max_batch_size: int = 1
+    tie_word_embeddings: bool = False
+    partial_rotary_factor: float = 1.0
     kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
 
     def __post_init__(self):
@@ -94,6 +96,17 @@ def __post_init__(self):
 # pylint: disable=invalid-name,missing-docstring
 
 
+class Phi3Embedding(nn.Embedding):
+    """The embedding module that can be shared with the final lm_head."""
+
+    def lm_head_forward(self, x: nn.Tensor):
+        """The lm_head forwarding, which transposes the weight and multiplies
+        with the input tensor.
+        """
+        weight = nn.op.permute_dims(self.weight)
+        return nn.op.matmul(x, weight, out_dtype="float32")
+
+
 class Phi3MLP(nn.Module):
     def __init__(self, config: Phi3Config):
         super().__init__()
@@ -195,7 +208,7 @@ def _apply_parallel_residual(self, mlp_out, residual):
 class Phi3Model(nn.Module):
     def __init__(self, config: Phi3Config) -> None:
         super().__init__()
-        self.embd = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.embd = Phi3Embedding(config.vocab_size, config.hidden_size)
         self.h = nn.ModuleList([Phi3ParallelBlock(config) for _ in range(config.num_hidden_layers)])
         self.norm = nn.RMSNorm(config.hidden_size, -1, config.rms_norm_eps, bias=False)
 
@@ -213,7 +226,9 @@ def __init__(self, config: Phi3Config) -> None:
         super().__init__()
 
         self.transformer = Phi3Model(config)
-        self.lm_head = nn.Linear(config.hidden_size, "vocab_size", bias=False)
+        self.tie_word_embeddings = config.tie_word_embeddings
+        if not config.tie_word_embeddings:
+            self.lm_head = nn.Linear(config.hidden_size, "vocab_size", bias=False)
         self.num_hidden_layers = config.num_hidden_layers
         self.num_attention_heads = config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
@@ -226,13 +241,24 @@ def __init__(self, config: Phi3Config) -> None:
             config.rope_scaling["long_factor"] if config.rope_scaling is not None else None
         )
         self.tensor_parallel_shards = config.tensor_parallel_shards
+        self.partial_rotary_factor = config.partial_rotary_factor
         self.dtype = "float32"
 
     def to(self, dtype: Optional[str] = None):
         super().to(dtype=dtype)
         if dtype is not None:
             self.dtype = dtype
 
+    def get_logits(self, hidden_states: Tensor):
+        op_ext.configure()
+        if self.tie_word_embeddings:
+            logits = self.transformer.embd.lm_head_forward(hidden_states)
+        else:
+            logits = self.lm_head(hidden_states)
+        if logits.dtype != "float32":
+            logits = logits.astype("float32")
+        return logits
+
     def batch_forward(
         self,
         input_embeds: Tensor,
@@ -244,10 +270,7 @@ def batch_forward(
         hidden_states = self.transformer(input_embeds, paged_kv_cache)
         if logit_positions is not None:
             hidden_states = op.take(hidden_states, logit_positions, axis=1)
-        lm_logits = self.lm_head(hidden_states)
-        if lm_logits.dtype != "float32":
-            lm_logits = lm_logits.astype("float32")
-        return lm_logits
+        return self.get_logits(hidden_states)
 
     def prefill(self, input_embed: Tensor, paged_kv_cache: PagedKVCache):
         op_ext.configure()
@@ -258,20 +281,14 @@ def _index(x: te.Tensor):
 
         hidden_states = self.transformer(input_embed, paged_kv_cache)
         hidden_states = op.tensor_expr_op(_index, name_hint="index", args=[hidden_states])
-        logits = self.lm_head(hidden_states)
-
-        if logits.dtype != "float32":
-            logits = logits.astype("float32")
-
+        logits = self.get_logits(hidden_states)
         return logits, paged_kv_cache
 
     def decode(self, input_embed: Tensor, paged_kv_cache: PagedKVCache):
         op_ext.configure()
 
         hidden_states = self.transformer(input_embed, paged_kv_cache)
-        logits = self.lm_head(hidden_states)
-        if logits.dtype != "float32":
-            logits = logits.astype("float32")
+        logits = self.get_logits(hidden_states)
         return logits, paged_kv_cache
 
     def batch_prefill(
@@ -321,6 +338,7 @@ def create_paged_kv_cache(  # pylint: disable=too-many-arguments
             rope_scale=1,
             rope_theta=self.rope_theta,
             rope_ext_factors=self.rope_ext_factors,
+            rotary_dim=int(self.head_dim * self.partial_rotary_factor),
             dtype=self.dtype,
         )
 
diff --git a/python/mlc_llm/op/position_embedding.py b/python/mlc_llm/op/position_embedding.py
@@ -468,7 +468,7 @@ def fused_rope_longrope_scaling(  # pylint: disable=too-many-locals
         var_q: T.handle,
         var_k: T.handle,
         var_v: T.handle,
-        ext_factors: T.Buffer((head_dim // 2,), "float32"),  # type: ignore
+        ext_factors: T.Buffer((rotary_dim // 2,), "float32"),  # type: ignore
     ):
         T.func_attr(
             {

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,9 @@ def _add(mlc_name, hf_name):`
`48`	`48`	`),`
`49`	`49`	`)`
`50`	`50`
`51`		`- _add("lm_head.weight", "lm_head.weight")`
	`51`	`+ # Skip lm_head.weight if tie_word_embeddings is enabled`
	`52`	`+ if not getattr(model_config, "tie_word_embeddings", False):`
	`53`	`+ _add("lm_head.weight", "lm_head.weight")`
`52`	`54`	`_add("transformer.norm.weight", "model.norm.weight")`
`53`	`55`	`_add("transformer.embd.weight", "model.embed_tokens.weight")`
`54`	`56`
Original file line number	Diff line number	Diff line change
`@@ -468,7 +468,7 @@ def fused_rope_longrope_scaling( # pylint: disable=too-many-locals`
`468`	`468`	`var_q: T.handle,`
`469`	`469`	`var_k: T.handle,`
`470`	`470`	`var_v: T.handle,`
`471`		`- ext_factors: T.Buffer((head_dim // 2,), "float32"), # type: ignore`
	`471`	`+ ext_factors: T.Buffer((rotary_dim // 2,), "float32"), # type: ignore`
`472`	`472`	`):`
`473`	`473`	`T.func_attr(`
`474`	`474`	`{`