Add Qwen3 0.6B, 1.7B, and 4B (#10539)

jackzhxng · web-flow · commit 7b86bf192bfc · 2025-04-29T14:27:27.000-07:00
Add ExecuTorch support for Qwen3 0.6B, 1.7B, and 4B

### Qwen3 0.6B
Export with xnnpack + 8da4w quantization
```
python -m examples.models.llama.export_llama --model qwen3-0_6b --params examples/models/qwen3/0_6b_config.json -kv --use_sdpa_with_kv_cache -X --xnnpack-extended-ops -d fp32 --output_name="qwen3-0_6b_x_8da4w.pte" --verbose -qmode 8da4w
```

Run with pybindings
```
python -m examples.models.llama.runner.native --model qwen3-0_6b --pte qwen3-0_6b_x_8da4w.pte  --tokenizer ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json --tokenizer_config ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer_config.json --prompt "Who is the president of the US?" --params examples/models/qwen3/0_6b_config.json --max_len 128 -kv --temperature 0.6

&gt;&gt; Okay, let's see. The user is asking about the president of the US, but they wrote "And who is the president of the US?" and "And who is the president of the US?" So maybe they are using the same question but in a different way. They might be referring to the same president. Let me check. ...

# Some rough stats
Prefill time: 0.24 s
Generation tok/s: 17.15 s
Memory: 826.68 MB
```

### Qwen3 1.7B
Export with xnnpack + 8da4w quantization
```
python -m examples.models.llama.export_llama --model qwen3-1_7b --params examples/models/qwen3/1_7b_config.json -kv --use_sdpa_with_kv_cache -X --xnnpack-extended-ops -d fp32 --output_name="qwen3-1_7b_x_8da4w.pte" --verbose -qmode 8da4w
```

Run with pybindings
```
python -m examples.models.llama.runner.native --model qwen3-1_7b --pte qwen3-1_7b_x_8da4w.pte  --tokenizer ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json --tokenizer_config ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer_config.json --prompt "Who is the president of the US?" --params examples/models/qwen3/1_7b_config.json --max_len 128 -kv --temperature 0.6

# Some rough stats
Prefill time: 0.25 s
Generation tok/s: 16.87 s
Memory: 1.02 GB
```

### Qwen3 4B
Export with xnnpack + 8da4w quantization
```
python -m examples.models.llama.export_llama --model qwen3-4b --params examples/models/qwen3/4b_config.json -kv --use_sdpa_with_kv_cache -X --xnnpack-extended-ops -d fp32 --output_name="qwen3-4b_x_8da4w.pte" --verbose -qmode 8da4w
```

Run with pybindings
```
python -m examples.models.llama.runner.native --model qwen3-4b --pte qwen3-4b_x_8da4w.pte  --tokenizer ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json --tokenizer_config ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer_config.json --prompt "Who is the president of the US?" --params examples/models/qwen3/4b_config.json --max_len 128 -kv --temperature 0.6

# Some rough stats
Prefill time: 0.44 s
Generation tok/s: 12.12 s
Memory: 2.5 GB
```

bypass-github-export-checks
diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ To get started you can:
 
 - Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
 - Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
-- Jump straight into LLM use cases by following specific instructions for [Llama](examples/models/llama/README.md) and [Llava](examples/models/llava/README.md)
+- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), and [Llava](examples/models/llava/README.md)
 
 ## Feedback and Engagement
 
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -178,6 +178,7 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         self.dim = args.dim
         self.attention_qkv_bias = args.attention_qkv_bias
         self.use_qk_norm = args.use_qk_norm
+        self.qk_norm_before_rope = args.qk_norm_before_rope
 
         if self.use_qk_norm:
             q_norm_dim = self.head_dim
@@ -243,14 +244,18 @@ def forward(
         k = k.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
         v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
 
+        if self.use_qk_norm and self.qk_norm_before_rope:
+            q = self.q_norm_fn(q)
+            k = self.k_norm_fn(k)
+
         # RoPE relative positional embeddings
         q, k = self.rope.forward(q, k, freqs_cos, freqs_sin)
 
         q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
 
-        if self.use_qk_norm:
+        if self.use_qk_norm and not self.qk_norm_before_rope:
             q = self.q_norm_fn(q)
             k = self.k_norm_fn(k)
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -100,6 +100,9 @@
     "llama3_2",
     "static_llama",
     "qwen2_5",
+    "qwen3-0_6b",
+    "qwen3-1_7b",
+    "qwen3-4b",
     "phi_4_mini",
     "smollm2",
 ]
@@ -108,6 +111,9 @@
     "qwen2_5": "Qwen/Qwen2.5-1.5B",
     "phi_4_mini": "microsoft/Phi-4-mini-instruct",
     "smollm2": "HuggingFaceTB/SmolLM-135M",
+    "qwen3-0_6b": "Qwen/Qwen3-0.6B",
+    "qwen3-1_7b": "Qwen/Qwen3-1.7B",
+    "qwen3-4b": "Qwen/Qwen3-4B",
 }
 
 
@@ -544,6 +550,10 @@ def export_llama(args) -> str:
             from executorch.examples.models.qwen2_5 import (  # pyre-ignore[21]
                 convert_weights,
             )
+        elif args.model.startswith("qwen3"):
+            from executorch.examples.models.qwen3 import (  # pyre-ignore[21]
+                convert_weights,
+            )
         elif args.model == "phi_4_mini":
             from executorch.examples.models.phi_4_mini import (  # pyre-ignore[21]
                 convert_weights,
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -38,6 +38,7 @@ class ModelArgs:
     apply_embedding: bool = True  # Use embedding inside the transformer
     apply_output: bool = True  # Use output layer (unembedding) inside the transformer
     use_qk_norm: bool = False  # apply normalization to q and k in the attention
+    qk_norm_before_rope: bool = False  # when to apply qk norm
     use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
     partial_rotary_factor: float = 1.0
     rope_theta: Optional[float] = (
diff --git a/examples/models/qwen3/0_6b_config.json b/examples/models/qwen3/0_6b_config.json
@@ -0,0 +1,17 @@
+{
+  "dim": 1024,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 3072,
+  "n_heads": 16,
+  "head_dim": 128,
+  "n_kv_heads": 8,
+  "n_layers": 28,
+  "norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 151936,
+  "use_hf_rope": true,
+  "attention_qkv_bias": false,
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true
+}
diff --git a/examples/models/qwen3/1_7b_config.json b/examples/models/qwen3/1_7b_config.json
@@ -0,0 +1,17 @@
+{
+  "dim": 2048,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 6144,
+  "n_heads": 16,
+  "head_dim": 128,
+  "n_kv_heads": 8,
+  "n_layers": 28,
+  "norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 151936,
+  "use_hf_rope": true,
+  "attention_qkv_bias": false,
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true
+}
diff --git a/examples/models/qwen3/4b_config.json b/examples/models/qwen3/4b_config.json
@@ -0,0 +1,17 @@
+{
+  "dim": 2560,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 9728,
+  "n_heads": 32,
+  "head_dim": 128,
+  "n_kv_heads": 8,
+  "n_layers": 36,
+  "norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 151936,
+  "use_hf_rope": true,
+  "attention_qkv_bias": false,
+  "use_qk_norm": true,
+  "qk_norm_before_repo": true
+}
diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
@@ -0,0 +1,85 @@
+## Summary
+Qwen 3 is the latest iteration of the Qwen series of large language models (LLMs) developed by Alibaba. Edge-sized Qwen3 model variations (0.6B, 1.7B, and 4B) are currently supported .
+
+## Instructions
+
+Qwen 3 uses the same example code as our optimized Llama model, while the checkpoint, model params, and tokenizer are different. Please see the [Llama README page](../llama/README.md) for details.
+
+All commands for exporting and running Llama on various backends should also be applicable to Qwen 3, by swapping the following args:
+```
+--model [qwen3-0.6b,qwen3-1_7b,qwen3-4b]
+--params [examples/models/qwen3/0_6b_config.json,examples/models/qwen3/1_7b_config.json,examples/models/qwen3/4b_config.json]
+```
+
+### Example export
+Here is a basic example for exporting Qwen 3, although please refer to the Llama README's [Step 2: Prepare model](../llama/README.md#step-2-prepare-model) for more advanced usage.
+
+Export 0.6b to XNNPack, quantized with 8da4w:
+```
+python -m examples.models.llama.export_llama \
+  --model qwen3-0_6b \
+  --params examples/models/qwen3/0_6b_config.json \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  -d fp32 \
+  -X \
+  --xnnpack-extended-ops \
+  -qmode 8da4w
+  --output_name="qwen3-0_6b.pte" \
+  --verbose
+```
+
+Export 1.7b to XNNPack, quantized with 8da4w:
+```
+python -m examples.models.llama.export_llama \
+  --model qwen3-1_7b \
+  --params examples/models/qwen3/1_7b_config.json \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  -d fp32 \
+  -X \
+  --xnnpack-extended-ops \
+  -qmode 8da4w
+  --output_name="qwen3-1_7b.pte" \
+  --verbose
+```
+
+Export 4b to XNNPack, quantized with 8da4w:
+```
+python -m examples.models.llama.export_llama \
+  --model qwen3-4b \
+  --params examples/models/qwen3/4b_config.json \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  -d fp32 \
+  -X \
+  --xnnpack-extended-ops \
+  -qmode 8da4w
+  --output_name="qwen3-4b.pte" \
+  --verbose
+```
+
+### Example run
+With ExecuTorch pybindings:
+```
+python -m examples.models.llama.runner.native
+  --model qwen3-0_6b \
+  --pte qwen3-0_6b.pte \
+  --tokenizer ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json \
+  --tokenizer_config ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer_config.json \
+  --prompt "Who is the president of the US?" \
+  --params examples/models/qwen3/0_6b_config.json \
+  --max_len 128 \
+  -kv \
+  --temperature 0.6
+```
+
+With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner):
+```
+cmake-out/examples/models/llama/llama_main
+  --model_path qwen3-0_6b.pte
+  --tokenizer_path ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json
+  --prompt="Who is the president of the US?"
+```
+
+To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.
diff --git a/examples/models/qwen3/__init__.py b/examples/models/qwen3/__init__.py
@@ -0,0 +1,16 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.llama.model import Llama2Model
+from executorch.examples.models.qwen3.convert_weights import convert_weights
+
+
+class Qwen3Model(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "Qwen3Model",
+    "convert_weights",
+]
diff --git a/examples/models/qwen3/convert_weights.py b/examples/models/qwen3/convert_weights.py
@@ -0,0 +1,115 @@
+import argparse
+
+import json
+import os
+from typing import Dict
+
+import torch
+from safetensors.torch import load_file
+
+from torchtune.models.convert_weights import get_mapped_key
+
+# Standard _FROM_META weight mapping of Meta weights to TorchTune + additional bias weight mappings.
+_QWEN_3_FROM_META = {
+    "tok_embeddings.weight": "model.embed_tokens.weight",
+    "norm.weight": "model.norm.weight",
+    "layers.{}.attention.wk.weight": "model.layers.{}.self_attn.k_proj.weight",
+    "layers.{}.attention.k_norm_fn.weight": "model.layers.{}.self_attn.k_norm.weight",
+    "layers.{}.attention.wq.weight": "model.layers.{}.self_attn.q_proj.weight",
+    "layers.{}.attention.q_norm_fn.weight": "model.layers.{}.self_attn.q_norm.weight",
+    "layers.{}.attention.wv.weight": "model.layers.{}.self_attn.v_proj.weight",
+    "layers.{}.attention.wo.weight": "model.layers.{}.self_attn.o_proj.weight",
+    "layers.{}.attention_norm.weight": "model.layers.{}.input_layernorm.weight",
+    "layers.{}.ffn_norm.weight": "model.layers.{}.post_attention_layernorm.weight",
+    # Note: gate_proj and up_proj are reversed, usually w1 is the up_proj,
+    # w2 is the gate_proj, and activation is applied on the up_proj, but since
+    # Qwen3 applies activation on the gate_proj, we just swap the gate_proj
+    # and up_proj in the checkpoint itself as a hack.
+    "layers.{}.feed_forward.w1.weight": "model.layers.{}.mlp.gate_proj.weight",
+    "layers.{}.feed_forward.w2.weight": "model.layers.{}.mlp.down_proj.weight",
+    "layers.{}.feed_forward.w3.weight": "model.layers.{}.mlp.up_proj.weight",
+}
+
+
+def qwen_3_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from torchtune's format to Meta's format. This function
+    doesn't handle any sharding or splitting of state dicts. It follows the
+    state_dict IN -> state_dict OUT pattern.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in Meta's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _QWEN_3_FROM_META.items()}
+
+    for key, value in state_dict.items():
+        # Tied embeddings for 0.6b and 4b models.
+        if key == "lm_head.weight":
+            continue
+        new_key = get_mapped_key(key, inverted_mapping_dict)
+        converted_state_dict[new_key] = value
+
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
+
+    return converted_state_dict
+
+
+def load_checkpoint(input_dir: str) -> Dict:
+    index_path = os.path.join(input_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        # Sharded checkpoint.
+        with open(index_path, "r") as f:
+            index = json.load(f)
+        weight_map = index["weight_map"]
+        checkpoint_shards = sorted(set(weight_map.values()))
+
+        # Load all the shards into memory
+        shard_to_weights = {}
+        for shard in checkpoint_shards:
+            shard_to_weights[shard] = load_file(os.path.join(input_dir, shard))
+
+        # Merge tensors into consolidated state dict.
+        merged_state_dict = {}
+        for weight_name, shard in weight_map.items():
+            tensor = shard_to_weights[shard][weight_name]
+            merged_state_dict[weight_name] = tensor
+        return merged_state_dict
+    else:
+        # Single checkpoint.
+        state_dict = load_file(os.path.join(input_dir, "model.safetensors"))
+        return state_dict
+
+
+def convert_weights(input_dir: str, output_file: str) -> None:
+    print("Loading checkpoint...")
+    sd = load_checkpoint(input_dir)
+    print("Converting checkpoint...")
+    sd = qwen_3_tune_to_meta(sd)
+    print("Saving checkpoint...")
+    torch.save(sd, output_file)
+    print("Done.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Qwen3 weights to Meta format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing checkpoint files",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+    convert_weights(args.input_dir, args.output)
+
+
+if __name__ == "__main__":
+    main()