1.7B and 4B

jackzhxng · jackzhxng · commit 507dbc05cda4 · 2025-04-28T22:17:46.000-07:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -100,7 +100,9 @@
     "llama3_2",
     "static_llama",
     "qwen2_5",
-    "qwen3",
+    "qwen3-0_6b",
+    "qwen3-1_7b",
+    "qwen3-4b",
     "phi_4_mini",
     "smollm2",
 ]
@@ -109,7 +111,9 @@
     "qwen2_5": "Qwen/Qwen2.5-1.5B",
     "phi_4_mini": "microsoft/Phi-4-mini-instruct",
     "smollm2": "HuggingFaceTB/SmolLM-135M",
-    "qwen3": "Qwen/Qwen3-0.6B",
+    "qwen3-0_6b": "Qwen/Qwen3-0.6B",
+    "qwen3-1_7b": "Qwen/Qwen3-1.7B",
+    "qwen3-4b": "Qwen/Qwen3-4B",
 }
 
 
@@ -546,7 +550,7 @@ def export_llama(args) -> str:
             from executorch.examples.models.qwen2_5 import (  # pyre-ignore[21]
                 convert_weights,
             )
-        elif args.model == "qwen3":
+        elif args.model.startswith("qwen3"):
             from executorch.examples.models.qwen3 import (  # pyre-ignore[21]
                 convert_weights,
             )
diff --git a/examples/models/qwen3/convert_weights.py b/examples/models/qwen3/convert_weights.py
@@ -1,9 +1,11 @@
 import argparse
-from typing import Dict
 
+import json
 import os
-from safetensors import safe_open
+from typing import Dict
+
 import torch
+from safetensors.torch import load_file
 
 from torchtune.models.convert_weights import get_mapped_key
 
@@ -58,13 +60,35 @@ def qwen_3_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.
     return converted_state_dict
 
 
+def load_checkpoint(input_dir: str) -> Dict:
+    index_path = os.path.join(input_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        # Sharded checkpoint.
+        with open(index_path, "r") as f:
+            index = json.load(f)
+        weight_map = index["weight_map"]
+        checkpoint_shards = sorted(set(weight_map.values()))
+
+        # Load all the shards into memory
+        shard_to_weights = {}
+        for shard in checkpoint_shards:
+            shard_to_weights[shard] = load_file(os.path.join(input_dir, shard))
+
+        # Merge tensors into consolidated state dict.
+        merged_state_dict = {}
+        for weight_name, shard in weight_map.items():
+            tensor = shard_to_weights[shard][weight_name]
+            merged_state_dict[weight_name] = tensor
+        return merged_state_dict
+    else:
+        # Single checkpoint.
+        state_dict = load_file(os.path.join(input_dir, "model.safetensors"))
+        return state_dict
+
+
 def convert_weights(input_dir: str, output_file: str) -> None:
     print("Loading checkpoint...")
-    sd = {}
-    with safe_open(os.path.join(input_dir, "model.safetensors"), framework="pt", device="cpu") as f:
-        for key in f.keys():
-            sd[key] = f.get_tensor(key)
-
+    sd = load_checkpoint(input_dir)
     print("Converting checkpoint...")
     sd = qwen_3_tune_to_meta(sd)
     print("Saving checkpoint...")