Add support for Llama 3.2. 3B in eval/bench pipelines (#1271)

Diogo-V · web-flow · commit 9a9ea259a15c · 2024-11-13T19:04:46.000-08:00
* feat: adds support for llama 3.2 3b in benchmarks

* chore: add model to prepare.sh
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
@@ -85,6 +85,10 @@ def permute(w, n_head):
        else:
            state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True)
            merged_result.update(state_dict)
+    
+    if config.tie_word_embeddings:
+        merged_result["lm_head.weight"] = merged_result["model.embed_tokens.weight"].clone()
+
     final_result = {}
     for key, value in merged_result.items():
         if "layers" in key:
@@ -112,7 +116,7 @@ def permute(w, n_head):
             del final_result[key.replace("wq", "wv")]
     print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}")
     torch.save(final_result, checkpoint_dir / "model.pth")
-    if 'llama-3-' in model_name.lower() or 'llama-3.1-' in model_name.lower():
+    if any([x in model_name.lower() for x in ["llama-3-", "llama-3.1-", "llama-3.2-"]]):
         if 'llama-3.1-405b' in model_name.lower():
             original_dir = checkpoint_dir / "original" / "mp16"
         else:
diff --git a/scripts/prepare.sh b/scripts/prepare.sh
@@ -1,6 +1,8 @@
 python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf
 python scripts/download.py --repo_id meta-llama/Meta-Llama-3-8B
 python scripts/download.py --repo_id meta-llama/Meta-Llama-3.1-8B
+python scripts/download.py --repo_id meta-llama/Llama-3.2-3B
 python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf
 python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3-8B
 python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3.1-8B
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-3.2-3B
diff --git a/torchao/_models/llama/model.py b/torchao/_models/llama/model.py
@@ -35,6 +35,7 @@ class ModelArgs:
     rope_base: float = 10000
     norm_eps: float = 1e-5
     use_scaled_rope: bool = False
+    tie_word_embeddings: bool = False
 
     def __post_init__(self):
         if self.n_local_heads == -1:
@@ -79,6 +80,9 @@ def from_name(cls, name: str):
     "Llama-3.1-405B": dict(block_size=131072, n_layer=126, n_head=128, n_local_heads=8, dim=16384, intermediate_size=53248, vocab_size=128256, rope_base=500000,
         use_scaled_rope=True
     ),
+    "Llama-3.2-3B": dict(block_size=131072, n_layer=28, n_head=24, n_local_heads=8, dim=3072, intermediate_size=8192, vocab_size=128256, rope_base=500000,
+        use_scaled_rope=True, tie_word_embeddings=True
+    ),
 }
 
 # this is a model specific variable that controls whether index_put is used for the kv_cache update,