Cap Mistral's context length at 2k (#495)

collingray · web-flow · commit ba3fb3bd0820 · 2024-01-28T10:44:49.000-03:00
Temporary fix to prevent multiple TB of memory allocated just to attention masks
diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py
@@ -815,7 +815,7 @@ def convert_hf_model_config(model_name: str, **kwargs):
             "n_heads": 32,
             "d_mlp": 14336,
             "n_layers": 32,
-            "n_ctx": 32768,
+            "n_ctx": 2048,  # Capped due to memory issues
             "d_vocab": 32000,
             "act_fn": "silu",
             "normalization_type": "RMS",