convert : for Mamba, also consider the "MambaLMHeadModel" arch name

compilade · compilade · commit 5a8961aeeeac · 2024-02-14T15:26:25.000-05:00
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -218,7 +218,7 @@ def from_model_architecture(model_architecture):
             return BertModel
         if model_architecture == "NomicBertModel":
             return NomicBertModel
-        if model_architecture == "MambaForCausalLM":
+        if model_architecture in ("MambaForCausalLM", "MambaLMHeadModel"):
             return MambaModel
         return Model
 
@@ -279,7 +279,7 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
             return gguf.MODEL_ARCH.BERT
         if arch == "NomicBertModel":
             return gguf.MODEL_ARCH.NOMIC_BERT
-        if arch == "MambaForCausalLM":
+        if arch in ("MambaForCausalLM", "MambaLMHeadModel"):
             return gguf.MODEL_ARCH.MAMBA
 
         raise NotImplementedError(f'Architecture "{arch}" not supported!')
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -205,7 +205,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
     MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
     MODEL_TENSOR.LAYER_OUT_NORM:  "blk.{bid}.layer_output_norm",
-    # FIXME: NAMES FOR MAMBA ARE NOT FINAL
     MODEL_TENSOR.SSM_IN:          "blk.{bid}.ssm_in",
     MODEL_TENSOR.SSM_CONV1D:      "blk.{bid}.ssm_conv1d",
     MODEL_TENSOR.SSM_X:           "blk.{bid}.ssm_x",
diff --git a/llama.cpp b/llama.cpp
@@ -393,8 +393,6 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_Q_NORM,
     LLM_TENSOR_ATTN_K_NORM,
     LLM_TENSOR_LAYER_OUT_NORM,
-    // TODO: maybe use longer names?
-    // TODO: can the in_proj and/or the out_proj instead re-use some of the above types?
     LLM_TENSOR_SSM_IN,
     LLM_TENSOR_SSM_CONV1D,
     LLM_TENSOR_SSM_X,