Skip to content

Commit f5e96b3

Browse files
authored
model : support LiquidAI LFM2 hybrid family (#14620)
**Important** LFM2 was [merged ](huggingface/transformers#39340 transformers, but has not yet been released. To convert into gguf, install transformers from source ```shell pip install "transformers @ git+https://github.com/huggingface/transformers.git@main" ```
1 parent 756aa10 commit f5e96b3

14 files changed

+373
-3
lines changed

convert_hf_to_gguf.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ def prepare_tensors(self):
300300
gguf.MODEL_TENSOR.POS_EMBD,
301301
gguf.MODEL_TENSOR.TOKEN_TYPES,
302302
gguf.MODEL_TENSOR.SSM_CONV1D,
303+
gguf.MODEL_TENSOR.SHORTCONV_CONV,
303304
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
304305
gguf.MODEL_TENSOR.TIME_MIX_W1,
305306
gguf.MODEL_TENSOR.TIME_MIX_W2,
@@ -836,6 +837,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
836837
if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
837838
# ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
838839
res = "midm-2.0"
840+
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
841+
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
842+
res = "lfm2"
839843

840844
if res is None:
841845
logger.warning("\n")
@@ -7073,6 +7077,50 @@ def set_vocab(self):
70737077
chat_template = tokenizer.chat_template.replace("[:]", "")
70747078
self.gguf_writer.add_chat_template(chat_template)
70757079

7080+
7081+
@ModelBase.register("Lfm2ForCausalLM")
7082+
@ModelBase.register("LFM2ForCausalLM")
7083+
class LFM2Model(TextModel):
7084+
model_arch = gguf.MODEL_ARCH.LFM2
7085+
7086+
def _add_feed_forward_length(self):
7087+
ff_dim = self.hparams["block_ff_dim"]
7088+
7089+
auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
7090+
ff_dim = self.hparams["block_ff_dim"]
7091+
ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
7092+
multiple_of = self.hparams["block_multiple_of"]
7093+
7094+
if auto_adjust_ff_dim:
7095+
ff_dim = int(2 * ff_dim / 3)
7096+
# custom dim factor multiplier
7097+
if ffn_dim_multiplier is not None:
7098+
ff_dim = int(ffn_dim_multiplier * ff_dim)
7099+
ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
7100+
7101+
self.gguf_writer.add_feed_forward_length(ff_dim)
7102+
7103+
def set_gguf_parameters(self):
7104+
# set num_key_value_heads only for attention layers
7105+
self.hparams["num_key_value_heads"] = [
7106+
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
7107+
for layer_type in self.hparams["layer_types"]
7108+
]
7109+
7110+
super().set_gguf_parameters()
7111+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
7112+
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
7113+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
7114+
self._add_feed_forward_length()
7115+
7116+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7117+
# conv op requires 2d tensor
7118+
if 'conv.conv' in name:
7119+
data_torch = data_torch.squeeze(1)
7120+
7121+
return [(self.map_tensor_name(name), data_torch)]
7122+
7123+
70767124
###### CONVERSION LOGIC ######
70777125

70787126

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ class TOKENIZER_TYPE(IntEnum):
130130
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
131131
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
132132
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
133+
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
133134
]
134135

135136
# some models are known to be broken upstream, so we will skip them as exceptions

ggml/src/ggml-cuda/ssm-conv.cu

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,17 +107,25 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
107107
if (nc == 4) {
108108
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
109109
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
110+
} else if (nc == 3) {
111+
ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
112+
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
110113
} else {
111-
GGML_ABORT("Only support kernel size = 4 now.");
114+
GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
112115
}
113116
} else {
114117
if (nc == 4) {
115118
const int64_t split_n_t = 32;
116119
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
117120
ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
118121
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
122+
} else if (nc == 3) {
123+
const int64_t split_n_t = 32;
124+
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
125+
ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
126+
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
119127
} else {
120-
GGML_ABORT("Only support kernel size = 4 right now.");
128+
GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
121129
}
122130
}
123131
}

gguf-py/gguf/constants.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,9 @@ class ConvNext:
187187
class Classifier:
188188
OUTPUT_LABELS = "{arch}.classifier.output_labels"
189189

190+
class ShortConv:
191+
L_CACHE = "{arch}.shortconv.l_cache"
192+
190193
class Tokenizer:
191194
MODEL = "tokenizer.ggml.model"
192195
PRE = "tokenizer.ggml.pre"
@@ -362,6 +365,7 @@ class MODEL_ARCH(IntEnum):
362365
ERNIE4_5 = auto()
363366
HUNYUAN_MOE = auto()
364367
SMOLLM3 = auto()
368+
LFM2 = auto()
365369

366370

367371
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -533,6 +537,9 @@ class MODEL_TENSOR(IntEnum):
533537
POSNET_ATTN_K = auto()
534538
POSNET_ATTN_V = auto()
535539
POSNET_ATTN_OUT = auto()
540+
SHORTCONV_CONV = auto()
541+
SHORTCONV_INPROJ = auto()
542+
SHORTCONV_OUTPROJ = auto()
536543
# vision
537544
V_MMPROJ = auto()
538545
V_MMPROJ_FC = auto()
@@ -673,6 +680,7 @@ class MODEL_TENSOR(IntEnum):
673680
MODEL_ARCH.FALCON_H1: "falcon-h1",
674681
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
675682
MODEL_ARCH.SMOLLM3: "smollm3",
683+
MODEL_ARCH.LFM2: "lfm2",
676684
}
677685

678686
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -844,6 +852,9 @@ class MODEL_TENSOR(IntEnum):
844852
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
845853
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
846854
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
855+
MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv",
856+
MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj",
857+
MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj",
847858
# vision
848859
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
849860
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
@@ -2356,6 +2367,24 @@ class MODEL_TENSOR(IntEnum):
23562367
MODEL_TENSOR.FFN_DOWN,
23572368
MODEL_TENSOR.FFN_UP,
23582369
],
2370+
MODEL_ARCH.LFM2: [
2371+
MODEL_TENSOR.TOKEN_EMBD,
2372+
MODEL_TENSOR.TOKEN_EMBD_NORM,
2373+
MODEL_TENSOR.SHORTCONV_CONV,
2374+
MODEL_TENSOR.SHORTCONV_INPROJ,
2375+
MODEL_TENSOR.SHORTCONV_OUTPROJ,
2376+
MODEL_TENSOR.FFN_GATE,
2377+
MODEL_TENSOR.FFN_DOWN,
2378+
MODEL_TENSOR.FFN_UP,
2379+
MODEL_TENSOR.FFN_NORM,
2380+
MODEL_TENSOR.ATTN_NORM, # operator_norm
2381+
MODEL_TENSOR.ATTN_Q_NORM,
2382+
MODEL_TENSOR.ATTN_K_NORM,
2383+
MODEL_TENSOR.ATTN_Q,
2384+
MODEL_TENSOR.ATTN_K,
2385+
MODEL_TENSOR.ATTN_V,
2386+
MODEL_TENSOR.ATTN_OUT,
2387+
],
23592388
# TODO
23602389
}
23612390

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,9 @@ def add_convnext_embedding_length(self, length: int) -> None:
648648
def add_convnext_block_count(self, length: int) -> None:
649649
self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
650650

651+
def add_shortconv_l_cache(self, length: int) -> None:
652+
self.add_uint32(Keys.ShortConv.L_CACHE.format(arch=self.arch), length)
653+
651654
def add_block_count(self, length: int) -> None:
652655
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
653656

gguf-py/gguf/tensor_mapping.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class TensorNameMap:
5050
"model.pre_ln", # rwkv7
5151
"model.layers.0.pre_norm", # rwkv7
5252
"backbone.norm", # wavtokenizer
53+
"model.embedding_norm", # lfm2
5354
),
5455

5556
# Position embeddings
@@ -136,6 +137,7 @@ class TensorNameMap:
136137
"model.layers.{bid}.ln1", # rwkv7
137138
"model.layers.{bid}.input_layernorm", # llama4
138139
"transformer_encoder.{bid}.attention_norm", # neobert
140+
"model.layers.{bid}.operator_norm", # lfm2
139141
),
140142

141143
# Attention norm 2
@@ -220,6 +222,7 @@ class TensorNameMap:
220222
"transformer.h.{bid}.self_attention.dense", # falcon
221223
"h.{bid}.self_attention.dense", # bloom
222224
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe
225+
"model.layers.{bid}.self_attn.out_proj", # lfm2
223226
"model.layers.{bid}.self_attn.linear_attn", # deci
224227
"layers.{bid}.attention.wo", # llama-pth
225228
"encoder.layer.{bid}.attention.output.dense", # bert
@@ -1015,6 +1018,18 @@ class TensorNameMap:
10151018
"backbone.posnet.{bid}.proj_out", # wavtokenizer
10161019
),
10171020

1021+
MODEL_TENSOR.SHORTCONV_CONV: (
1022+
"model.layers.{bid}.conv.conv",
1023+
),
1024+
1025+
MODEL_TENSOR.SHORTCONV_INPROJ: (
1026+
"model.layers.{bid}.conv.in_proj",
1027+
),
1028+
1029+
MODEL_TENSOR.SHORTCONV_OUTPROJ: (
1030+
"model.layers.{bid}.conv.out_proj",
1031+
),
1032+
10181033
#############################################################################
10191034
## Vision encoder
10201035

src/llama-arch.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8383
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
8484
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
8585
{ LLM_ARCH_SMOLLM3, "smollm3" },
86+
{ LLM_ARCH_LFM2, "lfm2" },
8687
{ LLM_ARCH_UNKNOWN, "(unknown)" },
8788
};
8889

@@ -188,6 +189,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
188189

189190
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
190191

192+
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
193+
191194
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
192195
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
193196
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -1830,6 +1833,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
18301833
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
18311834
},
18321835
},
1836+
{
1837+
LLM_ARCH_LFM2,
1838+
{
1839+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1840+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1841+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1842+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1843+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1844+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1845+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1846+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1847+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1848+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1849+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1850+
{ LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
1851+
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
1852+
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
1853+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1854+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1855+
}
1856+
},
18331857
{
18341858
LLM_ARCH_UNKNOWN,
18351859
{
@@ -1997,6 +2021,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
19972021
{LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
19982022
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
19992023
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2024+
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2025+
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2026+
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
20002027
};
20012028

20022029
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@@ -2068,6 +2095,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
20682095
case LLM_ARCH_JAMBA:
20692096
case LLM_ARCH_FALCON_H1:
20702097
case LLM_ARCH_GRANITE_HYBRID:
2098+
case LLM_ARCH_LFM2:
20712099
return true;
20722100
default:
20732101
return false;

src/llama-arch.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ enum llm_arch {
8787
LLM_ARCH_ERNIE4_5,
8888
LLM_ARCH_HUNYUAN_MOE,
8989
LLM_ARCH_SMOLLM3,
90+
LLM_ARCH_LFM2,
9091
LLM_ARCH_UNKNOWN,
9192
};
9293

@@ -227,6 +228,8 @@ enum llm_kv {
227228

228229
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
229230

231+
LLM_KV_SHORTCONV_L_CACHE,
232+
230233
// deprecated:
231234
LLM_KV_TOKENIZER_PREFIX_ID,
232235
LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -396,6 +399,9 @@ enum llm_tensor {
396399
LLM_TENSOR_POS_NET_ATTN_K,
397400
LLM_TENSOR_POS_NET_ATTN_V,
398401
LLM_TENSOR_POS_NET_ATTN_OUT,
402+
LLM_TENSOR_SHORTCONV_CONV,
403+
LLM_TENSOR_SHORTCONV_INPROJ,
404+
LLM_TENSOR_SHORTCONV_OUTPROJ,
399405
};
400406

401407
enum llm_tensor_layer {

src/llama-hparams.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ uint32_t llama_hparams::n_embd_r() const {
7171
return token_shift_count * n_embd;
7272
}
7373

74+
if (n_shortconv_l_cache != 0) {
75+
// for LFM2 models
76+
return n_embd * (n_shortconv_l_cache - 1);
77+
}
78+
7479
// TODO: maybe support other convolution strides than 1
7580
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
7681
// Corresponds to Mamba's conv_states size

src/llama-hparams.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ struct llama_hparams {
5555
struct llama_hparams_posnet posnet;
5656
struct llama_hparams_convnext convnext;
5757

58+
uint32_t n_shortconv_l_cache = 0;
59+
5860
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
5961
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
6062
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;

0 commit comments

Comments
 (0)