@@ -326,6 +326,7 @@ def prepare_tensors(self):
326
326
gguf .MODEL_TENSOR .TIME_MIX_W2 ,
327
327
gguf .MODEL_TENSOR .TIME_MIX_DECAY_W1 ,
328
328
gguf .MODEL_TENSOR .TIME_MIX_DECAY_W2 ,
329
+ gguf .MODEL_TENSOR .TIME_MIX_LERP_FUSED ,
329
330
gguf .MODEL_TENSOR .POSNET_NORM1 ,
330
331
gguf .MODEL_TENSOR .POSNET_NORM2 ,
331
332
)
@@ -2562,6 +2563,63 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2562
2563
yield (self .format_tensor_name (gguf .MODEL_TENSOR .ROPE_FACTORS_SHORT ), torch .tensor (short_factors , dtype = torch .float32 ))
2563
2564
2564
2565
2566
+ @Model .register ("PhiMoEForCausalLM" )
2567
+ class PhiMoeModel (Phi3MiniModel ):
2568
+ model_arch = gguf .MODEL_ARCH .PHIMOE
2569
+
2570
+ _experts : list [dict [str , Tensor ]] | None = None
2571
+
2572
+ def set_gguf_parameters (self ):
2573
+ super ().set_gguf_parameters ()
2574
+ self .gguf_writer .add_expert_used_count (self .hparams ["num_experts_per_tok" ])
2575
+ self .gguf_writer .add_expert_count (self .hparams ["num_local_experts" ])
2576
+
2577
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2578
+ # process the experts separately
2579
+ if name .find ("block_sparse_moe.experts" ) != - 1 :
2580
+ n_experts = self .hparams ["num_local_experts" ]
2581
+ assert bid is not None
2582
+
2583
+ if self ._experts is None :
2584
+ self ._experts = [{} for _ in range (self .block_count )]
2585
+
2586
+ self ._experts [bid ][name ] = data_torch
2587
+
2588
+ if len (self ._experts [bid ]) >= n_experts * 3 :
2589
+ tensors : list [tuple [str , Tensor ]] = []
2590
+
2591
+ # merge the experts into a single 3d tensor
2592
+ for w_name in ["w1" , "w2" , "w3" ]:
2593
+ datas : list [Tensor ] = []
2594
+
2595
+ for xid in range (n_experts ):
2596
+ ename = f"model.layers.{ bid } .block_sparse_moe.experts.{ xid } .{ w_name } .weight"
2597
+ datas .append (self ._experts [bid ][ename ])
2598
+ del self ._experts [bid ][ename ]
2599
+
2600
+ data_torch = torch .stack (datas , dim = 0 )
2601
+
2602
+ merged_name = f"model.layers.{ bid } .block_sparse_moe.experts.{ w_name } .weight"
2603
+
2604
+ new_name = self .map_tensor_name (merged_name )
2605
+
2606
+ tensors .append ((new_name , data_torch ))
2607
+ return tensors
2608
+ else :
2609
+ return []
2610
+
2611
+ return [(self .map_tensor_name (name ), data_torch )]
2612
+
2613
+ def prepare_tensors (self ):
2614
+ super ().prepare_tensors ()
2615
+
2616
+ if self ._experts is not None :
2617
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2618
+ experts = [k for d in self ._experts for k in d .keys ()]
2619
+ if len (experts ) > 0 :
2620
+ raise ValueError (f"Unprocessed experts: { experts } " )
2621
+
2622
+
2565
2623
@Model .register ("PlamoForCausalLM" )
2566
2624
class PlamoModel (Model ):
2567
2625
model_arch = gguf .MODEL_ARCH .PLAMO
@@ -3259,6 +3317,8 @@ def set_gguf_parameters(self):
3259
3317
# required by llama.cpp, unused
3260
3318
self .gguf_writer .add_head_count (0 )
3261
3319
3320
+ lerp_weights : dict [int , dict [str , Tensor ]] = {}
3321
+
3262
3322
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3263
3323
new_name = self .map_tensor_name (name )
3264
3324
@@ -3274,14 +3334,84 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
3274
3334
if new_name .endswith ("time_mix_decay.weight" ) or "lerp" in new_name :
3275
3335
data_torch = data_torch .squeeze ()
3276
3336
3277
- rescale_every_n_layers = self .hparams ["rescale_every" ]
3278
- if rescale_every_n_layers > 0 :
3279
- if new_name .endswith ("time_mix_output.weight" ) or new_name .endswith ("channel_mix_value.weight" ):
3280
- data_torch = data_torch .div_ (2 ** int (bid // rescale_every_n_layers ))
3337
+ try :
3338
+ rescale_every_n_layers = self .hparams ["rescale_every" ]
3339
+ if rescale_every_n_layers > 0 :
3340
+ if new_name .endswith ("time_mix_output.weight" ) or new_name .endswith ("channel_mix_value.weight" ):
3341
+ data_torch = data_torch .div_ (2 ** int (bid // rescale_every_n_layers ))
3342
+ except KeyError :
3343
+ pass
3344
+
3345
+ # concat time_mix_lerp weights to reduce some cpu overhead
3346
+ # also reduces the number of tensors in the model
3347
+ if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name :
3348
+ try :
3349
+ self .lerp_weights [bid ][new_name ] = data_torch
3350
+ except KeyError :
3351
+ self .lerp_weights [bid ] = {new_name : data_torch }
3352
+ if all (f"blk.{ bid } .time_mix_lerp_{ i } .weight" in self .lerp_weights [bid ].keys () for i in ["w" , "k" , "v" , "r" , "g" ]):
3353
+ new_name = f"blk.{ bid } .time_mix_lerp_fused.weight"
3354
+ data = torch .stack ([self .lerp_weights [bid ][f"blk.{ bid } .time_mix_lerp_{ i } .weight" ].unsqueeze (0 ) for i in ["w" , "k" , "v" , "r" , "g" ]], dim = 0 ).unsqueeze (1 )
3355
+ yield (new_name , data )
3356
+ return
3281
3357
3282
3358
yield (new_name , data_torch )
3283
3359
3284
3360
3361
+ @Model .register ("RWKV6Qwen2ForCausalLM" )
3362
+ class RWKV6Qwen2Model (Rwkv6Model ):
3363
+ model_arch = gguf .MODEL_ARCH .RWKV6QWEN2
3364
+
3365
+ def set_vocab (self ):
3366
+ try :
3367
+ self ._set_vocab_sentencepiece ()
3368
+ except FileNotFoundError :
3369
+ self ._set_vocab_gpt2 ()
3370
+
3371
+ def set_gguf_parameters (self ):
3372
+ block_count = self .hparams ["num_hidden_layers" ]
3373
+ num_attention_heads = self .hparams ["num_attention_heads" ]
3374
+ num_key_value_heads = self .hparams ["num_key_value_heads" ]
3375
+ hidden_size = self .hparams ["hidden_size" ]
3376
+ head_size = hidden_size // num_attention_heads
3377
+ rms_norm_eps = self .hparams ["rms_norm_eps" ]
3378
+ intermediate_size = self .hparams ["intermediate_size" ]
3379
+ time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
3380
+ time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
3381
+
3382
+ # RWKV isn't context limited
3383
+ self .gguf_writer .add_context_length (1048576 )
3384
+ self .gguf_writer .add_embedding_length (hidden_size )
3385
+ self .gguf_writer .add_block_count (block_count )
3386
+ self .gguf_writer .add_wkv_head_size (head_size )
3387
+ self .gguf_writer .add_time_mix_extra_dim (time_mix_extra_dim )
3388
+ self .gguf_writer .add_time_decay_extra_dim (time_decay_extra_dim )
3389
+ self .gguf_writer .add_feed_forward_length (intermediate_size )
3390
+ self .gguf_writer .add_file_type (self .ftype )
3391
+
3392
+ # special parameters for time_mixing in RWKV6QWEN2
3393
+ self .gguf_writer .add_layer_norm_rms_eps (rms_norm_eps )
3394
+ self .gguf_writer .add_token_shift_count (1 )
3395
+ # RWKV6QWEN2 use grouped key/value like GQA
3396
+ self .gguf_writer .add_head_count_kv (num_key_value_heads )
3397
+
3398
+ # required by llama.cpp, unused
3399
+ self .gguf_writer .add_head_count (0 )
3400
+
3401
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3402
+ for new_name , data in super ().modify_tensors (data_torch , name , bid ):
3403
+ if "time_mix_w1" in new_name or "time_mix_w2" in new_name :
3404
+ data = data .view (5 , - 1 , data .shape [- 1 ])
3405
+ # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
3406
+ # permute them here to avoid code changes
3407
+ data = torch .stack ([data [3 ], data [1 ], data [2 ], data [0 ], data [4 ]], dim = 0 ).view (- 1 , data .shape [- 1 ])
3408
+ if "w2" in new_name :
3409
+ data = data .view (5 , - 1 , data .shape [- 1 ])
3410
+ yield (new_name , data )
3411
+ continue
3412
+ yield (new_name , data )
3413
+
3414
+
3285
3415
@Model .register ("MambaForCausalLM" , "MambaLMHeadModel" , "FalconMambaForCausalLM" )
3286
3416
class MambaModel (Model ):
3287
3417
model_arch = gguf .MODEL_ARCH .MAMBA
0 commit comments