@@ -556,11 +556,8 @@ def set_gguf_parameters(self):
556
556
logger .info (f"gguf: experts used count = { n_experts_used } " )
557
557
558
558
if (head_dim := self .hparams .get ("head_dim" )) is not None :
559
- # Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
560
- # https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
561
- if self .hparams .get ("model_type" ) != "deepseek_v3" :
562
- self .gguf_writer .add_key_length (head_dim )
563
- self .gguf_writer .add_value_length (head_dim )
559
+ self .gguf_writer .add_key_length (head_dim )
560
+ self .gguf_writer .add_value_length (head_dim )
564
561
565
562
self .gguf_writer .add_file_type (self .ftype )
566
563
logger .info (f"gguf: file type = { self .ftype } " )
@@ -1901,9 +1898,7 @@ def set_gguf_parameters(self):
1901
1898
hparams = self .hparams
1902
1899
self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
1903
1900
1904
- if "head_dim" in hparams :
1905
- rope_dim = hparams ["head_dim" ]
1906
- else :
1901
+ if (rope_dim := hparams .get ("head_dim" )) is None :
1907
1902
rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
1908
1903
self .gguf_writer .add_rope_dimension_count (rope_dim )
1909
1904
@@ -1985,7 +1980,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1985
1980
if rope_scaling := self .find_hparam (["rope_scaling" ], optional = True ):
1986
1981
if rope_scaling .get ("rope_type" , '' ).lower () == "llama3" :
1987
1982
base = self .hparams .get ("rope_theta" , 10000.0 )
1988
- dim = self .hparams .get ("head_dim" , self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
1983
+ if (dim := self .hparams .get ("head_dim" )) is None :
1984
+ dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
1989
1985
freqs = 1.0 / (base ** (torch .arange (0 , dim , 2 , dtype = torch .float32 ) / dim ))
1990
1986
1991
1987
factor = rope_scaling .get ("factor" , 8.0 )
@@ -2321,9 +2317,7 @@ def set_gguf_parameters(self):
2321
2317
hparams = self .hparams
2322
2318
self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
2323
2319
2324
- if "head_dim" in hparams :
2325
- rope_dim = hparams ["head_dim" ]
2326
- else :
2320
+ if (rope_dim := hparams .get ("head_dim" )) is None :
2327
2321
rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
2328
2322
self .gguf_writer .add_rope_dimension_count (rope_dim )
2329
2323
@@ -2363,7 +2357,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2363
2357
if rope_scaling := self .find_hparam (["rope_scaling" ], optional = True ):
2364
2358
if rope_scaling .get ("rope_type" , '' ).lower () == "llama3" :
2365
2359
base = self .hparams .get ("rope_theta" , 10000.0 )
2366
- dim = self .hparams .get ("head_dim" , self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
2360
+ if (dim := self .hparams .get ("head_dim" )) is None :
2361
+ dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
2367
2362
freqs = 1.0 / (base ** (torch .arange (0 , dim , 2 , dtype = torch .float32 ) / dim ))
2368
2363
2369
2364
factor = rope_scaling .get ("factor" , 8.0 )
@@ -3681,9 +3676,7 @@ def set_gguf_parameters(self):
3681
3676
hparams = self .hparams
3682
3677
self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
3683
3678
3684
- if "head_dim" in hparams :
3685
- rope_dim = hparams ["head_dim" ]
3686
- else :
3679
+ if (rope_dim := hparams .get ("head_dim" )) is None :
3687
3680
rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
3688
3681
self .gguf_writer .add_rope_dimension_count (rope_dim )
3689
3682
@@ -5098,9 +5091,7 @@ def set_vocab(self):
5098
5091
def set_gguf_parameters (self ):
5099
5092
super ().set_gguf_parameters ()
5100
5093
hparams = self .hparams
5101
- if "head_dim" in hparams :
5102
- rope_dim = hparams ["head_dim" ]
5103
- else :
5094
+ if (rope_dim := hparams .get ("head_dim" )) is None :
5104
5095
rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
5105
5096
5106
5097
self .gguf_writer .add_rope_dimension_count (rope_dim )
@@ -5990,7 +5981,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
5990
5981
if rope_scaling := self .find_hparam (["rope_scaling" ], optional = True ):
5991
5982
if rope_scaling .get ("rope_type" , '' ).lower () == "llama3" :
5992
5983
base = self .hparams .get ("rope_theta" , 10000.0 )
5993
- dim = self .hparams .get ("head_dim" , self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
5984
+ if (dim := self .hparams .get ("head_dim" )) is None :
5985
+ dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
5994
5986
freqs = 1.0 / (base ** (torch .arange (0 , dim , 2 , dtype = torch .float32 ) / dim ))
5995
5987
5996
5988
factor = rope_scaling .get ("factor" , 8.0 )
@@ -6102,7 +6094,8 @@ def set_vocab(self):
6102
6094
def set_gguf_parameters (self ):
6103
6095
super ().set_gguf_parameters ()
6104
6096
hparams = self .hparams
6105
- rope_dim = hparams .get ("head_dim" ) or hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
6097
+ if (rope_dim := hparams .get ("head_dim" )) is None :
6098
+ rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
6106
6099
6107
6100
self .gguf_writer .add_rope_dimension_count (rope_dim )
6108
6101
rope_scaling = self .hparams .get ("rope_scaling" ) or {}
@@ -6134,7 +6127,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
6134
6127
n_head = self .hparams ["num_attention_heads" ]
6135
6128
n_kv_head = self .hparams .get ("num_key_value_heads" )
6136
6129
n_embd = self .hparams ["hidden_size" ]
6137
- head_dim = self .hparams .get ("head_dim" ) or n_embd // n_head
6130
+ if (head_dim := self .hparams .get ("head_dim" )) is None :
6131
+ head_dim = n_embd // n_head
6138
6132
6139
6133
output_name = self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT )
6140
6134
0 commit comments