Skip to content

Commit 13a6f9c

Browse files
CISCqnixsynapse
authored andcommitted
convert : fix null head_dim AutoConfig regression (ggml-org#14248)
1 parent b8ebb3f commit 13a6f9c

File tree

1 file changed

+16
-22
lines changed

1 file changed

+16
-22
lines changed

convert_hf_to_gguf.py

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -556,11 +556,8 @@ def set_gguf_parameters(self):
556556
logger.info(f"gguf: experts used count = {n_experts_used}")
557557

558558
if (head_dim := self.hparams.get("head_dim")) is not None:
559-
# Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
560-
# https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
561-
if self.hparams.get("model_type") != "deepseek_v3":
562-
self.gguf_writer.add_key_length(head_dim)
563-
self.gguf_writer.add_value_length(head_dim)
559+
self.gguf_writer.add_key_length(head_dim)
560+
self.gguf_writer.add_value_length(head_dim)
564561

565562
self.gguf_writer.add_file_type(self.ftype)
566563
logger.info(f"gguf: file type = {self.ftype}")
@@ -1901,9 +1898,7 @@ def set_gguf_parameters(self):
19011898
hparams = self.hparams
19021899
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
19031900

1904-
if "head_dim" in hparams:
1905-
rope_dim = hparams["head_dim"]
1906-
else:
1901+
if (rope_dim := hparams.get("head_dim")) is None:
19071902
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
19081903
self.gguf_writer.add_rope_dimension_count(rope_dim)
19091904

@@ -1985,7 +1980,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
19851980
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
19861981
if rope_scaling.get("rope_type", '').lower() == "llama3":
19871982
base = self.hparams.get("rope_theta", 10000.0)
1988-
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1983+
if (dim := self.hparams.get("head_dim")) is None:
1984+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
19891985
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
19901986

19911987
factor = rope_scaling.get("factor", 8.0)
@@ -2321,9 +2317,7 @@ def set_gguf_parameters(self):
23212317
hparams = self.hparams
23222318
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
23232319

2324-
if "head_dim" in hparams:
2325-
rope_dim = hparams["head_dim"]
2326-
else:
2320+
if (rope_dim := hparams.get("head_dim")) is None:
23272321
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
23282322
self.gguf_writer.add_rope_dimension_count(rope_dim)
23292323

@@ -2363,7 +2357,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
23632357
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
23642358
if rope_scaling.get("rope_type", '').lower() == "llama3":
23652359
base = self.hparams.get("rope_theta", 10000.0)
2366-
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
2360+
if (dim := self.hparams.get("head_dim")) is None:
2361+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
23672362
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
23682363

23692364
factor = rope_scaling.get("factor", 8.0)
@@ -3681,9 +3676,7 @@ def set_gguf_parameters(self):
36813676
hparams = self.hparams
36823677
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
36833678

3684-
if "head_dim" in hparams:
3685-
rope_dim = hparams["head_dim"]
3686-
else:
3679+
if (rope_dim := hparams.get("head_dim")) is None:
36873680
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
36883681
self.gguf_writer.add_rope_dimension_count(rope_dim)
36893682

@@ -5098,9 +5091,7 @@ def set_vocab(self):
50985091
def set_gguf_parameters(self):
50995092
super().set_gguf_parameters()
51005093
hparams = self.hparams
5101-
if "head_dim" in hparams:
5102-
rope_dim = hparams["head_dim"]
5103-
else:
5094+
if (rope_dim := hparams.get("head_dim")) is None:
51045095
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
51055096

51065097
self.gguf_writer.add_rope_dimension_count(rope_dim)
@@ -5990,7 +5981,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
59905981
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
59915982
if rope_scaling.get("rope_type", '').lower() == "llama3":
59925983
base = self.hparams.get("rope_theta", 10000.0)
5993-
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
5984+
if (dim := self.hparams.get("head_dim")) is None:
5985+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
59945986
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
59955987

59965988
factor = rope_scaling.get("factor", 8.0)
@@ -6102,7 +6094,8 @@ def set_vocab(self):
61026094
def set_gguf_parameters(self):
61036095
super().set_gguf_parameters()
61046096
hparams = self.hparams
6105-
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
6097+
if (rope_dim := hparams.get("head_dim")) is None:
6098+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
61066099

61076100
self.gguf_writer.add_rope_dimension_count(rope_dim)
61086101
rope_scaling = self.hparams.get("rope_scaling") or {}
@@ -6134,7 +6127,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
61346127
n_head = self.hparams["num_attention_heads"]
61356128
n_kv_head = self.hparams.get("num_key_value_heads")
61366129
n_embd = self.hparams["hidden_size"]
6137-
head_dim = self.hparams.get("head_dim") or n_embd // n_head
6130+
if (head_dim := self.hparams.get("head_dim")) is None:
6131+
head_dim = n_embd // n_head
61386132

61396133
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
61406134

0 commit comments

Comments
 (0)