Skip to content

Commit c53cb65

Browse files
authored
Support for dots.llm1 models (ikawrakow#573)
* Add llama.cpp changes for dots1 support * Add python changes for dots1 support * Fix to make it convert * Remove V reshaping, remove BOS by default for dots1 and fix warmup to handle models without BOS * Minor fix * Remove commented lines
1 parent 283753c commit c53cb65

File tree

4 files changed

+326
-3
lines changed

4 files changed

+326
-3
lines changed

convert_hf_to_gguf.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3864,6 +3864,34 @@ def prepare_tensors(self):
38643864
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
38653865

38663866

3867+
@Model.register("Dots1ForCausalLM")
3868+
class Dots1Model(Qwen2MoeModel):
3869+
model_arch = gguf.MODEL_ARCH.DOTS1
3870+
3871+
def __init__(self, *args, **kwargs):
3872+
super().__init__(*args, **kwargs)
3873+
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
3874+
3875+
def set_gguf_parameters(self):
3876+
super().set_gguf_parameters()
3877+
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
3878+
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
3879+
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
3880+
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
3881+
3882+
if self.hparams["scoring_func"] == "sigmoid":
3883+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
3884+
else:
3885+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
3886+
3887+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
3888+
if name.endswith("e_score_correction_bias"):
3889+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
3890+
if "shared_experts" in name:
3891+
return [(self.map_tensor_name(name), data_torch)]
3892+
return super().modify_tensors(data_torch, name, bid)
3893+
3894+
38673895
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
38683896
class ChatGLMModel(Model):
38693897
model_arch = gguf.MODEL_ARCH.CHATGLM

gguf-py/gguf/constants.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ class MODEL_ARCH(IntEnum):
226226
T5 = auto()
227227
T5ENCODER = auto()
228228
JAIS = auto()
229+
DOTS1 = auto()
229230

230231

231232
class MODEL_TENSOR(IntEnum):
@@ -362,6 +363,7 @@ class MODEL_TENSOR(IntEnum):
362363
MODEL_ARCH.T5: "t5",
363364
MODEL_ARCH.T5ENCODER: "t5encoder",
364365
MODEL_ARCH.JAIS: "jais",
366+
MODEL_ARCH.DOTS1: "dots1",
365367
}
366368

367369
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1164,6 +1166,30 @@ class MODEL_TENSOR(IntEnum):
11641166
MODEL_TENSOR.FFN_GATE,
11651167
MODEL_TENSOR.FFN_UP,
11661168
],
1169+
MODEL_ARCH.DOTS1: [
1170+
MODEL_TENSOR.TOKEN_EMBD,
1171+
MODEL_TENSOR.OUTPUT_NORM,
1172+
MODEL_TENSOR.OUTPUT,
1173+
MODEL_TENSOR.ATTN_NORM,
1174+
MODEL_TENSOR.ATTN_Q,
1175+
MODEL_TENSOR.ATTN_Q_NORM,
1176+
MODEL_TENSOR.ATTN_K,
1177+
MODEL_TENSOR.ATTN_K_NORM,
1178+
MODEL_TENSOR.ATTN_V,
1179+
MODEL_TENSOR.ATTN_OUT,
1180+
MODEL_TENSOR.FFN_EXP_PROBS_B,
1181+
MODEL_TENSOR.FFN_NORM,
1182+
MODEL_TENSOR.FFN_GATE,
1183+
MODEL_TENSOR.FFN_GATE_EXP,
1184+
MODEL_TENSOR.FFN_GATE_INP,
1185+
MODEL_TENSOR.FFN_GATE_SHEXP,
1186+
MODEL_TENSOR.FFN_DOWN,
1187+
MODEL_TENSOR.FFN_DOWN_EXP,
1188+
MODEL_TENSOR.FFN_DOWN_SHEXP,
1189+
MODEL_TENSOR.FFN_UP,
1190+
MODEL_TENSOR.FFN_UP_EXP,
1191+
MODEL_TENSOR.FFN_UP_SHEXP,
1192+
],
11671193
# TODO
11681194
}
11691195

gguf-py/gguf/tensor_mapping.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ class TensorNameMap:
257257
),
258258

259259
MODEL_TENSOR.FFN_EXP_PROBS_B: (
260-
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
260+
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
261261
),
262262

263263
# Feed-forward up

0 commit comments

Comments
 (0)