Skip to content

Commit 9567566

Browse files
committed
Direct conversion with _S and _XS FTYPES
for q4_1, q5_0, q5_1, q6_0, q8_0 _s : attn_q one quant lower _xs : ffn_up and ffn_gate one quant lower
1 parent 51b6f81 commit 9567566

File tree

2 files changed

+89
-1
lines changed

2 files changed

+89
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,54 @@ def prepare_tensors(self):
343343
# TODO: use Q4_K and Q6_K
344344
data_qtype = gguf.GGMLQuantizationType.Q6_0
345345

346+
if data_qtype is False and any(
347+
self.match_model_tensor_name(new_name, key, bid)
348+
for key in (
349+
gguf.MODEL_TENSOR.ATTN_Q,
350+
)
351+
):
352+
if self.ftype in (
353+
gguf.LlamaFileType.MOSTLY_Q4_1_S,
354+
gguf.LlamaFileType.MOSTLY_Q5_0_S,
355+
gguf.LlamaFileType.MOSTLY_Q4_1_XS,
356+
gguf.LlamaFileType.MOSTLY_Q5_0_XS,
357+
):
358+
data_qtype = gguf.GGMLQuantizationType.Q4_0
359+
elif self.ftype in (
360+
gguf.LlamaFileType.MOSTLY_Q5_1_S,
361+
gguf.LlamaFileType.MOSTLY_Q6_0_S,
362+
gguf.LlamaFileType.MOSTLY_Q5_1_XS,
363+
gguf.LlamaFileType.MOSTLY_Q6_0_XS,
364+
):
365+
data_qtype = gguf.GGMLQuantizationType.Q5_0
366+
elif self.ftype in (
367+
gguf.LlamaFileType.MOSTLY_Q8_0_S,
368+
gguf.LlamaFileType.MOSTLY_Q8_0_XS,
369+
):
370+
data_qtype = gguf.GGMLQuantizationType.Q6_0
371+
372+
if data_qtype is False and any(
373+
self.match_model_tensor_name(new_name, key, bid)
374+
for key in (
375+
gguf.MODEL_TENSOR.FFN_UP,
376+
gguf.MODEL_TENSOR.FFN_GATE,
377+
)
378+
):
379+
if self.ftype in (
380+
gguf.LlamaFileType.MOSTLY_Q4_1_XS,
381+
gguf.LlamaFileType.MOSTLY_Q5_0_XS,
382+
):
383+
data_qtype = gguf.GGMLQuantizationType.Q4_0
384+
elif self.ftype in (
385+
gguf.LlamaFileType.MOSTLY_Q5_1_XS,
386+
gguf.LlamaFileType.MOSTLY_Q6_0_XS,
387+
):
388+
data_qtype = gguf.GGMLQuantizationType.Q5_0
389+
elif self.ftype in (
390+
gguf.LlamaFileType.MOSTLY_Q8_0_XS,
391+
):
392+
data_qtype = gguf.GGMLQuantizationType.Q6_0
393+
346394
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
347395
if isinstance(data_qtype, bool):
348396
if self.ftype == gguf.LlamaFileType.ALL_F32:
@@ -363,6 +411,26 @@ def prepare_tensors(self):
363411
data_qtype = gguf.GGMLQuantizationType.Q6_0
364412
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
365413
data_qtype = gguf.GGMLQuantizationType.Q8_0
414+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1_S:
415+
data_qtype = gguf.GGMLQuantizationType.Q4_1
416+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0_S:
417+
data_qtype = gguf.GGMLQuantizationType.Q5_0
418+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1_S:
419+
data_qtype = gguf.GGMLQuantizationType.Q5_1
420+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0_S:
421+
data_qtype = gguf.GGMLQuantizationType.Q6_0
422+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0_S:
423+
data_qtype = gguf.GGMLQuantizationType.Q8_0
424+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1_XS:
425+
data_qtype = gguf.GGMLQuantizationType.Q4_1
426+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0_XS:
427+
data_qtype = gguf.GGMLQuantizationType.Q5_0
428+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1_XS:
429+
data_qtype = gguf.GGMLQuantizationType.Q5_1
430+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0_XS:
431+
data_qtype = gguf.GGMLQuantizationType.Q6_0
432+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0_XS:
433+
data_qtype = gguf.GGMLQuantizationType.Q8_0
366434
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
367435
data_qtype = gguf.GGMLQuantizationType.TQ1_0
368436
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
@@ -6373,7 +6441,7 @@ def parse_args() -> argparse.Namespace:
63736441
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
63746442
)
63756443
parser.add_argument(
6376-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q6_0", "tq1_0", "tq2_0", "auto"], default="f16",
6444+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q6_0", "q4_1_s", "q5_0_s", "q5_1_s", "q6_0_s", "q8_0_s", "q4_1_xs", "q5_0_xs", "q5_1_xs", "q6_0_xs", "q8_0_xs", "tq1_0", "tq2_0", "auto"], default="f16",
63776445
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, q4_0, q4_1, q5_0, q5_1, q6_0 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
63786446
)
63796447
parser.add_argument(
@@ -6509,6 +6577,16 @@ def main() -> None:
65096577
"q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
65106578
"q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
65116579
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
6580+
"q4_1_s": gguf.LlamaFileType.MOSTLY_Q4_1_S,
6581+
"q5_0_s": gguf.LlamaFileType.MOSTLY_Q5_0_S,
6582+
"q5_1_s": gguf.LlamaFileType.MOSTLY_Q5_1_S,
6583+
"q6_0_s": gguf.LlamaFileType.MOSTLY_Q6_0_S,
6584+
"q8_0_s": gguf.LlamaFileType.MOSTLY_Q8_0_S,
6585+
"q4_1_xs": gguf.LlamaFileType.MOSTLY_Q4_1_XS,
6586+
"q5_0_xs": gguf.LlamaFileType.MOSTLY_Q5_0_XS,
6587+
"q5_1_xs": gguf.LlamaFileType.MOSTLY_Q5_1_XS,
6588+
"q6_0_xs": gguf.LlamaFileType.MOSTLY_Q6_0_XS,
6589+
"q8_0_xs": gguf.LlamaFileType.MOSTLY_Q8_0_XS,
65126590
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
65136591
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
65146592
"auto": gguf.LlamaFileType.GUESSED,

gguf-py/gguf/constants.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2338,6 +2338,16 @@ class LlamaFileType(IntEnum):
23382338
MOSTLY_IQ5_KS_R4 = 341 #except 1d tensors
23392339
MOSTLY_Q8_KV_R8 = 398 #except 1d tensors
23402340
MOSTLY_Q8_K_R8 = 399 #except 1d tensors
2341+
MOSTLY_Q4_1_S = 490 # except 1d tensors
2342+
MOSTLY_Q5_0_S = 491 # except 1d tensors
2343+
MOSTLY_Q5_1_S = 492 # except 1d tensors
2344+
MOSTLY_Q6_0_S = 493 # except 1d tensors
2345+
MOSTLY_Q8_0_S = 494 # except 1d tensors
2346+
MOSTLY_Q4_1_XS = 495 # except 1d tensors
2347+
MOSTLY_Q5_0_XS = 496 # except 1d tensors
2348+
MOSTLY_Q5_1_XS = 497 # except 1d tensors
2349+
MOSTLY_Q6_0_XS = 498 # except 1d tensors
2350+
MOSTLY_Q8_0_XS = 499 # except 1d tensors
23412351

23422352
GUESSED = 1024 # not specified in the model file
23432353

0 commit comments

Comments
 (0)