Direct conversion with _S and _XS FTYPES

Nexesenex · Nexesenex · commit 956756674b69 · 2025-06-02T01:45:02.000+02:00
for q4_1, q5_0, q5_1, q6_0, q8_0
_s : attn_q one quant lower
_xs : ffn_up and ffn_gate one quant lower
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -343,6 +343,54 @@ def prepare_tensors(self):
                         # TODO: use Q4_K and Q6_K
                         data_qtype = gguf.GGMLQuantizationType.Q6_0
 
+                if data_qtype is False and any(
+                    self.match_model_tensor_name(new_name, key, bid)
+                    for key in (
+                        gguf.MODEL_TENSOR.ATTN_Q,
+                    )
+                ):
+                    if self.ftype in (
+                        gguf.LlamaFileType.MOSTLY_Q4_1_S,
+                        gguf.LlamaFileType.MOSTLY_Q5_0_S,
+                        gguf.LlamaFileType.MOSTLY_Q4_1_XS,
+                        gguf.LlamaFileType.MOSTLY_Q5_0_XS,
+                    ):
+                        data_qtype = gguf.GGMLQuantizationType.Q4_0
+                    elif self.ftype in (
+                        gguf.LlamaFileType.MOSTLY_Q5_1_S,
+                        gguf.LlamaFileType.MOSTLY_Q6_0_S,
+                        gguf.LlamaFileType.MOSTLY_Q5_1_XS,
+                        gguf.LlamaFileType.MOSTLY_Q6_0_XS,
+                    ):
+                        data_qtype = gguf.GGMLQuantizationType.Q5_0
+                    elif self.ftype in (
+                        gguf.LlamaFileType.MOSTLY_Q8_0_S,
+                        gguf.LlamaFileType.MOSTLY_Q8_0_XS,
+                    ):
+                        data_qtype = gguf.GGMLQuantizationType.Q6_0
+
+                if data_qtype is False and any(
+                    self.match_model_tensor_name(new_name, key, bid)
+                    for key in (
+                        gguf.MODEL_TENSOR.FFN_UP,
+                        gguf.MODEL_TENSOR.FFN_GATE,
+                    )
+                ):
+                    if self.ftype in (
+                        gguf.LlamaFileType.MOSTLY_Q4_1_XS,
+                        gguf.LlamaFileType.MOSTLY_Q5_0_XS,
+                    ):
+                        data_qtype = gguf.GGMLQuantizationType.Q4_0
+                    elif self.ftype in (
+                        gguf.LlamaFileType.MOSTLY_Q5_1_XS,
+                        gguf.LlamaFileType.MOSTLY_Q6_0_XS,
+                    ):
+                        data_qtype = gguf.GGMLQuantizationType.Q5_0
+                    elif self.ftype in (
+                        gguf.LlamaFileType.MOSTLY_Q8_0_XS,
+                    ):
+                        data_qtype = gguf.GGMLQuantizationType.Q6_0
+
                 # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                 if isinstance(data_qtype, bool):
                     if self.ftype == gguf.LlamaFileType.ALL_F32:
@@ -363,6 +411,26 @@ def prepare_tensors(self):
                         data_qtype = gguf.GGMLQuantizationType.Q6_0
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
                         data_qtype = gguf.GGMLQuantizationType.Q8_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1_S:
+                        data_qtype = gguf.GGMLQuantizationType.Q4_1
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0_S:
+                        data_qtype = gguf.GGMLQuantizationType.Q5_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1_S:
+                        data_qtype = gguf.GGMLQuantizationType.Q5_1
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0_S:
+                        data_qtype = gguf.GGMLQuantizationType.Q6_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0_S:
+                        data_qtype = gguf.GGMLQuantizationType.Q8_0 
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1_XS:
+                        data_qtype = gguf.GGMLQuantizationType.Q4_1                   
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0_XS:
+                        data_qtype = gguf.GGMLQuantizationType.Q5_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1_XS:
+                        data_qtype = gguf.GGMLQuantizationType.Q5_1
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0_XS:
+                        data_qtype = gguf.GGMLQuantizationType.Q6_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0_XS:
+                        data_qtype = gguf.GGMLQuantizationType.Q8_0
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
                         data_qtype = gguf.GGMLQuantizationType.TQ1_0
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
@@ -6373,7 +6441,7 @@ def parse_args() -> argparse.Namespace:
         help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
     )
     parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q6_0", "tq1_0", "tq2_0", "auto"], default="f16",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q6_0", "q4_1_s", "q5_0_s", "q5_1_s", "q6_0_s", "q8_0_s", "q4_1_xs", "q5_0_xs", "q5_1_xs", "q6_0_xs", "q8_0_xs", "tq1_0", "tq2_0", "auto"], default="f16",
         help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, q4_0, q4_1, q5_0, q5_1, q6_0 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
     )
     parser.add_argument(
@@ -6509,6 +6577,16 @@ def main() -> None:
         "q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
         "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
         "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "q4_1_s": gguf.LlamaFileType.MOSTLY_Q4_1_S,
+        "q5_0_s": gguf.LlamaFileType.MOSTLY_Q5_0_S,
+        "q5_1_s": gguf.LlamaFileType.MOSTLY_Q5_1_S,
+        "q6_0_s": gguf.LlamaFileType.MOSTLY_Q6_0_S,
+        "q8_0_s": gguf.LlamaFileType.MOSTLY_Q8_0_S,
+        "q4_1_xs": gguf.LlamaFileType.MOSTLY_Q4_1_XS,
+        "q5_0_xs": gguf.LlamaFileType.MOSTLY_Q5_0_XS,
+        "q5_1_xs": gguf.LlamaFileType.MOSTLY_Q5_1_XS,
+        "q6_0_xs": gguf.LlamaFileType.MOSTLY_Q6_0_XS,
+        "q8_0_xs": gguf.LlamaFileType.MOSTLY_Q8_0_XS,
         "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
         "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
         "auto": gguf.LlamaFileType.GUESSED,
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -2338,6 +2338,16 @@ class LlamaFileType(IntEnum):
     MOSTLY_IQ5_KS_R4       = 341    #except 1d tensors
     MOSTLY_Q8_KV_R8        = 398    #except 1d tensors
     MOSTLY_Q8_K_R8         = 399    #except 1d tensors
+    MOSTLY_Q4_1_S          = 490   # except 1d tensors
+    MOSTLY_Q5_0_S          = 491   # except 1d tensors
+    MOSTLY_Q5_1_S          = 492   # except 1d tensors
+    MOSTLY_Q6_0_S          = 493   # except 1d tensors
+    MOSTLY_Q8_0_S          = 494   # except 1d tensors
+    MOSTLY_Q4_1_XS         = 495   # except 1d tensors
+    MOSTLY_Q5_0_XS         = 496   # except 1d tensors
+    MOSTLY_Q5_1_XS         = 497   # except 1d tensors
+    MOSTLY_Q6_0_XS         = 498   # except 1d tensors
+    MOSTLY_Q8_0_XS         = 499   # except 1d tensors
 
     GUESSED              = 1024  # not specified in the model file