Direct conversion from fp16 to Q6_0

Nexesenex · Nexesenex · commit 51b6f81e14a8 · 2025-06-02T01:37:38.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -323,6 +323,7 @@ def prepare_tensors(self):
                         gguf.MODEL_TENSOR.OUTPUT,
                         gguf.MODEL_TENSOR.ATTN_V,
                         gguf.MODEL_TENSOR.ATTN_K,
+                        gguf.MODEL_TENSOR.ATTN_QKV,
                     )
                 ):
                     if self.ftype in (
@@ -333,15 +334,14 @@ def prepare_tensors(self):
                     elif self.ftype in (
                         gguf.LlamaFileType.MOSTLY_Q5_0,
                         gguf.LlamaFileType.MOSTLY_Q5_1,
-                        # gguf.LlamaFileType.MOSTLY_Q6_0,
                     ):
-                        data_qtype = gguf.GGMLQuantizationType.Q8_0
+                        data_qtype = gguf.GGMLQuantizationType.Q6_0
                     elif self.ftype in (
                         gguf.LlamaFileType.MOSTLY_TQ1_0,
                         gguf.LlamaFileType.MOSTLY_TQ2_0,
                     ):
                         # TODO: use Q4_K and Q6_K
-                        data_qtype = gguf.GGMLQuantizationType.F16
+                        data_qtype = gguf.GGMLQuantizationType.Q6_0
 
                 # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                 if isinstance(data_qtype, bool):
@@ -359,8 +359,8 @@ def prepare_tensors(self):
                         data_qtype = gguf.GGMLQuantizationType.Q5_0
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1:
                         data_qtype = gguf.GGMLQuantizationType.Q5_1
-                    # elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: // To be implemented?
-                        # data_qtype = gguf.GGMLQuantizationType.Q6_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0:
+                        data_qtype = gguf.GGMLQuantizationType.Q6_0
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
                         data_qtype = gguf.GGMLQuantizationType.Q8_0
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
@@ -545,7 +545,7 @@ def set_gguf_parameters(self):
         logger.info("****************************************************************************************")
         logger.info("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`")
         logger.info("** `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0")
-        logger.info("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0")
+        logger.info("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q6_0")
         logger.info("** This, in order to generate a small but reliable conversion to create an iMatrix file.")
         logger.info("****************************************************************************************")
 
@@ -6373,8 +6373,8 @@ def parse_args() -> argparse.Namespace:
         help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
     )
     parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "tq1_0", "tq2_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q6_0", "tq1_0", "tq2_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, q4_0, q4_1, q5_0, q5_1, q6_0 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
     )
     parser.add_argument(
         "--bigendian", action="store_true",
@@ -6507,7 +6507,7 @@ def main() -> None:
         "q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
         "q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
         "q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
-        # "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
+        "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
         "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
         "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
         "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -2401,6 +2401,7 @@ class VisionProjectorType:
     GGMLQuantizationType.Q4_1:    (32, 2 + 2 + 16),
     GGMLQuantizationType.Q5_0:    (32, 2 + 4 + 16),
     GGMLQuantizationType.Q5_1:    (32, 2 + 2 + 4 + 16),
+    GGMLQuantizationType.Q6_0:    (32, 2 + 8 + 16),
     GGMLQuantizationType.Q8_0:    (32, 2 + 32),
     GGMLQuantizationType.Q8_1:    (32, 4 + 4 + 32),
     GGMLQuantizationType.Q2_K:    (256, 2 + 2 + QK_K // 16 + QK_K // 4),
@@ -2433,7 +2434,7 @@ class VisionProjectorType:
     GGMLQuantizationType.Q8_0_X4     : (  32,   34),
     GGMLQuantizationType.Q8_1_X4     : (  32,   36),
     GGMLQuantizationType.Q8_2_X4     : (  32,   36),
-    GGMLQuantizationType.Q6_0        : (  32,   26),
+    # GGMLQuantizationType.Q6_0        : (  32,   26),
     GGMLQuantizationType.IQ1_BN      : (  64,   13),
     GGMLQuantizationType.IQ2_BN      : (  64,   16),
     GGMLQuantizationType.Q8_K64      : (  64,   68),
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
@@ -377,6 +377,50 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
         return (d * qs) + m
 
 
+class Q6_0(__Quant, qtype=GGMLQuantizationType.Q6_0):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        imax = abs(blocks).argmax(axis=-1, keepdims=True)
+        max = np.take_along_axis(blocks, imax, axis=-1)
+
+        d = max / -32
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        # Adapted from Q5_0
+        q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(32.5), dtype=np.float32).astype(np.uint8).clip(0, 63)
+
+        qs = q.reshape((n_blocks, 2, cls.block_size // 2))
+        qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
+
+        qh = np.zeros((n_blocks, cls.block_size // 4), dtype=np.uint8)
+        for j in range(cls.block_size // 2):
+            h = ((q[:, j] >> 4) | ((q[:, j + cls.block_size // 2] >> 4) << 2)).astype(np.uint8)
+            qh[:, j % (cls.block_size // 4)] |= (h << 4 * (j // (cls.block_size // 4)))
+
+        d = d.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([d, qh, qs], axis=-1)
+
+    # @classmethod
+    # def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        # n_blocks = blocks.shape[0]
+
+        # d, rest = np.hsplit(blocks, [2])  # (n_blocks, 2)
+        # qh, qs = np.hsplit(rest, [cls.block_size // 4])  # (n_blocks, block_size//4), (n_blocks, block_size//2*2)
+
+        # d = d.view(np.float16).astype(np.float32)
+
+        # qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32))
+        # ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
+        # qh = (qh & np.uint32(0x03)).astype(np.uint8)
+        # ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
+
+        # qs = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(32)
+
+        # return (d * qs.astype(np.float32))
+
 class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
     @classmethod
     # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
diff --git a/gguf-py/tests/test_quants.py b/gguf-py/tests/test_quants.py
@@ -64,7 +64,7 @@ def __init__(self, libggml: Path):
         self.libggml.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,)
 
         for t in (
-            "q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
+            "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "q6_0"
             "q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
             "tq1_0", "tq2_0",
             "iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",