Skip to content

Commit 895004f

Browse files
committed
convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16 to allow quantizing them to Q4_K and Q6_K with llama-quantize. * llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0 Q4_0 is not completely symmetric (so not lossless for ternary models), but it should be good enough.
1 parent 3a0bf17 commit 895004f

File tree

2 files changed

+24
-2
lines changed

2 files changed

+24
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,20 @@ def prepare_tensors(self):
301301
):
302302
data_qtype = gguf.GGMLQuantizationType.F32
303303

304+
if data_qtype is False and any(
305+
self.match_model_tensor_name(new_name, key, bid)
306+
for key in (
307+
gguf.MODEL_TENSOR.TOKEN_EMBD,
308+
gguf.MODEL_TENSOR.OUTPUT,
309+
)
310+
):
311+
if self.ftype in (
312+
gguf.LlamaFileType.MOSTLY_TQ1_0,
313+
gguf.LlamaFileType.MOSTLY_TQ2_0,
314+
):
315+
# TODO: use Q4_K and Q6_K
316+
data_qtype = gguf.GGMLQuantizationType.F16
317+
304318
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
305319
if isinstance(data_qtype, bool):
306320
if self.ftype == gguf.LlamaFileType.ALL_F32:
@@ -311,6 +325,10 @@ def prepare_tensors(self):
311325
data_qtype = gguf.GGMLQuantizationType.BF16
312326
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
313327
data_qtype = gguf.GGMLQuantizationType.Q8_0
328+
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
329+
data_qtype = gguf.GGMLQuantizationType.TQ1_0
330+
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
331+
data_qtype = gguf.GGMLQuantizationType.TQ2_0
314332
else:
315333
raise ValueError(f"Unknown file type: {self.ftype.name}")
316334

@@ -3814,8 +3832,8 @@ def parse_args() -> argparse.Namespace:
38143832
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
38153833
)
38163834
parser.add_argument(
3817-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
3818-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
3835+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
3836+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
38193837
)
38203838
parser.add_argument(
38213839
"--bigendian", action="store_true",
@@ -3902,6 +3920,8 @@ def main() -> None:
39023920
"f16": gguf.LlamaFileType.MOSTLY_F16,
39033921
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
39043922
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
3923+
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
3924+
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
39053925
"auto": gguf.LlamaFileType.GUESSED,
39063926
}
39073927

src/llama.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15717,6 +15717,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1571715717
}
1571815718
if (convert_incompatible_tensor) {
1571915719
switch (new_type) {
15720+
case GGML_TYPE_TQ1_0:
15721+
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
1572015722
case GGML_TYPE_IQ2_XXS:
1572115723
case GGML_TYPE_IQ2_XS:
1572215724
case GGML_TYPE_IQ2_S:

0 commit comments

Comments
 (0)