@@ -323,6 +323,7 @@ def prepare_tensors(self):
323
323
gguf .MODEL_TENSOR .OUTPUT ,
324
324
gguf .MODEL_TENSOR .ATTN_V ,
325
325
gguf .MODEL_TENSOR .ATTN_K ,
326
+ gguf .MODEL_TENSOR .ATTN_QKV ,
326
327
)
327
328
):
328
329
if self .ftype in (
@@ -333,15 +334,14 @@ def prepare_tensors(self):
333
334
elif self .ftype in (
334
335
gguf .LlamaFileType .MOSTLY_Q5_0 ,
335
336
gguf .LlamaFileType .MOSTLY_Q5_1 ,
336
- # gguf.LlamaFileType.MOSTLY_Q6_0,
337
337
):
338
- data_qtype = gguf .GGMLQuantizationType .Q8_0
338
+ data_qtype = gguf .GGMLQuantizationType .Q6_0
339
339
elif self .ftype in (
340
340
gguf .LlamaFileType .MOSTLY_TQ1_0 ,
341
341
gguf .LlamaFileType .MOSTLY_TQ2_0 ,
342
342
):
343
343
# TODO: use Q4_K and Q6_K
344
- data_qtype = gguf .GGMLQuantizationType .F16
344
+ data_qtype = gguf .GGMLQuantizationType .Q6_0
345
345
346
346
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
347
347
if isinstance (data_qtype , bool ):
@@ -359,8 +359,8 @@ def prepare_tensors(self):
359
359
data_qtype = gguf .GGMLQuantizationType .Q5_0
360
360
elif self .ftype == gguf .LlamaFileType .MOSTLY_Q5_1 :
361
361
data_qtype = gguf .GGMLQuantizationType .Q5_1
362
- # elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: // To be implemented?
363
- # data_qtype = gguf.GGMLQuantizationType.Q6_0
362
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_Q6_0 :
363
+ data_qtype = gguf .GGMLQuantizationType .Q6_0
364
364
elif self .ftype == gguf .LlamaFileType .MOSTLY_Q8_0 :
365
365
data_qtype = gguf .GGMLQuantizationType .Q8_0
366
366
elif self .ftype == gguf .LlamaFileType .MOSTLY_TQ1_0 :
@@ -545,7 +545,7 @@ def set_gguf_parameters(self):
545
545
logger .info ("****************************************************************************************" )
546
546
logger .info ("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`" )
547
547
logger .info ("** `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0" )
548
- logger .info ("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0 " )
548
+ logger .info ("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q6_0 " )
549
549
logger .info ("** This, in order to generate a small but reliable conversion to create an iMatrix file." )
550
550
logger .info ("****************************************************************************************" )
551
551
@@ -6373,8 +6373,8 @@ def parse_args() -> argparse.Namespace:
6373
6373
help = "path to write to; default: based on input. {ftype} will be replaced by the outtype." ,
6374
6374
)
6375
6375
parser .add_argument (
6376
- "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "q8_0" , "q4_0" , "q4_1" , "q5_0" , "q5_1" , "tq1_0" , "tq2_0" , "auto" ], default = "f16" ,
6377
- help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
6376
+ "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "q8_0" , "q4_0" , "q4_1" , "q5_0" , "q5_1" , "q6_0" , " tq1_0" , "tq2_0" , "auto" ], default = "f16" ,
6377
+ help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, q4_0, q4_1, q5_0, q5_1, q6_0 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
6378
6378
)
6379
6379
parser .add_argument (
6380
6380
"--bigendian" , action = "store_true" ,
@@ -6507,7 +6507,7 @@ def main() -> None:
6507
6507
"q4_1" : gguf .LlamaFileType .MOSTLY_Q4_1 ,
6508
6508
"q5_0" : gguf .LlamaFileType .MOSTLY_Q5_0 ,
6509
6509
"q5_1" : gguf .LlamaFileType .MOSTLY_Q5_1 ,
6510
- # "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
6510
+ "q6_0" : gguf .LlamaFileType .MOSTLY_Q6_0 ,
6511
6511
"q8_0" : gguf .LlamaFileType .MOSTLY_Q8_0 ,
6512
6512
"tq1_0" : gguf .LlamaFileType .MOSTLY_TQ1_0 ,
6513
6513
"tq2_0" : gguf .LlamaFileType .MOSTLY_TQ2_0 ,
0 commit comments