From 7ff30834e10b707e7895f37f448bd6b4035094da Mon Sep 17 00:00:00 2001
From: Robert Sinclair <0wwafa@gmail.com>
Date: Fri, 26 Jul 2024 23:42:35 +0300
Subject: [PATCH 1/3] Add option to keep output and embed tensors at f16

Add option to keep output and embed tensors at f16
---
 convert_hf_to_gguf.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 4087187c19834..2c5822f3ae24d 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -63,6 +63,7 @@ class Model:
     model_name: str | None
     metadata_override: Path | None
     dir_model_card: Path
+    z: bool
 
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
@@ -70,7 +71,7 @@ class Model:
     def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                  use_temp_file: bool = False, eager: bool = False,
                  metadata_override: Path | None = None, model_name: str | None = None,
-                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
+                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, z: bool = False):
         if type(self) is Model:
             raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
 
@@ -92,6 +93,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.metadata_override = metadata_override
         self.model_name = model_name
         self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
+        self.z = z
 
         # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
         if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -319,7 +321,7 @@ def prepare_tensors(self):
                         assert data.dtype == np.int16
                         data_qtype = gguf.GGMLQuantizationType.BF16
 
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data) and (self.z and new_name not in (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))):
                         data = gguf.quantize_q8_0(data)
                         assert data.dtype == np.uint8
                         data_qtype = gguf.GGMLQuantizationType.Q8_0
@@ -3598,6 +3600,10 @@ def parse_args() -> argparse.Namespace:
         "--metadata", type=Path,
         help="Specify the path for an authorship metadata override file"
     )
+    parser.add_argument(
+        "--z", action="store_true",
+        help="Keep output and ambed tensors at F16"
+    )
 
     return parser.parse_args()
 
@@ -3672,7 +3678,7 @@ def main() -> None:
                                      metadata_override=args.metadata, model_name=args.model_name,
                                      split_max_tensors=args.split_max_tensors,
                                      split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
-                                     small_first_shard=args.no_tensor_first_split)
+                                     small_first_shard=args.no_tensor_first_split,z=args.z)
 
         if args.vocab_only:
             logger.info("Exporting model vocab...")

From 9758c5a92b352ed92cd92df3f90c990ec593ca74 Mon Sep 17 00:00:00 2001
From: Robert Sinclair <0wwafa@gmail.com>
Date: Sun, 28 Jul 2024 02:03:28 +0300
Subject: [PATCH 2/3] Update convert_hf_to_gguf.py

Co-authored-by: compilade <git@compilade.net>
---
 convert_hf_to_gguf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 2c5822f3ae24d..aca66072516b6 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -3678,7 +3678,8 @@ def main() -> None:
                                      metadata_override=args.metadata, model_name=args.model_name,
                                      split_max_tensors=args.split_max_tensors,
                                      split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
-                                     small_first_shard=args.no_tensor_first_split,z=args.z)
+                                     small_first_shard=args.no_tensor_first_split,
+                                     z=args.z)
 
         if args.vocab_only:
             logger.info("Exporting model vocab...")

From 40d169874d09edfa13f3724872eb65887534d8a4 Mon Sep 17 00:00:00 2001
From: Robert Sinclair <0wwafa@gmail.com>
Date: Sun, 28 Jul 2024 02:03:40 +0300
Subject: [PATCH 3/3] Update convert_hf_to_gguf.py

Co-authored-by: compilade <git@compilade.net>
---
 convert_hf_to_gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index aca66072516b6..060827e98f405 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -3602,7 +3602,7 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--z", action="store_true",
-        help="Keep output and ambed tensors at F16"
+        help="Keep output and embed tensors at F16"
     )
 
     return parser.parse_args()