From be8306d7958766908c45bdd691d5ab9db8b99f85 Mon Sep 17 00:00:00 2001 From: brian khuu Date: Thu, 18 Jul 2024 21:02:10 +1000 Subject: [PATCH 1/4] convert-*.py: autogen uuid --- convert_hf_to_gguf.py | 6 ++++++ gguf-py/gguf/gguf_writer.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 769d49a8b6f0a..cc3dc6a2ea719 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -381,6 +381,12 @@ def prepare_metadata(self, vocab_only: bool): # output in the same directory as the model by default self.fname_out = self.dir_model / f"{fname_default}.gguf" + # Upon missing model uuid, generate uuid based on tensor content + if not vocab_only and self.metadata.uuid is None: + self.metadata.uuid = self.gguf_writer.generate_tensors_uuid() + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + logger.info(f"{f'%-{max_name_len}s' % f'generating general.uuid'} {self.metadata.uuid}") + self.set_type() logger.info("Set meta model") diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index ba6f53cda25a1..8bef811029e28 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -2,6 +2,8 @@ import logging import os +import uuid +import hashlib import shutil import struct import tempfile @@ -417,6 +419,19 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: self.state = WriterState.WEIGHTS + def generate_tensors_uuid(self) -> str: + uuidv5_sha1 = hashlib.sha1() + uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes) + + for tensors in self.tensors: + # relying on the fact that Python dicts preserve insertion order (since 3.7) + for name, ti in tensors.items(): + assert ti.tensor is not None + assert ti.tensor.nbytes == ti.nbytes + uuidv5_sha1.update(ti.tensor.tobytes('C')) + + return str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5)) + def write_tensors_to_file(self, *, progress: bool = False) -> None: self.write_ti_data_to_file() From 0c491520a87ee74146513abb1af609b28f1997f7 Mon Sep 17 00:00:00 2001 From: brian khuu Date: Sat, 27 Jul 2024 02:25:39 +1000 Subject: [PATCH 2/4] convert-*.py: Add source uuid generation --- convert_hf_to_gguf.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cc3dc6a2ea719..dc56621a5f1d3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -10,6 +10,8 @@ import os import re import sys +import uuid +import hashlib from enum import IntEnum from pathlib import Path from hashlib import sha256 @@ -255,6 +257,19 @@ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: i return False + def generate_source_tensors_uuid(self) -> str: + uuidv5_sha1 = hashlib.sha1() + uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes) + + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + data: np.ndarray = data_torch.to(torch.float64).squeeze().numpy() + uuidv5_sha1.update(data.tobytes('C')) + + return str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5)) + def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") @@ -381,11 +396,15 @@ def prepare_metadata(self, vocab_only: bool): # output in the same directory as the model by default self.fname_out = self.dir_model / f"{fname_default}.gguf" + # Upon missing source model uuid, generate uuid based on source tensor content + if not vocab_only and self.metadata.source_uuid is None: + self.metadata.source_uuid = self.generate_source_tensors_uuid() + logger.info(f"generating general.source_uuid: {self.metadata.source_uuid}") + # Upon missing model uuid, generate uuid based on tensor content if not vocab_only and self.metadata.uuid is None: self.metadata.uuid = self.gguf_writer.generate_tensors_uuid() - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") - logger.info(f"{f'%-{max_name_len}s' % f'generating general.uuid'} {self.metadata.uuid}") + logger.info(f"generating general.uuid: {self.metadata.uuid}") self.set_type() @@ -3468,6 +3487,7 @@ class LazyTorchTensor(gguf.LazyBase): _dtype_map: dict[torch.dtype, type] = { torch.float16: np.float16, torch.float32: np.float32, + torch.float64: np.float64, } # used for safetensors slices From 3fb690e91b155680fcbe87d8121ab3f290f96eff Mon Sep 17 00:00:00 2001 From: brian khuu Date: Sat, 27 Jul 2024 13:03:13 +1000 Subject: [PATCH 3/4] convert*.py: inline source uuid generation approach --- convert_hf_to_gguf.py | 36 ++++++++++++++++-------------------- gguf-py/gguf/gguf_writer.py | 15 --------------- 2 files changed, 16 insertions(+), 35 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index dc56621a5f1d3..13019ab854c82 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -64,6 +64,7 @@ class Model: gguf_writer: gguf.GGUFWriter model_name: str | None metadata_override: Path | None + generated_source_uuid: str | None # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -257,23 +258,18 @@ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: i return False - def generate_source_tensors_uuid(self) -> str: + def prepare_tensors(self): + uuidv5_sha1 = hashlib.sha1() uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes) - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): - continue - data: np.ndarray = data_torch.to(torch.float64).squeeze().numpy() - uuidv5_sha1.update(data.tobytes('C')) - - return str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5)) - - def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") for name, data_torch in self.get_tensors(): + + uuidv5_data_buffer: np.ndarray = data_torch.to(torch.float64).numpy() + uuidv5_sha1.update(uuidv5_data_buffer.tobytes('C')) + # we don't need these if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue @@ -353,6 +349,9 @@ def prepare_tensors(self): self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + # Upon missing source model uuid, generate uuid based on source tensor content + self.generated_source_uuid = str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5)) + def set_type(self): self.gguf_writer.add_type(gguf.GGUFType.MODEL) @@ -396,15 +395,12 @@ def prepare_metadata(self, vocab_only: bool): # output in the same directory as the model by default self.fname_out = self.dir_model / f"{fname_default}.gguf" - # Upon missing source model uuid, generate uuid based on source tensor content - if not vocab_only and self.metadata.source_uuid is None: - self.metadata.source_uuid = self.generate_source_tensors_uuid() - logger.info(f"generating general.source_uuid: {self.metadata.source_uuid}") - - # Upon missing model uuid, generate uuid based on tensor content - if not vocab_only and self.metadata.uuid is None: - self.metadata.uuid = self.gguf_writer.generate_tensors_uuid() - logger.info(f"generating general.uuid: {self.metadata.uuid}") + if not vocab_only: + if self.metadata.source_uuid is not None: + logger.info(f"Source UUID present: {self.metadata.source_uuid}") + elif self.generated_source_uuid is not None: + logger.info(f"Source UUID missing. Using generated source uuid: {self.generated_source_uuid}") + self.metadata.source_uuid = self.generated_source_uuid self.set_type() diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 8bef811029e28..ba6f53cda25a1 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -2,8 +2,6 @@ import logging import os -import uuid -import hashlib import shutil import struct import tempfile @@ -419,19 +417,6 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: self.state = WriterState.WEIGHTS - def generate_tensors_uuid(self) -> str: - uuidv5_sha1 = hashlib.sha1() - uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes) - - for tensors in self.tensors: - # relying on the fact that Python dicts preserve insertion order (since 3.7) - for name, ti in tensors.items(): - assert ti.tensor is not None - assert ti.tensor.nbytes == ti.nbytes - uuidv5_sha1.update(ti.tensor.tobytes('C')) - - return str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5)) - def write_tensors_to_file(self, *, progress: bool = False) -> None: self.write_ti_data_to_file() From 6db4f52d1c3b74f635fc3b1c4389bf4598052955 Mon Sep 17 00:00:00 2001 From: brian khuu Date: Sun, 28 Jul 2024 02:29:37 +1000 Subject: [PATCH 4/4] convert-*.py: hash pytorch array as numpy without type conversion (except for bf16 which is typecasted upward) --- convert_hf_to_gguf.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8dd1eaffc9836..1cc6d60e8414a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -273,8 +273,8 @@ def prepare_tensors(self): for name, data_torch in self.get_tensors(): - uuidv5_data_buffer: np.ndarray = data_torch.to(torch.float64).numpy() - uuidv5_sha1.update(uuidv5_data_buffer.tobytes('C')) + uuidv5_data_buffer: np.ndarray = data_torch.numpy() + uuidv5_sha1.update(uuidv5_data_buffer.data.tobytes('C')) # we don't need these if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): @@ -3506,6 +3506,9 @@ class LazyTorchTensor(gguf.LazyBase): torch.float16: np.float16, torch.float32: np.float32, torch.float64: np.float64, + + # No direct mapping avaliable. Cast upwards to avoid loss of precision + torch.bfloat16: np.float32, } # used for safetensors slices