Skip to content

Commit ae3b805

Browse files
author
katsu560
committed
gguf : embed files to gguf
1 parent 5ca49cb commit ae3b805

File tree

4 files changed

+341
-1
lines changed

4 files changed

+341
-1
lines changed

gguf-py/gguf/constants.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from enum import Enum, IntEnum, auto
44
from typing import Any
5+
from dataclasses import dataclass
56

67
#
78
# constants
@@ -12,6 +13,15 @@
1213
GGUF_DEFAULT_ALIGNMENT = 32
1314
GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
1415

16+
#
17+
# datatype
18+
#
19+
20+
@dataclass
21+
class NamedObject:
22+
name: str
23+
obj: bytes[Any]
24+
1525
#
1626
# metadata keys
1727
#
@@ -31,6 +41,8 @@ class General:
3141
SOURCE_URL = "general.source.url"
3242
SOURCE_HF_REPO = "general.source.huggingface.repository"
3343
FILE_TYPE = "general.file_type"
44+
NAMEDOBJECT = "general.namedobject"
45+
CONNECT = "."
3446

3547
class LLM:
3648
VOCAB_SIZE = "{arch}.vocab_size"
@@ -901,11 +913,14 @@ class GGUFValueType(IntEnum):
901913
UINT64 = 10
902914
INT64 = 11
903915
FLOAT64 = 12
916+
NAMEDOBJECT = 13
904917

905918
@staticmethod
906919
def get_type(val: Any) -> GGUFValueType:
907920
if isinstance(val, (str, bytes, bytearray)):
908921
return GGUFValueType.STRING
922+
elif isinstance(val, (str, bytes, bytearray)):
923+
return GGUFValueType.NAMEDOBJECT
909924
elif isinstance(val, list):
910925
return GGUFValueType.ARRAY
911926
elif isinstance(val, float):

gguf-py/gguf/gguf_reader.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.
154154
slen = self._get(offset, np.uint64)
155155
return slen, self._get(offset + 8, np.uint8, slen[0])
156156

157+
def _get_obj(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
158+
olen = self._get(offset, np.uint64)
159+
return olen, self._get(offset + 8, np.uint8, olen[0])
160+
157161
def _get_field_parts(
158162
self, orig_offs: int, raw_type: int,
159163
) -> tuple[int, list[npt.NDArray[Any]], list[int], list[GGUFValueType]]:
@@ -166,6 +170,15 @@ def _get_field_parts(
166170
sparts: list[npt.NDArray[Any]] = list(self._get_str(offs))
167171
size = sum(int(part.nbytes) for part in sparts)
168172
return size, sparts, [1], types
173+
# Handle namedobjects.
174+
if gtype == GGUFValueType.NAMEDOBJECT:
175+
nparts: list[npt.NDArray[Any]] = list(self._get_str(offs))
176+
nsize = sum(int(part.nbytes) for part in nparts)
177+
oparts: list[npt.NDArray[Any]] = list(self._get_obj(offs + nsize))
178+
osize = sum(int(part.nbytes) for part in oparts)
179+
nosize = nsize + osize
180+
noparts: list[npt.NDArray[Any]] = list((nparts[0],nparts[1],oparts[0],oparts[1]))
181+
return nosize, noparts, [4], types
169182
# Check if it's a simple scalar type.
170183
nptype = self.gguf_scalar_to_np.get(gtype)
171184
if nptype is not None:

gguf-py/gguf/gguf_writer.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ def __init__(
7575
"Big" if self.endianess == GGUFEndian.BIG else "Little",
7676
))
7777
self.state = WriterState.EMPTY
78+
# namedobject
79+
self.namedobject_count = 0
7880

7981
self.add_architecture()
8082

@@ -165,7 +167,33 @@ def add_array(self, key: str, val: Sequence[Any]) -> None:
165167
self.add_key(key)
166168
self.add_val(val, GGUFValueType.ARRAY)
167169

168-
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None:
170+
def add_namedobject(self, key: str, val: bytes[Any], name: str, array: NamedObject[Any] | None = None) -> None:
171+
# array: False: add as each namedobject, True: add as each element of array of namedobject
172+
if not name:
173+
raise ValueError("Need name for namedobject")
174+
if not val:
175+
raise ValueError("Need val for namedobject")
176+
177+
if array is None:
178+
self.namedobject_count += 1
179+
key = Keys.General.NAMEDOBJECT + Keys.General.CONNECT + str(self.namedobject_count)
180+
self.add_key(key)
181+
self.add_val(val, GGUFValueType.NAMEDOBJECT, name=name)
182+
else:
183+
# if array, val and name is dammy
184+
key = Keys.General.NAMEDOBJECT
185+
self.add_key(key)
186+
vtype = GGUFValueType.ARRAY
187+
self.kv_data += self._pack("I", vtype)
188+
self.kv_data_count += 1
189+
ltype = GGUFValueType.NAMEDOBJECT
190+
self.kv_data += self._pack("I", ltype)
191+
self.kv_data += self._pack("Q", len(array))
192+
for item in array:
193+
self.add_val(item.obj, GGUFValueType.NAMEDOBJECT, add_vtype=False, name=item.name)
194+
195+
196+
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True, name: str | None = None) -> None:
169197
if vtype is None:
170198
vtype = GGUFValueType.get_type(val)
171199

@@ -180,6 +208,12 @@ def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool
180208
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
181209
self.kv_data += self._pack("Q", len(encoded_val))
182210
self.kv_data += encoded_val
211+
elif vtype == GGUFValueType.NAMEDOBJECT:
212+
encoded_name = name.encode("utf8") if isinstance(name, str) else name
213+
self.kv_data += self._pack("Q", len(encoded_name))
214+
self.kv_data += encoded_name
215+
self.kv_data += self._pack("Q", len(val))
216+
self.kv_data += val
183217
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
184218
ltype = GGUFValueType.get_type(val[0])
185219
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):

0 commit comments

Comments
 (0)