Skip to content

Commit 484bba3

Browse files
author
katsu560
committed
add file data to kv as STRING
1 parent ae3b805 commit 484bba3

File tree

4 files changed

+43
-108
lines changed

4 files changed

+43
-108
lines changed

gguf-py/gguf/constants.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
from enum import Enum, IntEnum, auto
44
from typing import Any
5-
from dataclasses import dataclass
65

76
#
87
# constants
@@ -13,15 +12,6 @@
1312
GGUF_DEFAULT_ALIGNMENT = 32
1413
GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
1514

16-
#
17-
# datatype
18-
#
19-
20-
@dataclass
21-
class NamedObject:
22-
name: str
23-
obj: bytes[Any]
24-
2515
#
2616
# metadata keys
2717
#
@@ -41,8 +31,7 @@ class General:
4131
SOURCE_URL = "general.source.url"
4232
SOURCE_HF_REPO = "general.source.huggingface.repository"
4333
FILE_TYPE = "general.file_type"
44-
NAMEDOBJECT = "general.namedobject"
45-
CONNECT = "."
34+
FILE_MARK = "/"
4635

4736
class LLM:
4837
VOCAB_SIZE = "{arch}.vocab_size"
@@ -913,14 +902,11 @@ class GGUFValueType(IntEnum):
913902
UINT64 = 10
914903
INT64 = 11
915904
FLOAT64 = 12
916-
NAMEDOBJECT = 13
917905

918906
@staticmethod
919907
def get_type(val: Any) -> GGUFValueType:
920908
if isinstance(val, (str, bytes, bytearray)):
921909
return GGUFValueType.STRING
922-
elif isinstance(val, (str, bytes, bytearray)):
923-
return GGUFValueType.NAMEDOBJECT
924910
elif isinstance(val, list):
925911
return GGUFValueType.ARRAY
926912
elif isinstance(val, float):

gguf-py/gguf/gguf_reader.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -170,15 +170,6 @@ def _get_field_parts(
170170
sparts: list[npt.NDArray[Any]] = list(self._get_str(offs))
171171
size = sum(int(part.nbytes) for part in sparts)
172172
return size, sparts, [1], types
173-
# Handle namedobjects.
174-
if gtype == GGUFValueType.NAMEDOBJECT:
175-
nparts: list[npt.NDArray[Any]] = list(self._get_str(offs))
176-
nsize = sum(int(part.nbytes) for part in nparts)
177-
oparts: list[npt.NDArray[Any]] = list(self._get_obj(offs + nsize))
178-
osize = sum(int(part.nbytes) for part in oparts)
179-
nosize = nsize + osize
180-
noparts: list[npt.NDArray[Any]] = list((nparts[0],nparts[1],oparts[0],oparts[1]))
181-
return nosize, noparts, [4], types
182173
# Check if it's a simple scalar type.
183174
nptype = self.gguf_scalar_to_np.get(gtype)
184175
if nptype is not None:

gguf-py/gguf/gguf_writer.py

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,6 @@ def __init__(
7575
"Big" if self.endianess == GGUFEndian.BIG else "Little",
7676
))
7777
self.state = WriterState.EMPTY
78-
# namedobject
79-
self.namedobject_count = 0
8078

8179
self.add_architecture()
8280

@@ -167,33 +165,19 @@ def add_array(self, key: str, val: Sequence[Any]) -> None:
167165
self.add_key(key)
168166
self.add_val(val, GGUFValueType.ARRAY)
169167

170-
def add_namedobject(self, key: str, val: bytes[Any], name: str, array: NamedObject[Any] | None = None) -> None:
171-
# array: False: add as each namedobject, True: add as each element of array of namedobject
172-
if not name:
173-
raise ValueError("Need name for namedobject")
168+
def add_object(self, key: str, val: bytes[Any]) -> None:
174169
if not val:
175-
raise ValueError("Need val for namedobject")
170+
raise ValueError("Need val for object")
176171

177-
if array is None:
178-
self.namedobject_count += 1
179-
key = Keys.General.NAMEDOBJECT + Keys.General.CONNECT + str(self.namedobject_count)
180-
self.add_key(key)
181-
self.add_val(val, GGUFValueType.NAMEDOBJECT, name=name)
182-
else:
183-
# if array, val and name is dammy
184-
key = Keys.General.NAMEDOBJECT
185-
self.add_key(key)
186-
vtype = GGUFValueType.ARRAY
187-
self.kv_data += self._pack("I", vtype)
188-
self.kv_data_count += 1
189-
ltype = GGUFValueType.NAMEDOBJECT
190-
self.kv_data += self._pack("I", ltype)
191-
self.kv_data += self._pack("Q", len(array))
192-
for item in array:
193-
self.add_val(item.obj, GGUFValueType.NAMEDOBJECT, add_vtype=False, name=item.name)
172+
# store object as STRING
173+
self.add_key(key)
174+
self.kv_data += self._pack("I", GGUFValueType.STRING)
175+
self.kv_data_count += 1
176+
self.kv_data += self._pack("Q", len(val))
177+
self.kv_data += val
194178

195179

196-
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True, name: str | None = None) -> None:
180+
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None:
197181
if vtype is None:
198182
vtype = GGUFValueType.get_type(val)
199183

@@ -208,12 +192,6 @@ def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool
208192
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
209193
self.kv_data += self._pack("Q", len(encoded_val))
210194
self.kv_data += encoded_val
211-
elif vtype == GGUFValueType.NAMEDOBJECT:
212-
encoded_name = name.encode("utf8") if isinstance(name, str) else name
213-
self.kv_data += self._pack("Q", len(encoded_name))
214-
self.kv_data += encoded_name
215-
self.kv_data += self._pack("Q", len(val))
216-
self.kv_data += val
217195
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
218196
ltype = GGUFValueType.get_type(val[0])
219197
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):

gguf-py/scripts/gguf-addfile.py

Lines changed: 33 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
#print("add path", str(Path(__file__).parent.parent))
1818
sys.path.insert(0, str(Path(__file__).parent.parent))
1919

20-
from gguf import GGUFReader, GGUFWriter, ReaderField, GGUFEndian, GGUFValueType, Keys, NamedObject # noqa: E402
20+
from gguf import GGUFReader, GGUFWriter, ReaderField, GGUFEndian, GGUFValueType, Keys # noqa: E402
2121

2222
logger = logging.getLogger("gguf-addfile")
2323

@@ -49,10 +49,10 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
4949
if len(field.types) == 1:
5050
curr_type = field.types[0]
5151
if curr_type == GGUFValueType.STRING:
52-
print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '')
53-
elif curr_type == GGUFValueType.NAMEDOBJECT:
54-
print(' = {0}'.format(repr(str(bytes(field.parts[4]), encoding='utf8')[:60])), end = '')
55-
print(', {0}'.format(int(field.parts[5]))[:20], end = '')
52+
if not field.name[0] == Keys.General.FILE_MARK:
53+
print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '')
54+
else:
55+
print(' = binary data', end = '')
5656
elif field.types[0] in reader.gguf_scalar_to_np:
5757
print(' = {0}'.format(field.parts[-1][0]), end = '')
5858
print()
@@ -88,16 +88,17 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
8888
continue
8989
itype = field.types[-1]
9090
if itype == GGUFValueType.STRING:
91-
curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
92-
elif itype == GGUFValueType.NAMEDOBJECT:
93-
curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
91+
if not field.name[0] == Keys.General.FILE_MARK:
92+
curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
93+
else:
94+
curr["value"] = [bytes(field.parts[idx]) for idx in field.data]
9495
else:
9596
curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()]
9697
elif field.types[0] == GGUFValueType.STRING:
97-
curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
98-
elif field.types[0] == GGUFValueType.NAMEDOBJECT:
99-
curr["value"] = str(bytes(field.parts[4]), encoding="utf-8")
100-
curr["value"] = int(field.parts[5])
98+
if not field.name[0] == Keys.General.FILE_MARK:
99+
curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
100+
else:
101+
curr["value"] = bytes(field.parts[-1])
101102
else:
102103
curr["value"] = field.parts[-1].tolist()[0]
103104
if not args.no_tensors:
@@ -135,15 +136,17 @@ def decode_field(field: ReaderField) -> Any:
135136
sub_type = field.types[-1]
136137

137138
if sub_type == GGUFValueType.STRING:
138-
return [str(bytes(field.parts[idx]), encoding='utf8') for idx in field.data]
139-
elif sub_type == GGUFValueType.NAMEDOBJECT:
140-
return [str(bytes(field.parts[idx]), encoding='utf8') for idx in field.data]
139+
if not field.name[0] == Keys.General.FILE_MARK:
140+
return [str(bytes(field.parts[idx]), encoding='utf8') for idx in field.data]
141+
else:
142+
return [bytes(field.parts[idx]) for idx in field.data]
141143
else:
142144
return [pv for idx in field.data for pv in field.parts[idx].tolist()]
143145
if main_type == GGUFValueType.STRING:
144-
return str(bytes(field.parts[-1]), encoding='utf8')
145-
elif main_type == GGUFValueType.NAMEDOBJECT:
146-
return str(bytes(field.parts[4]), encoding='utf8')
146+
if not field.name[0] == Keys.General.FILE_MARK:
147+
return str(bytes(field.parts[-1]), encoding='utf8')
148+
else:
149+
return bytes(field.parts[-1])
147150
else:
148151
return field.parts[-1][0]
149152

@@ -156,7 +159,7 @@ def get_field_data(reader: GGUFReader, key: str) -> Any:
156159
return decode_field(field)
157160

158161

159-
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: Mapping[str, str], array: NamedObject[Any] | None = None) -> None:
162+
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: Mapping[str, str]) -> None:
160163
for field in reader.fields.values():
161164
# Suppress virtual fields and fields written by GGUFWriter
162165
if field.name == Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
@@ -186,18 +189,11 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
186189
writer.add_chat_template(new_metadata[Keys.Tokenizer.CHAT_TEMPLATE])
187190
del new_metadata[Keys.Tokenizer.CHAT_TEMPLATE]
188191

189-
if array is None:
190-
for key, name in new_metadata.items():
191-
logger.debug(f'Adding {key}: {name}')
192-
# named object
193-
with open(name, "rb") as f:
194-
val = f.read()
195-
writer.add_namedobject(key, val, name)
196-
else:
197-
for key, name in new_metadata.items():
198-
logger.debug(f'Adding array {key}: {name}')
199-
# named object
200-
writer.add_namedobject(key, 'val', name, array=array)
192+
for key, name in new_metadata.items():
193+
logger.debug(f'Adding {key}: {name}')
194+
with open(name, "rb") as f:
195+
val = f.read()
196+
writer.add_object(key, val)
201197

202198
for tensor in reader.tensors:
203199
# Dimensions are written in reverse order, so flip them first
@@ -219,7 +215,6 @@ def main() -> None:
219215
parser.add_argument("input", type=str, help="GGUF format model input filename")
220216
parser.add_argument("output", type=str, help="GGUF format model output filename")
221217
parser.add_argument("addfiles", type=str, nargs='+', help="add filenames ...")
222-
parser.add_argument("--array", action="store_true", help="add files to namedobject array")
223218
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
224219
parser.add_argument("--json", action="store_true", help="Produce JSON output")
225220
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
@@ -242,27 +237,12 @@ def main() -> None:
242237

243238
logger.info(f'* Adding: {args.addfiles}')
244239
new_metadata = {}
245-
count = 0
246-
if args.array is False:
247-
for path in args.addfiles:
248-
count += 1
249-
key = Keys.General.NAMEDOBJECT + Keys.General.CONNECT + str(count)
250-
new_metadata[key] = path
251-
logger.info(f'* Adding: {key} = {path}')
252-
copy_with_new_metadata(reader, writer, new_metadata)
253-
else:
254-
key = Keys.General.NAMEDOBJECT
255-
# array is dummy
256-
new_metadata[key] = 'array'
257-
files = []
258-
for path in args.addfiles:
259-
with open(path, "rb") as f:
260-
val = f.read()
261-
#print(f'files[{count}] = {path}')
262-
files.append(NamedObject(path, val))
263-
logger.info(f'* Adding: {key}[{count}] = {path}')
264-
count += 1
265-
copy_with_new_metadata(reader, writer, new_metadata, array=files)
240+
for path in args.addfiles:
241+
# add FILE_MARK to key
242+
key = Keys.General.FILE_MARK + path
243+
new_metadata[key] = path
244+
logger.info(f'* Adding: {key} = {path}')
245+
copy_with_new_metadata(reader, writer, new_metadata)
266246

267247
if args.json:
268248
dump_metadata_json(reader, args)

0 commit comments

Comments
 (0)