GGUF : write tensor (#2426)

* WIP: Write tensor

* GGUF : Support writing tensors in Python

* refactor : rm unused import and upd todos

* fix : fix errors upd writing example

* rm example.gguf

* gitignore *.gguf

* undo formatting
This commit is contained in:
M. Yusuf Sarıgöz 2023-07-28 11:34:16 +03:00 committed by GitHub
parent d2bb3ac10b
commit 11ef380c2a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 72 additions and 35 deletions

1
.gitignore vendored
View File

@ -1,6 +1,7 @@
*.o *.o
*.a *.a
*.so *.so
*.gguf
.DS_Store .DS_Store
.build/ .build/
.cache/ .cache/

View File

@ -1,5 +1,6 @@
GGUF_MAGIC = 0x47475546 GGUF_MAGIC = 0x47475546
GGUF_VERSION = 1 GGUF_VERSION = 1
GGUF_DEFAULT_ALIGNMENT = 32
# general # general
KEY_GENERAL_ARCHITECTURE = "general.architecture" KEY_GENERAL_ARCHITECTURE = "general.architecture"

95
gguf.py
View File

@ -1,14 +1,16 @@
"""TODOs """TODOs
1. Implement writing tensor data with alignment. 1. Implement writers for known architectures, LLaMA in particular.
2. Implement writers for known architectures, LLaMA in particular. 2. Add docstrings from the format specs.
3. Add docstrings from the format specs. 3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
4. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
""" """
import struct import struct
import constants import constants
from enum import IntEnum from enum import IntEnum
from typing import List, Any from typing import Any, IO, List
import numpy as np
class GGMLQuantizationType(IntEnum): class GGMLQuantizationType(IntEnum):
F32 = 0 F32 = 0
@ -54,15 +56,18 @@ class GGUFValueType(IntEnum):
else: else:
return GGUFValueType.INT32 return GGUFValueType.INT32
class GGUFWriter: class GGUFWriter:
def __init__(self, buffered_writer): def __init__(self, fout: IO):
self.buffered_writer = buffered_writer self.fout = fout
self.offset_tensor = 0
self.tensors: List[np.ndarray] = []
def write_header(self, tensor_count: int, metadata_kv_count: int): def write_header(self, tensor_count: int, metadata_kv_count: int):
self.buffered_writer.write(struct.pack("<I", constants.GGUF_MAGIC)) self.fout.write(struct.pack("<I", constants.GGUF_MAGIC))
self.buffered_writer.write(struct.pack("<I", constants.GGUF_VERSION)) self.fout.write(struct.pack("<I", constants.GGUF_VERSION))
self.buffered_writer.write(struct.pack("<I", tensor_count)) self.fout.write(struct.pack("<I", tensor_count))
self.buffered_writer.write(struct.pack("<I", metadata_kv_count)) self.fout.write(struct.pack("<I", metadata_kv_count))
@classmethod @classmethod
def open(cls, path: str) -> "GGUFWriter": def open(cls, path: str) -> "GGUFWriter":
@ -119,40 +124,69 @@ class GGUFWriter:
if vtype is None: if vtype is None:
vtype = GGUFValueType.get_type(val) vtype = GGUFValueType.get_type(val)
self.buffered_writer.write(struct.pack("<I", vtype)) self.fout.write(struct.pack("<I", vtype))
if vtype == GGUFValueType.UINT8: if vtype == GGUFValueType.UINT8:
self.buffered_writer.write(struct.pack("<B", val)) self.fout.write(struct.pack("<B", val))
elif vtype == GGUFValueType.INT8: elif vtype == GGUFValueType.INT8:
self.buffered_writer.write(struct.pack("<b", val)) self.fout.write(struct.pack("<b", val))
elif vtype == GGUFValueType.UINT16: elif vtype == GGUFValueType.UINT16:
self.buffered_writer.write(struct.pack("<H", val)) self.fout.write(struct.pack("<H", val))
elif vtype == GGUFValueType.INT16: elif vtype == GGUFValueType.INT16:
self.buffered_writer.write(struct.pack("<h", val)) self.fout.write(struct.pack("<h", val))
elif vtype == GGUFValueType.UINT32: elif vtype == GGUFValueType.UINT32:
self.buffered_writer.write(struct.pack("<I", val)) self.fout.write(struct.pack("<I", val))
elif vtype == GGUFValueType.INT32: elif vtype == GGUFValueType.INT32:
self.buffered_writer.write(struct.pack("<i", val)) self.fout.write(struct.pack("<i", val))
elif vtype == GGUFValueType.FLOAT32: elif vtype == GGUFValueType.FLOAT32:
self.buffered_writer.write(struct.pack("<f", val)) self.fout.write(struct.pack("<f", val))
elif vtype == GGUFValueType.BOOL: elif vtype == GGUFValueType.BOOL:
self.buffered_writer.write(struct.pack("?", val)) self.fout.write(struct.pack("?", val))
elif vtype == GGUFValueType.STRING: elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf8") encoded_val = val.encode("utf8")
self.buffered_writer.write(struct.pack("<I", len(encoded_val))) self.fout.write(struct.pack("<I", len(encoded_val)))
self.buffered_writer.write(encoded_val) self.fout.write(encoded_val)
elif vtype == GGUFValueType.ARRAY: elif vtype == GGUFValueType.ARRAY:
self.buffered_writer.write(struct.pack("<I", len(val))) self.fout.write(struct.pack("<I", len(val)))
for item in val: for item in val:
self.write_val(item) self.write_val(item)
else: else:
raise ValueError("Invalid GGUF metadata value type") raise ValueError("Invalid GGUF metadata value type")
@staticmethod
def ggml_pad(x: int, n: int) -> int:
return ((x + n - 1) // n) * n
def write_tensor_info(self, name: str, tensor: np.ndarray):
self.write_val(name, GGUFValueType.STRING)
n_dims = len(tensor.shape)
self.write_val(n_dims, GGUFValueType.INT32)
for i in range(n_dims):
self.write_val(tensor.shape[n_dims - 1 - i], GGUFValueType.INT32)
assert tensor.dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
dtype = GGMLQuantizationType.F32 if tensor.dtype == np.float32 else GGMLQuantizationType.F16
self.write_val(dtype, GGUFValueType.INT32)
self.fout.write(struct.pack("<Q", self.offset_tensor))
self.offset_tensor += GGUFWriter.ggml_pad(tensor.nbytes, constants.GGUF_DEFAULT_ALIGNMENT)
offset_data = GGUFWriter.ggml_pad(self.fout.tell(), constants.GGUF_DEFAULT_ALIGNMENT)
pad = offset_data - self.fout.tell()
self.fout.write(bytes([0] * pad))
self.tensors.append(tensor)
def write_tensors(self):
for tensor in self.tensors:
tensor.tofile(self.fout)
pad = GGUFWriter.ggml_pad(tensor.nbytes, constants.GGUF_DEFAULT_ALIGNMENT) - tensor.nbytes
self.fout.write(bytes([0] * pad))
def flush(self): def flush(self):
self.buffered_writer.flush() self.fout.flush()
def close(self): def close(self):
self.buffered_writer.close() self.fout.close()
def write_architecture(self, architecture: str): def write_architecture(self, architecture: str):
self.write_string(constants.KEY_GENERAL_ARCHITECTURE, self.write_string(constants.KEY_GENERAL_ARCHITECTURE,
@ -235,14 +269,15 @@ class GGUFWriter:
if __name__ == "__main__": if __name__ == "__main__":
# Example usage with a file # Example usage with a file
gguf_writer = GGUFWriter.open("example.gguf") gguf_writer = GGUFWriter.open("example.gguf")
gguf_writer.write_header(0, 3) gguf_writer.write_header(2, 3)
gguf_writer.write_architecture("llama") gguf_writer.write_architecture("llama")
gguf_writer.write_uint32("answer", 42) # Write a 32-bit integer gguf_writer.write_uint32("answer", 42) # Write a 32-bit integer
gguf_writer.write_float32("answer_in_float", 42.0) # Write a 32-bit float gguf_writer.write_float32("answer_in_float", 42.0) # Write a 32-bit float
# Write an array of integers tensor1 = np.random.random(size=(7, 10)).astype(np.float32)
#gguf_writer.write_array("simple_array", [1, 2, 3, 4]) tensor2 = np.random.random(size=(16, 12)).astype(np.float16)
# Write a nested array gguf_writer.write_tensor_info("tensor1", tensor1)
#gguf_writer.write_array("nested", [1, "nested", [2, 3]]) gguf_writer.write_tensor_info("tensor2", tensor2)
gguf_writer.write_tensors()
gguf_writer.close() gguf_writer.close()