wip : implement GGUF (#2397)

* Add LLAMA_DEFAULT_RMS_EPS so we can change the default (#2384)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

* WIP: python class to write GGUF, incomplete C apı for reading

---------

Co-authored-by: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
M. Yusuf Sarıgöz 2023-07-26 11:17:05 +03:00 committed by Georgi Gerganov
parent 4d698495ea
commit bae6b125f6
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
3 changed files with 481 additions and 0 deletions

32
constants.py Normal file
View File

@ -0,0 +1,32 @@
GGUF_MAGIC = 0x47475546
GGUF_VERSION = 1
# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
KEY_GENERAL_NAME = "general.name"
KEY_GENERAL_AUTHOR = "general.author"
KEY_GENERAL_URL = "general.url"
KEY_GENERAL_DESCRIPTION = "general.description"
KEY_GENERAL_FILE_TYPE = "general.file_type"
KEY_GENERAL_LICENSE = "general.license"
KEY_GENERAL_SOURCE_URL = "general.source.url"
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
# LLM
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
KEY_LLM_LAYER_COUNT = "{llm}.layer_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"
# attention
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"
# RoPE
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
KEY_ROPE_SCALE = "{llm}.rope.scale"

192
gguf.c Normal file
View File

@ -0,0 +1,192 @@
// TODO: convert to proper gguf.h gguf.c structure, now I'm trying to be fast as much as possible,
// and everything is in this file for quick debugging.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
enum ggml_type {
GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
// GGML_TYPE_Q4_2 = 4, support has been removed
// GGML_TYPE_Q4_3 (5) support has been removed
GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,
// k-quantizations
GGML_TYPE_Q2_K = 10,
GGML_TYPE_Q3_K = 11,
GGML_TYPE_Q4_K = 12,
GGML_TYPE_Q5_K = 13,
GGML_TYPE_Q6_K = 14,
GGML_TYPE_Q8_K = 15,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_COUNT,
};
enum gguf_metadata_value_type {
GGUF_METADATA_VALUE_TYPE_UINT8 = 0,
GGUF_METADATA_VALUE_TYPE_INT8 = 1,
GGUF_METADATA_VALUE_TYPE_UINT16 = 2,
GGUF_METADATA_VALUE_TYPE_INT16 = 3,
GGUF_METADATA_VALUE_TYPE_UINT32 = 4,
GGUF_METADATA_VALUE_TYPE_INT32 = 5,
GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
GGUF_METADATA_VALUE_TYPE_BOOL = 7,
GGUF_METADATA_VALUE_TYPE_STRING = 8,
GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
};
struct gguf_string_t {
uint32_t len;
char * string;
};
union gguf_metadata_value_t;
// Union definition for gguf_metadata_value_t
union gguf_metadata_value_t {
uint8_t uint8;
int8_t int8;
uint16_t uint16;
int16_t int16;
uint32_t uint32;
int32_t int32;
float float32;
bool bool_;
struct gguf_string_t string;
struct {
uint32_t len;
enum gguf_metadata_value_type type;
union gguf_metadata_value_t * array;
} array;
};
struct gguf_metadata_kv_t {
struct gguf_string_t key;
uint32_t value_len;
enum gguf_metadata_value_type value_type;
union gguf_metadata_value_t* value;
};
struct gguf_header_t {
uint32_t magic;
uint32_t version;
uint32_t tensor_count;
uint32_t metadata_kv_count;
struct gguf_metadata_kv_t * metadata_kv;
};
struct gguf_tensor_info_t {
struct gguf_string_t name;
uint32_t n_dimensions;
uint32_t dimensions[];
};
struct gguf_file_t {
struct gguf_header_t header;
uint8_t tensor_data[];
};
void read_gguf_file(const char * file_path, struct gguf_file_t * gguf_file) {
FILE* file = fopen(file_path, "rb");
if (file == NULL) {
printf("Error opening the file.\n");
return;
}
fread(&gguf_file->header.magic, sizeof(uint32_t), 1, file);
// Verify magic and version
if (gguf_file->header.magic != 0x47475546) {
printf("Invalid magic number. Not a valid GGUF file.\n");
fclose(file);
return;
}
fread(&gguf_file->header.version, sizeof(uint32_t), 1, file);
if (gguf_file->header.version != 1) {
printf("Unsupported version. Expected version 1.\n");
fclose(file);
return;
}
fread(&gguf_file->header.tensor_count, sizeof(uint32_t), 1, file);
fread(&gguf_file->header.metadata_kv_count, sizeof(uint32_t), 1, file);
printf("Magic: %x\n", gguf_file->header.magic);
printf("Version: %d\n", gguf_file->header.version);
printf("Tensor Count: %d\n", gguf_file->header.tensor_count);
printf("Metadata Key-Value Count: %d\n", gguf_file->header.metadata_kv_count);
gguf_file->header.metadata_kv = (struct gguf_metadata_kv_t*)malloc(gguf_file->header.metadata_kv_count * sizeof(struct gguf_metadata_kv_t));
for (int i = 0; i < gguf_file->header.metadata_kv_count; i++) {
struct gguf_metadata_kv_t* kv = &gguf_file->header.metadata_kv[i];
fread(&kv->key.len, sizeof(uint32_t), 1, file);
kv->key.string = (char*)malloc(kv->key.len ); // Allocate memory for the key string
fread(kv->key.string, sizeof(char), kv->key.len, file);
//kv->key.string[kv->key.len] = '\0'; // Null-terminate the key string
fread(&kv->value_type, sizeof(uint32_t), 1, file);
printf("Metadata Value Type: %d\n", kv->value_type);
printf("Metadata Key: %s\n", kv->key.string);
// Read metadata value according to its type using reinterpret_cast
switch (kv->value_type) {
case GGUF_METADATA_VALUE_TYPE_UINT32:
kv->value = (uint32_t *) malloc(sizeof(uint32_t));
fread(kv->value, sizeof(uint32_t), 1, file);
printf("value: %d\n", kv->value->uint32);
break;
case GGUF_METADATA_VALUE_TYPE_FLOAT32:
kv->value = (float *)malloc(sizeof(float));
fread(kv->value, sizeof(float), 1, file);
printf("value: %f\n", (float)kv->value->float32);
break;
case GGUF_METADATA_VALUE_TYPE_STRING:
fread(&kv->value_len, sizeof(uint32_t), 1, file);
printf("value len: %d\n", kv->value_len);
kv->value = (char *)malloc(sizeof(char) * kv->value_len); // Allocate memory for the value string
fread(kv->value, sizeof(char), kv->value_len, file);
printf("value: %s\n", (char *)kv->value);
break;
// ... (handle other types in a similar manner)
default:
printf("Unsupported metadata value type.\n");
fclose(file);
return;
}
}
// TODO: handle reading tensor data
fclose(file);
}
void gguf_free(struct gguf_file_t * gguf_file) {
// Free allocated memory for key strings avd values
for (int i = 0; i < gguf_file->header.metadata_kv_count; i++) {
free(gguf_file->header.metadata_kv[i].key.string);
free(gguf_file->header.metadata_kv[i].value);
}
free(gguf_file->header.metadata_kv);
}
int main() {
const char* file_path = "example.gguf";
struct gguf_file_t gguf_file;
read_gguf_file(file_path, &gguf_file);
gguf_free(&gguf_file);
return 0;
}

257
gguf.py Normal file
View File

@ -0,0 +1,257 @@
"""TODOs
1. Implement writing tensor data with alignment.
2. Implement writers for known architectures, LLaMA in particular.
3. Add docstrings from the format specs.
4. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
"""
import struct
from enum import IntEnum
from typing import List, Any
import constants
class GGMLQuantizationType(IntEnum):
F32 = 0
F16 = 1
QR_0 = 2
Q4_1 = 3
# Q4_2 = 4 # support has been removed
# Q4_3 = 5 # support has been removed
Q5_0 = 6
Q5_1 = 7
Q8_0 = 8
Q8_1 = 9
Q2_K = 10
Q3_K = 11
Q4_K = 12
Q5_K = 13
Q6_K = 14
Q8_K = 15
class GGUFValueType(IntEnum):
UINT8 = 0
INT8 = 1
UINT16 = 2
INT16 = 3
UINT32 = 4
INT32 = 5
FLOAT32 = 6
BOOL = 7
STRING = 8
ARRAY = 9
@staticmethod
def get_type(value):
if isinstance(value, str):
return GGUFValueType.STRING
elif isinstance(value, list):
return GGUFValueType.ARRAY
elif isinstance(value, float):
return GGUFValueType.FLOAT32
elif isinstance(value, bool):
return GGUFValueType.BOOL
else:
return GGUFValueType.INT32
class GGUFWriter:
def __init__(self, buffered_writer):
self.buffered_writer = buffered_writer
def write_header(self, tensor_count: int, metadata_kv_count: int):
self.buffered_writer.write(struct.pack("<I", constants.GGUF_MAGIC))
self.buffered_writer.write(struct.pack("<I", constants.GGUF_VERSION))
self.buffered_writer.write(struct.pack("<I", tensor_count))
self.buffered_writer.write(struct.pack("<I", metadata_kv_count))
@classmethod
def open(cls, path: str) -> "GGUFWriter":
f = open(path, "wb")
return cls(f)
def write_key(self, key: str, value_type: GGUFValueType):
encoded_key = key.encode("utf8")
self.buffered_writer.write(struct.pack("<I", len(encoded_key)))
self.buffered_writer.write(encoded_key)
self.buffered_writer.write(struct.pack("<I", value_type))
def write_uint8(self, key: str, value: int):
self.write_key(key, GGUFValueType.UINT8)
self.buffered_writer.write(struct.pack("<B", value))
def write_int8(self, key: str, value: int):
self.write_key(key, GGUFValueType.INT8)
self.buffered_writer.write(struct.pack("<b", value))
def write_uint16(self, key: str, value: int):
self.write_key(key, GGUFValueType.UINT16)
self.buffered_writer.write(struct.pack("<H", value))
def write_int16(self, key: str, value: int):
self.write_key(key, GGUFValueType.INT16)
self.buffered_writer.write(struct.pack("<h", value))
def write_uint32(self, key: str, value: int):
self.write_key(key, GGUFValueType.UINT32)
self.buffered_writer.write(struct.pack("<I", value))
def write_int32(self, key: str, value: int):
self.write_key(key, GGUFValueType.INT32)
self.buffered_writer.write(struct.pack("<i", value))
def write_float32(self, key: str, value: float):
self.write_key(key, GGUFValueType.FLOAT32)
self.buffered_writer.write(struct.pack("<f", value))
def write_bool(self, key: str, value: bool):
self.write_key(key, GGUFValueType.BOOL)
self.buffered_writer.write(struct.pack("<?", value))
def write_string(self, key: str, value: str):
self.write_key(key, GGUFValueType.STRING)
encoded_string = value.encode('utf-8')
self.buffered_writer.write(struct.pack("<I", len(encoded_string)))
self.buffered_writer.write(encoded_string)
def write_array(self, key: str, value: list):
if not isinstance(value, list):
raise ValueError("Value must be a list for array type")
self.write_key(key, GGUFValueType.ARRAY)
self.buffered_writer.write(struct.pack("<I", len(value)))
for item in value:
self.write_value(item)
def write_value(self: str, value: Any):
value_type = GGUFValueType.get_type(value)
self.buffered_writer.write(struct.pack("<I", value_type))
if value_type == GGUFValueType.UINT8:
self.buffered_writer.write(struct.pack("<B", value))
elif value_type == GGUFValueType.INT8:
self.buffered_writer.write(struct.pack("<b", value))
elif value_type == GGUFValueType.UINT16:
self.buffered_writer.write(struct.pack("<H", value))
elif value_type == GGUFValueType.INT16:
self.buffered_writer.write(struct.pack("<h", value))
elif value_type == GGUFValueType.UINT32:
self.buffered_writer.write(struct.pack("<I", value))
elif value_type == GGUFValueType.INT32:
self.buffered_writer.write(struct.pack("<i", value))
elif value_type == GGUFValueType.FLOAT32:
self.buffered_writer.write(struct.pack("<f", value))
elif value_type == GGUFValueType.BOOL:
self.buffered_writer.write(struct.pack("?", value))
elif value_type == GGUFValueType.STRING:
encoded_value = value.encode("utf8")
self.buffered_writer.write(struct.pack("<I", len(encoded_value)))
self.buffered_writer.write(encoded_value)
elif value_type == GGUFValueType.ARRAY:
self.buffered_writer.write(struct.pack("<I", len(value)))
for item in value:
self.write_value(item)
else:
raise ValueError("Invalid GGUF metadata value type")
def flush(self):
self.buffered_writer.flush()
def close(self):
self.buffered_writer.close()
def write_architecture(self, architecture: str):
self.write_string(constants.KEY_GENERAL_ARCHITECTURE,
architecture)
def write_author(self, author: str):
self.write_string(constants.KEY_GENERAL_AUTHOR, author)
def write_url(self, url: str):
self.write_string(constants.KEY_GENERAL_URL, url)
def write_description(self, description: str):
self.write_string(constants.KEY_GENERAL_DESCRIPTION, description)
def write_file_type(self, file_type: str):
self.write_string(constants.KEY_GENERAL_FILE_TYPE, file_type)
def write_source_url(self, url: str):
self.write_string(constants.KEY_GENERAL_SOURCE_URL, url)
def write_source_hf_repo(self, repo: str):
self.write_string(constants.KEY_GENERAL_SOURCE_HF_REPO, repo)
def write_name(self, name: str):
self.write_string(constants.KEY_GENERAL_NAME, name)
def write_quantization_version(self, quantization_version: GGMLQuantizationType):
self.write_uint32(
constants.KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
def write_context_length(self, llm: str, length: int):
self.write_uint32(
constants.KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length)
def write_embedding_length(self, llm: str, length: int):
self.write_uint32(
constants.KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length)
def write_layer_count(self, llm: str, length: int):
self.write_uint32(
constants.KEY_LLM_LAYER_COUNT.format(llm=llm), length)
def write_feed_forward_length(self, llm: str, length: int):
self.write_uint32(
constants.KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length)
def write_parallel_residual(self, llm: str, use: bool):
self.write_bool(
constants.KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use)
def write_tensor_data_layout(self, llm: str, layout: str):
self.write_string(
constants.KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout)
def write_head_count(self, llm: str, count: int):
self.write_uint32(
constants.KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count)
def write_head_count_kv(self, llm: str, count: int):
self.write_uint32(
constants.KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count)
def write_max_alibi_bias(self, llm: str, bias: float):
self.write_float32(
constants.KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias)
def write_clamp_kqv(self, llm: str, value: float):
self.write_float32(
constants.KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value)
def write_rope_dimension_count(self, llm: str, count: int):
self.write_uint32(
constants.KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count)
def write_rope_scale(self, llm: str, value: float):
self.write_float32(constants.KEY_ROPE_SCALE.format(llm=llm), value)
# Example usage:
if __name__ == "__main__":
# Example usage with a file
gguf_writer = GGUFWriter.open("example.gguf")
gguf_writer.write_header(0, 3)
gguf_writer.write_architecture("llama")
gguf_writer.write_uint32("answer", 42) # Write a 32-bit integer
gguf_writer.write_float32("answer_in_float", 42.0) # Write a 32-bit float
# Write an array of integers
#gguf_writer.write_array("simple_array", [1, 2, 3, 4])
# Write a nested array
#gguf_writer.write_array("nested", [1, "nested", [2, 3]])
gguf_writer.close()