From fa7c39540cff0ce6f6d245baeb15ac1ebe6cdd69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Sat, 12 Aug 2023 15:55:58 +0300 Subject: [PATCH] gguf : start implementing quantization (WIP) --- gguf-llama.cpp | 15 +++++++++++---- gguf-util.h | 12 ++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/gguf-llama.cpp b/gguf-llama.cpp index f1755fef5..eecefc0f6 100644 --- a/gguf-llama.cpp +++ b/gguf-llama.cpp @@ -525,6 +525,11 @@ struct ggml_context * ctx_data = NULL; // TODO make keysconstants in header // TODO: read all hparams from file + int q_ver_idx = gguf_find_key (gguf_ctx, "general.quantization_version"); + if (q_ver_idx != -1) { + hparams.ftype = gguf_get_val_u32(gguf_ctx, q_ver_idx); + } + hparams.n_vocab = read_n_vocab(); hparams.n_ctx = read_u32("llama.context_length"); hparams.n_embd = read_u32("llama.embedding_length"); @@ -738,27 +743,29 @@ struct gguf_file_saver { info_offset = file.tell(); size_t count = gguf_get_data_offset(fl->gguf_ctx) - info_offset; file.write_zeros(count); - printf("info_offset = %zu\n", info_offset); file.seek(info_offset, SEEK_SET); GGML_ASSERT(info_offset == file.tell()); } - size_t write_tensor_info(llama_load_tensor & tensor) { + size_t write_tensor_info(llama_load_tensor & tensor, enum ggml_type type) { size_t total_written = 0; file.seek(info_offset, SEEK_SET); GGML_ASSERT(info_offset == file.tell()); total_written += file.write_str(tensor.name); +printf("total_written = %zu, name = %s\n", total_written, tensor.name.c_str()); int32_t n_dims = tensor.ne.size(); total_written += file.write_i32(n_dims); for (int32_t i = 0; i < n_dims; ++i) { - total_written += file.write_i32(i); + total_written += file.write_i32(tensor.ne[i]); } + total_written += file.write_i32(type); total_written += file.write_u64(tensor_offset); info_offset += total_written; file.seek(0, SEEK_END); + printf("total_written = %zu\n", total_written); return total_written; } @@ -781,7 +788,7 @@ struct gguf_file_saver { default: GGML_ASSERT(false); } - write_tensor_info(tensor); + write_tensor_info(tensor, new_type); file.write_raw(new_data, new_size); size_t padded_size = GGML_PAD(new_size, GGUF_DEFAULT_ALIGNMENT); // TODO: handle custom alignment size_t pad = padded_size - new_size; diff --git a/gguf-util.h b/gguf-util.h index 17f9dc968..ed7d53f69 100644 --- a/gguf-util.h +++ b/gguf-util.h @@ -109,18 +109,22 @@ struct gguf_file { size_t write_str(const std::string & val) { size_t total_written = 0; const int32_t n = val.size(); - total_written += fwrite((const char *) &n, sizeof(n), 1, fp); - total_written += fwrite(val.c_str(), n, 1, fp); + fwrite((const char *) &n, sizeof(n), 1, fp); + total_written += sizeof(n); + fwrite(val.c_str(), n, 1, fp); + total_written += n; return total_written; } size_t write_i32(int32_t val) { - return fwrite((const char *) &val, sizeof(val), 1, fp); + fwrite((const char *) &val, sizeof(val), 1, fp); + return sizeof(val); } size_t write_u64(size_t val) { - return fwrite((const char *) &val, sizeof(val), 1, fp); + fwrite((const char *) &val, sizeof(val), 1, fp); + return sizeof(val); } void write_raw(const void * data, size_t size) {