mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 11:24:35 +00:00
llama : refactor model loading code (#2620)
* llama : style formatting + remove helper methods * llama : fix quantization using gguf tool * llama : simplify gguf_file_saver * llama : fix method names * llama : simplify write_header() * llama : no need to pass full file loader to the file saver just gguf_ctx * llama : gguf_file_saver write I32 * llama : refactor tensor names (#2622) * gguf: update tensor names searched in quantization * gguf : define tensor names as constants * gguf : initial write API (not tested yet) * gguf : write to file API (not tested) * gguf : initial write API ready + example * gguf : fix header write * gguf : fixes + simplify example + add ggml_nbytes_pad() * gguf : minor * llama : replace gguf_file_saver with new gguf write API * gguf : streaming support when writing files * gguf : remove oboslete write methods * gguf : remove obosolete gguf_get_arr_xxx API * llama : simplify gguf_file_loader * llama : move hparams and vocab from gguf_file_loader to llama_model_loader * llama : merge gguf-util.h in llama.cpp * llama : reorder definitions in .cpp to match .h * llama : minor simplifications * llama : refactor llama_model_loader (WIP) wip : remove ggml_ctx from llama_model_loader wip : merge gguf_file_loader in llama_model_loader * llama : fix shape prints * llama : fix Windows build + fix norm_rms_eps key * llama : throw error on missing KV paris in model meta data * llama : improve printing + log meta data * llama : switch print order of meta data --------- Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
This commit is contained in:
parent
ea5615a03a
commit
758ff1bbb5
2
Makefile
2
Makefile
@ -332,7 +332,7 @@ OBJS += ggml-alloc.o
|
||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h gguf-util.h
|
||||
gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
|
@ -74,7 +74,9 @@ int main(int argc, char ** argv) {
|
||||
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
|
||||
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
|
||||
|
||||
while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
|
||||
const int n_gen = std::min(32, max_context_size);
|
||||
|
||||
while (llama_get_kv_cache_token_count(ctx) < n_gen) {
|
||||
// evaluate the transformer
|
||||
|
||||
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
|
||||
@ -114,7 +116,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// push this new token for next evaluation
|
||||
tokens_list.push_back(new_token_id);
|
||||
|
||||
}
|
||||
|
||||
llama_free(ctx);
|
||||
@ -122,5 +123,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,5 +1,4 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf-util.h"
|
||||
#include "gguf-llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
@ -21,133 +20,22 @@ static std::string to_string(const T & val) {
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
|
||||
const int32_t n = val.size();
|
||||
fout.write((const char *) &n, sizeof(n));
|
||||
fout.write(val.c_str(), n);
|
||||
}
|
||||
|
||||
void gguf_ex_write_i32(std::ofstream & fout, int32_t val) {
|
||||
fout.write((const char *) &val, sizeof(val));
|
||||
}
|
||||
|
||||
void gguf_ex_write_u64(std::ofstream & fout, size_t val) {
|
||||
fout.write((const char *) &val, sizeof(val));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void gguf_ex_write_val(std::ofstream & fout, const std::string & key, enum gguf_type type, const T & val) {
|
||||
gguf_ex_write_str(fout, key);
|
||||
fout.write((const char *) &type, sizeof(type));
|
||||
fout.write((const char *) &val, sizeof(val));
|
||||
|
||||
fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), to_string(val).c_str());
|
||||
}
|
||||
|
||||
template<>
|
||||
void gguf_ex_write_val<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::string & val) {
|
||||
gguf_ex_write_str(fout, key);
|
||||
fout.write((const char *) &type, sizeof(type));
|
||||
|
||||
const int32_t n = val.size();
|
||||
fout.write((const char *) &n, sizeof(n));
|
||||
fout.write(val.c_str(), n);
|
||||
|
||||
fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), val.c_str());
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void gguf_ex_write_arr(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<T> & val) {
|
||||
gguf_ex_write_str(fout, key);
|
||||
{
|
||||
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
|
||||
fout.write((const char *) &tarr, sizeof(tarr));
|
||||
}
|
||||
|
||||
const int32_t n = val.size();
|
||||
fout.write((const char *) &type, sizeof(type));
|
||||
fout.write((const char *) &n, sizeof(n));
|
||||
fout.write((const char *) val.data(), n * sizeof(T));
|
||||
|
||||
fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
|
||||
for (int i = 0; i < n; ++i) {
|
||||
fprintf(stdout, "%s", to_string(val[i]).c_str());
|
||||
if (i < n - 1) {
|
||||
fprintf(stdout, ", ");
|
||||
}
|
||||
}
|
||||
fprintf(stdout, "]\n");
|
||||
}
|
||||
|
||||
template<>
|
||||
void gguf_ex_write_arr<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<std::string> & val) {
|
||||
gguf_ex_write_str(fout, key);
|
||||
{
|
||||
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
|
||||
fout.write((const char *) &tarr, sizeof(tarr));
|
||||
}
|
||||
|
||||
const int32_t n = val.size();
|
||||
fout.write((const char *) &type, sizeof(type));
|
||||
fout.write((const char *) &n, sizeof(n));
|
||||
for (int i = 0; i < n; ++i) {
|
||||
const int32_t nstr = val[i].size();
|
||||
fout.write((const char *) &nstr, sizeof(nstr));
|
||||
fout.write(val[i].c_str(), nstr);
|
||||
}
|
||||
|
||||
fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
|
||||
for (int i = 0; i < n; ++i) {
|
||||
fprintf(stdout, "%s", val[i].c_str());
|
||||
if (i < n - 1) {
|
||||
fprintf(stdout, ", ");
|
||||
}
|
||||
}
|
||||
fprintf(stdout, "]\n");
|
||||
}
|
||||
|
||||
bool gguf_ex_write(const std::string & fname) {
|
||||
std::ofstream fout(fname.c_str(), std::ios::binary);
|
||||
struct gguf_context * ctx = gguf_init_empty();
|
||||
|
||||
{
|
||||
const int32_t magic = GGUF_MAGIC;
|
||||
fout.write((const char *) &magic, sizeof(magic));
|
||||
}
|
||||
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
|
||||
gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
|
||||
gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
|
||||
gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
|
||||
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
|
||||
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
|
||||
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
|
||||
gguf_set_val_bool(ctx, "some.parameter.bool", true);
|
||||
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
|
||||
|
||||
{
|
||||
const int32_t version = GGUF_VERSION;
|
||||
fout.write((const char *) &version, sizeof(version));
|
||||
}
|
||||
|
||||
// NOTE: these have to match the output below!
|
||||
const int n_tensors = 10;
|
||||
const int n_kv = 12;
|
||||
|
||||
fout.write((const char*) &n_tensors, sizeof(n_tensors));
|
||||
fout.write((const char*) &n_kv, sizeof(n_kv));
|
||||
|
||||
fprintf(stdout, "%s: write header\n", __func__);
|
||||
|
||||
// kv data
|
||||
{
|
||||
gguf_ex_write_val< uint8_t>(fout, "some.parameter.uint8", GGUF_TYPE_UINT8, 0x12);
|
||||
gguf_ex_write_val< int8_t>(fout, "some.parameter.int8", GGUF_TYPE_INT8, -0x13);
|
||||
gguf_ex_write_val<uint16_t>(fout, "some.parameter.uint16", GGUF_TYPE_UINT16, 0x1234);
|
||||
gguf_ex_write_val< int16_t>(fout, "some.parameter.int16", GGUF_TYPE_INT16, -0x1235);
|
||||
gguf_ex_write_val<uint32_t>(fout, "some.parameter.uint32", GGUF_TYPE_UINT32, 0x12345678);
|
||||
gguf_ex_write_val< int32_t>(fout, "some.parameter.int32", GGUF_TYPE_INT32, -0x12345679);
|
||||
|
||||
gguf_ex_write_val<float> (fout, "some.parameter.float32", GGUF_TYPE_FLOAT32, 0.123456789f);
|
||||
gguf_ex_write_val<bool> (fout, "some.parameter.bool", GGUF_TYPE_BOOL, true);
|
||||
|
||||
gguf_ex_write_val<std::string>(fout, "some.parameter.string", GGUF_TYPE_STRING, "hello world");
|
||||
|
||||
gguf_ex_write_arr<int16_t> (fout, "some.parameter.arr.i16", GGUF_TYPE_INT16, { 1, 2, 3, 4, });
|
||||
gguf_ex_write_arr<float> (fout, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, { 3.145f, 2.718f, 1.414f, });
|
||||
gguf_ex_write_arr<std::string>(fout, "some.parameter.arr.str", GGUF_TYPE_STRING, { "hello", "world", "!" });
|
||||
}
|
||||
|
||||
uint64_t offset_tensor = 0;
|
||||
gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
|
||||
gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
|
||||
gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ 128ull*1024ull*1024ull,
|
||||
@ -157,6 +45,8 @@ bool gguf_ex_write(const std::string & fname) {
|
||||
|
||||
struct ggml_context * ctx_data = ggml_init(params);
|
||||
|
||||
const int n_tensors = 10;
|
||||
|
||||
// tensor infos
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const std::string name = "tensor_" + to_string(i);
|
||||
@ -178,58 +68,15 @@ bool gguf_ex_write(const std::string & fname) {
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stdout, "%s: tensor: %s, %d dims, ne = [", __func__, name.c_str(), n_dims);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
fprintf(stdout, "%s%3d", j == 0 ? "" : ", ", (int) cur->ne[j]);
|
||||
}
|
||||
fprintf(stdout, "], offset_tensor = %6" PRIu64 "\n", offset_tensor);
|
||||
|
||||
gguf_ex_write_str(fout, name);
|
||||
gguf_ex_write_i32(fout, n_dims);
|
||||
for (int j = 0; j < n_dims; ++j) {
|
||||
gguf_ex_write_i32(fout, cur->ne[j]);
|
||||
}
|
||||
gguf_ex_write_i32(fout, cur->type);
|
||||
gguf_ex_write_u64(fout, offset_tensor);
|
||||
|
||||
offset_tensor += GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT);
|
||||
gguf_add_tensor(ctx, cur);
|
||||
}
|
||||
|
||||
const uint64_t offset_data = GGML_PAD((uint64_t) fout.tellp(), GGUF_DEFAULT_ALIGNMENT);
|
||||
|
||||
fprintf(stdout, "%s: data offset = %" PRIu64 "\n", __func__, offset_data);
|
||||
|
||||
{
|
||||
const size_t pad = offset_data - fout.tellp();
|
||||
|
||||
for (size_t j = 0; j < pad; ++j) {
|
||||
fout.put(0);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
fprintf(stdout, "%s: writing tensor %d data\n", __func__, i);
|
||||
|
||||
const std::string name = "tensor_" + to_string(i);
|
||||
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
|
||||
|
||||
fout.write((const char *) cur->data, ggml_nbytes(cur));
|
||||
|
||||
{
|
||||
const size_t pad = GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT) - ggml_nbytes(cur);
|
||||
|
||||
for (size_t j = 0; j < pad; ++j) {
|
||||
fout.put(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fout.close();
|
||||
gguf_write_to_file(ctx, fname.c_str(), false);
|
||||
|
||||
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
|
||||
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -345,8 +192,16 @@ bool gguf_ex_read_1(const std::string & fname) {
|
||||
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||
|
||||
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n",
|
||||
__func__, i, cur->n_dims, cur->name, cur->data);
|
||||
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
|
||||
|
||||
// print first 10 elements
|
||||
const float * data = (const float *) cur->data;
|
||||
|
||||
printf("%s data[:10] : ", name);
|
||||
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
|
||||
printf("%f ", data[j]);
|
||||
}
|
||||
printf("\n\n");
|
||||
|
||||
// check data
|
||||
{
|
||||
@ -369,48 +224,6 @@ bool gguf_ex_read_1(const std::string & fname) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// read just the tensor info and mmap the data in user code
|
||||
bool gguf_ex_read_2(const std::string & fname) {
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx_data,
|
||||
};
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||
|
||||
struct gguf_file file(fname.c_str(), "rb");
|
||||
gguf_mmap data_mmap(&file, 0, false);
|
||||
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||
|
||||
cur->data = static_cast<char *>(data_mmap.addr) + offset;
|
||||
|
||||
// print first 10 elements
|
||||
const float * data = (const float *) cur->data;
|
||||
|
||||
printf("%s data[:10] : ", name);
|
||||
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
|
||||
printf("%f ", data[j]);
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
|
||||
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3) {
|
||||
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
|
||||
@ -427,7 +240,6 @@ int main(int argc, char ** argv) {
|
||||
} else if (mode == "r") {
|
||||
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
|
||||
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
|
||||
GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file");
|
||||
} else if (mode == "q") {
|
||||
llama_model_quantize_params params = llama_model_quantize_default_params();
|
||||
llama_model_quantize(fname.c_str(), "quant.gguf", ¶ms);
|
||||
|
595
ggml.c
595
ggml.c
@ -213,10 +213,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||
error_desc = "insufficient memory";
|
||||
break;
|
||||
}
|
||||
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
||||
__func__, error_desc, size/(1024.0*1024.0));
|
||||
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return aligned_memory;
|
||||
}
|
||||
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
||||
@ -4109,7 +4109,11 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||
//
|
||||
// is enough, but just in case, adding the second part
|
||||
|
||||
return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
|
||||
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
||||
}
|
||||
|
||||
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
||||
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
||||
}
|
||||
|
||||
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
||||
@ -16899,7 +16903,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||
// compute size of intermediate results
|
||||
// TODO: does not take into account scratch buffers !!!!
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
size_eval += ggml_nbytes(cgraph->nodes[i]);
|
||||
size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
|
||||
}
|
||||
|
||||
// print
|
||||
@ -18579,6 +18583,19 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
||||
};
|
||||
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
||||
|
||||
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
||||
[GGUF_TYPE_UINT8] = "uint8",
|
||||
[GGUF_TYPE_INT8] = "int8",
|
||||
[GGUF_TYPE_UINT16] = "uint16",
|
||||
[GGUF_TYPE_INT16] = "int16",
|
||||
[GGUF_TYPE_UINT32] = "uint32",
|
||||
[GGUF_TYPE_INT32] = "int32",
|
||||
[GGUF_TYPE_FLOAT32] = "float32",
|
||||
[GGUF_TYPE_BOOL] = "bool",
|
||||
[GGUF_TYPE_STRING] = "string",
|
||||
[GGUF_TYPE_ARRAY] = "array",
|
||||
};
|
||||
|
||||
union gguf_value {
|
||||
uint8_t uint8;
|
||||
int8_t int8;
|
||||
@ -18613,8 +18630,6 @@ struct gguf_header {
|
||||
uint32_t version;
|
||||
uint32_t n_tensors;
|
||||
uint32_t n_kv;
|
||||
|
||||
struct gguf_kv * kv;
|
||||
};
|
||||
|
||||
struct gguf_tensor_info {
|
||||
@ -18622,44 +18637,69 @@ struct gguf_tensor_info {
|
||||
|
||||
uint32_t n_dims;
|
||||
uint32_t ne[GGML_MAX_DIMS];
|
||||
uint32_t n_elms; // TODO: is this needed?
|
||||
|
||||
enum ggml_type type;
|
||||
|
||||
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
|
||||
|
||||
// for writing API
|
||||
const void * data;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct gguf_context {
|
||||
struct gguf_header header;
|
||||
|
||||
struct gguf_kv * kv;
|
||||
struct gguf_tensor_info * infos;
|
||||
|
||||
size_t alignment;
|
||||
size_t offset; // offset of `data` from beginning of file
|
||||
size_t size_data; // size of `data` in bytes
|
||||
size_t size; // size of `data` in bytes
|
||||
|
||||
//uint8_t * padding;
|
||||
uint8_t * data;
|
||||
void * data;
|
||||
};
|
||||
|
||||
static bool gguf_fread_el(void * dst, size_t size, FILE * file, size_t * offset) {
|
||||
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
||||
const size_t n = fread(dst, 1, size, file);
|
||||
*offset += n;
|
||||
return n == size;
|
||||
}
|
||||
|
||||
static bool gguf_fread_str(struct gguf_str * p, FILE * file, size_t * offset) {
|
||||
static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
p->n = 0;
|
||||
p->data = NULL;
|
||||
|
||||
bool ok = true;
|
||||
|
||||
// TODO: how to avoid mallocs for strings?
|
||||
ok = ok && gguf_fread_el(&p->n, sizeof(p->n), file, offset); p->data = calloc(p->n + 1, 1);
|
||||
ok = ok && gguf_fread_el( p->data, p->n, file, offset);
|
||||
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_empty(void) {
|
||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||
|
||||
ctx->header.magic = GGUF_MAGIC;
|
||||
ctx->header.version = GGUF_VERSION;
|
||||
ctx->header.n_tensors = 0;
|
||||
ctx->header.n_kv = 0;
|
||||
|
||||
ctx->kv = NULL;
|
||||
ctx->infos = NULL;
|
||||
|
||||
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
|
||||
ctx->offset = 0;
|
||||
ctx->size = 0;
|
||||
|
||||
ctx->data = NULL;
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
||||
FILE * file = fopen(fname, "rb");
|
||||
if (!file) {
|
||||
@ -18673,7 +18713,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
// check the magic before making allocations
|
||||
{
|
||||
gguf_fread_el(&magic, sizeof(magic), file, &offset);
|
||||
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
||||
|
||||
if (magic != GGUF_MAGIC) {
|
||||
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
|
||||
@ -18689,14 +18729,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
// read the header
|
||||
{
|
||||
ctx->header.magic = magic;
|
||||
ctx->header.kv = NULL;
|
||||
|
||||
ctx->kv = NULL;
|
||||
ctx->infos = NULL;
|
||||
ctx->data = NULL;
|
||||
|
||||
ok = ok && gguf_fread_el(&ctx->header.version, sizeof(ctx->header.version), file, &offset);
|
||||
ok = ok && gguf_fread_el(&ctx->header.n_tensors, sizeof(ctx->header.n_tensors), file, &offset);
|
||||
ok = ok && gguf_fread_el(&ctx->header.n_kv, sizeof(ctx->header.n_kv), file, &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read header\n", __func__);
|
||||
@ -18708,33 +18748,33 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
// read the kv pairs
|
||||
{
|
||||
ctx->header.kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
|
||||
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||
struct gguf_kv * kv = &ctx->header.kv[i];
|
||||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
|
||||
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
||||
|
||||
ok = ok && gguf_fread_str(&kv->key, file, &offset);
|
||||
//ok = ok && gguf_fread_el (&kv->n_bytes, sizeof(kv->n_bytes), file, &offset);
|
||||
ok = ok && gguf_fread_el (&kv->type, sizeof(kv->type), file, &offset);
|
||||
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
||||
//ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
|
||||
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
||||
|
||||
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
||||
|
||||
switch (kv->type) {
|
||||
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (&kv->value.uint8, sizeof(kv->value.uint8), file, &offset); break;
|
||||
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (&kv->value.int8, sizeof(kv->value.int8), file, &offset); break;
|
||||
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (&kv->value.uint16, sizeof(kv->value.uint16), file, &offset); break;
|
||||
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (&kv->value.int16, sizeof(kv->value.int16), file, &offset); break;
|
||||
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (&kv->value.uint32, sizeof(kv->value.uint32), file, &offset); break;
|
||||
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (&kv->value.int32, sizeof(kv->value.int32), file, &offset); break;
|
||||
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (&kv->value.float32, sizeof(kv->value.float32), file, &offset); break;
|
||||
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (&kv->value.bool_, sizeof(kv->value.bool_), file, &offset); break;
|
||||
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(&kv->value.str, file, &offset); break;
|
||||
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
|
||||
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
|
||||
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
|
||||
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
|
||||
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
||||
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
||||
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
||||
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
||||
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
ok = ok && gguf_fread_el(&kv->value.arr.type, sizeof(kv->value.arr.type), file, &offset);
|
||||
ok = ok && gguf_fread_el(&kv->value.arr.n, sizeof(kv->value.arr.n), file, &offset);
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||
|
||||
switch (kv->value.arr.type) {
|
||||
case GGUF_TYPE_UINT8:
|
||||
@ -18747,17 +18787,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
case GGUF_TYPE_BOOL:
|
||||
{
|
||||
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||
ok = ok && gguf_fread_el(kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], file, &offset);
|
||||
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
|
||||
} break;
|
||||
case GGUF_TYPE_STRING:
|
||||
{
|
||||
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
||||
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
||||
ok = ok && gguf_fread_str(&((struct gguf_str *) kv->value.arr.data)[j], file, &offset);
|
||||
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
};
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||
@ -18787,14 +18827,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
info->ne[j] = 1;
|
||||
}
|
||||
|
||||
ok = ok && gguf_fread_str(&info->name, file, &offset);
|
||||
ok = ok && gguf_fread_el (&info->n_dims, sizeof(info->n_dims), file, &offset);
|
||||
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
||||
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||
ok = ok && gguf_fread_el(&info->ne[j], sizeof(info->ne[j]), file, &offset);
|
||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||
}
|
||||
//ok = ok && gguf_fread_el (&info->n_elms, sizeof(info->n_elms), file, &offset);
|
||||
ok = ok && gguf_fread_el (&info->type, sizeof(info->type), file, &offset);
|
||||
ok = ok && gguf_fread_el (&info->offset, sizeof(info->offset), file, &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
||||
@ -18827,8 +18866,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
// compute the total size of the data section, taking into account the alignment
|
||||
{
|
||||
|
||||
ctx->size_data = 0;
|
||||
ctx->size = 0;
|
||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
|
||||
@ -18848,7 +18886,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
|
||||
|
||||
ctx->size_data += GGML_PAD(size_cur, ctx->alignment);
|
||||
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
||||
}
|
||||
}
|
||||
|
||||
@ -18862,7 +18900,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
const size_t mem_size =
|
||||
params.no_alloc ?
|
||||
(ctx->header.n_tensors )*ggml_tensor_overhead() :
|
||||
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size_data;
|
||||
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
|
||||
|
||||
struct ggml_init_params pdata = {
|
||||
.mem_size = mem_size,
|
||||
@ -18877,12 +18915,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
struct ggml_tensor * data = NULL;
|
||||
|
||||
if (params.no_alloc == false) {
|
||||
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size_data);
|
||||
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
||||
|
||||
ok = ok && data != NULL;
|
||||
|
||||
// read the binary blob with the tensor data
|
||||
ok = ok && gguf_fread_el(data->data, ctx->size_data, file, &offset);
|
||||
ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
|
||||
@ -18944,10 +18982,10 @@ void gguf_free(struct gguf_context * ctx) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctx->header.kv) {
|
||||
if (ctx->kv) {
|
||||
// free string memory - not great..
|
||||
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||
struct gguf_kv * kv = &ctx->header.kv[i];
|
||||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
|
||||
if (kv->key.data) {
|
||||
free(kv->key.data);
|
||||
@ -18974,7 +19012,7 @@ void gguf_free(struct gguf_context * ctx) {
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ALIGNED_FREE(ctx->header.kv);
|
||||
GGML_ALIGNED_FREE(ctx->kv);
|
||||
}
|
||||
|
||||
if (ctx->infos) {
|
||||
@ -18992,6 +19030,10 @@ void gguf_free(struct gguf_context * ctx) {
|
||||
GGML_ALIGNED_FREE(ctx);
|
||||
}
|
||||
|
||||
const char * gguf_type_name(enum gguf_type type) {
|
||||
return GGUF_TYPE_NAME[type];
|
||||
}
|
||||
|
||||
int gguf_get_version(struct gguf_context * ctx) {
|
||||
return ctx->header.version;
|
||||
}
|
||||
@ -19014,9 +19056,10 @@ int gguf_get_n_kv(struct gguf_context * ctx) {
|
||||
|
||||
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
||||
// return -1 if key not found
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
int keyfound = -1;
|
||||
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
|
||||
keyfound = i;
|
||||
@ -19028,71 +19071,87 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
||||
}
|
||||
|
||||
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].key.data;
|
||||
return ctx->kv[i].key.data;
|
||||
}
|
||||
|
||||
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].type;
|
||||
return ctx->kv[i].type;
|
||||
}
|
||||
|
||||
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.arr.type;
|
||||
return ctx->kv[i].value.arr.type;
|
||||
}
|
||||
|
||||
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.arr.data;
|
||||
}
|
||||
|
||||
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
||||
struct gguf_kv * kv = &ctx->header.kv[key_id];
|
||||
struct gguf_kv * kv = &ctx->kv[key_id];
|
||||
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
||||
return str->data;
|
||||
}
|
||||
|
||||
float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i) {
|
||||
return ((float *) ctx->header.kv[key_id].value.arr.data)[i];
|
||||
}
|
||||
|
||||
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.arr.n;
|
||||
return ctx->kv[i].value.arr.n;
|
||||
}
|
||||
|
||||
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.uint8;
|
||||
return ctx->kv[i].value.uint8;
|
||||
}
|
||||
|
||||
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.int8;
|
||||
return ctx->kv[i].value.int8;
|
||||
}
|
||||
|
||||
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.uint16;
|
||||
return ctx->kv[i].value.uint16;
|
||||
}
|
||||
|
||||
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.int16;
|
||||
return ctx->kv[i].value.int16;
|
||||
}
|
||||
|
||||
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.uint32;
|
||||
return ctx->kv[i].value.uint32;
|
||||
}
|
||||
|
||||
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.int32;
|
||||
return ctx->kv[i].value.int32;
|
||||
}
|
||||
|
||||
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.float32;
|
||||
return ctx->kv[i].value.float32;
|
||||
}
|
||||
|
||||
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.bool_;
|
||||
return ctx->kv[i].value.bool_;
|
||||
}
|
||||
|
||||
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
||||
return ctx->header.kv[i].value.str.data;
|
||||
return ctx->kv[i].value.str.data;
|
||||
}
|
||||
|
||||
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
||||
return ctx->header.n_tensors;
|
||||
}
|
||||
|
||||
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
||||
// return -1 if tensor not found
|
||||
int tensorfound = -1;
|
||||
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
|
||||
tensorfound = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return tensorfound;
|
||||
}
|
||||
|
||||
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
||||
return ctx->infos[i].offset;
|
||||
}
|
||||
@ -19101,6 +19160,400 @@ char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
||||
return ctx->infos[i].name.data;
|
||||
}
|
||||
|
||||
// returns the index
|
||||
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
||||
const int idx = gguf_find_key(ctx, key);
|
||||
if (idx >= 0) {
|
||||
return idx;
|
||||
}
|
||||
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
|
||||
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
|
||||
ctx->kv[n_kv].key.n = strlen(key) + 1;
|
||||
ctx->kv[n_kv].key.data = strdup(key);
|
||||
ctx->header.n_kv++;
|
||||
|
||||
return n_kv;
|
||||
}
|
||||
|
||||
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_UINT8;
|
||||
ctx->kv[idx].value.uint8 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_INT8;
|
||||
ctx->kv[idx].value.int8 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_UINT16;
|
||||
ctx->kv[idx].value.uint16 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_INT16;
|
||||
ctx->kv[idx].value.int16 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_UINT32;
|
||||
ctx->kv[idx].value.uint32 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_INT32;
|
||||
ctx->kv[idx].value.int32 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
|
||||
ctx->kv[idx].value.float32 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_BOOL;
|
||||
ctx->kv[idx].value.bool_ = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_STRING;
|
||||
ctx->kv[idx].value.str.n = strlen(val) + 1;
|
||||
ctx->kv[idx].value.str.data = strdup(val);
|
||||
}
|
||||
|
||||
void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||
ctx->kv[idx].value.arr.type = type;
|
||||
ctx->kv[idx].value.arr.n = n;
|
||||
ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
|
||||
memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
|
||||
}
|
||||
|
||||
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
||||
ctx->kv[idx].value.arr.n = n;
|
||||
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
||||
for (int i = 0; i < n; i++) {
|
||||
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
||||
str->n = strlen(data[i]) + 1;
|
||||
str->data = strdup(data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// set or add KV pairs from another context
|
||||
void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
||||
for (uint32_t i = 0; i < src->header.n_kv; i++) {
|
||||
switch (src->kv[i].type) {
|
||||
case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
|
||||
case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
|
||||
case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
|
||||
case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
|
||||
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
||||
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
||||
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
||||
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
||||
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
||||
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
|
||||
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
||||
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
||||
}
|
||||
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
||||
free(data);
|
||||
} if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
||||
GGML_ASSERT(false && "nested arrays not supported");
|
||||
} else {
|
||||
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void gguf_add_tensor(
|
||||
struct gguf_context * ctx,
|
||||
const struct ggml_tensor * tensor) {
|
||||
const int idx = ctx->header.n_tensors;
|
||||
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
||||
|
||||
ctx->infos[idx].name.n = strlen(tensor->name) + 1;
|
||||
ctx->infos[idx].name.data = strdup(tensor->name);
|
||||
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
ctx->infos[idx].ne[i] = 1;
|
||||
}
|
||||
|
||||
ctx->infos[idx].n_dims = tensor->n_dims;
|
||||
for (int i = 0; i < tensor->n_dims; i++) {
|
||||
ctx->infos[idx].ne[i] = tensor->ne[i];
|
||||
}
|
||||
|
||||
ctx->infos[idx].type = tensor->type;
|
||||
ctx->infos[idx].offset = 0;
|
||||
ctx->infos[idx].data = tensor->data;
|
||||
ctx->infos[idx].size = ggml_nbytes(tensor);
|
||||
|
||||
if (ctx->header.n_tensors > 0) {
|
||||
ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
|
||||
}
|
||||
|
||||
ctx->header.n_tensors++;
|
||||
}
|
||||
|
||||
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
|
||||
const int idx = gguf_find_tensor(ctx, name);
|
||||
if (idx < 0) {
|
||||
GGML_ASSERT(false && "tensor not found");
|
||||
}
|
||||
|
||||
ctx->infos[idx].type = type;
|
||||
}
|
||||
|
||||
void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
|
||||
const int idx = gguf_find_tensor(ctx, name);
|
||||
if (idx < 0) {
|
||||
GGML_ASSERT(false && "tensor not found");
|
||||
}
|
||||
|
||||
ctx->infos[idx].data = data;
|
||||
ctx->infos[idx].size = size;
|
||||
|
||||
// update offsets
|
||||
for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
|
||||
ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
|
||||
}
|
||||
}
|
||||
|
||||
//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
|
||||
// fwrite(&val->n, sizeof(val->n), 1, file);
|
||||
// fwrite(val->data, sizeof(char), val->n, file);
|
||||
//}
|
||||
//
|
||||
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
|
||||
// fwrite(val, sizeof(char), size, file);
|
||||
//}
|
||||
|
||||
struct gguf_buf {
|
||||
void * data;
|
||||
size_t size;
|
||||
size_t offset;
|
||||
};
|
||||
|
||||
static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
struct gguf_buf buf = {
|
||||
/*buf.data =*/ size == 0 ? NULL : malloc(size),
|
||||
/*buf.size =*/ size,
|
||||
/*buf.offset =*/ 0,
|
||||
};
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void gguf_buf_free(struct gguf_buf buf) {
|
||||
if (buf.data) {
|
||||
free(buf.data);
|
||||
}
|
||||
}
|
||||
|
||||
static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
|
||||
if (buf->offset + size > buf->size) {
|
||||
buf->size = 1.5*(buf->offset + size);
|
||||
if (buf->data) {
|
||||
buf->data = realloc(buf->data, buf->size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
|
||||
gguf_buf_grow(buf, sizeof(val->n) + val->n);
|
||||
|
||||
buf->data && memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
|
||||
buf->offset += sizeof(val->n);
|
||||
|
||||
buf->data && memcpy((char *) buf->data + buf->offset, val->data, val->n);
|
||||
buf->offset += val->n;
|
||||
}
|
||||
|
||||
static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
|
||||
gguf_buf_grow(buf, el_size);
|
||||
|
||||
buf->data && memcpy((char *) buf->data + buf->offset, val, el_size);
|
||||
buf->offset += el_size;
|
||||
}
|
||||
|
||||
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
||||
// write header
|
||||
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
||||
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
||||
gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
|
||||
gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
|
||||
|
||||
// write key-value pairs
|
||||
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
|
||||
gguf_bwrite_str(buf, &kv->key);
|
||||
gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
|
||||
|
||||
switch (kv->type) {
|
||||
case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
|
||||
case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
|
||||
case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
|
||||
case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
|
||||
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
||||
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
||||
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
||||
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
||||
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
|
||||
gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
|
||||
|
||||
switch (kv->value.arr.type) {
|
||||
case GGUF_TYPE_UINT8:
|
||||
case GGUF_TYPE_INT8:
|
||||
case GGUF_TYPE_UINT16:
|
||||
case GGUF_TYPE_INT16:
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32:
|
||||
case GGUF_TYPE_FLOAT32:
|
||||
case GGUF_TYPE_BOOL:
|
||||
{
|
||||
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||
} break;
|
||||
case GGUF_TYPE_STRING:
|
||||
{
|
||||
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
||||
gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
};
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||
};
|
||||
}
|
||||
|
||||
// write tensor infos
|
||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
|
||||
gguf_bwrite_str(buf, &info->name);
|
||||
gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
|
||||
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||
gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
|
||||
}
|
||||
gguf_bwrite_el(buf, &info->type, sizeof(info->type));
|
||||
gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
|
||||
}
|
||||
|
||||
// we require the data section to be aligned, so take into account any padding
|
||||
{
|
||||
const size_t offset = buf->offset;
|
||||
const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
|
||||
|
||||
if (offset_pad != offset) {
|
||||
uint8_t pad = 0;
|
||||
for (size_t i = 0; i < offset_pad - offset; ++i) {
|
||||
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (only_meta) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t offset = 0;
|
||||
|
||||
// write tensor data
|
||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
|
||||
const size_t size = info->size;
|
||||
const size_t size_pad = GGML_PAD(size, ctx->alignment);
|
||||
|
||||
gguf_bwrite_el(buf, info->data, size);
|
||||
|
||||
if (size_pad != size) {
|
||||
uint8_t pad = 0;
|
||||
for (size_t j = 0; j < size_pad - size; ++j) {
|
||||
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(offset == info->offset);
|
||||
|
||||
offset += size_pad;
|
||||
}
|
||||
}
|
||||
|
||||
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
||||
FILE * file = fopen(fname, "wb");
|
||||
if (!file) {
|
||||
GGML_ASSERT(false && "failed to open file for writing");
|
||||
}
|
||||
|
||||
struct gguf_buf buf = gguf_buf_init(16*1024);
|
||||
|
||||
gguf_write_to_buf(ctx, &buf, only_meta);
|
||||
|
||||
fwrite(buf.data, 1, buf.offset, file);
|
||||
|
||||
gguf_buf_free(buf);
|
||||
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
||||
// no allocs - only compute size
|
||||
struct gguf_buf buf = gguf_buf_init(0);
|
||||
|
||||
gguf_write_to_buf(ctx, &buf, true);
|
||||
|
||||
return buf.offset;
|
||||
}
|
||||
|
||||
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
||||
struct gguf_buf buf = gguf_buf_init(16*1024);
|
||||
|
||||
gguf_write_to_buf(ctx, &buf, true);
|
||||
|
||||
memcpy(data, buf.data, buf.offset);
|
||||
|
||||
gguf_buf_free(buf);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int ggml_cpu_has_avx(void) {
|
||||
|
63
ggml.h
63
ggml.h
@ -566,6 +566,7 @@ extern "C" {
|
||||
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
||||
|
||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
||||
@ -1498,7 +1499,6 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||
@ -1711,7 +1711,6 @@ extern "C" {
|
||||
// gguf
|
||||
//
|
||||
|
||||
// TODO: can be removed if the API is extended for writing
|
||||
enum gguf_type {
|
||||
GGUF_TYPE_UINT8 = 0,
|
||||
GGUF_TYPE_INT8 = 1,
|
||||
@ -1735,10 +1734,14 @@ extern "C" {
|
||||
struct ggml_context ** ctx;
|
||||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
|
||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||
|
||||
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||
|
||||
GGML_API int gguf_get_version (struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
||||
@ -1747,13 +1750,11 @@ extern "C" {
|
||||
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
||||
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
||||
|
||||
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
||||
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
|
||||
GGML_API void gguf_get_val (struct gguf_context * ctx, int i, void * val);
|
||||
|
||||
GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i);
|
||||
GGML_API float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i);
|
||||
|
||||
// results are undefined if the wrong type is used for the key
|
||||
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
|
||||
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
|
||||
@ -1764,12 +1765,60 @@ extern "C" {
|
||||
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
||||
GGML_API void gguf_get_arr_data(struct gguf_context * ctx, int i, void * data);
|
||||
GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
|
||||
|
||||
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
|
||||
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
|
||||
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
|
||||
|
||||
// overrides existing values or adds a new one
|
||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
||||
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
||||
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
||||
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
|
||||
|
||||
// set or add KV pairs from another context
|
||||
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
|
||||
|
||||
// manage tensor info
|
||||
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
||||
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
||||
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
|
||||
|
||||
// writing gguf files can be done in 2 ways:
|
||||
//
|
||||
// - write the entire gguf_context to a binary file in a single pass:
|
||||
//
|
||||
// gguf_write_to_file(ctx, fname);
|
||||
//
|
||||
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
||||
//
|
||||
// FILE * f = fopen(fname, "wb");
|
||||
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
|
||||
// fwrite(f, ...);
|
||||
// void * data = gguf_meta_get_meta_data(ctx);
|
||||
// fseek(f, 0, SEEK_SET);
|
||||
// fwrite(f, data, gguf_get_meta_size(ctx));
|
||||
// free(data);
|
||||
// fclose(f);
|
||||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
|
||||
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
|
||||
|
||||
//
|
||||
// system info
|
||||
//
|
||||
|
2346
gguf-llama.cpp
2346
gguf-llama.cpp
File diff suppressed because it is too large
Load Diff
30
gguf-llama.h
30
gguf-llama.h
@ -111,6 +111,7 @@ extern "C" {
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool embedding; // embedding mode only
|
||||
};
|
||||
|
||||
// model file types
|
||||
enum llama_ftype {
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
@ -190,17 +191,12 @@ extern "C" {
|
||||
int32_t n_eval;
|
||||
};
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
||||
|
||||
LLAMA_API int llama_max_devices();
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
||||
|
||||
LLAMA_API bool llama_mmap_supported();
|
||||
LLAMA_API bool llama_mlock_supported();
|
||||
LLAMA_API int llama_max_devices(void);
|
||||
LLAMA_API bool llama_mmap_supported(void);
|
||||
LLAMA_API bool llama_mlock_supported(void);
|
||||
|
||||
// TODO: not great API - very likely to change
|
||||
// Initialize the llama + ggml backend
|
||||
@ -208,9 +204,9 @@ extern "C" {
|
||||
// Call once at the start of the program
|
||||
LLAMA_API void llama_backend_init(bool numa);
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free();
|
||||
LLAMA_API void llama_backend_free(void);
|
||||
|
||||
LLAMA_API int64_t llama_time_us();
|
||||
LLAMA_API int64_t llama_time_us(void);
|
||||
|
||||
LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
@ -377,9 +373,9 @@ extern "C" {
|
||||
char * str,
|
||||
int length);
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl(); // next-line
|
||||
LLAMA_API llama_token llama_token_bos(void); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(void); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl(void); // next-line
|
||||
|
||||
// Grammar
|
||||
//
|
||||
@ -459,6 +455,10 @@ extern "C" {
|
||||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
470
gguf-util.h
470
gguf-util.h
@ -1,470 +0,0 @@
|
||||
// GGUF counterpart of llama-util.h.
|
||||
// we may consider making it a part of ggml.c once GGUF work is complete.
|
||||
// this will require extra work to migrate this to pure C.
|
||||
// Contains wrappers around OS interfaces.
|
||||
|
||||
#ifndef GGUF_UTIL_H
|
||||
#define GGUF_UTIL_H
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <cstdarg>
|
||||
#include <cstdlib>
|
||||
#include <climits>
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
#include <unistd.h>
|
||||
#if defined(_POSIX_MAPPED_FILES)
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
#if defined(_POSIX_MEMLOCK_RANGE)
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
#include <stdio.h> // for _fseeki64
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
__attribute__((format(gnu_printf, 1, 2)))
|
||||
#else
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif
|
||||
#endif
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap, ap2;
|
||||
va_start(ap, fmt);
|
||||
va_copy(ap2, ap);
|
||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||
GGML_ASSERT(size >= 0 && size < INT_MAX);
|
||||
std::vector<char> buf(size + 1);
|
||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||
GGML_ASSERT(size2 == size);
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
}
|
||||
|
||||
// TODO: can we merge this one and gguf_context?
|
||||
struct gguf_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
|
||||
gguf_file(const char * fname, const char * mode) {
|
||||
fp = std::fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
#ifdef _WIN32
|
||||
__int64 ret = _ftelli64(fp);
|
||||
#else
|
||||
long ret = std::ftell(fp);
|
||||
#endif
|
||||
GGML_ASSERT(ret != -1); // this really shouldn't fail
|
||||
return (size_t) ret;
|
||||
}
|
||||
|
||||
void seek(size_t offset, int whence) {
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
||||
#else
|
||||
int ret = std::fseek(fp, (long) offset, whence);
|
||||
#endif
|
||||
GGML_ASSERT(ret == 0); // same
|
||||
}
|
||||
|
||||
size_t write_str(const std::string & val) {
|
||||
size_t total_written = 0;
|
||||
const int32_t n = val.size();
|
||||
fwrite((const char *) &n, sizeof(n), 1, fp);
|
||||
total_written += sizeof(n);
|
||||
fwrite(val.c_str(), n, 1, fp);
|
||||
total_written += n;
|
||||
|
||||
return total_written;
|
||||
}
|
||||
|
||||
size_t write_i32(int32_t val) {
|
||||
fwrite((const char *) &val, sizeof(val), 1, fp);
|
||||
return sizeof(val);
|
||||
}
|
||||
|
||||
size_t write_u64(size_t val) {
|
||||
fwrite((const char *) &val, sizeof(val), 1, fp);
|
||||
return sizeof(val);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void write_val(const std::string & key, enum gguf_type type, const T & val) {
|
||||
write_str(key);
|
||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
||||
fwrite((const char *) &val, sizeof(val), 1, fp);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void write_arr(const std::string & key, enum gguf_type type, const std::vector<T> & val) {
|
||||
write_str(key);
|
||||
{
|
||||
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
|
||||
fwrite((const char *) &tarr, sizeof(tarr), 1, fp);
|
||||
}
|
||||
|
||||
const int32_t n = val.size();
|
||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
||||
fwrite((const char *) &n, sizeof(n), 1, fp);
|
||||
fwrite(val.data(), sizeof(T), n, fp);
|
||||
}
|
||||
|
||||
void write_str(const std::string & key, enum gguf_type type, const std::string & val) {
|
||||
write_str(key);
|
||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
||||
|
||||
const int32_t n = val.size();
|
||||
fwrite((const char *) &n, sizeof(n), 1, fp);
|
||||
fwrite(val.c_str(), n, 1, fp);
|
||||
}
|
||||
|
||||
void write_str(const std::string & key, enum gguf_type type, const std::vector<std::string> & val) {
|
||||
write_str(key);
|
||||
{
|
||||
const enum gguf_type tarr = GGUF_TYPE_ARRAY;
|
||||
fwrite((const char *) &tarr, sizeof(tarr), 1, fp);
|
||||
}
|
||||
|
||||
const int32_t n = val.size();
|
||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
||||
fwrite((const char *) &n, sizeof(n), 1, fp);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
const int32_t nstr = val[i].size();
|
||||
fwrite((const char *) &nstr, sizeof(nstr), 1, fp);
|
||||
fwrite(val[i].c_str(), nstr, 1, fp);
|
||||
}
|
||||
}
|
||||
|
||||
void write_zeros(size_t count) {
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
fputc(0, fp);
|
||||
}
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
||||
}
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, len, 1, fp);
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
~gguf_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(_WIN32)
|
||||
static std::string gguf_format_win_err(DWORD err) {
|
||||
LPSTR buf;
|
||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||
if (!size) {
|
||||
return "FormatMessageA failed";
|
||||
}
|
||||
std::string ret(buf, size);
|
||||
LocalFree(buf);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct gguf_mmap {
|
||||
void * addr;
|
||||
size_t size;
|
||||
|
||||
gguf_mmap(const gguf_mmap &) = delete;
|
||||
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
gguf_mmap(struct gguf_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
// prefetch/readahead impairs performance on NUMA systems
|
||||
if (numa) { prefetch = 0; }
|
||||
#ifdef __linux__
|
||||
if (prefetch) { flags |= MAP_POPULATE; }
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
if (addr == MAP_FAILED) {
|
||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||
}
|
||||
|
||||
if (prefetch > 0) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
if (numa) {
|
||||
// advise the kernel not to use readahead
|
||||
// (because the next page might not belong on the same node)
|
||||
if (madvise(addr, file->size, MADV_RANDOM)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~gguf_mmap() {
|
||||
munmap(addr, size);
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
gguf_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
||||
(void) numa;
|
||||
|
||||
size = file->size;
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
DWORD error = GetLastError();
|
||||
|
||||
if (hMapping == NULL) {
|
||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
error = GetLastError();
|
||||
CloseHandle(hMapping);
|
||||
|
||||
if (addr == NULL) {
|
||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
gguf_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
||||
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
}
|
||||
|
||||
~gguf_mmap() {
|
||||
if (!UnmapViewOfFile(addr)) {
|
||||
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
gguf_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
||||
(void) prefetch;
|
||||
(void) numa;
|
||||
|
||||
throw std::runtime_error(std::string("mmap not supported"));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
||||
// will automatically unlock on destruction.
|
||||
struct gguf_mlock {
|
||||
void * addr = NULL;
|
||||
size_t size = 0;
|
||||
bool failed_already = false;
|
||||
|
||||
gguf_mlock() {}
|
||||
gguf_mlock(const gguf_mlock &) = delete;
|
||||
|
||||
~gguf_mlock() {
|
||||
if (size) {
|
||||
raw_unlock(addr, size);
|
||||
}
|
||||
}
|
||||
|
||||
void init(void * ptr) {
|
||||
GGML_ASSERT(addr == NULL && size == 0);
|
||||
addr = ptr;
|
||||
}
|
||||
|
||||
void grow_to(size_t target_size) {
|
||||
GGML_ASSERT(addr);
|
||||
if (failed_already) {
|
||||
return;
|
||||
}
|
||||
size_t granularity = lock_granularity();
|
||||
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
||||
if (target_size > size) {
|
||||
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
||||
size = target_size;
|
||||
} else {
|
||||
failed_already = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _POSIX_MEMLOCK_RANGE
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
return (size_t) sysconf(_SC_PAGESIZE);
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
||||
#else
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
||||
#endif
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) {
|
||||
if (!mlock(addr, size)) {
|
||||
return true;
|
||||
} else {
|
||||
char* errmsg = std::strerror(errno);
|
||||
bool suggest = (errno == ENOMEM);
|
||||
|
||||
// Check if the resource limit is fine after all
|
||||
struct rlimit lock_limit;
|
||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
||||
suggest = false;
|
||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
||||
suggest = false;
|
||||
|
||||
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
||||
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#undef MLOCK_SUGGESTION
|
||||
|
||||
void raw_unlock(void * addr, size_t size) {
|
||||
if (munlock(addr, size)) {
|
||||
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
||||
}
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si);
|
||||
return (size_t) si.dwPageSize;
|
||||
}
|
||||
|
||||
bool raw_lock(void * ptr, size_t len) {
|
||||
for (int tries = 1; ; tries++) {
|
||||
if (VirtualLock(ptr, len)) {
|
||||
return true;
|
||||
}
|
||||
if (tries == 2) {
|
||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
||||
len, size, llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// It failed but this was only the first try; increase the working
|
||||
// set size and try again.
|
||||
SIZE_T min_ws_size, max_ws_size;
|
||||
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
||||
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
||||
gguf_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
// Per MSDN: "The maximum number of pages that a process can lock
|
||||
// is equal to the number of pages in its minimum working set minus
|
||||
// a small overhead."
|
||||
// Hopefully a megabyte is enough overhead:
|
||||
size_t increment = len + 1048576;
|
||||
// The minimum must be <= the maximum, so we need to increase both:
|
||||
min_ws_size += increment;
|
||||
max_ws_size += increment;
|
||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
||||
gguf_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void raw_unlock(void * ptr, size_t len) {
|
||||
if (!VirtualUnlock(ptr, len)) {
|
||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
||||
gguf_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
size_t lock_granularity() {
|
||||
return (size_t) 65536;
|
||||
}
|
||||
|
||||
bool raw_lock(const void * addr, size_t len) {
|
||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
void raw_unlock(const void * addr, size_t len) {}
|
||||
#endif
|
||||
};
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user