diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 39c75e0a7..e04feeae3 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -32,6 +32,7 @@ struct split_params { int n_split_tensors = 128; std::string input; std::string output; + bool no_tensor_first_split = false; bool dry_run = false; }; @@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) { printf(" --merge merge multiple GGUF to a single GGUF\n"); printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); printf(" --split-max-size N(M|G) max size per split\n"); + printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n"); printf(" --dry-run only print out a split plan and exit, without writing any new files\n"); printf("\n"); } @@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p arg_found = true; params.dry_run = true; } + if (arg == "--no-tensor-first-split") { + arg_found = true; + params.no_tensor_first_split = true; + } if (is_op_set) { throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); @@ -200,10 +206,10 @@ struct split_strategy { // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits int i_split = -1; struct gguf_context * ctx_out = NULL; - auto new_ctx_out = [&]() { + auto new_ctx_out = [&](bool allow_no_tensors) { i_split++; if (ctx_out != NULL) { - if (gguf_get_n_tensors(ctx_out) == 0) { + if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) { fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n"); exit(EXIT_FAILURE); } @@ -220,7 +226,12 @@ struct split_strategy { }; // initialize ctx_out for the first split - new_ctx_out(); + new_ctx_out(false); + + // skip first split if no_tensor_first_split is set + if (params.no_tensor_first_split) { + new_ctx_out(true); + } // process tensors one by one size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) @@ -230,7 +241,7 @@ struct split_strategy { size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); size_t next_tensors_size = curr_tensors_size + n_bytes; if (should_split(i, next_tensors_size)) { - new_ctx_out(); + new_ctx_out(false); curr_tensors_size = n_bytes; } else { curr_tensors_size = next_tensors_size; diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh index 57588204d..7ca6fa7f2 100755 --- a/examples/gguf-split/tests.sh +++ b/examples/gguf-split/tests.sh @@ -55,15 +55,15 @@ $MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32 echo PASS echo -# 4. Split with no tensor in metadata -#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors -#echo PASS -#echo +# 4. Split with no tensors in the first split +$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors +echo PASS +echo # 4b. Test the sharded model is loading properly -#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32 -#echo PASS -#echo +$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32 +echo PASS +echo # 5. Merge #$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf diff --git a/ggml.c b/ggml.c index 74ecd5927..82179a125 100644 --- a/ggml.c +++ b/ggml.c @@ -21139,7 +21139,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } // read the tensor infos - { + if (ctx->header.n_tensors > 0) { ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info)); for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {