mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
gguf-split: add --no-tensor-first-split (#7072)
This commit is contained in:
parent
cf768b7e71
commit
842500144e
@ -32,6 +32,7 @@ struct split_params {
|
|||||||
int n_split_tensors = 128;
|
int n_split_tensors = 128;
|
||||||
std::string input;
|
std::string input;
|
||||||
std::string output;
|
std::string output;
|
||||||
|
bool no_tensor_first_split = false;
|
||||||
bool dry_run = false;
|
bool dry_run = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) {
|
|||||||
printf(" --merge merge multiple GGUF to a single GGUF\n");
|
printf(" --merge merge multiple GGUF to a single GGUF\n");
|
||||||
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
|
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
|
||||||
printf(" --split-max-size N(M|G) max size per split\n");
|
printf(" --split-max-size N(M|G) max size per split\n");
|
||||||
|
printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
|
||||||
printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
|
printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
|
|||||||
arg_found = true;
|
arg_found = true;
|
||||||
params.dry_run = true;
|
params.dry_run = true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--no-tensor-first-split") {
|
||||||
|
arg_found = true;
|
||||||
|
params.no_tensor_first_split = true;
|
||||||
|
}
|
||||||
|
|
||||||
if (is_op_set) {
|
if (is_op_set) {
|
||||||
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
|
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
|
||||||
@ -200,10 +206,10 @@ struct split_strategy {
|
|||||||
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
|
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
|
||||||
int i_split = -1;
|
int i_split = -1;
|
||||||
struct gguf_context * ctx_out = NULL;
|
struct gguf_context * ctx_out = NULL;
|
||||||
auto new_ctx_out = [&]() {
|
auto new_ctx_out = [&](bool allow_no_tensors) {
|
||||||
i_split++;
|
i_split++;
|
||||||
if (ctx_out != NULL) {
|
if (ctx_out != NULL) {
|
||||||
if (gguf_get_n_tensors(ctx_out) == 0) {
|
if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
|
||||||
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
|
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -220,7 +226,12 @@ struct split_strategy {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// initialize ctx_out for the first split
|
// initialize ctx_out for the first split
|
||||||
new_ctx_out();
|
new_ctx_out(false);
|
||||||
|
|
||||||
|
// skip first split if no_tensor_first_split is set
|
||||||
|
if (params.no_tensor_first_split) {
|
||||||
|
new_ctx_out(true);
|
||||||
|
}
|
||||||
|
|
||||||
// process tensors one by one
|
// process tensors one by one
|
||||||
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
|
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
|
||||||
@ -230,7 +241,7 @@ struct split_strategy {
|
|||||||
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
|
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
|
||||||
size_t next_tensors_size = curr_tensors_size + n_bytes;
|
size_t next_tensors_size = curr_tensors_size + n_bytes;
|
||||||
if (should_split(i, next_tensors_size)) {
|
if (should_split(i, next_tensors_size)) {
|
||||||
new_ctx_out();
|
new_ctx_out(false);
|
||||||
curr_tensors_size = n_bytes;
|
curr_tensors_size = n_bytes;
|
||||||
} else {
|
} else {
|
||||||
curr_tensors_size = next_tensors_size;
|
curr_tensors_size = next_tensors_size;
|
||||||
|
@ -55,15 +55,15 @@ $MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
|
|||||||
echo PASS
|
echo PASS
|
||||||
echo
|
echo
|
||||||
|
|
||||||
# 4. Split with no tensor in metadata
|
# 4. Split with no tensors in the first split
|
||||||
#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
|
$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
|
||||||
#echo PASS
|
echo PASS
|
||||||
#echo
|
echo
|
||||||
|
|
||||||
# 4b. Test the sharded model is loading properly
|
# 4b. Test the sharded model is loading properly
|
||||||
#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
|
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
|
||||||
#echo PASS
|
echo PASS
|
||||||
#echo
|
echo
|
||||||
|
|
||||||
# 5. Merge
|
# 5. Merge
|
||||||
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
|
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
|
||||||
|
2
ggml.c
2
ggml.c
@ -21139,7 +21139,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
}
|
}
|
||||||
|
|
||||||
// read the tensor infos
|
// read the tensor infos
|
||||||
{
|
if (ctx->header.n_tensors > 0) {
|
||||||
ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
|
ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
|
||||||
|
|
||||||
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
|
Loading…
Reference in New Issue
Block a user