mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
Allow quantize to only copy tensors, some other improvements (#2931)
* Allow quantize tool to only copy tensors to allow repackaging models. * Slightly better logic when requantizing. * Change help message to go to `stdout`.
This commit is contained in:
parent
0d58936686
commit
5d6f19f16b
@ -35,6 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
|
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||||
|
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -71,12 +73,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
|||||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||||
//
|
//
|
||||||
void usage(const char * executable) {
|
void usage(const char * executable) {
|
||||||
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||||
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||||
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||||
fprintf(stderr, "\nAllowed quantization types:\n");
|
printf("\nAllowed quantization types:\n");
|
||||||
for (auto & it : QUANT_OPTIONS) {
|
for (auto & it : QUANT_OPTIONS) {
|
||||||
printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
|
if (it.name != "COPY") {
|
||||||
|
printf(" %2d or ", it.ftype);
|
||||||
|
} else {
|
||||||
|
printf(" ");
|
||||||
|
}
|
||||||
|
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
|
||||||
}
|
}
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@ -121,6 +128,9 @@ int main(int argc, char ** argv) {
|
|||||||
// export as [inp path]/ggml-model-[ftype].gguf
|
// export as [inp path]/ggml-model-[ftype].gguf
|
||||||
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
|
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
|
||||||
arg_idx++;
|
arg_idx++;
|
||||||
|
if (ftype_str == "COPY") {
|
||||||
|
params.only_copy = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
fname_out = argv[arg_idx];
|
fname_out = argv[arg_idx];
|
||||||
@ -133,6 +143,10 @@ int main(int argc, char ** argv) {
|
|||||||
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||||
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
||||||
return 1;
|
return 1;
|
||||||
|
} else {
|
||||||
|
if (ftype_str == "COPY") {
|
||||||
|
params.only_copy = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
arg_idx++;
|
arg_idx++;
|
||||||
}
|
}
|
||||||
|
25
llama.cpp
25
llama.cpp
@ -4683,6 +4683,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
llm_load_arch(*ml, model);
|
llm_load_arch(*ml, model);
|
||||||
llm_load_hparams(*ml, model, 0, 0, 0);
|
llm_load_hparams(*ml, model, 0, 0, 0);
|
||||||
|
|
||||||
|
if (params->only_copy) {
|
||||||
|
ftype = model.ftype;
|
||||||
|
}
|
||||||
|
|
||||||
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
||||||
struct gguf_context * ctx_out = gguf_init_empty();
|
struct gguf_context * ctx_out = gguf_init_empty();
|
||||||
|
|
||||||
@ -4769,18 +4773,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
// quantize only 2D tensors
|
// quantize only 2D tensors
|
||||||
quantize &= (tensor->n_dims == 2);
|
quantize &= (tensor->n_dims == 2);
|
||||||
quantize &= params->quantize_output_tensor || name != "output.weight";
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
||||||
quantize &= quantized_type != tensor->type;
|
quantize &= !params->only_copy;
|
||||||
|
|
||||||
enum ggml_type new_type;
|
enum ggml_type new_type;
|
||||||
void * new_data;
|
void * new_data;
|
||||||
size_t new_size;
|
size_t new_size;
|
||||||
|
|
||||||
if (!quantize) {
|
if (quantize) {
|
||||||
new_type = tensor->type;
|
|
||||||
new_data = tensor->data;
|
|
||||||
new_size = ggml_nbytes(tensor);
|
|
||||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
||||||
} else {
|
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||||
@ -4879,7 +4878,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
// If we've decided to quantize to the same type the tensor is already
|
||||||
|
// in then there's nothing to do.
|
||||||
|
quantize = tensor->type != new_type;
|
||||||
|
}
|
||||||
|
if (!quantize) {
|
||||||
|
new_type = tensor->type;
|
||||||
|
new_data = tensor->data;
|
||||||
|
new_size = ggml_nbytes(tensor);
|
||||||
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||||
|
} else {
|
||||||
const size_t nelements = ggml_nelements(tensor);
|
const size_t nelements = ggml_nelements(tensor);
|
||||||
|
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
@ -5310,6 +5318,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||||
/*.allow_requantize =*/ false,
|
/*.allow_requantize =*/ false,
|
||||||
/*.quantize_output_tensor =*/ true,
|
/*.quantize_output_tensor =*/ true,
|
||||||
|
/*.only_copy =*/ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
1
llama.h
1
llama.h
@ -164,6 +164,7 @@ extern "C" {
|
|||||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
// grammar types
|
// grammar types
|
||||||
|
Loading…
Reference in New Issue
Block a user