mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 03:14:35 +00:00
quantize : --pure option for disabling k-quant mixtures
This commit is contained in:
parent
ee37e35dc5
commit
8a86b95e87
@ -68,13 +68,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
|||||||
}
|
}
|
||||||
|
|
||||||
// usage:
|
// usage:
|
||||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||||
//
|
//
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
static void usage(const char * executable) {
|
static void usage(const char * executable) {
|
||||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||||
|
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||||
printf("\nAllowed quantization types:\n");
|
printf("\nAllowed quantization types:\n");
|
||||||
for (auto & it : QUANT_OPTIONS) {
|
for (auto & it : QUANT_OPTIONS) {
|
||||||
if (it.name != "COPY") {
|
if (it.name != "COPY") {
|
||||||
@ -101,6 +102,8 @@ int main(int argc, char ** argv) {
|
|||||||
params.quantize_output_tensor = false;
|
params.quantize_output_tensor = false;
|
||||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||||
params.allow_requantize = true;
|
params.allow_requantize = true;
|
||||||
|
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||||
|
params.pure = true;
|
||||||
} else {
|
} else {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
|
@ -8380,7 +8380,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
|
|
||||||
if (quantize) {
|
if (quantize) {
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
|
if (!params->pure) {
|
||||||
|
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
|
||||||
|
}
|
||||||
|
|
||||||
// If we've decided to quantize to the same type the tensor is already
|
// If we've decided to quantize to the same type the tensor is already
|
||||||
// in then there's nothing to do.
|
// in then there's nothing to do.
|
||||||
@ -8835,6 +8837,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||||||
/*.allow_requantize =*/ false,
|
/*.allow_requantize =*/ false,
|
||||||
/*.quantize_output_tensor =*/ true,
|
/*.quantize_output_tensor =*/ true,
|
||||||
/*.only_copy =*/ false,
|
/*.only_copy =*/ false,
|
||||||
|
/*.pure =*/ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
1
llama.h
1
llama.h
@ -191,6 +191,7 @@ extern "C" {
|
|||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
|
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
// grammar types
|
// grammar types
|
||||||
|
Loading…
Reference in New Issue
Block a user