quantize: options for output and token embedding tensors qtype (#6239)

* quantize: be able to specify the output tensor type

* quantize: be able to specify the token embedding tensor type

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow 2024-03-22 19:47:14 +01:00 committed by GitHub
parent dba1af6129
commit 1d0331c12a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 61 additions and 26 deletions

View File

@ -189,6 +189,18 @@ static void prepare_imatrix(const std::string& imatrix_file,
} }
} }
static ggml_type parse_ggml_type(const char * arg) {
ggml_type result = GGML_TYPE_COUNT;
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
auto type = ggml_type(j);
const auto * name = ggml_type_name(type);
if (name && strcmp(arg, name) == 0) {
result = type; break;
}
}
return result;
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
if (argc < 3) { if (argc < 3) {
usage(argv[0]); usage(argv[0]);
@ -203,6 +215,18 @@ int main(int argc, char ** argv) {
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
params.quantize_output_tensor = false; params.quantize_output_tensor = false;
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
if (arg_idx < argc-1) {
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
if (arg_idx < argc-1) {
params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true; params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) { } else if (strcmp(argv[arg_idx], "--pure") == 0) {

View File

@ -12141,27 +12141,34 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
// with the quantization of the output tensor // with the quantization of the output tensor
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
int nx = tensor->ne[0]; if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = qs.params->output_tensor_type;
new_type = GGML_TYPE_Q8_0; } else {
} int nx = tensor->ne[0];
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { new_type = GGML_TYPE_Q8_0;
new_type = GGML_TYPE_Q5_K; }
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
else if (new_type != GGML_TYPE_Q8_0) { ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
new_type = GGML_TYPE_Q6_K; new_type = GGML_TYPE_Q5_K;
}
else if (new_type != GGML_TYPE_Q8_0) {
new_type = GGML_TYPE_Q6_K;
}
} }
} else if (name == "token_embd.weight") { } else if (name == "token_embd.weight") {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { new_type = qs.params->token_embedding_type;
new_type = GGML_TYPE_Q2_K; } else {
} if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { new_type = GGML_TYPE_Q2_K;
new_type = GGML_TYPE_IQ3_S; }
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S;
new_type = GGML_TYPE_IQ3_S; }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S;
}
} }
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
@ -13051,6 +13058,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
struct llama_model_quantize_params result = { struct llama_model_quantize_params result = {
/*.nthread =*/ 0, /*.nthread =*/ 0,
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
/*.allow_requantize =*/ false, /*.allow_requantize =*/ false,
/*.quantize_output_tensor =*/ true, /*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false, /*.only_copy =*/ false,

16
llama.h
View File

@ -275,13 +275,15 @@ extern "C" {
// model quantization parameters // model quantization parameters
typedef struct llama_model_quantize_params { typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype enum llama_ftype ftype; // quantize to this llama_ftype
bool allow_requantize; // allow quantizing non-f32/f16 tensors enum ggml_type output_tensor_type; // output tensor type
bool quantize_output_tensor; // quantize output.weight enum ggml_type token_embedding_type; // itoken embeddings tensor type
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool pure; // quantize all tensors to the default type bool quantize_output_tensor; // quantize output.weight
void * imatrix; // pointer to importance matrix data bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
void * imatrix; // pointer to importance matrix data
} llama_model_quantize_params; } llama_model_quantize_params;
// grammar types // grammar types