gemma : use more bits for the token_embd.weight tensor (#5650)

* gemma : use Q8_0 for the token_embd.weight tensor

* llama : quantize token_embd.weight using output type
This commit is contained in:
Georgi Gerganov 2024-02-22 23:23:46 +02:00 committed by GitHub
parent 847eedbdb2
commit 96633eeca1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -10498,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
return std::make_pair(i_layer, n_layer); return std::make_pair(i_layer, n_layer);
}; };
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
// with the quantization of the output tensor
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
int nx = tensor->ne[0]; int nx = tensor->ne[0];
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0; new_type = GGML_TYPE_Q8_0;