llama : correction of the attn.v.weight quantization for IQ3_XS (#6209)

IQ3_XS was not mentioned, IQ3_S and IQ3_M were present twice. That PR corrects this in the manner which was probably intended initially.
2024-12-25 10:54:36 +00:00 · 2024-03-22 14:32:02 +01:00 · 2024-03-22 14:32:02 +01:00 · e80f06d2a1
commit e80f06d2a1
parent f77a8ffd3b
1 changed files with 1 additions and 7 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -12027,13 +12027,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
            new_type = GGML_TYPE_Q4_K;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {