From cfd3be76e37dab92c846d75a2421178f20db4a11 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 21 Mar 2024 13:59:38 +0100 Subject: [PATCH] ggml : same IQ4_NL quantization for CPU/CUDA/Metal (#6196) * Make quantize_row_iq4_nl do the same thing is quantization on CUDA * Make quantize_row_iq4_nl do the same thing is quantization on CUDA This time for real. backend-ops tests pass. * Now fix test-quantize-fns --------- Co-authored-by: Iwan Kawrakow --- ggml-quants.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 109dd6660..2eaca0593 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -11705,9 +11705,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, float * scales, float * weight, uint8_t * L, const int8_t * values, - const float * quant_weights) { - - const int ntry = 7; + const float * quant_weights, + const int ntry) { float sigma2 = 0; for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j]; @@ -11719,6 +11718,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block float max_scale = 0, amax_scale = 0; for (int ib = 0; ib < super_block_size/block_size; ++ib) { const float * xb = x + ib*block_size; + uint8_t * Lb = L + ib*block_size; if (quant_weights) { const float * qw = quant_weights + ib*block_size; for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); @@ -11736,12 +11736,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block scales[ib] = 0; continue; } - float d = -max/values[0]; + float d = ntry > 0 ? -max/values[0] : max/values[0]; float id = 1/d; float sumqx = 0, sumq2 = 0; for (int j = 0; j < block_size; ++j) { float al = id*xb[j]; int l = best_index_int8(16, values, al); + Lb[j] = l; float q = values[l]; float w = weight[j]; sumqx += w*q*xb[j]; @@ -11796,9 +11797,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block } } else { dh[0] = GGML_FP32_TO_FP16(scales[0]); - float id = scales[0] ? 1/scales[0] : 0; - for (int j = 0; j < super_block_size; ++j) { - L[j] = best_index_int8(16, values, id*x[j]); + if (ntry > 0) { + float id = scales[0] ? 1/scales[0] : 0; + for (int j = 0; j < super_block_size; ++j) { + L[j] = best_index_int8(16, values, id*x[j]); + } } } @@ -11823,7 +11826,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow for (int ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, - &scale, weight, L, kvalues_iq4nl, qw); + &scale, weight, L, kvalues_iq4nl, qw, 7); } src += n_per_row; qrow += nblock*sizeof(block_iq4_nl); @@ -11832,14 +11835,23 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow } void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { - assert(k % QK4_NL == 0); - block_iq4_nl * restrict y = vy; - quantize_row_iq4_nl_reference(x, y, k); + GGML_ASSERT(k%QK4_NL == 0); + int nblock = k/QK4_NL; + uint8_t L[QK4_NL]; + float weight[QK4_NL]; + uint16_t unused_h; + uint8_t * unused_l = NULL; + float scale; + block_iq4_nl * iq4 = (block_iq4_nl *)vy; + for (int ibl = 0; ibl < nblock; ++ibl) { + quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, + &scale, weight, L, kvalues_iq4nl, NULL, -1); + } } void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { assert(k % QK4_NL == 0); - quantize_iq4_nl(x, y, 1, k, NULL); + quantize_row_iq4_nl(x, y, k); } size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { @@ -11857,7 +11869,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow for (int ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL; quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l, - scales, weight, L, kvalues_iq4nl, qw); + scales, weight, L, kvalues_iq4nl, qw, 7); } src += n_per_row; qrow += nblock*sizeof(block_iq4_xs);