ggml : same IQ4_NL quantization for CPU/CUDA/Metal (#6196)

* Make quantize_row_iq4_nl do the same thing is quantization on CUDA

* Make quantize_row_iq4_nl do the same thing is quantization on CUDA

This time for real. backend-ops tests pass.

* Now fix test-quantize-fns

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow 2024-03-21 13:59:38 +01:00 committed by GitHub
parent 5b7b0ac8df
commit cfd3be76e3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -11705,9 +11705,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
float * scales, float * weight, uint8_t * L, float * scales, float * weight, uint8_t * L,
const int8_t * values, const int8_t * values,
const float * quant_weights) { const float * quant_weights,
const int ntry) {
const int ntry = 7;
float sigma2 = 0; float sigma2 = 0;
for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j]; for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
@ -11719,6 +11718,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
float max_scale = 0, amax_scale = 0; float max_scale = 0, amax_scale = 0;
for (int ib = 0; ib < super_block_size/block_size; ++ib) { for (int ib = 0; ib < super_block_size/block_size; ++ib) {
const float * xb = x + ib*block_size; const float * xb = x + ib*block_size;
uint8_t * Lb = L + ib*block_size;
if (quant_weights) { if (quant_weights) {
const float * qw = quant_weights + ib*block_size; const float * qw = quant_weights + ib*block_size;
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
@ -11736,12 +11736,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
scales[ib] = 0; scales[ib] = 0;
continue; continue;
} }
float d = -max/values[0]; float d = ntry > 0 ? -max/values[0] : max/values[0];
float id = 1/d; float id = 1/d;
float sumqx = 0, sumq2 = 0; float sumqx = 0, sumq2 = 0;
for (int j = 0; j < block_size; ++j) { for (int j = 0; j < block_size; ++j) {
float al = id*xb[j]; float al = id*xb[j];
int l = best_index_int8(16, values, al); int l = best_index_int8(16, values, al);
Lb[j] = l;
float q = values[l]; float q = values[l];
float w = weight[j]; float w = weight[j];
sumqx += w*q*xb[j]; sumqx += w*q*xb[j];
@ -11796,11 +11797,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
} }
} else { } else {
dh[0] = GGML_FP32_TO_FP16(scales[0]); dh[0] = GGML_FP32_TO_FP16(scales[0]);
if (ntry > 0) {
float id = scales[0] ? 1/scales[0] : 0; float id = scales[0] ? 1/scales[0] : 0;
for (int j = 0; j < super_block_size; ++j) { for (int j = 0; j < super_block_size; ++j) {
L[j] = best_index_int8(16, values, id*x[j]); L[j] = best_index_int8(16, values, id*x[j]);
} }
} }
}
for (int i = 0; i < super_block_size/32; ++i) { for (int i = 0; i < super_block_size/32; ++i) {
for (int j = 0; j < 16; ++j) { for (int j = 0; j < 16; ++j) {
@ -11823,7 +11826,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
for (int ibl = 0; ibl < nblock; ++ibl) { for (int ibl = 0; ibl < nblock; ++ibl) {
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
&scale, weight, L, kvalues_iq4nl, qw); &scale, weight, L, kvalues_iq4nl, qw, 7);
} }
src += n_per_row; src += n_per_row;
qrow += nblock*sizeof(block_iq4_nl); qrow += nblock*sizeof(block_iq4_nl);
@ -11832,14 +11835,23 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
} }
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
assert(k % QK4_NL == 0); GGML_ASSERT(k%QK4_NL == 0);
block_iq4_nl * restrict y = vy; int nblock = k/QK4_NL;
quantize_row_iq4_nl_reference(x, y, k); uint8_t L[QK4_NL];
float weight[QK4_NL];
uint16_t unused_h;
uint8_t * unused_l = NULL;
float scale;
block_iq4_nl * iq4 = (block_iq4_nl *)vy;
for (int ibl = 0; ibl < nblock; ++ibl) {
quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
&scale, weight, L, kvalues_iq4nl, NULL, -1);
}
} }
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
assert(k % QK4_NL == 0); assert(k % QK4_NL == 0);
quantize_iq4_nl(x, y, 1, k, NULL); quantize_row_iq4_nl(x, y, k);
} }
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
@ -11857,7 +11869,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
for (int ibl = 0; ibl < nblock; ++ibl) { for (int ibl = 0; ibl < nblock; ++ibl) {
const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL; const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l, quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
scales, weight, L, kvalues_iq4nl, qw); scales, weight, L, kvalues_iq4nl, qw, 7);
} }
src += n_per_row; src += n_per_row;
qrow += nblock*sizeof(block_iq4_xs); qrow += nblock*sizeof(block_iq4_xs);